xref: /openbsd-src/gnu/usr.bin/perl/cpan/Unicode-Collate/Collate.pm (revision d59bb9942320b767f2a19aaa7690c8c6e30b724c)
1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7    unless (0x41 == unpack('U', 'A')) {
8	die "Unicode::Collate cannot get a Unicode code point\n";
9    }
10}
11
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16use File::Spec;
17
18no warnings 'utf8';
19
20our $VERSION = '1.14';
21our $PACKAGE = __PACKAGE__;
22
23### begin XS only ###
24require DynaLoader;
25our @ISA = qw(DynaLoader);
26bootstrap Unicode::Collate $VERSION;
27### end XS only ###
28
29my @Path = qw(Unicode Collate);
30my $KeyFile = "allkeys.txt";
31
32# Perl's boolean
33use constant TRUE  => 1;
34use constant FALSE => "";
35use constant NOMATCHPOS => -1;
36
37# A coderef to get combining class imported from Unicode::Normalize
38# (i.e. \&Unicode::Normalize::getCombinClass).
39# This is also used as a HAS_UNICODE_NORMALIZE flag.
40my $CVgetCombinClass;
41
42# Supported Levels
43use constant MinLevel => 1;
44use constant MaxLevel => 4;
45
46# Minimum weights at level 2 and 3, respectively
47use constant Min2Wt => 0x20;
48use constant Min3Wt => 0x02;
49
50# Shifted weight at 4th level
51use constant Shift4Wt => 0xFFFF;
52
53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
54use constant VCE_TEMPLATE => 'Cn4';
55
56# A sort key: 16-bit weights
57use constant KEY_TEMPLATE => 'n*';
58
59# The tie-breaking: 32-bit weights
60use constant TIE_TEMPLATE => 'N*';
61
62# Level separator in a sort key:
63# i.e. pack(KEY_TEMPLATE, 0)
64use constant LEVEL_SEP => "\0\0";
65
66# As Unicode code point separator for hash keys.
67# A joined code point string (denoted by JCPS below)
68# like "65;768" is used for internal processing
69# instead of Perl's Unicode string like "\x41\x{300}",
70# as the native code point is different from the Unicode code point
71# on EBCDIC platform.
72# This character must not be included in any stringified
73# representation of an integer.
74use constant CODE_SEP => ';';
75	# NOTE: in regex /;/ is used for $jcps!
76
77# boolean values of variable weights
78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR     => 1; # Variable character
80
81# specific code points
82use constant Hangul_SIni   => 0xAC00;
83use constant Hangul_SFin   => 0xD7A3;
84
85# Logical_Order_Exception in PropList.txt
86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
87
88# for highestFFFF and minimalFFFE
89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
90my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
91
92sub UCA_Version { "30" }
93
94sub Base_Unicode_Version { "7.0.0" }
95
96######
97
98sub pack_U {
99    return pack('U*', @_);
100}
101
102sub unpack_U {
103    return unpack('U*', shift(@_).pack('U*'));
104}
105
106######
107
108my (%VariableOK);
109@VariableOK{ qw/
110    blanked  non-ignorable  shifted  shift-trimmed
111  / } = (); # keys lowercased
112
113our @ChangeOK = qw/
114    alternate backwards level normalization rearrange
115    katakana_before_hiragana upper_before_lower ignore_level2
116    overrideCJK overrideHangul overrideOut preprocess UCA_Version
117    hangul_terminator variable identical highestFFFF minimalFFFE
118    long_contraction
119  /;
120
121our @ChangeNG = qw/
122    entry mapping table maxlength contraction
123    ignoreChar ignoreName undefChar undefName rewrite
124    versionTable alternateTable backwardsTable forwardsTable
125    rearrangeTable variableTable
126    derivCode normCode rearrangeHash backwardsFlag
127    suppress suppressHash
128    __useXS /; ### XS only
129# The hash key 'ignored' was deleted at v 0.21.
130# The hash key 'isShift' was deleted at v 0.23.
131# The hash key 'combining' was deleted at v 0.24.
132# The hash key 'entries' was deleted at v 0.30.
133# The hash key 'L3_ignorable' was deleted at v 0.40.
134
135sub version {
136    my $self = shift;
137    return $self->{versionTable} || 'unknown';
138}
139
140my (%ChangeOK, %ChangeNG);
141@ChangeOK{ @ChangeOK } = ();
142@ChangeNG{ @ChangeNG } = ();
143
144sub change {
145    my $self = shift;
146    my %hash = @_;
147    my %old;
148    if (exists $hash{alternate}) {
149	if (exists $hash{variable}) {
150	    delete $hash{alternate};
151	} else {
152	    $hash{variable} = $hash{alternate};
153	}
154    }
155    foreach my $k (keys %hash) {
156	if (exists $ChangeOK{$k}) {
157	    $old{$k} = $self->{$k};
158	    $self->{$k} = $hash{$k};
159	} elsif (exists $ChangeNG{$k}) {
160	    croak "change of $k via change() is not allowed!";
161	}
162	# else => ignored
163    }
164    $self->checkCollator();
165    return wantarray ? %old : $self;
166}
167
168sub _checkLevel {
169    my $level = shift;
170    my $key   = shift; # 'level' or 'backwards'
171    MinLevel <= $level or croak sprintf
172	"Illegal level %d (in value for key '%s') lower than %d.",
173	    $level, $key, MinLevel;
174    $level <= MaxLevel or croak sprintf
175	"Unsupported level %d (in value for key '%s') higher than %d.",
176	    $level, $key, MaxLevel;
177}
178
179my %DerivCode = (
180    8 => \&_derivCE_8,
181    9 => \&_derivCE_9,
182   11 => \&_derivCE_9, # 11 == 9
183   14 => \&_derivCE_14,
184   16 => \&_derivCE_14, # 16 == 14
185   18 => \&_derivCE_18,
186   20 => \&_derivCE_20,
187   22 => \&_derivCE_22,
188   24 => \&_derivCE_24,
189   26 => \&_derivCE_24, # 26 == 24
190   28 => \&_derivCE_24, # 28 == 24
191   30 => \&_derivCE_24, # 30 == 24
192);
193
194sub checkCollator {
195    my $self = shift;
196    _checkLevel($self->{level}, "level");
197
198    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
199	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
200
201    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
202				$self->{alternateTable} || 'shifted';
203    $self->{variable} = $self->{alternate} = lc($self->{variable});
204    exists $VariableOK{ $self->{variable} }
205	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
206
207    if (! defined $self->{backwards}) {
208	$self->{backwardsFlag} = 0;
209    } elsif (! ref $self->{backwards}) {
210	_checkLevel($self->{backwards}, "backwards");
211	$self->{backwardsFlag} = 1 << $self->{backwards};
212    } else {
213	my %level;
214	$self->{backwardsFlag} = 0;
215	for my $b (@{ $self->{backwards} }) {
216	    _checkLevel($b, "backwards");
217	    $level{$b} = 1;
218	}
219	for my $v (sort keys %level) {
220	    $self->{backwardsFlag} += 1 << $v;
221	}
222    }
223
224    defined $self->{rearrange} or $self->{rearrange} = [];
225    ref $self->{rearrange}
226	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
227
228    # keys of $self->{rearrangeHash} are $self->{rearrange}.
229    $self->{rearrangeHash} = undef;
230
231    if (@{ $self->{rearrange} }) {
232	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
233    }
234
235    $self->{normCode} = undef;
236
237    if (defined $self->{normalization}) {
238	eval { require Unicode::Normalize };
239	$@ and croak "Unicode::Normalize is required to normalize strings";
240
241	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
242
243	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
244	    $self->{normCode} = \&Unicode::Normalize::NFD;
245	}
246	elsif ($self->{normalization} ne 'prenormalized') {
247	    my $norm = $self->{normalization};
248	    $self->{normCode} = sub {
249		Unicode::Normalize::normalize($norm, shift);
250	    };
251	    eval { $self->{normCode}->("") }; # try
252	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
253	}
254    }
255    return;
256}
257
258sub new
259{
260    my $class = shift;
261    my $self = bless { @_ }, $class;
262
263### begin XS only ###
264    if (! exists $self->{table}     && !defined $self->{rewrite} &&
265	!defined $self->{undefName} && !defined $self->{ignoreName} &&
266	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
267	$self->{__useXS} = \&_fetch_simple;
268    } else {
269	$self->{__useXS} = undef;
270    }
271### end XS only ###
272
273    # keys of $self->{suppressHash} are $self->{suppress}.
274    if ($self->{suppress} && @{ $self->{suppress} }) {
275	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
276    } # before read_table()
277
278    # If undef is passed explicitly, no file is read.
279    $self->{table} = $KeyFile if ! exists $self->{table};
280    $self->read_table() if defined $self->{table};
281
282    if ($self->{entry}) {
283	while ($self->{entry} =~ /([^\n]+)/g) {
284	    $self->parseEntry($1, TRUE);
285	}
286    }
287
288    # only in new(), not in change()
289    $self->{level} ||= MaxLevel;
290    $self->{UCA_Version} ||= UCA_Version();
291
292    $self->{overrideHangul} = FALSE
293	if ! exists $self->{overrideHangul};
294    $self->{overrideCJK} = FALSE
295	if ! exists $self->{overrideCJK};
296    $self->{normalization} = 'NFD'
297	if ! exists $self->{normalization};
298    $self->{rearrange} = $self->{rearrangeTable} ||
299	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
300	if ! exists $self->{rearrange};
301    $self->{backwards} = $self->{backwardsTable}
302	if ! exists $self->{backwards};
303    exists $self->{long_contraction} or $self->{long_contraction}
304	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
305
306    # checkCollator() will be called in change()
307    $self->checkCollator();
308
309    return $self;
310}
311
312sub parseAtmark {
313    my $self = shift;
314    my $line = shift; # after s/^\s*\@//
315
316    if ($line =~ /^version\s*(\S*)/) {
317	$self->{versionTable} ||= $1;
318    }
319    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
320	$self->{variableTable} ||= $1;
321    }
322    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
323	$self->{alternateTable} ||= $1;
324    }
325    elsif ($line =~ /^backwards\s+(\S*)/) {
326	push @{ $self->{backwardsTable} }, $1;
327    }
328    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
329	push @{ $self->{forwardsTable} }, $1;
330    }
331    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
332	push @{ $self->{rearrangeTable} }, _getHexArray($1);
333    }
334}
335
336sub read_table {
337    my $self = shift;
338
339### begin XS only ###
340    if ($self->{__useXS}) {
341	my @rest = _fetch_rest(); # complex matter need to parse
342	for my $line (@rest) {
343	    next if $line =~ /^\s*#/;
344
345	    if ($line =~ s/^\s*\@//) {
346		$self->parseAtmark($line);
347	    } else {
348		$self->parseEntry($line);
349	    }
350	}
351	return;
352    }
353### end XS only ###
354
355    my($f, $fh);
356    foreach my $d (@INC) {
357	$f = File::Spec->catfile($d, @Path, $self->{table});
358	last if open($fh, $f);
359	$f = undef;
360    }
361    if (!defined $f) {
362	$f = File::Spec->catfile(@Path, $self->{table});
363	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
364    }
365
366    while (my $line = <$fh>) {
367	next if $line =~ /^\s*#/;
368
369	if ($line =~ s/^\s*\@//) {
370	    $self->parseAtmark($line);
371	} else {
372	    $self->parseEntry($line);
373	}
374    }
375    close $fh;
376}
377
378
379##
380## get $line, parse it, and write an entry in $self
381##
382sub parseEntry
383{
384    my $self = shift;
385    my $line = shift;
386    my $tailoring = shift;
387    my($name, $entry, @uv, @key);
388
389    if (defined $self->{rewrite}) {
390	$line = $self->{rewrite}->($line);
391    }
392
393    return if $line !~ /^\s*[0-9A-Fa-f]/;
394
395    # removes comment and gets name
396    $name = $1
397	if $line =~ s/[#%]\s*(.*)//;
398    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
399
400    # gets element
401    my($e, $k) = split /;/, $line;
402    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
403	if ! $k;
404
405    @uv = _getHexArray($e);
406    return if !@uv;
407    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
408		  exists $self->{suppressHash}{$uv[0]};
409    $entry = join(CODE_SEP, @uv); # in JCPS
410
411    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
412	my $ele = pack_U(@uv);
413
414	# regarded as if it were not stored in the table
415	return
416	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
417
418	# replaced as completely ignorable
419	$k = '[.0000.0000.0000.0000]'
420	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
421    }
422
423    # replaced as completely ignorable
424    $k = '[.0000.0000.0000.0000]'
425	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
426
427    my $is_L3_ignorable = TRUE;
428
429    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
430	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
431	my @wt = _getHexArray($arr);
432	push @key, pack(VCE_TEMPLATE, $var, @wt);
433	$is_L3_ignorable = FALSE
434	    if $wt[0] || $wt[1] || $wt[2];
435	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
436	# is completely ignorable.
437	# For expansion, an entry $is_L3_ignorable
438	# if and only if "all" CEs are [.0000.0000.0000].
439    }
440
441    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
442
443    if (@uv > 1) {
444	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
445	    $self->{maxlength}{$uv[0]} = @uv;
446	}
447    }
448    while (@uv > 2) {
449	pop @uv;
450	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
451	$self->{contraction}{$fake_entry} = 1;
452    }
453}
454
455
456sub viewSortKey
457{
458    my $self = shift;
459    my $str  = shift;
460    $self->visualizeSortKey($self->getSortKey($str));
461}
462
463
464sub process
465{
466    my $self = shift;
467    my $str  = shift;
468    my $prep = $self->{preprocess};
469    my $norm = $self->{normCode};
470
471    $str = &$prep($str) if ref $prep;
472    $str = &$norm($str) if ref $norm;
473    return $str;
474}
475
476##
477## arrayref of JCPS   = splitEnt(string to be collated)
478## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
479##
480sub splitEnt
481{
482    my $self = shift;
483    my $str  = shift;
484    my $wLen = shift; # with Length
485
486    my $map  = $self->{mapping};
487    my $max  = $self->{maxlength};
488    my $reH  = $self->{rearrangeHash};
489    my $vers = $self->{UCA_Version};
490    my $ver9 = $vers >= 9 && $vers <= 11;
491    my $long = $self->{long_contraction};
492    my $uXS  = $self->{__useXS}; ### XS only
493
494    my @buf;
495
496    # get array of Unicode code point of string.
497    my @src = unpack_U($str);
498
499    # rearrangement:
500    # Character positions are not kept if rearranged,
501    # then neglected if $wLen is true.
502    if ($reH && ! $wLen) {
503	for (my $i = 0; $i < @src; $i++) {
504	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
505		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
506		$i++;
507	    }
508	}
509    }
510
511    # remove a code point marked as a completely ignorable.
512    for (my $i = 0; $i < @src; $i++) {
513	if ($vers <= 20 && _isIllegal($src[$i])) {
514	    $src[$i] = undef;
515	} elsif ($ver9) {
516	    $src[$i] = undef if $map->{ $src[$i] }
517			   ? @{ $map->{ $src[$i] } } == 0
518			   : $uXS && _ignorable_simple($src[$i]); ### XS only
519	}
520    }
521
522    for (my $i = 0; $i < @src; $i++) {
523	my $jcps = $src[$i];
524
525	# skip removed code point
526	if (! defined $jcps) {
527	    if ($wLen && @buf) {
528		$buf[-1][2] = $i + 1;
529	    }
530	    next;
531	}
532
533	my $i_orig = $i;
534
535	# find contraction
536	if ($max->{$jcps}) {
537	    my $temp_jcps = $jcps;
538	    my $jcpsLen = 1;
539	    my $maxLen = $max->{$jcps};
540
541	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
542		next if ! defined $src[$p];
543		$temp_jcps .= CODE_SEP . $src[$p];
544		$jcpsLen++;
545		if ($map->{$temp_jcps}) {
546		    $jcps = $temp_jcps;
547		    $i = $p;
548		}
549	    }
550
551	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
552	# This process requires Unicode::Normalize.
553	# If "normalization" is undef, here should be skipped *always*
554	# (in spite of bool value of $CVgetCombinClass),
555	# since canonical ordering cannot be expected.
556	# Blocked combining character should not be contracted.
557
558	    # $self->{normCode} is false in the case of "prenormalized".
559	    if ($self->{normalization}) {
560		my $cont = $self->{contraction};
561		my $preCC = 0;
562		my $preCC_uc = 0;
563		my $jcps_uc = $jcps;
564		my(@out, @out_uc);
565
566		for (my $p = $i + 1; $p < @src; $p++) {
567		    next if ! defined $src[$p];
568		    my $curCC = $CVgetCombinClass->($src[$p]);
569		    last unless $curCC;
570		    my $tail = CODE_SEP . $src[$p];
571
572		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
573			$jcps .= $tail;
574			push @out, $p;
575		    } else {
576			$preCC = $curCC;
577		    }
578
579		    next if !$long;
580
581		    if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
582					       $cont->{$jcps_uc.$tail})) {
583			$jcps_uc .= $tail;
584			push @out_uc, $p;
585		    } else {
586			$preCC_uc = $curCC;
587		    }
588		}
589
590		if (@out_uc && $map->{$jcps_uc}) {
591		    $jcps = $jcps_uc;
592		    $src[$_] = undef for @out_uc;
593		} else {
594		    $src[$_] = undef for @out;
595		}
596	    }
597	}
598
599	# skip completely ignorable
600	if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
601	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
602	    if ($wLen && @buf) {
603		$buf[-1][2] = $i + 1;
604	    }
605	    next;
606	}
607
608	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
609    }
610    return \@buf;
611}
612
613##
614## VCE = _pack_override(input, codepoint, derivCode)
615##
616sub _pack_override ($$$) {
617    my $r = shift;
618    my $u = shift;
619    my $der = shift;
620
621    if (ref $r) {
622	return pack(VCE_TEMPLATE, NON_VAR, @$r);
623    } elsif (defined $r) {
624	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
625    } else {
626	$u = 0xFFFD if 0x10FFFF < $u;
627	return $der->($u);
628    }
629}
630
631##
632## list of VCE = getWt(JCPS)
633##
634sub getWt
635{
636    my $self = shift;
637    my $u    = shift;
638    my $map  = $self->{mapping};
639    my $der  = $self->{derivCode};
640    my $out  = $self->{overrideOut};
641    my $uXS  = $self->{__useXS}; ### XS only
642
643    return if !defined $u;
644    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
645    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
646    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
647
648    my @ce;
649    if ($map->{$u}) {
650	@ce = @{ $map->{$u} }; # $u may be a contraction
651### begin XS only ###
652    } elsif ($uXS && _exists_simple($u)) {
653	@ce = _fetch_simple($u);
654### end XS only ###
655    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
656	my $hang = $self->{overrideHangul};
657	if ($hang) {
658	    @ce = map _pack_override($_, $u, $der), $hang->($u);
659	} elsif (!defined $hang) {
660	    @ce = $der->($u);
661	} else {
662	    my $max  = $self->{maxlength};
663	    my @decH = _decompHangul($u);
664
665	    if (@decH == 2) {
666		my $contract = join(CODE_SEP, @decH);
667		@decH = ($contract) if $map->{$contract};
668	    } else { # must be <@decH == 3>
669		if ($max->{$decH[0]}) {
670		    my $contract = join(CODE_SEP, @decH);
671		    if ($map->{$contract}) {
672			@decH = ($contract);
673		    } else {
674			$contract = join(CODE_SEP, @decH[0,1]);
675			$map->{$contract} and @decH = ($contract, $decH[2]);
676		    }
677		    # even if V's ignorable, LT contraction is not supported.
678		    # If such a situation were required, NFD should be used.
679		}
680		if (@decH == 3 && $max->{$decH[1]}) {
681		    my $contract = join(CODE_SEP, @decH[1,2]);
682		    $map->{$contract} and @decH = ($decH[0], $contract);
683		}
684	    }
685
686	    @ce = map({
687		    $map->{$_} ? @{ $map->{$_} } :
688		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
689		    $der->($_);
690		} @decH);
691	}
692    } elsif ($out && 0x10FFFF < $u) {
693	@ce = map _pack_override($_, $u, $der), $out->($u);
694    } else {
695	my $cjk  = $self->{overrideCJK};
696	my $vers = $self->{UCA_Version};
697	if ($cjk && _isUIdeo($u, $vers)) {
698	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
699	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
700	    @ce = _uideoCE_8($u);
701	} else {
702	    @ce = $der->($u);
703	}
704    }
705    return map $self->varCE($_), @ce;
706}
707
708
709##
710## string sortkey = getSortKey(string arg)
711##
712sub getSortKey
713{
714    my $self = shift;
715    my $orig = shift;
716    my $str  = $self->process($orig);
717    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
718    my $vers = $self->{UCA_Version};
719    my $term = $self->{hangul_terminator};
720    my $lev  = $self->{level};
721    my $iden = $self->{identical};
722
723    my @buf; # weight arrays
724    if ($term) {
725	my $preHST = '';
726	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
727	foreach my $jcps (@$rEnt) {
728	    # weird things like VL, TL-contraction are not considered!
729	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
730	    if ($preHST && !$curHST || # hangul before non-hangul
731		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
732		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
733		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
734		push @buf, $termCE;
735	    }
736	    $preHST = $curHST;
737	    push @buf, $self->getWt($jcps);
738	}
739	push @buf, $termCE if $preHST; # end at hangul
740    } else {
741	foreach my $jcps (@$rEnt) {
742	    push @buf, $self->getWt($jcps);
743	}
744    }
745
746    my $rkey = $self->mk_SortKey(\@buf); ### XS only
747
748    if ($iden || $vers >= 26 && $lev == MaxLevel) {
749	$rkey .= LEVEL_SEP;
750	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
751    }
752    return $rkey;
753}
754
755
756##
757## int compare = cmp(string a, string b)
758##
759sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
760sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
761sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
762sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
763sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
764sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
765sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
766
767##
768## list[strings] sorted = sort(list[strings] arg)
769##
770sub sort {
771    my $obj = shift;
772    return
773	map { $_->[1] }
774	    sort{ $a->[0] cmp $b->[0] }
775		map [ $obj->getSortKey($_), $_ ], @_;
776}
777
778
779##
780## bool _nonIgnorAtLevel(arrayref weights, int level)
781##
782sub _nonIgnorAtLevel($$)
783{
784    my $wt = shift;
785    return if ! defined $wt;
786    my $lv = shift;
787    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
788}
789
790##
791## bool _eqArray(
792##    arrayref of arrayref[weights] source,
793##    arrayref of arrayref[weights] substr,
794##    int level)
795## * comparison of graphemes vs graphemes.
796##   @$source >= @$substr must be true (check it before call this);
797##
798sub _eqArray($$$)
799{
800    my $source = shift;
801    my $substr = shift;
802    my $lev = shift;
803
804    for my $g (0..@$substr-1){
805	# Do the $g'th graphemes have the same number of AV weights?
806	return if @{ $source->[$g] } != @{ $substr->[$g] };
807
808	for my $w (0..@{ $substr->[$g] }-1) {
809	    for my $v (0..$lev-1) {
810		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
811	    }
812	}
813    }
814    return 1;
815}
816
817##
818## (int position, int length)
819## int position = index(string, substring, position, [undoc'ed global])
820##
821## With "global" (only for the list context),
822##  returns list of arrayref[position, length].
823##
824sub index
825{
826    my $self = shift;
827    $self->{preprocess} and
828	croak "Don't use Preprocess with index(), match(), etc.";
829    $self->{normCode} and
830	croak "Don't use Normalization with index(), match(), etc.";
831
832    my $str  = shift;
833    my $len  = length($str);
834    my $sub  = shift;
835    my $subE = $self->splitEnt($sub);
836    my $pos  = @_ ? shift : 0;
837       $pos  = 0 if $pos < 0;
838    my $glob = shift;
839
840    my $lev  = $self->{level};
841    my $v2i  = $self->{UCA_Version} >= 9 &&
842		$self->{variable} ne 'non-ignorable';
843
844    if (! @$subE) {
845	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
846	return $glob
847	    ? map([$_, 0], $temp..$len)
848	    : wantarray ? ($temp,0) : $temp;
849    }
850    $len < $pos
851	and return wantarray ? () : NOMATCHPOS;
852    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
853    @$strE
854	or return wantarray ? () : NOMATCHPOS;
855
856    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
857
858    my $last_is_variable;
859    for my $vwt (map $self->getWt($_), @$subE) {
860	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
861	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
862
863	# "Ignorable (L1, L2) after Variable" since track. v. 9
864	if ($v2i) {
865	    if ($var) {
866		$last_is_variable = TRUE;
867	    }
868	    elsif (!$wt[0]) { # ignorable
869		$to_be_pushed = FALSE if $last_is_variable;
870	    }
871	    else {
872		$last_is_variable = FALSE;
873	    }
874	}
875
876	if (@subWt && !$var && !$wt[0]) {
877	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
878	} elsif ($to_be_pushed) {
879	    push @subWt, [ \@wt ];
880	}
881	# else ===> skipped
882    }
883
884    my $count = 0;
885    my $end = @$strE - 1;
886
887    $last_is_variable = FALSE; # reuse
888    for (my $i = 0; $i <= $end; ) { # no $i++
889	my $found_base = 0;
890
891	# fetch a grapheme
892	while ($i <= $end && $found_base == 0) {
893	    for my $vwt ($self->getWt($strE->[$i][0])) {
894		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
895		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
896
897		# "Ignorable (L1, L2) after Variable" since track. v. 9
898		if ($v2i) {
899		    if ($var) {
900			$last_is_variable = TRUE;
901		    }
902		    elsif (!$wt[0]) { # ignorable
903			$to_be_pushed = FALSE if $last_is_variable;
904		    }
905		    else {
906			$last_is_variable = FALSE;
907		    }
908		}
909
910		if (@strWt && !$var && !$wt[0]) {
911		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
912		    $finPos[-1] = $strE->[$i][2];
913		} elsif ($to_be_pushed) {
914		    push @strWt, [ \@wt ];
915		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
916		    $finPos[-1] = NOMATCHPOS if $found_base;
917		    push @finPos, $strE->[$i][2];
918		    $found_base++;
919		}
920		# else ===> no-op
921	    }
922	    $i++;
923	}
924
925	# try to match
926	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
927	    if ($iniPos[0] != NOMATCHPOS &&
928		    $finPos[$#subWt] != NOMATCHPOS &&
929			_eqArray(\@strWt, \@subWt, $lev)) {
930		my $temp = $iniPos[0] + $pos;
931
932		if ($glob) {
933		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
934		    splice @strWt,  0, $#subWt;
935		    splice @iniPos, 0, $#subWt;
936		    splice @finPos, 0, $#subWt;
937		}
938		else {
939		    return wantarray
940			? ($temp, $finPos[$#subWt] - $iniPos[0])
941			:  $temp;
942		}
943	    }
944	    shift @strWt;
945	    shift @iniPos;
946	    shift @finPos;
947	}
948    }
949
950    return $glob
951	? @g_ret
952	: wantarray ? () : NOMATCHPOS;
953}
954
955##
956## scalarref to matching part = match(string, substring)
957##
958sub match
959{
960    my $self = shift;
961    if (my($pos,$len) = $self->index($_[0], $_[1])) {
962	my $temp = substr($_[0], $pos, $len);
963	return wantarray ? $temp : \$temp;
964	# An lvalue ref \substr should be avoided,
965	# since its value is affected by modification of its referent.
966    }
967    else {
968	return;
969    }
970}
971
972##
973## arrayref matching parts = gmatch(string, substring)
974##
975sub gmatch
976{
977    my $self = shift;
978    my $str  = shift;
979    my $sub  = shift;
980    return map substr($str, $_->[0], $_->[1]),
981		$self->index($str, $sub, 0, 'g');
982}
983
984##
985## bool subst'ed = subst(string, substring, replace)
986##
987sub subst
988{
989    my $self = shift;
990    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
991
992    if (my($pos,$len) = $self->index($_[0], $_[1])) {
993	if ($code) {
994	    my $mat = substr($_[0], $pos, $len);
995	    substr($_[0], $pos, $len, $code->($mat));
996	} else {
997	    substr($_[0], $pos, $len, $_[2]);
998	}
999	return TRUE;
1000    }
1001    else {
1002	return FALSE;
1003    }
1004}
1005
1006##
1007## int count = gsubst(string, substring, replace)
1008##
1009sub gsubst
1010{
1011    my $self = shift;
1012    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1013    my $cnt = 0;
1014
1015    # Replacement is carried out from the end, then use reverse.
1016    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1017	if ($code) {
1018	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1019	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1020	} else {
1021	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1022	}
1023	$cnt++;
1024    }
1025    return $cnt;
1026}
1027
10281;
1029__END__
1030
1031=head1 NAME
1032
1033Unicode::Collate - Unicode Collation Algorithm
1034
1035=head1 SYNOPSIS
1036
1037  use Unicode::Collate;
1038
1039  #construct
1040  $Collator = Unicode::Collate->new(%tailoring);
1041
1042  #sort
1043  @sorted = $Collator->sort(@not_sorted);
1044
1045  #compare
1046  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1047
1048B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1049according to Perl's Unicode support. See L<perlunicode>,
1050L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1051Otherwise you can use C<preprocess> or should decode them before.
1052
1053=head1 DESCRIPTION
1054
1055This module is an implementation of Unicode Technical Standard #10
1056(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1057
1058=head2 Constructor and Tailoring
1059
1060The C<new> method returns a collator object. If new() is called
1061with no parameters, the collator should do the default collation.
1062
1063   $Collator = Unicode::Collate->new(
1064      UCA_Version => $UCA_Version,
1065      alternate => $alternate, # alias for 'variable'
1066      backwards => $levelNumber, # or \@levelNumbers
1067      entry => $element,
1068      hangul_terminator => $term_primary_weight,
1069      highestFFFF => $bool,
1070      identical => $bool,
1071      ignoreName => qr/$ignoreName/,
1072      ignoreChar => qr/$ignoreChar/,
1073      ignore_level2 => $bool,
1074      katakana_before_hiragana => $bool,
1075      level => $collationLevel,
1076      long_contraction => $bool,
1077      minimalFFFE => $bool,
1078      normalization  => $normalization_form,
1079      overrideCJK => \&overrideCJK,
1080      overrideHangul => \&overrideHangul,
1081      preprocess => \&preprocess,
1082      rearrange => \@charList,
1083      rewrite => \&rewrite,
1084      suppress => \@charList,
1085      table => $filename,
1086      undefName => qr/$undefName/,
1087      undefChar => qr/$undefChar/,
1088      upper_before_lower => $bool,
1089      variable => $variable,
1090   );
1091
1092=over 4
1093
1094=item UCA_Version
1095
1096If the revision (previously "tracking version") number of UCA is given,
1097behavior of that revision is emulated on collating.
1098If omitted, the return value of C<UCA_Version()> is used.
1099
1100The following revisions are supported.  The default is 30.
1101
1102     UCA       Unicode Standard         DUCET (@version)
1103   -------------------------------------------------------
1104      8              3.1                3.0.1 (3.0.1d9)
1105      9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1106     11              4.0                4.0.0 (4.0.0)
1107     14             4.1.0               4.1.0 (4.1.0)
1108     16              5.0                5.0.0 (5.0.0)
1109     18             5.1.0               5.1.0 (5.1.0)
1110     20             5.2.0               5.2.0 (5.2.0)
1111     22             6.0.0               6.0.0 (6.0.0)
1112     24             6.1.0               6.1.0 (6.1.0)
1113     26             6.2.0               6.2.0 (6.2.0)
1114     28             6.3.0               6.3.0 (6.3.0)
1115     30             7.0.0               7.0.0 (7.0.0)
1116
1117* See below C<long_contraction> with C<UCA_Version> 22 and 24.
1118
1119* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1120since C<UCA_Version> 22.
1121
1122* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1123and can be overridden since C<UCA_Version> 22.
1124
1125* Fully ignorable characters were ignored, and would not interrupt
1126contractions with C<UCA_Version> 9 and 11.
1127
1128* Treatment of ignorables after variables and some behaviors
1129were changed at C<UCA_Version> 9.
1130
1131* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1132depend on C<UCA_Version>.
1133
1134* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1135C<hangul_terminator>.
1136
1137=item alternate
1138
1139-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1140
1141For backward compatibility, C<alternate> (old name) can be used
1142as an alias for C<variable>.
1143
1144=item backwards
1145
1146-- see 3.4 Backward Accents, UTS #10.
1147
1148     backwards => $levelNumber or \@levelNumbers
1149
1150Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1151If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1152forwards at all the levels.
1153
1154=item entry
1155
1156-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1157
1158If the same character (or a sequence of characters) exists
1159in the collation element table through C<table>,
1160mapping to collation elements is overridden.
1161If it does not exist, the mapping is defined additionally.
1162
1163    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11640063 0068 ; [.0E6A.0020.0002.0063] # ch
11650043 0068 ; [.0E6A.0020.0007.0043] # Ch
11660043 0048 ; [.0E6A.0020.0008.0043] # CH
1167006C 006C ; [.0F4C.0020.0002.006C] # ll
1168004C 006C ; [.0F4C.0020.0007.004C] # Ll
1169004C 004C ; [.0F4C.0020.0008.004C] # LL
117000F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1171006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
117200D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1173004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1174ENTRY
1175
1176    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
117700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
117800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1179ENTRY
1180
1181B<NOTE:> The code point in the UCA file format (before C<';'>)
1182B<must> be a Unicode code point (defined as hexadecimal),
1183but not a native code point.
1184So C<0063> must always denote C<U+0063>,
1185but not a character of C<"\x63">.
1186
1187Weighting may vary depending on collation element table.
1188So ensure the weights defined in C<entry> will be consistent with
1189those in the collation element table loaded via C<table>.
1190
1191In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1192and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1193(as a value between C<0E60> and C<0E6D>)
1194makes ordering as C<C E<lt> CH E<lt> D>.
1195Exactly speaking DUCET already has some characters between C<C> and C<D>:
1196C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1197C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1198and C<c-curl> (C<U+0255>) with C<0E69>.
1199Then primary weight C<0E6A> for C<CH> makes C<CH>
1200ordered between C<c-curl> and C<D>.
1201
1202=item hangul_terminator
1203
1204-- see 7.1.4 Trailing Weights, UTS #10.
1205
1206If a true value is given (non-zero but should be positive),
1207it will be added as a terminator primary weight to the end of
1208every standard Hangul syllable. Secondary and any higher weights
1209for terminator are set to zero.
1210If the value is false or C<hangul_terminator> key does not exist,
1211insertion of terminator weights will not be performed.
1212
1213Boundaries of Hangul syllables are determined
1214according to conjoining Jamo behavior in F<the Unicode Standard>
1215and F<HangulSyllableType.txt>.
1216
1217B<Implementation Note:>
1218(1) For expansion mapping (Unicode character mapped
1219to a sequence of collation elements), a terminator will not be added
1220between collation elements, even if Hangul syllable boundary exists there.
1221Addition of terminator is restricted to the next position
1222to the last collation element.
1223
1224(2) Non-conjoining Hangul letters
1225(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1226automatically terminated with a terminator primary weight.
1227These characters may need terminator included in a collation element
1228table beforehand.
1229
1230=item highestFFFF
1231
1232-- see 5.14 Collation Elements, UTS #35.
1233
1234If the parameter is made true, C<U+FFFF> has a highest primary weight.
1235When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1236C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1237begins with C<"abc">, or another primary equivalent.
1238C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1239such as C<"abc\x{FFFF}xyz">.
1240
1241C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1242almost, but the latter has a problem that you should know which letter is
1243next to C<c>. For a certain language where C<ch> as the next letter,
1244C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1245
1246Note:
1247This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1248Any other character than C<U+FFFF> can be tailored by C<entry>.
1249
1250=item identical
1251
1252-- see A.3 Deterministic Comparison, UTS #10.
1253
1254By default, strings whose weights are equal should be equal,
1255even though their code points are not equal.
1256Completely ignorable characters are ignored.
1257
1258If the parameter is made true, a final, tie-breaking level is used.
1259If no difference of weights is found after the comparison through
1260all the level specified by C<level>, the comparison with code points
1261will be performed.
1262For the tie-breaking comparison, the sort key has code points
1263of the original string appended.
1264Completely ignorable characters are not ignored.
1265
1266If C<preprocess> and/or C<normalization> is applied, the code points
1267of the string after them (in NFD by default) are used.
1268
1269=item ignoreChar
1270
1271=item ignoreName
1272
1273-- see 3.6 Variable Weighting, UTS #10.
1274
1275Makes the entry in the table completely ignorable;
1276i.e. as if the weights were zero at all level.
1277
1278Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1279will be ignored. Through C<ignoreName>, any character whose name
1280(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1281will be ignored.
1282
1283E.g. when 'a' and 'e' are ignorable,
1284'element' is equal to 'lament' (or 'lmnt').
1285
1286=item ignore_level2
1287
1288-- see 5.1 Parametric Tailoring, UTS #10.
1289
1290By default, case-sensitive comparison (that is level 3 difference)
1291won't ignore accents (that is level 2 difference).
1292
1293If the parameter is made true, accents (and other primary ignorable
1294characters) are ignored, even though cases are taken into account.
1295
1296B<NOTE>: C<level> should be 3 or greater.
1297
1298=item katakana_before_hiragana
1299
1300-- see 7.2 Tertiary Weight Table, UTS #10.
1301
1302By default, hiragana is before katakana.
1303If the parameter is made true, this is reversed.
1304
1305B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1306distinctions must occur in level 3, and their weights at level 3 must be
1307same as those mentioned in 7.3.1, UTS #10.
1308If you define your collation elements which violate this requirement,
1309this parameter does not work validly.
1310
1311=item level
1312
1313-- see 4.3 Form Sort Key, UTS #10.
1314
1315Set the maximum level.
1316Any higher levels than the specified one are ignored.
1317
1318  Level 1: alphabetic ordering
1319  Level 2: diacritic ordering
1320  Level 3: case ordering
1321  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1322
1323  ex.level => 2,
1324
1325If omitted, the maximum is the 4th.
1326
1327B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1328But this module only uses weights within 0xFFFF.
1329When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1330and 'shift-trimmed'), the level 4 may be unreliable.
1331
1332See also C<identical>.
1333
1334=item long_contraction
1335
1336-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1337
1338If the parameter is made true, for a contraction with three or more
1339characters (here nicknamed "long contraction"), initial substrings
1340will be handled.
1341For example, a contraction ABC, where A is a starter, and B and C
1342are non-starters (character with non-zero combining character class),
1343will be detected even if there is not AB as a contraction.
1344
1345B<Default:> Usually false.
1346If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1347is not specified in C<new()>, a true value is set implicitly.
1348This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1349
1350C<change()> handles C<long_contraction> explicitly only.
1351If C<long_contraction> is not specified in C<change()>, even though
1352C<UCA_Version> is changed, C<long_contraction> will not be changed.
1353
1354B<Limitation:> Scanning non-starters is one-way (no back tracking).
1355If AB is found but not ABC is not found, other long contraction where
1356the first character is A and the second is not B may not be found.
1357
1358Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1359contractions will be skipped.
1360
1361B<Note:> The following contractions in DUCET are not considered
1362in steps S2.1.1 to S2.1.3, where they are discontiguous.
1363
1364    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1365    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1366
1367For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1368(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1369In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1370instead of C<0FB2 0F71 0F80>.
1371Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1372contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1373
1374=item minimalFFFE
1375
1376-- see 5.14 Collation Elements, UTS #35.
1377
1378If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1379The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1380first compares C<$a1> and C<$b1> at level 1, and
1381then C<$a2> and C<$b2> at level 1, as followed.
1382
1383        "ab\x{FFFE}a"
1384        "Ab\x{FFFE}a"
1385        "ab\x{FFFE}c"
1386        "Ab\x{FFFE}c"
1387        "ab\x{FFFE}xyz"
1388        "abc\x{FFFE}def"
1389        "abc\x{FFFE}xYz"
1390        "aBc\x{FFFE}xyz"
1391        "abcX\x{FFFE}def"
1392        "abcx\x{FFFE}xyz"
1393        "b\x{FFFE}aaa"
1394        "bbb\x{FFFE}a"
1395
1396Note:
1397This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1398Any other character than C<U+FFFE> can be tailored by C<entry>.
1399
1400=item normalization
1401
1402-- see 4.1 Normalize, UTS #10.
1403
1404If specified, strings are normalized before preparation of sort keys
1405(the normalization is executed after preprocess).
1406
1407A form name C<Unicode::Normalize::normalize()> accepts will be applied
1408as C<$normalization_form>.
1409Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1410See C<Unicode::Normalize::normalize()> for detail.
1411If omitted, C<'NFD'> is used.
1412
1413C<normalization> is performed after C<preprocess> (if defined).
1414
1415Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1416though they are not concerned with C<Unicode::Normalize::normalize()>.
1417
1418If C<undef> (not a string C<"undef">) is passed explicitly
1419as the value for this key,
1420any normalization is not carried out (this may make tailoring easier
1421if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1422only contiguous contractions are resolved;
1423e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1424C<A-cedilla-ring> would be primary equal to C<A>.
1425In this point,
1426C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1427B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1428
1429In the case of C<(normalization =E<gt> "prenormalized")>,
1430any normalization is not performed, but
1431discontiguous contractions with combining characters are performed.
1432Therefore
1433C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1434B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1435If source strings are finely prenormalized,
1436C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1437
1438Except C<(normalization =E<gt> undef)>,
1439B<Unicode::Normalize> is required (see also B<CAVEAT>).
1440
1441=item overrideCJK
1442
1443-- see 7.1 Derived Collation Elements, UTS #10.
1444
1445By default, CJK unified ideographs are ordered in Unicode codepoint
1446order, but those in the CJK Unified Ideographs block are less than
1447those in the CJK Unified Ideographs Extension A etc.
1448
1449    In the CJK Unified Ideographs block:
1450    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1451    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1452    U+4E00..U+9FC3 if UCA_Version is 18.
1453    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1454    U+4E00..U+9FCC if UCA_Version is 24 or later.
1455
1456    In the CJK Unified Ideographs Extension blocks:
1457    Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1458    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1459    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1460
1461Through C<overrideCJK>, ordering of CJK unified ideographs (including
1462extensions) can be overridden.
1463
1464ex. CJK unified ideographs in the JIS code point order.
1465
1466  overrideCJK => sub {
1467      my $u = shift;             # get a Unicode codepoint
1468      my $b = pack('n', $u);     # to UTF-16BE
1469      my $s = your_unicode_to_sjis_converter($b); # convert
1470      my $n = unpack('n', $s);   # convert sjis to short
1471      [ $n, 0x20, 0x2, $u ];     # return the collation element
1472  },
1473
1474The return value may be an arrayref of 1st to 4th weights as shown
1475above. The return value may be an integer as the primary weight
1476as shown below.  If C<undef> is returned, the default derived
1477collation element will be used.
1478
1479  overrideCJK => sub {
1480      my $u = shift;             # get a Unicode codepoint
1481      my $b = pack('n', $u);     # to UTF-16BE
1482      my $s = your_unicode_to_sjis_converter($b); # convert
1483      my $n = unpack('n', $s);   # convert sjis to short
1484      return $n;                 # return the primary weight
1485  },
1486
1487The return value may be a list containing zero or more of
1488an arrayref, an integer, or C<undef>.
1489
1490ex. ignores all CJK unified ideographs.
1491
1492  overrideCJK => sub {()}, # CODEREF returning empty list
1493
1494   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1495   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1496
1497If a false value (including C<undef>) is passed, C<overrideCJK>
1498has no effect.
1499C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1500
1501But assignment of weight for CJK unified ideographs
1502in C<table> or C<entry> is still valid.
1503If C<undef> is passed explicitly as the value for this key,
1504weights for CJK unified ideographs are treated as undefined.
1505However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1506has no special meaning.
1507
1508B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1509C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1510C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1511ideographs. But they can't be overridden via C<overrideCJK> when you use
1512DUCET, as the table includes weights for them. C<table> or C<entry> has
1513priority over C<overrideCJK>.
1514
1515=item overrideHangul
1516
1517-- see 7.1 Derived Collation Elements, UTS #10.
1518
1519By default, Hangul syllables are decomposed into Hangul Jamo,
1520even if C<(normalization =E<gt> undef)>.
1521But the mapping of Hangul syllables may be overridden.
1522
1523This parameter works like C<overrideCJK>, so see there for examples.
1524
1525If you want to override the mapping of Hangul syllables,
1526NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1527Hangul syllables before overriding. FCD may decompose Hangul syllables
1528as the case may be.
1529
1530If a false value (but not C<undef>) is passed, C<overrideHangul>
1531has no effect.
1532C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1533
1534If C<undef> is passed explicitly as the value for this key,
1535weight for Hangul syllables is treated as undefined
1536without decomposition into Hangul Jamo.
1537But definition of weight for Hangul syllables
1538in C<table> or C<entry> is still valid.
1539
1540=item overrideOut
1541
1542-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1543
1544Perl seems to allow out-of-range values (greater than 0x10FFFF).
1545By default, out-of-range values are replaced with C<U+FFFD>
1546(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1547or ignored when C<UCA_Version> E<lt>= 20.
1548
1549When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1550can be overridden. Though C<table> or C<entry> are available for them,
1551out-of-range values are too many.
1552
1553C<overrideOut> can perform it algorithmically.
1554This parameter works like C<overrideCJK>, so see there for examples.
1555
1556ex. ignores all out-of-range values.
1557
1558  overrideOut => sub {()}, # CODEREF returning empty list
1559
1560If a false value (including C<undef>) is passed, C<overrideOut>
1561has no effect.
1562C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1563
1564B<NOTE ABOUT U+FFFD:>
1565
1566UCA recommends that out-of-range values should not be ignored for security
1567reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1568However, C<U+FFFD> is wrongly mapped to a variable collation element
1569in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1570ignored when C<variable> isn't C<Non-ignorable>.
1571
1572The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1573see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1574(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1575
1576  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1577
1578This workaround is unnecessary since Unicode 6.3.0.
1579
1580=item preprocess
1581
1582-- see 5.4 Preprocessing, UTS #10.
1583
1584If specified, the coderef is used to preprocess each string
1585before the formation of sort keys.
1586
1587ex. dropping English articles, such as "a" or "the".
1588Then, "the pen" is before "a pencil".
1589
1590     preprocess => sub {
1591           my $str = shift;
1592           $str =~ s/\b(?:an?|the)\s+//gi;
1593           return $str;
1594        },
1595
1596C<preprocess> is performed before C<normalization> (if defined).
1597
1598ex. decoding strings in a legacy encoding such as shift-jis:
1599
1600    $sjis_collator = Unicode::Collate->new(
1601        preprocess => \&your_shiftjis_to_unicode_decoder,
1602    );
1603    @result = $sjis_collator->sort(@shiftjis_strings);
1604
1605B<Note:> Strings returned from the coderef will be interpreted
1606according to Perl's Unicode support. See L<perlunicode>,
1607L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1608
1609=item rearrange
1610
1611-- see 3.5 Rearrangement, UTS #10.
1612
1613Characters that are not coded in logical order and to be rearranged.
1614If C<UCA_Version> is equal to or less than 11, default is:
1615
1616    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1617
1618If you want to disallow any rearrangement, pass C<undef> or C<[]>
1619(a reference to empty list) as the value for this key.
1620
1621If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1622(i.e. no rearrangement).
1623
1624B<According to the version 9 of UCA, this parameter shall not be used;
1625but it is not warned at present.>
1626
1627=item rewrite
1628
1629If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1630The coderef will get each line, and then should return a rewritten line
1631according to the UCA file format.
1632If the coderef returns an empty line, the line will be skipped.
1633
1634e.g. any primary ignorable characters into tertiary ignorable:
1635
1636    rewrite => sub {
1637        my $line = shift;
1638        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1639        return $line;
1640    },
1641
1642This example shows rewriting weights. C<rewrite> is allowed to
1643affect code points, weights, and the name.
1644
1645B<NOTE>: C<table> is available to use another table file;
1646preparing a modified table once would be more efficient than
1647rewriting lines on reading an unmodified table every time.
1648
1649=item suppress
1650
1651-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1652UTS #35 (LDML).
1653
1654Contractions beginning with the specified characters are suppressed,
1655even if those contractions are defined in C<table>.
1656
1657An example for Russian and some languages using the Cyrillic script:
1658
1659    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1660
1661where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1662
1663B<NOTE>: Contractions via C<entry> are not be suppressed.
1664
1665=item table
1666
1667-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1668
1669You can use another collation element table if desired.
1670
1671The table file should locate in the F<Unicode/Collate> directory
1672on C<@INC>. Say, if the filename is F<Foo.txt>,
1673the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1674
1675By default, F<allkeys.txt> (as the filename of DUCET) is used.
1676If you will prepare your own table file, any name other than F<allkeys.txt>
1677may be better to avoid namespace conflict.
1678
1679B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1680module, and it may save time at the run time.
1681Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1682or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1683C<rewrite> will prevent this module from using the compiled DUCET.
1684
1685If C<undef> is passed explicitly as the value for this key,
1686no file is read (but you can define collation elements via C<entry>).
1687
1688A typical way to define a collation element table
1689without any file of table:
1690
1691   $onlyABC = Unicode::Collate->new(
1692       table => undef,
1693       entry => << 'ENTRIES',
16940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
16950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
16960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
16970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
16980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
16990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1700ENTRIES
1701    );
1702
1703If C<ignoreName> or C<undefName> is used, character names should be
1704specified as a comment (following C<#>) on each line.
1705
1706=item undefChar
1707
1708=item undefName
1709
1710-- see 6.3.4 Reducing the Repertoire, UTS #10.
1711
1712Undefines the collation element as if it were unassigned in the C<table>.
1713This reduces the size of the table.
1714If an unassigned character appears in the string to be collated,
1715the sort key is made from its codepoint
1716as a single-character collation element,
1717as it is greater than any other assigned collation elements
1718(in the codepoint order among the unassigned characters).
1719But, it'd be better to ignore characters
1720unfamiliar to you and maybe never used.
1721
1722Through C<undefChar>, any character matching C<qr/$undefChar/>
1723will be undefined. Through C<undefName>, any character whose name
1724(given in the C<table> file as a comment) matches C<qr/$undefName/>
1725will be undefined.
1726
1727ex. Collation weights for beyond-BMP characters are not stored in object:
1728
1729    undefChar => qr/[^\0-\x{fffd}]/,
1730
1731=item upper_before_lower
1732
1733-- see 6.6 Case Comparisons, UTS #10.
1734
1735By default, lowercase is before uppercase.
1736If the parameter is made true, this is reversed.
1737
1738B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1739distinctions must occur in level 3, and their weights at level 3 must be
1740same as those mentioned in 7.3.1, UTS #10.
1741If you define your collation elements which differs from this requirement,
1742this parameter doesn't work validly.
1743
1744=item variable
1745
1746-- see 3.6 Variable Weighting, UTS #10.
1747
1748This key allows for variable weighting of variable collation elements,
1749which are marked with an ASTERISK in the table
1750(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1751
1752   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1753
1754These names are case-insensitive.
1755By default (if specification is omitted), 'shifted' is adopted.
1756
1757   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1758                    considered at the 4th level.
1759
1760   'Non-Ignorable'  Variable elements are not reset to ignorable.
1761
1762   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1763                    their level 4 weight is replaced by the old level 1 weight.
1764                    Level 4 weight for Non-Variable elements is 0xFFFF.
1765
1766   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1767                    are trimmed.
1768
1769=back
1770
1771=head2 Methods for Collation
1772
1773=over 4
1774
1775=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1776
1777Sorts a list of strings.
1778
1779=item C<$result = $Collator-E<gt>cmp($a, $b)>
1780
1781Returns 1 (when C<$a> is greater than C<$b>)
1782or 0 (when C<$a> is equal to C<$b>)
1783or -1 (when C<$a> is less than C<$b>).
1784
1785=item C<$result = $Collator-E<gt>eq($a, $b)>
1786
1787=item C<$result = $Collator-E<gt>ne($a, $b)>
1788
1789=item C<$result = $Collator-E<gt>lt($a, $b)>
1790
1791=item C<$result = $Collator-E<gt>le($a, $b)>
1792
1793=item C<$result = $Collator-E<gt>gt($a, $b)>
1794
1795=item C<$result = $Collator-E<gt>ge($a, $b)>
1796
1797They works like the same name operators as theirs.
1798
1799   eq : whether $a is equal to $b.
1800   ne : whether $a is not equal to $b.
1801   lt : whether $a is less than $b.
1802   le : whether $a is less than $b or equal to $b.
1803   gt : whether $a is greater than $b.
1804   ge : whether $a is greater than $b or equal to $b.
1805
1806=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1807
1808-- see 4.3 Form Sort Key, UTS #10.
1809
1810Returns a sort key.
1811
1812You compare the sort keys using a binary comparison
1813and get the result of the comparison of the strings using UCA.
1814
1815   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1816
1817      is equivalent to
1818
1819   $Collator->cmp($a, $b)
1820
1821=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1822
1823Converts a sorting key into its representation form.
1824If C<UCA_Version> is 8, the output is slightly different.
1825
1826   use Unicode::Collate;
1827   my $c = Unicode::Collate->new();
1828   print $c->viewSortKey("Perl"),"\n";
1829
1830   # output:
1831   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1832   #  Level 1               Level 2               Level 3               Level 4
1833
1834=back
1835
1836=head2 Methods for Searching
1837
1838The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1839like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1840but they are not aware of any pattern, but only a literal substring.
1841
1842B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1843for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1844C<subst>, C<gsubst>) is croaked, as the position and the length might
1845differ from those on the specified string.
1846
1847C<rearrange> and C<hangul_terminator> parameters are neglected.
1848C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1849matching and searching, as it doesn't matter whether greater or less.
1850
1851=over 4
1852
1853=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1854
1855=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1856
1857If C<$substring> matches a part of C<$string>, returns
1858the position of the first occurrence of the matching part in scalar context;
1859in list context, returns a two-element list of
1860the position and the length of the matching part.
1861
1862If C<$substring> does not match any part of C<$string>,
1863returns C<-1> in scalar context and
1864an empty list in list context.
1865
1866e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1867you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1868
1869  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1870                                     # (normalization => undef) is REQUIRED.
1871  my $match;
1872  if (my($pos,$len) = $Collator->index($str, $sub)) {
1873      $match = substr($str, $pos, $len);
1874  }
1875
1876and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1877is primary equal to C<"M>E<uuml>C<SS">.
1878
1879=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1880
1881=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1882
1883If C<$substring> matches a part of C<$string>, in scalar context, returns
1884B<a reference to> the first occurrence of the matching part
1885(C<$match_ref> is always true if matches,
1886since every reference is B<true>);
1887in list context, returns the first occurrence of the matching part.
1888
1889If C<$substring> does not match any part of C<$string>,
1890returns C<undef> in scalar context and
1891an empty list in list context.
1892
1893e.g.
1894
1895    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1896	print "matches [$$match_ref].\n";
1897    } else {
1898	print "doesn't match.\n";
1899    }
1900
1901     or
1902
1903    if (($match) = $Collator->match($str, $sub)) { # list context
1904	print "matches [$match].\n";
1905    } else {
1906	print "doesn't match.\n";
1907    }
1908
1909=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1910
1911If C<$substring> matches a part of C<$string>, returns
1912all the matching parts (or matching count in scalar context).
1913
1914If C<$substring> does not match any part of C<$string>,
1915returns an empty list.
1916
1917=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1918
1919If C<$substring> matches a part of C<$string>,
1920the first occurrence of the matching part is replaced by C<$replacement>
1921(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1922
1923C<$replacement> can be a C<CODEREF>,
1924taking the matching part as an argument,
1925and returning a string to replace the matching part
1926(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1927
1928=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1929
1930If C<$substring> matches a part of C<$string>,
1931all the occurrences of the matching part are replaced by C<$replacement>
1932(C<$string> is modified) and C<$count> is returned.
1933
1934C<$replacement> can be a C<CODEREF>,
1935taking the matching part as an argument,
1936and returning a string to replace the matching part
1937(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1938
1939e.g.
1940
1941  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1942                                     # (normalization => undef) is REQUIRED.
1943  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1944  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1945
1946  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1947  # i.e., all the camels are made bold-faced.
1948
1949   Examples: levels and ignore_level2 - what does camel match?
1950  ---------------------------------------------------------------------------
1951   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1952  -----------------------|---------------------------------------------------
1953     1        false      |   yes    yes      yes          yes        yes
1954     2        false      |   yes    yes      no           yes        yes
1955     3        false      |   yes    no       no           yes        yes
1956     4        false      |   yes    no       no           no         yes
1957  -----------------------|---------------------------------------------------
1958     1        true       |   yes    yes      yes          yes        yes
1959     2        true       |   yes    yes      yes          yes        yes
1960     3        true       |   yes    no       yes          yes        yes
1961     4        true       |   yes    no       yes          no         yes
1962  ---------------------------------------------------------------------------
1963   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1964         at any level.
1965
1966=back
1967
1968=head2 Other Methods
1969
1970=over 4
1971
1972=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1973
1974=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1975
1976Changes the value of specified keys and returns the changed part.
1977
1978    $Collator = Unicode::Collate->new(level => 4);
1979
1980    $Collator->eq("perl", "PERL"); # false
1981
1982    %old = $Collator->change(level => 2); # returns (level => 4).
1983
1984    $Collator->eq("perl", "PERL"); # true
1985
1986    $Collator->change(%old); # returns (level => 2).
1987
1988    $Collator->eq("perl", "PERL"); # false
1989
1990Not all C<(key,value)>s are allowed to be changed.
1991See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1992
1993In the scalar context, returns the modified collator
1994(but it is B<not> a clone from the original).
1995
1996    $Collator->change(level => 2)->eq("perl", "PERL"); # true
1997
1998    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1999
2000    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2001
2002=item C<$version = $Collator-E<gt>version()>
2003
2004Returns the version number (a string) of the Unicode Standard
2005which the C<table> file used by the collator object is based on.
2006If the table does not include a version line (starting with C<@version>),
2007returns C<"unknown">.
2008
2009=item C<UCA_Version()>
2010
2011Returns the revision number of UTS #10 this module consults,
2012that should correspond with the DUCET incorporated.
2013
2014=item C<Base_Unicode_Version()>
2015
2016Returns the version number of UTS #10 this module consults,
2017that should correspond with the DUCET incorporated.
2018
2019=back
2020
2021=head1 EXPORT
2022
2023No method will be exported.
2024
2025=head1 INSTALL
2026
2027Though this module can be used without any C<table> file,
2028to use this module easily, it is recommended to install a table file
2029in the UCA format, by copying it under the directory
2030<a place in @INC>/Unicode/Collate.
2031
2032The most preferable one is "The Default Unicode Collation Element Table"
2033(aka DUCET), available from the Unicode Consortium's website:
2034
2035   http://www.unicode.org/Public/UCA/
2036
2037   http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
2038
2039If DUCET is not installed, it is recommended to copy the file
2040from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2041to <a place in @INC>/Unicode/Collate/allkeys.txt
2042manually.
2043
2044=head1 CAVEATS
2045
2046=over 4
2047
2048=item Normalization
2049
2050Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2051module (see L<Unicode::Normalize>).
2052
2053If you need not it (say, in the case when you need not
2054handle any combining characters),
2055assign C<(normalization =E<gt> undef)> explicitly.
2056
2057-- see 6.5 Avoiding Normalization, UTS #10.
2058
2059=item Conformance Test
2060
2061The Conformance Test for the UCA is available
2062under L<http://www.unicode.org/Public/UCA/>.
2063
2064For F<CollationTest_SHIFTED.txt>,
2065a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2066for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2067C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2068
2069If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2070C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2071C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2072C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2073
2074B<Unicode::Normalize is required to try The Conformance Test.>
2075
2076=back
2077
2078=head1 AUTHOR, COPYRIGHT AND LICENSE
2079
2080The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2081<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014,
2082SADAHIRO Tomoyuki. Japan. All rights reserved.
2083
2084This module is free software; you can redistribute it and/or
2085modify it under the same terms as Perl itself.
2086
2087The file Unicode/Collate/allkeys.txt was copied verbatim
2088from L<http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt>.
2089For this file, Copyright (c) 2001-2012 Unicode, Inc.
2090Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
2091
2092=head1 SEE ALSO
2093
2094=over 4
2095
2096=item Unicode Collation Algorithm - UTS #10
2097
2098L<http://www.unicode.org/reports/tr10/>
2099
2100=item The Default Unicode Collation Element Table (DUCET)
2101
2102L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2103
2104=item The conformance test for the UCA
2105
2106L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2107
2108L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2109
2110=item Hangul Syllable Type
2111
2112L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2113
2114=item Unicode Normalization Forms - UAX #15
2115
2116L<http://www.unicode.org/reports/tr15/>
2117
2118=item Unicode Locale Data Markup Language (LDML) - UTS #35
2119
2120L<http://www.unicode.org/reports/tr35/>
2121
2122=back
2123
2124=cut
2125