xref: /openbsd-src/gnu/usr.bin/perl/cpan/Unicode-Collate/Collate.pm (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7}
8
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13use File::Spec;
14
15no warnings 'utf8';
16
17our $VERSION = '0.97';
18our $PACKAGE = __PACKAGE__;
19
20### begin XS only ###
21require DynaLoader;
22our @ISA = qw(DynaLoader);
23bootstrap Unicode::Collate $VERSION;
24### end XS only ###
25
26my @Path = qw(Unicode Collate);
27my $KeyFile = "allkeys.txt";
28
29# Perl's boolean
30use constant TRUE  => 1;
31use constant FALSE => "";
32use constant NOMATCHPOS => -1;
33
34# A coderef to get combining class imported from Unicode::Normalize
35# (i.e. \&Unicode::Normalize::getCombinClass).
36# This is also used as a HAS_UNICODE_NORMALIZE flag.
37my $CVgetCombinClass;
38
39# Supported Levels
40use constant MinLevel => 1;
41use constant MaxLevel => 4;
42
43# Minimum weights at level 2 and 3, respectively
44use constant Min2Wt => 0x20;
45use constant Min3Wt => 0x02;
46
47# Shifted weight at 4th level
48use constant Shift4Wt => 0xFFFF;
49
50# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
51use constant VCE_TEMPLATE => 'Cn4';
52
53# A sort key: 16-bit weights
54use constant KEY_TEMPLATE => 'n*';
55
56# The tie-breaking: 32-bit weights
57use constant TIE_TEMPLATE => 'N*';
58
59# Level separator in a sort key:
60# i.e. pack(KEY_TEMPLATE, 0)
61use constant LEVEL_SEP => "\0\0";
62
63# As Unicode code point separator for hash keys.
64# A joined code point string (denoted by JCPS below)
65# like "65;768" is used for internal processing
66# instead of Perl's Unicode string like "\x41\x{300}",
67# as the native code point is different from the Unicode code point
68# on EBCDIC platform.
69# This character must not be included in any stringified
70# representation of an integer.
71use constant CODE_SEP => ';';
72	# NOTE: in regex /;/ is used for $jcps!
73
74# boolean values of variable weights
75use constant NON_VAR => 0; # Non-Variable character
76use constant VAR     => 1; # Variable character
77
78# specific code points
79use constant Hangul_SIni   => 0xAC00;
80use constant Hangul_SFin   => 0xD7A3;
81
82# Logical_Order_Exception in PropList.txt
83my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
84
85# for highestFFFF and minimalFFFE
86my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
87my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
88
89sub UCA_Version { "26" }
90
91sub Base_Unicode_Version { "6.2.0" }
92
93######
94
95sub pack_U {
96    return pack('U*', @_);
97}
98
99######
100
101my (%VariableOK);
102@VariableOK{ qw/
103    blanked  non-ignorable  shifted  shift-trimmed
104  / } = (); # keys lowercased
105
106our @ChangeOK = qw/
107    alternate backwards level normalization rearrange
108    katakana_before_hiragana upper_before_lower ignore_level2
109    overrideHangul overrideCJK preprocess UCA_Version
110    hangul_terminator variable identical highestFFFF minimalFFFE
111  /;
112
113our @ChangeNG = qw/
114    entry mapping table maxlength contraction
115    ignoreChar ignoreName undefChar undefName rewrite
116    versionTable alternateTable backwardsTable forwardsTable
117    rearrangeTable variableTable
118    derivCode normCode rearrangeHash backwardsFlag
119    suppress suppressHash
120    __useXS /; ### XS only
121# The hash key 'ignored' is deleted at v 0.21.
122# The hash key 'isShift' is deleted at v 0.23.
123# The hash key 'combining' is deleted at v 0.24.
124# The hash key 'entries' is deleted at v 0.30.
125# The hash key 'L3_ignorable' is deleted at v 0.40.
126
127sub version {
128    my $self = shift;
129    return $self->{versionTable} || 'unknown';
130}
131
132my (%ChangeOK, %ChangeNG);
133@ChangeOK{ @ChangeOK } = ();
134@ChangeNG{ @ChangeNG } = ();
135
136sub change {
137    my $self = shift;
138    my %hash = @_;
139    my %old;
140    if (exists $hash{alternate}) {
141	if (exists $hash{variable}) {
142	    delete $hash{alternate};
143	} else {
144	    $hash{variable} = $hash{alternate};
145	}
146    }
147    foreach my $k (keys %hash) {
148	if (exists $ChangeOK{$k}) {
149	    $old{$k} = $self->{$k};
150	    $self->{$k} = $hash{$k};
151	} elsif (exists $ChangeNG{$k}) {
152	    croak "change of $k via change() is not allowed!";
153	}
154	# else => ignored
155    }
156    $self->checkCollator();
157    return wantarray ? %old : $self;
158}
159
160sub _checkLevel {
161    my $level = shift;
162    my $key   = shift; # 'level' or 'backwards'
163    MinLevel <= $level or croak sprintf
164	"Illegal level %d (in value for key '%s') lower than %d.",
165	    $level, $key, MinLevel;
166    $level <= MaxLevel or croak sprintf
167	"Unsupported level %d (in value for key '%s') higher than %d.",
168	    $level, $key, MaxLevel;
169}
170
171my %DerivCode = (
172    8 => \&_derivCE_8,
173    9 => \&_derivCE_9,
174   11 => \&_derivCE_9, # 11 == 9
175   14 => \&_derivCE_14,
176   16 => \&_derivCE_14, # 16 == 14
177   18 => \&_derivCE_18,
178   20 => \&_derivCE_20,
179   22 => \&_derivCE_22,
180   24 => \&_derivCE_24,
181   26 => \&_derivCE_24, # 26 == 24
182);
183
184sub checkCollator {
185    my $self = shift;
186    _checkLevel($self->{level}, "level");
187
188    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
189	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
190
191    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
192				$self->{alternateTable} || 'shifted';
193    $self->{variable} = $self->{alternate} = lc($self->{variable});
194    exists $VariableOK{ $self->{variable} }
195	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
196
197    if (! defined $self->{backwards}) {
198	$self->{backwardsFlag} = 0;
199    } elsif (! ref $self->{backwards}) {
200	_checkLevel($self->{backwards}, "backwards");
201	$self->{backwardsFlag} = 1 << $self->{backwards};
202    } else {
203	my %level;
204	$self->{backwardsFlag} = 0;
205	for my $b (@{ $self->{backwards} }) {
206	    _checkLevel($b, "backwards");
207	    $level{$b} = 1;
208	}
209	for my $v (sort keys %level) {
210	    $self->{backwardsFlag} += 1 << $v;
211	}
212    }
213
214    defined $self->{rearrange} or $self->{rearrange} = [];
215    ref $self->{rearrange}
216	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
217
218    # keys of $self->{rearrangeHash} are $self->{rearrange}.
219    $self->{rearrangeHash} = undef;
220
221    if (@{ $self->{rearrange} }) {
222	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
223    }
224
225    $self->{normCode} = undef;
226
227    if (defined $self->{normalization}) {
228	eval { require Unicode::Normalize };
229	$@ and croak "Unicode::Normalize is required to normalize strings";
230
231	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
232
233	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
234	    $self->{normCode} = \&Unicode::Normalize::NFD;
235	}
236	elsif ($self->{normalization} ne 'prenormalized') {
237	    my $norm = $self->{normalization};
238	    $self->{normCode} = sub {
239		Unicode::Normalize::normalize($norm, shift);
240	    };
241	    eval { $self->{normCode}->("") }; # try
242	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
243	}
244    }
245    return;
246}
247
248sub new
249{
250    my $class = shift;
251    my $self = bless { @_ }, $class;
252
253### begin XS only ###
254    if (! exists $self->{table}     && !defined $self->{rewrite} &&
255	!defined $self->{undefName} && !defined $self->{ignoreName} &&
256	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
257	$self->{__useXS} = \&_fetch_simple;
258    } else {
259	$self->{__useXS} = undef;
260    }
261### end XS only ###
262
263    # keys of $self->{suppressHash} are $self->{suppress}.
264    if ($self->{suppress} && @{ $self->{suppress} }) {
265	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
266    } # before read_table()
267
268    # If undef is passed explicitly, no file is read.
269    $self->{table} = $KeyFile if ! exists $self->{table};
270    $self->read_table() if defined $self->{table};
271
272    if ($self->{entry}) {
273	while ($self->{entry} =~ /([^\n]+)/g) {
274	    $self->parseEntry($1, TRUE);
275	}
276    }
277
278    $self->{level} ||= MaxLevel;
279    $self->{UCA_Version} ||= UCA_Version();
280
281    $self->{overrideHangul} = FALSE
282	if ! exists $self->{overrideHangul};
283    $self->{overrideCJK} = FALSE
284	if ! exists $self->{overrideCJK};
285    $self->{normalization} = 'NFD'
286	if ! exists $self->{normalization};
287    $self->{rearrange} = $self->{rearrangeTable} ||
288	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
289	if ! exists $self->{rearrange};
290    $self->{backwards} = $self->{backwardsTable}
291	if ! exists $self->{backwards};
292
293    $self->checkCollator();
294
295    return $self;
296}
297
298sub parseAtmark {
299    my $self = shift;
300    my $line = shift; # after s/^\s*\@//
301
302    if ($line =~ /^version\s*(\S*)/) {
303	$self->{versionTable} ||= $1;
304    }
305    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
306	$self->{variableTable} ||= $1;
307    }
308    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
309	$self->{alternateTable} ||= $1;
310    }
311    elsif ($line =~ /^backwards\s+(\S*)/) {
312	push @{ $self->{backwardsTable} }, $1;
313    }
314    elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
315	push @{ $self->{forwardsTable} }, $1;
316    }
317    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
318	push @{ $self->{rearrangeTable} }, _getHexArray($1);
319    }
320}
321
322sub read_table {
323    my $self = shift;
324
325### begin XS only ###
326    if ($self->{__useXS}) {
327	my @rest = _fetch_rest(); # complex matter need to parse
328	for my $line (@rest) {
329	    next if $line =~ /^\s*#/;
330
331	    if ($line =~ s/^\s*\@//) {
332		$self->parseAtmark($line);
333	    } else {
334		$self->parseEntry($line);
335	    }
336	}
337	return;
338    }
339### end XS only ###
340
341    my($f, $fh);
342    foreach my $d (@INC) {
343	$f = File::Spec->catfile($d, @Path, $self->{table});
344	last if open($fh, $f);
345	$f = undef;
346    }
347    if (!defined $f) {
348	$f = File::Spec->catfile(@Path, $self->{table});
349	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
350    }
351
352    while (my $line = <$fh>) {
353	next if $line =~ /^\s*#/;
354
355	if ($line =~ s/^\s*\@//) {
356	    $self->parseAtmark($line);
357	} else {
358	    $self->parseEntry($line);
359	}
360    }
361    close $fh;
362}
363
364
365##
366## get $line, parse it, and write an entry in $self
367##
368sub parseEntry
369{
370    my $self = shift;
371    my $line = shift;
372    my $tailoring = shift;
373    my($name, $entry, @uv, @key);
374
375    if (defined $self->{rewrite}) {
376	$line = $self->{rewrite}->($line);
377    }
378
379    return if $line !~ /^\s*[0-9A-Fa-f]/;
380
381    # removes comment and gets name
382    $name = $1
383	if $line =~ s/[#%]\s*(.*)//;
384    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
385
386    # gets element
387    my($e, $k) = split /;/, $line;
388    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
389	if ! $k;
390
391    @uv = _getHexArray($e);
392    return if !@uv;
393    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
394		  exists $self->{suppressHash}{$uv[0]};
395    $entry = join(CODE_SEP, @uv); # in JCPS
396
397    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
398	my $ele = pack_U(@uv);
399
400	# regarded as if it were not entried in the table
401	return
402	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
403
404	# replaced as completely ignorable
405	$k = '[.0000.0000.0000.0000]'
406	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
407    }
408
409    # replaced as completely ignorable
410    $k = '[.0000.0000.0000.0000]'
411	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
412
413    my $is_L3_ignorable = TRUE;
414
415    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
416	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
417	my @wt = _getHexArray($arr);
418	push @key, pack(VCE_TEMPLATE, $var, @wt);
419	$is_L3_ignorable = FALSE
420	    if $wt[0] || $wt[1] || $wt[2];
421	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
422	# is completely ignorable.
423	# For expansion, an entry $is_L3_ignorable
424	# if and only if "all" CEs are [.0000.0000.0000].
425    }
426
427    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
428
429    if (@uv > 1) {
430	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
431	    $self->{maxlength}{$uv[0]} = @uv;
432	}
433    }
434    if (@uv > 2) {
435	while (@uv) {
436	    pop @uv;
437	    my $fake_entry = join(CODE_SEP, @uv); # in JCPS
438	    $self->{contraction}{$fake_entry} = 1;
439	}
440    }
441}
442
443
444sub viewSortKey
445{
446    my $self = shift;
447    my $str  = shift;
448    $self->visualizeSortKey($self->getSortKey($str));
449}
450
451
452sub process
453{
454    my $self = shift;
455    my $str  = shift;
456    my $prep = $self->{preprocess};
457    my $norm = $self->{normCode};
458
459    $str = &$prep($str) if ref $prep;
460    $str = &$norm($str) if ref $norm;
461    return $str;
462}
463
464##
465## arrayref of JCPS   = splitEnt(string to be collated)
466## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
467##
468sub splitEnt
469{
470    my $self = shift;
471    my $str  = shift;
472    my $wLen = shift; # with Length
473
474    my $map  = $self->{mapping};
475    my $max  = $self->{maxlength};
476    my $reH  = $self->{rearrangeHash};
477    my $vers = $self->{UCA_Version};
478    my $ver9 = $vers >= 9 && $vers <= 11;
479    my $uXS  = $self->{__useXS}; ### XS only
480
481    my @buf;
482
483    # get array of Unicode code point of string.
484    my @src = unpack_U($str);
485
486    # rearrangement:
487    # Character positions are not kept if rearranged,
488    # then neglected if $wLen is true.
489    if ($reH && ! $wLen) {
490	for (my $i = 0; $i < @src; $i++) {
491	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
492		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
493		$i++;
494	    }
495	}
496    }
497
498    # remove a code point marked as a completely ignorable.
499    for (my $i = 0; $i < @src; $i++) {
500	if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
501	    $src[$i] = undef;
502	} elsif ($ver9) {
503	    $src[$i] = undef if $map->{ $src[$i] }
504			   ? @{ $map->{ $src[$i] } } == 0
505			   : $uXS && _ignorable_simple($src[$i]); ### XS only
506	}
507    }
508
509    for (my $i = 0; $i < @src; $i++) {
510	my $jcps = $src[$i];
511
512	# skip removed code point
513	if (! defined $jcps) {
514	    if ($wLen && @buf) {
515		$buf[-1][2] = $i + 1;
516	    }
517	    next;
518	}
519
520	my $i_orig = $i;
521
522	# find contraction
523	if ($max->{$jcps}) {
524	    my $temp_jcps = $jcps;
525	    my $jcpsLen = 1;
526	    my $maxLen = $max->{$jcps};
527
528	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
529		next if ! defined $src[$p];
530		$temp_jcps .= CODE_SEP . $src[$p];
531		$jcpsLen++;
532		if ($map->{$temp_jcps}) {
533		    $jcps = $temp_jcps;
534		    $i = $p;
535		}
536	    }
537
538	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
539	# This process requires Unicode::Normalize.
540	# If "normalization" is undef, here should be skipped *always*
541	# (in spite of bool value of $CVgetCombinClass),
542	# since canonical ordering cannot be expected.
543	# Blocked combining character should not be contracted.
544
545	    # $self->{normCode} is false in the case of "prenormalized".
546	    if ($self->{normalization}) {
547		my $cont = $self->{contraction};
548		my $preCC = 0;
549		my $preCC_uc = 0;
550		my $jcps_uc = $jcps;
551		my(@out, @out_uc);
552
553		for (my $p = $i + 1; $p < @src; $p++) {
554		    next if ! defined $src[$p];
555		    my $curCC = $CVgetCombinClass->($src[$p]);
556		    last unless $curCC;
557		    my $tail = CODE_SEP . $src[$p];
558
559		    if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
560					       $cont->{$jcps_uc.$tail})) {
561			$jcps_uc .= $tail;
562			push @out_uc, $p;
563		    } else {
564			$preCC_uc = $curCC;
565		    }
566
567		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
568			$jcps .= $tail;
569			push @out, $p;
570		    } else {
571			$preCC = $curCC;
572		    }
573		}
574
575		if ($map->{$jcps_uc}) {
576		    $jcps = $jcps_uc;
577		    $src[$_] = undef for @out_uc;
578		} else {
579		    $src[$_] = undef for @out;
580		}
581	    }
582	}
583
584	# skip completely ignorable
585	if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
586	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
587	    if ($wLen && @buf) {
588		$buf[-1][2] = $i + 1;
589	    }
590	    next;
591	}
592
593	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
594    }
595    return \@buf;
596}
597
598##
599## VCE = _pack_override(input, codepoint, derivCode)
600##
601sub _pack_override ($$$) {
602    my $r = shift;
603    my $u = shift;
604    my $der = shift;
605
606    if (ref $r) {
607	return pack(VCE_TEMPLATE, NON_VAR, @$r);
608    } elsif (defined $r) {
609	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
610    } else {
611	return $der->($u);
612    }
613}
614
615##
616## list of VCE = getWt(JCPS)
617##
618sub getWt
619{
620    my $self = shift;
621    my $u    = shift;
622    my $map  = $self->{mapping};
623    my $der  = $self->{derivCode};
624    my $uXS  = $self->{__useXS}; ### XS only
625
626    return if !defined $u;
627    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
628    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
629    return map($self->varCE($_), @{ $map->{$u} }) if $map->{$u};
630### begin XS only ###
631    return map($self->varCE($_), _fetch_simple($u))
632	if $uXS && _exists_simple($u);
633### end XS only ###
634
635    # JCPS must not be a contraction, then it's a code point.
636    if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
637	my $hang = $self->{overrideHangul};
638	my @hangulCE;
639	if ($hang) {
640	    @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
641	} elsif (!defined $hang) {
642	    @hangulCE = $der->($u);
643	} else {
644	    my $max  = $self->{maxlength};
645	    my @decH = _decompHangul($u);
646
647	    if (@decH == 2) {
648		my $contract = join(CODE_SEP, @decH);
649		@decH = ($contract) if $map->{$contract};
650	    } else { # must be <@decH == 3>
651		if ($max->{$decH[0]}) {
652		    my $contract = join(CODE_SEP, @decH);
653		    if ($map->{$contract}) {
654			@decH = ($contract);
655		    } else {
656			$contract = join(CODE_SEP, @decH[0,1]);
657			$map->{$contract} and @decH = ($contract, $decH[2]);
658		    }
659		    # even if V's ignorable, LT contraction is not supported.
660		    # If such a situation were required, NFD should be used.
661		}
662		if (@decH == 3 && $max->{$decH[1]}) {
663		    my $contract = join(CODE_SEP, @decH[1,2]);
664		    $map->{$contract} and @decH = ($decH[0], $contract);
665		}
666	    }
667
668	    @hangulCE = map({
669		    $map->{$_} ? @{ $map->{$_} } :
670		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
671		    $der->($_);
672		} @decH);
673	}
674	return map $self->varCE($_), @hangulCE;
675    } else {
676	my $cjk  = $self->{overrideCJK};
677	my $vers = $self->{UCA_Version};
678	if ($cjk && _isUIdeo($u, $vers)) {
679	    my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
680	    return map $self->varCE($_), @cjkCE;
681	}
682	if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
683	    return map $self->varCE($_), _uideoCE_8($u);
684	}
685	return map $self->varCE($_), $der->($u);
686    }
687}
688
689
690##
691## string sortkey = getSortKey(string arg)
692##
693sub getSortKey
694{
695    my $self = shift;
696    my $orig = shift;
697    my $str  = $self->process($orig);
698    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
699    my $vers = $self->{UCA_Version};
700    my $term = $self->{hangul_terminator};
701    my $lev  = $self->{level};
702    my $iden = $self->{identical};
703
704    my @buf; # weight arrays
705    if ($term) {
706	my $preHST = '';
707	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
708	foreach my $jcps (@$rEnt) {
709	    # weird things like VL, TL-contraction are not considered!
710	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
711	    if ($preHST && !$curHST || # hangul before non-hangul
712		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
713		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
714		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
715		push @buf, $termCE;
716	    }
717	    $preHST = $curHST;
718	    push @buf, $self->getWt($jcps);
719	}
720	push @buf, $termCE if $preHST; # end at hangul
721    } else {
722	foreach my $jcps (@$rEnt) {
723	    push @buf, $self->getWt($jcps);
724	}
725    }
726
727    my $rkey = $self->mk_SortKey(\@buf); ### XS only
728
729    if ($iden || $vers >= 26 && $lev == MaxLevel) {
730	$rkey .= LEVEL_SEP;
731	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
732    }
733    return $rkey;
734}
735
736
737##
738## int compare = cmp(string a, string b)
739##
740sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
741sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
742sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
743sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
744sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
745sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
746sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
747
748##
749## list[strings] sorted = sort(list[strings] arg)
750##
751sub sort {
752    my $obj = shift;
753    return
754	map { $_->[1] }
755	    sort{ $a->[0] cmp $b->[0] }
756		map [ $obj->getSortKey($_), $_ ], @_;
757}
758
759
760##
761## bool _nonIgnorAtLevel(arrayref weights, int level)
762##
763sub _nonIgnorAtLevel($$)
764{
765    my $wt = shift;
766    return if ! defined $wt;
767    my $lv = shift;
768    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
769}
770
771##
772## bool _eqArray(
773##    arrayref of arrayref[weights] source,
774##    arrayref of arrayref[weights] substr,
775##    int level)
776## * comparison of graphemes vs graphemes.
777##   @$source >= @$substr must be true (check it before call this);
778##
779sub _eqArray($$$)
780{
781    my $source = shift;
782    my $substr = shift;
783    my $lev = shift;
784
785    for my $g (0..@$substr-1){
786	# Do the $g'th graphemes have the same number of AV weights?
787	return if @{ $source->[$g] } != @{ $substr->[$g] };
788
789	for my $w (0..@{ $substr->[$g] }-1) {
790	    for my $v (0..$lev-1) {
791		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
792	    }
793	}
794    }
795    return 1;
796}
797
798##
799## (int position, int length)
800## int position = index(string, substring, position, [undoc'ed global])
801##
802## With "global" (only for the list context),
803##  returns list of arrayref[position, length].
804##
805sub index
806{
807    my $self = shift;
808    $self->{preprocess} and
809	croak "Don't use Preprocess with index(), match(), etc.";
810    $self->{normCode} and
811	croak "Don't use Normalization with index(), match(), etc.";
812
813    my $str  = shift;
814    my $len  = length($str);
815    my $sub  = shift;
816    my $subE = $self->splitEnt($sub);
817    my $pos  = @_ ? shift : 0;
818       $pos  = 0 if $pos < 0;
819    my $glob = shift;
820
821    my $lev  = $self->{level};
822    my $v2i  = $self->{UCA_Version} >= 9 &&
823		$self->{variable} ne 'non-ignorable';
824
825    if (! @$subE) {
826	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
827	return $glob
828	    ? map([$_, 0], $temp..$len)
829	    : wantarray ? ($temp,0) : $temp;
830    }
831    $len < $pos
832	and return wantarray ? () : NOMATCHPOS;
833    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
834    @$strE
835	or return wantarray ? () : NOMATCHPOS;
836
837    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
838
839    my $last_is_variable;
840    for my $vwt (map $self->getWt($_), @$subE) {
841	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
842	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
843
844	# "Ignorable (L1, L2) after Variable" since track. v. 9
845	if ($v2i) {
846	    if ($var) {
847		$last_is_variable = TRUE;
848	    }
849	    elsif (!$wt[0]) { # ignorable
850		$to_be_pushed = FALSE if $last_is_variable;
851	    }
852	    else {
853		$last_is_variable = FALSE;
854	    }
855	}
856
857	if (@subWt && !$var && !$wt[0]) {
858	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
859	} elsif ($to_be_pushed) {
860	    push @subWt, [ \@wt ];
861	}
862	# else ===> skipped
863    }
864
865    my $count = 0;
866    my $end = @$strE - 1;
867
868    $last_is_variable = FALSE; # reuse
869    for (my $i = 0; $i <= $end; ) { # no $i++
870	my $found_base = 0;
871
872	# fetch a grapheme
873	while ($i <= $end && $found_base == 0) {
874	    for my $vwt ($self->getWt($strE->[$i][0])) {
875		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
876		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
877
878		# "Ignorable (L1, L2) after Variable" since track. v. 9
879		if ($v2i) {
880		    if ($var) {
881			$last_is_variable = TRUE;
882		    }
883		    elsif (!$wt[0]) { # ignorable
884			$to_be_pushed = FALSE if $last_is_variable;
885		    }
886		    else {
887			$last_is_variable = FALSE;
888		    }
889		}
890
891		if (@strWt && !$var && !$wt[0]) {
892		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
893		    $finPos[-1] = $strE->[$i][2];
894		} elsif ($to_be_pushed) {
895		    push @strWt, [ \@wt ];
896		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
897		    $finPos[-1] = NOMATCHPOS if $found_base;
898		    push @finPos, $strE->[$i][2];
899		    $found_base++;
900		}
901		# else ===> no-op
902	    }
903	    $i++;
904	}
905
906	# try to match
907	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
908	    if ($iniPos[0] != NOMATCHPOS &&
909		    $finPos[$#subWt] != NOMATCHPOS &&
910			_eqArray(\@strWt, \@subWt, $lev)) {
911		my $temp = $iniPos[0] + $pos;
912
913		if ($glob) {
914		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
915		    splice @strWt,  0, $#subWt;
916		    splice @iniPos, 0, $#subWt;
917		    splice @finPos, 0, $#subWt;
918		}
919		else {
920		    return wantarray
921			? ($temp, $finPos[$#subWt] - $iniPos[0])
922			:  $temp;
923		}
924	    }
925	    shift @strWt;
926	    shift @iniPos;
927	    shift @finPos;
928	}
929    }
930
931    return $glob
932	? @g_ret
933	: wantarray ? () : NOMATCHPOS;
934}
935
936##
937## scalarref to matching part = match(string, substring)
938##
939sub match
940{
941    my $self = shift;
942    if (my($pos,$len) = $self->index($_[0], $_[1])) {
943	my $temp = substr($_[0], $pos, $len);
944	return wantarray ? $temp : \$temp;
945	# An lvalue ref \substr should be avoided,
946	# since its value is affected by modification of its referent.
947    }
948    else {
949	return;
950    }
951}
952
953##
954## arrayref matching parts = gmatch(string, substring)
955##
956sub gmatch
957{
958    my $self = shift;
959    my $str  = shift;
960    my $sub  = shift;
961    return map substr($str, $_->[0], $_->[1]),
962		$self->index($str, $sub, 0, 'g');
963}
964
965##
966## bool subst'ed = subst(string, substring, replace)
967##
968sub subst
969{
970    my $self = shift;
971    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
972
973    if (my($pos,$len) = $self->index($_[0], $_[1])) {
974	if ($code) {
975	    my $mat = substr($_[0], $pos, $len);
976	    substr($_[0], $pos, $len, $code->($mat));
977	} else {
978	    substr($_[0], $pos, $len, $_[2]);
979	}
980	return TRUE;
981    }
982    else {
983	return FALSE;
984    }
985}
986
987##
988## int count = gsubst(string, substring, replace)
989##
990sub gsubst
991{
992    my $self = shift;
993    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
994    my $cnt = 0;
995
996    # Replacement is carried out from the end, then use reverse.
997    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
998	if ($code) {
999	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1000	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1001	} else {
1002	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1003	}
1004	$cnt++;
1005    }
1006    return $cnt;
1007}
1008
10091;
1010__END__
1011
1012=head1 NAME
1013
1014Unicode::Collate - Unicode Collation Algorithm
1015
1016=head1 SYNOPSIS
1017
1018  use Unicode::Collate;
1019
1020  #construct
1021  $Collator = Unicode::Collate->new(%tailoring);
1022
1023  #sort
1024  @sorted = $Collator->sort(@not_sorted);
1025
1026  #compare
1027  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1028
1029B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1030according to Perl's Unicode support. See L<perlunicode>,
1031L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1032Otherwise you can use C<preprocess> or should decode them before.
1033
1034=head1 DESCRIPTION
1035
1036This module is an implementation of Unicode Technical Standard #10
1037(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1038
1039=head2 Constructor and Tailoring
1040
1041The C<new> method returns a collator object. If new() is called
1042with no parameters, the collator should do the default collation.
1043
1044   $Collator = Unicode::Collate->new(
1045      UCA_Version => $UCA_Version,
1046      alternate => $alternate, # alias for 'variable'
1047      backwards => $levelNumber, # or \@levelNumbers
1048      entry => $element,
1049      hangul_terminator => $term_primary_weight,
1050      highestFFFF => $bool,
1051      identical => $bool,
1052      ignoreName => qr/$ignoreName/,
1053      ignoreChar => qr/$ignoreChar/,
1054      ignore_level2 => $bool,
1055      katakana_before_hiragana => $bool,
1056      level => $collationLevel,
1057      minimalFFFE => $bool,
1058      normalization  => $normalization_form,
1059      overrideCJK => \&overrideCJK,
1060      overrideHangul => \&overrideHangul,
1061      preprocess => \&preprocess,
1062      rearrange => \@charList,
1063      rewrite => \&rewrite,
1064      suppress => \@charList,
1065      table => $filename,
1066      undefName => qr/$undefName/,
1067      undefChar => qr/$undefChar/,
1068      upper_before_lower => $bool,
1069      variable => $variable,
1070   );
1071
1072=over 4
1073
1074=item UCA_Version
1075
1076If the revision (previously "tracking version") number of UCA is given,
1077behavior of that revision is emulated on collating.
1078If omitted, the return value of C<UCA_Version()> is used.
1079
1080The following revisions are supported.  The default is 26.
1081
1082     UCA       Unicode Standard         DUCET (@version)
1083   -------------------------------------------------------
1084      8              3.1                3.0.1 (3.0.1d9)
1085      9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1086     11              4.0                4.0.0 (4.0.0)
1087     14             4.1.0               4.1.0 (4.1.0)
1088     16              5.0                5.0.0 (5.0.0)
1089     18             5.1.0               5.1.0 (5.1.0)
1090     20             5.2.0               5.2.0 (5.2.0)
1091     22             6.0.0               6.0.0 (6.0.0)
1092     24             6.1.0               6.1.0 (6.1.0)
1093     26             6.2.0               6.2.0 (6.2.0)
1094
1095* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1096since C<UCA_Version> 22.
1097
1098* Fully ignorable characters were ignored, and would not interrupt
1099contractions with C<UCA_Version> 9 and 11.
1100
1101* Treatment of ignorables after variables and some behaviors
1102were changed at C<UCA_Version> 9.
1103
1104* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1105depend on C<UCA_Version>.
1106
1107* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1108C<hangul_terminator>.
1109
1110=item alternate
1111
1112-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1113
1114For backward compatibility, C<alternate> (old name) can be used
1115as an alias for C<variable>.
1116
1117=item backwards
1118
1119-- see 3.4 Backward Accents, UTS #10.
1120
1121     backwards => $levelNumber or \@levelNumbers
1122
1123Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1124If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1125forwards at all the levels.
1126
1127=item entry
1128
1129-- see 5 Tailoring; 3.6.1 File Format, UTS #10.
1130
1131If the same character (or a sequence of characters) exists
1132in the collation element table through C<table>,
1133mapping to collation elements is overridden.
1134If it does not exist, the mapping is defined additionally.
1135
1136    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11370063 0068 ; [.0E6A.0020.0002.0063] # ch
11380043 0068 ; [.0E6A.0020.0007.0043] # Ch
11390043 0048 ; [.0E6A.0020.0008.0043] # CH
1140006C 006C ; [.0F4C.0020.0002.006C] # ll
1141004C 006C ; [.0F4C.0020.0007.004C] # Ll
1142004C 004C ; [.0F4C.0020.0008.004C] # LL
114300F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1144006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
114500D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1146004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1147ENTRY
1148
1149    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
115000E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
115100C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1152ENTRY
1153
1154B<NOTE:> The code point in the UCA file format (before C<';'>)
1155B<must> be a Unicode code point (defined as hexadecimal),
1156but not a native code point.
1157So C<0063> must always denote C<U+0063>,
1158but not a character of C<"\x63">.
1159
1160Weighting may vary depending on collation element table.
1161So ensure the weights defined in C<entry> will be consistent with
1162those in the collation element table loaded via C<table>.
1163
1164In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1165and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1166(as a value between C<0E60> and C<0E6D>)
1167makes ordering as C<C E<lt> CH E<lt> D>.
1168Exactly speaking DUCET already has some characters between C<C> and C<D>:
1169C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1170C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1171and C<c-curl> (C<U+0255>) with C<0E69>.
1172Then primary weight C<0E6A> for C<CH> makes C<CH>
1173ordered between C<c-curl> and C<D>.
1174
1175=item hangul_terminator
1176
1177-- see 7.1.4 Trailing Weights, UTS #10.
1178
1179If a true value is given (non-zero but should be positive),
1180it will be added as a terminator primary weight to the end of
1181every standard Hangul syllable. Secondary and any higher weights
1182for terminator are set to zero.
1183If the value is false or C<hangul_terminator> key does not exist,
1184insertion of terminator weights will not be performed.
1185
1186Boundaries of Hangul syllables are determined
1187according to conjoining Jamo behavior in F<the Unicode Standard>
1188and F<HangulSyllableType.txt>.
1189
1190B<Implementation Note:>
1191(1) For expansion mapping (Unicode character mapped
1192to a sequence of collation elements), a terminator will not be added
1193between collation elements, even if Hangul syllable boundary exists there.
1194Addition of terminator is restricted to the next position
1195to the last collation element.
1196
1197(2) Non-conjoining Hangul letters
1198(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1199automatically terminated with a terminator primary weight.
1200These characters may need terminator included in a collation element
1201table beforehand.
1202
1203=item highestFFFF
1204
1205-- see 5.14 Collation Elements, UTS #35.
1206
1207If the parameter is made true, C<U+FFFF> has a highest primary weight.
1208When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1209C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1210begins with C<"abc">, or another primary equivalent.
1211C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1212such as C<"abc\x{FFFF}xyz">.
1213
1214C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1215almostly, but the latter has a problem that you should know which letter is
1216next to C<c>. For a certain language where C<ch> as the next letter,
1217C<"abch"> is greater than C<"abc\x{FFFF}">, but lesser than C<"abd">.
1218
1219Note: This is equivalent to C<entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]'>.
1220Any other character than C<U+FFFF> can be tailored by C<entry>.
1221
1222=item identical
1223
1224-- see A.3 Deterministic Comparison, UTS #10.
1225
1226By default, strings whose weights are equal should be equal,
1227even though their code points are not equal.
1228Completely ignorable characters are ignored.
1229
1230If the parameter is made true, a final, tie-breaking level is used.
1231If no difference of weights is found after the comparison through
1232all the level specified by C<level>, the comparison with code points
1233will be performed.
1234For the tie-breaking comparision, the sort key has code points
1235of the original string appended.
1236Completely ignorable characters are not ignored.
1237
1238If C<preprocess> and/or C<normalization> is applied, the code points
1239of the string after them (in NFD by default) are used.
1240
1241=item ignoreChar
1242
1243=item ignoreName
1244
1245-- see 3.6.2 Variable Weighting, UTS #10.
1246
1247Makes the entry in the table completely ignorable;
1248i.e. as if the weights were zero at all level.
1249
1250Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1251will be ignored. Through C<ignoreName>, any character whose name
1252(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1253will be ignored.
1254
1255E.g. when 'a' and 'e' are ignorable,
1256'element' is equal to 'lament' (or 'lmnt').
1257
1258=item ignore_level2
1259
1260-- see 5.1 Parametric Tailoring, UTS #10.
1261
1262By default, case-sensitive comparison (that is level 3 difference)
1263won't ignore accents (that is level 2 difference).
1264
1265If the parameter is made true, accents (and other primary ignorable
1266characters) are ignored, even though cases are taken into account.
1267
1268B<NOTE>: C<level> should be 3 or greater.
1269
1270=item katakana_before_hiragana
1271
1272-- see 7.2 Tertiary Weight Table, UTS #10.
1273
1274By default, hiragana is before katakana.
1275If the parameter is made true, this is reversed.
1276
1277B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1278distinctions must occur in level 3, and their weights at level 3 must be
1279same as those mentioned in 7.3.1, UTS #10.
1280If you define your collation elements which violate this requirement,
1281this parameter does not work validly.
1282
1283=item level
1284
1285-- see 4.3 Form Sort Key, UTS #10.
1286
1287Set the maximum level.
1288Any higher levels than the specified one are ignored.
1289
1290  Level 1: alphabetic ordering
1291  Level 2: diacritic ordering
1292  Level 3: case ordering
1293  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1294
1295  ex.level => 2,
1296
1297If omitted, the maximum is the 4th.
1298
1299B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1300But this module only uses weights within 0xFFFF.
1301When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1302and 'shift-trimmed'), the level 4 may be unreliable.
1303
1304See also C<identical>.
1305
1306=item minimalFFFE
1307
1308-- see 5.14 Collation Elements, UTS #35.
1309
1310If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1311The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1312first compares C<$a1> and C<$b1> at level 1, and
1313then C<$a2> and C<$b2> at level 1, as followed.
1314
1315        "ab\x{FFFE}a"
1316        "Ab\x{FFFE}a"
1317        "ab\x{FFFE}c"
1318        "Ab\x{FFFE}c"
1319        "ab\x{FFFE}xyz"
1320        "abc\x{FFFE}def"
1321        "abc\x{FFFE}xYz"
1322        "aBc\x{FFFE}xyz"
1323        "abcX\x{FFFE}def"
1324        "abcx\x{FFFE}xyz"
1325        "b\x{FFFE}aaa"
1326        "bbb\x{FFFE}a"
1327
1328Note: This is equivalent to C<entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]'>.
1329Any other character than C<U+FFFE> can be tailored by C<entry>.
1330
1331=item normalization
1332
1333-- see 4.1 Normalize, UTS #10.
1334
1335If specified, strings are normalized before preparation of sort keys
1336(the normalization is executed after preprocess).
1337
1338A form name C<Unicode::Normalize::normalize()> accepts will be applied
1339as C<$normalization_form>.
1340Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1341See C<Unicode::Normalize::normalize()> for detail.
1342If omitted, C<'NFD'> is used.
1343
1344C<normalization> is performed after C<preprocess> (if defined).
1345
1346Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1347though they are not concerned with C<Unicode::Normalize::normalize()>.
1348
1349If C<undef> (not a string C<"undef">) is passed explicitly
1350as the value for this key,
1351any normalization is not carried out (this may make tailoring easier
1352if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1353only contiguous contractions are resolved;
1354e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1355C<A-cedilla-ring> would be primary equal to C<A>.
1356In this point,
1357C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1358B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1359
1360In the case of C<(normalization =E<gt> "prenormalized")>,
1361any normalization is not performed, but
1362discontiguous contractions with combining characters are performed.
1363Therefore
1364C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1365B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1366If source strings are finely prenormalized,
1367C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1368
1369Except C<(normalization =E<gt> undef)>,
1370B<Unicode::Normalize> is required (see also B<CAVEAT>).
1371
1372=item overrideCJK
1373
1374-- see 7.1 Derived Collation Elements, UTS #10.
1375
1376By default, CJK unified ideographs are ordered in Unicode codepoint
1377order, but those in the CJK Unified Ideographs block are lesser than
1378those in the CJK Unified Ideographs Extension A etc.
1379
1380    In the CJK Unified Ideographs block:
1381    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1382    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1383    U+4E00..U+9FC3 if UCA_Version is 18.
1384    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1385    U+4E00..U+9FCC if UCA_Version is 24 or 26.
1386
1387    In the CJK Unified Ideographs Extension blocks:
1388    Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1389    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater.
1390    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
1391
1392Through C<overrideCJK>, ordering of CJK unified ideographs (including
1393extensions) can be overridden.
1394
1395ex. CJK unified ideographs in the JIS code point order.
1396
1397  overrideCJK => sub {
1398      my $u = shift;             # get a Unicode codepoint
1399      my $b = pack('n', $u);     # to UTF-16BE
1400      my $s = your_unicode_to_sjis_converter($b); # convert
1401      my $n = unpack('n', $s);   # convert sjis to short
1402      [ $n, 0x20, 0x2, $u ];     # return the collation element
1403  },
1404
1405The return value may be an arrayref of 1st to 4th weights as shown
1406above. The return value may be an integer as the primary weight
1407as shown below.  If C<undef> is returned, the default derived
1408collation element will be used.
1409
1410  overrideCJK => sub {
1411      my $u = shift;             # get a Unicode codepoint
1412      my $b = pack('n', $u);     # to UTF-16BE
1413      my $s = your_unicode_to_sjis_converter($b); # convert
1414      my $n = unpack('n', $s);   # convert sjis to short
1415      return $n;                 # return the primary weight
1416  },
1417
1418The return value may be a list containing zero or more of
1419an arrayref, an integer, or C<undef>.
1420
1421ex. ignores all CJK unified ideographs.
1422
1423  overrideCJK => sub {()}, # CODEREF returning empty list
1424
1425   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1426   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1427
1428If C<undef> is passed explicitly as the value for this key,
1429weights for CJK unified ideographs are treated as undefined.
1430But assignment of weight for CJK unified ideographs
1431in C<table> or C<entry> is still valid.
1432
1433B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1434C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1435C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1436ideographs. But they can't be overridden via C<overrideCJK> when you use
1437DUCET, as the table includes weights for them. C<table> or C<entry> has
1438priority over C<overrideCJK>.
1439
1440=item overrideHangul
1441
1442-- see 7.1 Derived Collation Elements, UTS #10.
1443
1444By default, Hangul syllables are decomposed into Hangul Jamo,
1445even if C<(normalization =E<gt> undef)>.
1446But the mapping of Hangul syllables may be overridden.
1447
1448This parameter works like C<overrideCJK>, so see there for examples.
1449
1450If you want to override the mapping of Hangul syllables,
1451NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1452Hangul syllables before overriding. FCD may decompose Hangul syllables
1453as the case may be.
1454
1455If C<undef> is passed explicitly as the value for this key,
1456weight for Hangul syllables is treated as undefined
1457without decomposition into Hangul Jamo.
1458But definition of weight for Hangul syllables
1459in C<table> or C<entry> is still valid.
1460
1461=item preprocess
1462
1463-- see 5.4 Preprocessing, UTS #10.
1464
1465If specified, the coderef is used to preprocess each string
1466before the formation of sort keys.
1467
1468ex. dropping English articles, such as "a" or "the".
1469Then, "the pen" is before "a pencil".
1470
1471     preprocess => sub {
1472           my $str = shift;
1473           $str =~ s/\b(?:an?|the)\s+//gi;
1474           return $str;
1475        },
1476
1477C<preprocess> is performed before C<normalization> (if defined).
1478
1479ex. decoding strings in a legacy encoding such as shift-jis:
1480
1481    $sjis_collator = Unicode::Collate->new(
1482        preprocess => \&your_shiftjis_to_unicode_decoder,
1483    );
1484    @result = $sjis_collator->sort(@shiftjis_strings);
1485
1486B<Note:> Strings returned from the coderef will be interpreted
1487according to Perl's Unicode support. See L<perlunicode>,
1488L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1489
1490=item rearrange
1491
1492-- see 3.5 Rearrangement, UTS #10.
1493
1494Characters that are not coded in logical order and to be rearranged.
1495If C<UCA_Version> is equal to or lesser than 11, default is:
1496
1497    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1498
1499If you want to disallow any rearrangement, pass C<undef> or C<[]>
1500(a reference to empty list) as the value for this key.
1501
1502If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1503(i.e. no rearrangement).
1504
1505B<According to the version 9 of UCA, this parameter shall not be used;
1506but it is not warned at present.>
1507
1508=item rewrite
1509
1510If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1511The coderef will get each line, and then should return a rewritten line
1512according to the UCA file format.
1513If the coderef returns an empty line, the line will be skipped.
1514
1515e.g. any primary ignorable characters into tertiary ignorable:
1516
1517    rewrite => sub {
1518        my $line = shift;
1519        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1520        return $line;
1521    },
1522
1523This example shows rewriting weights. C<rewrite> is allowed to
1524affect code points, weights, and the name.
1525
1526B<NOTE>: C<table> is available to use another table file;
1527preparing a modified table once would be more efficient than
1528rewriting lines on reading an unmodified table every time.
1529
1530=item suppress
1531
1532-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1533UTS #35 (LDML).
1534
1535Contractions beginning with the specified characters are suppressed,
1536even if those contractions are defined in C<table>.
1537
1538An example for Russian and some languages using the Cyrillic script:
1539
1540    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1541
1542where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1543
1544B<NOTE>: Contractions via C<entry> are not be suppressed.
1545
1546=item table
1547
1548-- see 3.6 Default Unicode Collation Element Table, UTS #10.
1549
1550You can use another collation element table if desired.
1551
1552The table file should locate in the F<Unicode/Collate> directory
1553on C<@INC>. Say, if the filename is F<Foo.txt>,
1554the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1555
1556By default, F<allkeys.txt> (as the filename of DUCET) is used.
1557If you will prepare your own table file, any name other than F<allkeys.txt>
1558may be better to avoid namespace conflict.
1559
1560B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1561module, and it may save time at the run time.
1562Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1563or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1564C<rewrite> will prevent this module from using the compiled DUCET.
1565
1566If C<undef> is passed explicitly as the value for this key,
1567no file is read (but you can define collation elements via C<entry>).
1568
1569A typical way to define a collation element table
1570without any file of table:
1571
1572   $onlyABC = Unicode::Collate->new(
1573       table => undef,
1574       entry => << 'ENTRIES',
15750061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
15760041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
15770062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
15780042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
15790063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
15800043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1581ENTRIES
1582    );
1583
1584If C<ignoreName> or C<undefName> is used, character names should be
1585specified as a comment (following C<#>) on each line.
1586
1587=item undefChar
1588
1589=item undefName
1590
1591-- see 6.3.4 Reducing the Repertoire, UTS #10.
1592
1593Undefines the collation element as if it were unassigned in the C<table>.
1594This reduces the size of the table.
1595If an unassigned character appears in the string to be collated,
1596the sort key is made from its codepoint
1597as a single-character collation element,
1598as it is greater than any other assigned collation elements
1599(in the codepoint order among the unassigned characters).
1600But, it'd be better to ignore characters
1601unfamiliar to you and maybe never used.
1602
1603Through C<undefChar>, any character matching C<qr/$undefChar/>
1604will be undefined. Through C<undefName>, any character whose name
1605(given in the C<table> file as a comment) matches C<qr/$undefName/>
1606will be undefined.
1607
1608ex. Collation weights for beyond-BMP characters are not stored in object:
1609
1610    undefChar => qr/[^\0-\x{fffd}]/,
1611
1612=item upper_before_lower
1613
1614-- see 6.6 Case Comparisons, UTS #10.
1615
1616By default, lowercase is before uppercase.
1617If the parameter is made true, this is reversed.
1618
1619B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1620distinctions must occur in level 3, and their weights at level 3 must be
1621same as those mentioned in 7.3.1, UTS #10.
1622If you define your collation elements which differs from this requirement,
1623this parameter doesn't work validly.
1624
1625=item variable
1626
1627-- see 3.6.2 Variable Weighting, UTS #10.
1628
1629This key allows for variable weighting of variable collation elements,
1630which are marked with an ASTERISK in the table
1631(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1632
1633   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1634
1635These names are case-insensitive.
1636By default (if specification is omitted), 'shifted' is adopted.
1637
1638   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1639                    considered at the 4th level.
1640
1641   'Non-Ignorable'  Variable elements are not reset to ignorable.
1642
1643   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1644                    their level 4 weight is replaced by the old level 1 weight.
1645                    Level 4 weight for Non-Variable elements is 0xFFFF.
1646
1647   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1648                    are trimmed.
1649
1650=back
1651
1652=head2 Methods for Collation
1653
1654=over 4
1655
1656=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1657
1658Sorts a list of strings.
1659
1660=item C<$result = $Collator-E<gt>cmp($a, $b)>
1661
1662Returns 1 (when C<$a> is greater than C<$b>)
1663or 0 (when C<$a> is equal to C<$b>)
1664or -1 (when C<$a> is lesser than C<$b>).
1665
1666=item C<$result = $Collator-E<gt>eq($a, $b)>
1667
1668=item C<$result = $Collator-E<gt>ne($a, $b)>
1669
1670=item C<$result = $Collator-E<gt>lt($a, $b)>
1671
1672=item C<$result = $Collator-E<gt>le($a, $b)>
1673
1674=item C<$result = $Collator-E<gt>gt($a, $b)>
1675
1676=item C<$result = $Collator-E<gt>ge($a, $b)>
1677
1678They works like the same name operators as theirs.
1679
1680   eq : whether $a is equal to $b.
1681   ne : whether $a is not equal to $b.
1682   lt : whether $a is lesser than $b.
1683   le : whether $a is lesser than $b or equal to $b.
1684   gt : whether $a is greater than $b.
1685   ge : whether $a is greater than $b or equal to $b.
1686
1687=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1688
1689-- see 4.3 Form Sort Key, UTS #10.
1690
1691Returns a sort key.
1692
1693You compare the sort keys using a binary comparison
1694and get the result of the comparison of the strings using UCA.
1695
1696   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1697
1698      is equivalent to
1699
1700   $Collator->cmp($a, $b)
1701
1702=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1703
1704Converts a sorting key into its representation form.
1705If C<UCA_Version> is 8, the output is slightly different.
1706
1707   use Unicode::Collate;
1708   my $c = Unicode::Collate->new();
1709   print $c->viewSortKey("Perl"),"\n";
1710
1711   # output:
1712   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1713   #  Level 1               Level 2               Level 3               Level 4
1714
1715=back
1716
1717=head2 Methods for Searching
1718
1719The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1720like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1721but they are not aware of any pattern, but only a literal substring.
1722
1723B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1724for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1725C<subst>, C<gsubst>) is croaked, as the position and the length might
1726differ from those on the specified string.
1727
1728C<rearrange> and C<hangul_terminator> parameters are neglected.
1729C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1730matching and searching, as it doesn't matter whether greater or lesser.
1731
1732=over 4
1733
1734=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1735
1736=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1737
1738If C<$substring> matches a part of C<$string>, returns
1739the position of the first occurrence of the matching part in scalar context;
1740in list context, returns a two-element list of
1741the position and the length of the matching part.
1742
1743If C<$substring> does not match any part of C<$string>,
1744returns C<-1> in scalar context and
1745an empty list in list context.
1746
1747e.g. you say
1748
1749  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1750                                     # (normalization => undef) is REQUIRED.
1751  my $str = "Ich mu� studieren Perl.";
1752  my $sub = "M�SS";
1753  my $match;
1754  if (my($pos,$len) = $Collator->index($str, $sub)) {
1755      $match = substr($str, $pos, $len);
1756  }
1757
1758and get C<"mu�"> in C<$match> since C<"mu�">
1759is primary equal to C<"M�SS">.
1760
1761=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1762
1763=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1764
1765If C<$substring> matches a part of C<$string>, in scalar context, returns
1766B<a reference to> the first occurrence of the matching part
1767(C<$match_ref> is always true if matches,
1768since every reference is B<true>);
1769in list context, returns the first occurrence of the matching part.
1770
1771If C<$substring> does not match any part of C<$string>,
1772returns C<undef> in scalar context and
1773an empty list in list context.
1774
1775e.g.
1776
1777    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1778	print "matches [$$match_ref].\n";
1779    } else {
1780	print "doesn't match.\n";
1781    }
1782
1783     or
1784
1785    if (($match) = $Collator->match($str, $sub)) { # list context
1786	print "matches [$match].\n";
1787    } else {
1788	print "doesn't match.\n";
1789    }
1790
1791=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1792
1793If C<$substring> matches a part of C<$string>, returns
1794all the matching parts (or matching count in scalar context).
1795
1796If C<$substring> does not match any part of C<$string>,
1797returns an empty list.
1798
1799=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1800
1801If C<$substring> matches a part of C<$string>,
1802the first occurrence of the matching part is replaced by C<$replacement>
1803(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1804
1805C<$replacement> can be a C<CODEREF>,
1806taking the matching part as an argument,
1807and returning a string to replace the matching part
1808(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1809
1810=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1811
1812If C<$substring> matches a part of C<$string>,
1813all the occurrences of the matching part are replaced by C<$replacement>
1814(C<$string> is modified) and C<$count> is returned.
1815
1816C<$replacement> can be a C<CODEREF>,
1817taking the matching part as an argument,
1818and returning a string to replace the matching part
1819(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1820
1821e.g.
1822
1823  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1824                                     # (normalization => undef) is REQUIRED.
1825  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1826  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1827
1828  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1829  # i.e., all the camels are made bold-faced.
1830
1831   Examples: levels and ignore_level2 - what does camel match?
1832  ---------------------------------------------------------------------------
1833   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1834  -----------------------|---------------------------------------------------
1835     1        false      |   yes    yes      yes          yes        yes
1836     2        false      |   yes    yes      no           yes        yes
1837     3        false      |   yes    no       no           yes        yes
1838     4        false      |   yes    no       no           no         yes
1839  -----------------------|---------------------------------------------------
1840     1        true       |   yes    yes      yes          yes        yes
1841     2        true       |   yes    yes      yes          yes        yes
1842     3        true       |   yes    no       yes          yes        yes
1843     4        true       |   yes    no       yes          no         yes
1844  ---------------------------------------------------------------------------
1845   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1846         at any level.
1847
1848=back
1849
1850=head2 Other Methods
1851
1852=over 4
1853
1854=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1855
1856=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1857
1858Changes the value of specified keys and returns the changed part.
1859
1860    $Collator = Unicode::Collate->new(level => 4);
1861
1862    $Collator->eq("perl", "PERL"); # false
1863
1864    %old = $Collator->change(level => 2); # returns (level => 4).
1865
1866    $Collator->eq("perl", "PERL"); # true
1867
1868    $Collator->change(%old); # returns (level => 2).
1869
1870    $Collator->eq("perl", "PERL"); # false
1871
1872Not all C<(key,value)>s are allowed to be changed.
1873See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1874
1875In the scalar context, returns the modified collator
1876(but it is B<not> a clone from the original).
1877
1878    $Collator->change(level => 2)->eq("perl", "PERL"); # true
1879
1880    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1881
1882    $Collator->change(level => 4)->eq("perl", "PERL"); # false
1883
1884=item C<$version = $Collator-E<gt>version()>
1885
1886Returns the version number (a string) of the Unicode Standard
1887which the C<table> file used by the collator object is based on.
1888If the table does not include a version line (starting with C<@version>),
1889returns C<"unknown">.
1890
1891=item C<UCA_Version()>
1892
1893Returns the revision number of UTS #10 this module consults,
1894that should correspond with the DUCET incorporated.
1895
1896=item C<Base_Unicode_Version()>
1897
1898Returns the version number of UTS #10 this module consults,
1899that should correspond with the DUCET incorporated.
1900
1901=back
1902
1903=head1 EXPORT
1904
1905No method will be exported.
1906
1907=head1 INSTALL
1908
1909Though this module can be used without any C<table> file,
1910to use this module easily, it is recommended to install a table file
1911in the UCA format, by copying it under the directory
1912<a place in @INC>/Unicode/Collate.
1913
1914The most preferable one is "The Default Unicode Collation Element Table"
1915(aka DUCET), available from the Unicode Consortium's website:
1916
1917   http://www.unicode.org/Public/UCA/
1918
1919   http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1920
1921If DUCET is not installed, it is recommended to copy the file
1922from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1923to <a place in @INC>/Unicode/Collate/allkeys.txt
1924manually.
1925
1926=head1 CAVEATS
1927
1928=over 4
1929
1930=item Normalization
1931
1932Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1933module (see L<Unicode::Normalize>).
1934
1935If you need not it (say, in the case when you need not
1936handle any combining characters),
1937assign C<normalization =E<gt> undef> explicitly.
1938
1939-- see 6.5 Avoiding Normalization, UTS #10.
1940
1941=item Conformance Test
1942
1943The Conformance Test for the UCA is available
1944under L<http://www.unicode.org/Public/UCA/>.
1945
1946For F<CollationTest_SHIFTED.txt>,
1947a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1948for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1949C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1950
1951If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
1952C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
1953C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
1954C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
1955
1956B<Unicode::Normalize is required to try The Conformance Test.>
1957
1958=back
1959
1960=head1 AUTHOR, COPYRIGHT AND LICENSE
1961
1962The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1963<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2012,
1964SADAHIRO Tomoyuki. Japan. All rights reserved.
1965
1966This module is free software; you can redistribute it and/or
1967modify it under the same terms as Perl itself.
1968
1969The file Unicode/Collate/allkeys.txt was copied verbatim
1970from L<http://www.unicode.org/Public/UCA/6.2.0/allkeys.txt>.
1971For this file, Copyright (c) 2001-2012 Unicode, Inc.
1972Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1973
1974=head1 SEE ALSO
1975
1976=over 4
1977
1978=item Unicode Collation Algorithm - UTS #10
1979
1980L<http://www.unicode.org/reports/tr10/>
1981
1982=item The Default Unicode Collation Element Table (DUCET)
1983
1984L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1985
1986=item The conformance test for the UCA
1987
1988L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1989
1990L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1991
1992=item Hangul Syllable Type
1993
1994L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1995
1996=item Unicode Normalization Forms - UAX #15
1997
1998L<http://www.unicode.org/reports/tr15/>
1999
2000=item Unicode Locale Data Markup Language (LDML) - UTS #35
2001
2002L<http://www.unicode.org/reports/tr35/>
2003
2004=back
2005
2006=cut
2007