xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/lib/Unicode/Collate.pm (revision 0:68f95e015346)
1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7}
8
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13use File::Spec;
14
15no warnings 'utf8';
16
17require Exporter;
18
19our $VERSION = '0.33';
20our $PACKAGE = __PACKAGE__;
21
22our @ISA = qw(Exporter);
23
24our %EXPORT_TAGS = ();
25our @EXPORT_OK = ();
26our @EXPORT = ();
27
28(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
29our $KeyFile = "allkeys.txt";
30
31# Perl's boolean
32use constant TRUE  => 1;
33use constant FALSE => "";
34use constant NOMATCHPOS => -1;
35
36# A coderef to get combining class imported from Unicode::Normalize
37# (i.e. \&Unicode::Normalize::getCombinClass).
38# This is also used as a HAS_UNICODE_NORMALIZE flag.
39our $CVgetCombinClass;
40
41# Supported Levels
42use constant MinLevel => 1;
43use constant MaxLevel => 4;
44
45# Minimum weights at level 2 and 3, respectively
46use constant Min2Wt => 0x20;
47use constant Min3Wt => 0x02;
48
49# Shifted weight at 4th level
50use constant Shift4Wt => 0xFFFF;
51
52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
53# PROBLEM: The Default Unicode Collation Element Table
54# has weights over 0xFFFF at the 4th level.
55# The tie-breaking in the variable weights
56# other than "shift" (as well as "shift-trimmed") is unreliable.
57use constant VCE_TEMPLATE => 'Cn4';
58
59# A sort key: 16-bit weights
60# See also the PROBLEM on VCE_TEMPLATE above.
61use constant KEY_TEMPLATE => 'n*';
62
63# Level separator in a sort key:
64# i.e. pack(KEY_TEMPLATE, 0)
65use constant LEVEL_SEP => "\0\0";
66
67# As Unicode code point separator for hash keys.
68# A joined code point string (denoted by JCPS below)
69# like "65;768" is used for internal processing
70# instead of Perl's Unicode string like "\x41\x{300}",
71# as the native code point is different from the Unicode code point
72# on EBCDIC platform.
73# This character must not be included in any stringified
74# representation of an integer.
75use constant CODE_SEP => ';';
76
77# boolean values of variable weights
78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR     => 1; # Variable character
80
81# specific code points
82use constant Hangul_LBase  => 0x1100;
83use constant Hangul_LIni   => 0x1100;
84use constant Hangul_LFin   => 0x1159;
85use constant Hangul_LFill  => 0x115F;
86use constant Hangul_VBase  => 0x1161;
87use constant Hangul_VIni   => 0x1160;
88use constant Hangul_VFin   => 0x11A2;
89use constant Hangul_TBase  => 0x11A7;
90use constant Hangul_TIni   => 0x11A8;
91use constant Hangul_TFin   => 0x11F9;
92use constant Hangul_TCount => 28;
93use constant Hangul_NCount => 588;
94use constant Hangul_SBase  => 0xAC00;
95use constant Hangul_SIni   => 0xAC00;
96use constant Hangul_SFin   => 0xD7A3;
97use constant CJK_UidIni    => 0x4E00;
98use constant CJK_UidFin    => 0x9FA5;
99use constant CJK_ExtAIni   => 0x3400;
100use constant CJK_ExtAFin   => 0x4DB5;
101use constant CJK_ExtBIni   => 0x20000;
102use constant CJK_ExtBFin   => 0x2A6D6;
103use constant BMP_Max       => 0xFFFF;
104
105# Logical_Order_Exception in PropList.txt
106# TODO: synchronization with change of PropList.txt.
107our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
108
109sub UCA_Version { "11" }
110
111sub Base_Unicode_Version { "4.0" }
112
113######
114
115sub pack_U {
116    return pack('U*', @_);
117}
118
119sub unpack_U {
120    return unpack('U*', pack('U*').shift);
121}
122
123######
124
125my (%VariableOK);
126@VariableOK{ qw/
127    blanked  non-ignorable  shifted  shift-trimmed
128  / } = (); # keys lowercased
129
130our @ChangeOK = qw/
131    alternate backwards level normalization rearrange
132    katakana_before_hiragana upper_before_lower
133    overrideHangul overrideCJK preprocess UCA_Version
134    hangul_terminator variable
135  /;
136
137our @ChangeNG = qw/
138    entry mapping table maxlength
139    ignoreChar ignoreName undefChar undefName variableTable
140    versionTable alternateTable backwardsTable forwardsTable rearrangeTable
141    derivCode normCode rearrangeHash L3_ignorable
142    backwardsFlag
143  /;
144# The hash key 'ignored' is deleted at v 0.21.
145# The hash key 'isShift' is deleted at v 0.23.
146# The hash key 'combining' is deleted at v 0.24.
147# The hash key 'entries' is deleted at v 0.30.
148
149sub version {
150    my $self = shift;
151    return $self->{versionTable} || 'unknown';
152}
153
154my (%ChangeOK, %ChangeNG);
155@ChangeOK{ @ChangeOK } = ();
156@ChangeNG{ @ChangeNG } = ();
157
158sub change {
159    my $self = shift;
160    my %hash = @_;
161    my %old;
162    if (exists $hash{variable} && exists $hash{alternate}) {
163	delete $hash{alternate};
164    }
165    elsif (!exists $hash{variable} && exists $hash{alternate}) {
166	$hash{variable} = $hash{alternate};
167    }
168    foreach my $k (keys %hash) {
169	if (exists $ChangeOK{$k}) {
170	    $old{$k} = $self->{$k};
171	    $self->{$k} = $hash{$k};
172	}
173	elsif (exists $ChangeNG{$k}) {
174	    croak "change of $k via change() is not allowed!";
175	}
176	# else => ignored
177    }
178    $self->checkCollator;
179    return wantarray ? %old : $self;
180}
181
182sub _checkLevel {
183    my $level = shift;
184    my $key   = shift; # 'level' or 'backwards'
185    MinLevel <= $level or croak sprintf
186	"Illegal level %d (in value for key '%s') lower than %d.",
187	    $level, $key, MinLevel;
188    $level <= MaxLevel or croak sprintf
189	"Unsupported level %d (in value for key '%s') higher than %d.",
190	    $level, $key, MaxLevel;
191}
192
193my %DerivCode = (
194    8 => \&_derivCE_8,
195    9 => \&_derivCE_9,
196   11 => \&_derivCE_9, # 11 == 9
197);
198
199sub checkCollator {
200    my $self = shift;
201    _checkLevel($self->{level}, "level");
202
203    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
204	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
205
206    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
207				$self->{alternateTable} || 'shifted';
208    $self->{variable} = $self->{alternate} = lc($self->{variable});
209    exists $VariableOK{ $self->{variable} }
210	or croak "$PACKAGE unknown variable tag name: $self->{variable}";
211
212    if (! defined $self->{backwards}) {
213	$self->{backwardsFlag} = 0;
214    }
215    elsif (! ref $self->{backwards}) {
216	_checkLevel($self->{backwards}, "backwards");
217	$self->{backwardsFlag} = 1 << $self->{backwards};
218    }
219    else {
220	my %level;
221	$self->{backwardsFlag} = 0;
222	for my $b (@{ $self->{backwards} }) {
223	    _checkLevel($b, "backwards");
224	    $level{$b} = 1;
225	}
226	for my $v (sort keys %level) {
227	    $self->{backwardsFlag} += 1 << $v;
228	}
229    }
230
231    defined $self->{rearrange} or $self->{rearrange} = [];
232    ref $self->{rearrange}
233	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
234
235    # keys of $self->{rearrangeHash} are $self->{rearrange}.
236    $self->{rearrangeHash} = undef;
237
238    if (@{ $self->{rearrange} }) {
239	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
240    }
241
242    $self->{normCode} = undef;
243
244    if (defined $self->{normalization}) {
245	eval { require Unicode::Normalize };
246	$@ and croak "Unicode::Normalize is required to normalize strings";
247
248	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
249
250	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
251	    $self->{normCode} = \&Unicode::Normalize::NFD;
252	}
253	elsif ($self->{normalization} ne 'prenormalized') {
254	    my $norm = $self->{normalization};
255	    $self->{normCode} = sub {
256		Unicode::Normalize::normalize($norm, shift);
257	    };
258	    eval { $self->{normCode}->("") }; # try
259	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
260	}
261    }
262    return;
263}
264
265sub new
266{
267    my $class = shift;
268    my $self = bless { @_ }, $class;
269
270    # If undef is passed explicitly, no file is read.
271    $self->{table} = $KeyFile if ! exists $self->{table};
272    $self->read_table if defined $self->{table};
273
274    if ($self->{entry}) {
275	$self->parseEntry($_) foreach split /\n/, $self->{entry};
276    }
277
278    $self->{level} ||= MaxLevel;
279    $self->{UCA_Version} ||= UCA_Version();
280
281    $self->{overrideHangul} = FALSE
282	if ! exists $self->{overrideHangul};
283    $self->{overrideCJK} = FALSE
284	if ! exists $self->{overrideCJK};
285    $self->{normalization} = 'NFD'
286	if ! exists $self->{normalization};
287    $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
288	if ! exists $self->{rearrange};
289    $self->{backwards} = $self->{backwardsTable}
290	if ! exists $self->{backwards};
291
292    $self->checkCollator;
293
294    return $self;
295}
296
297sub read_table {
298    my $self = shift;
299
300    my $filepath = File::Spec->catfile($Path, $self->{table});
301    open my $fk, "<$filepath"
302	or croak "File does not exist at $filepath";
303
304    while (<$fk>) {
305	next if /^\s*#/;
306	unless (s/^\s*\@//) {
307	    $self->parseEntry($_);
308	    next;
309	}
310
311	if (/^version\s*(\S*)/) {
312	    $self->{versionTable} ||= $1;
313	}
314	elsif (/^variable\s+(\S*)/) { # since UTS #10-9
315	    $self->{variableTable} ||= $1;
316	}
317	elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
318	    $self->{alternateTable} ||= $1;
319	}
320	elsif (/^backwards\s+(\S*)/) {
321	    push @{ $self->{backwardsTable} }, $1;
322	}
323	elsif (/^forwards\s+(\S*)/) { # parhaps no use
324	    push @{ $self->{forwardsTable} }, $1;
325	}
326	elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
327	    push @{ $self->{rearrangeTable} }, _getHexArray($1);
328	}
329    }
330    close $fk;
331}
332
333
334##
335## get $line, parse it, and write an entry in $self
336##
337sub parseEntry
338{
339    my $self = shift;
340    my $line = shift;
341    my($name, $entry, @uv, @key);
342
343    return if $line !~ /^\s*[0-9A-Fa-f]/;
344
345    # removes comment and gets name
346    $name = $1
347	if $line =~ s/[#%]\s*(.*)//;
348    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
349
350    # gets element
351    my($e, $k) = split /;/, $line;
352    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
353	if ! $k;
354
355    @uv = _getHexArray($e);
356    return if !@uv;
357
358    $entry = join(CODE_SEP, @uv); # in JCPS
359
360    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
361	my $ele = pack_U(@uv);
362
363	# regarded as if it were not entried in the table
364	return
365	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
366
367	# replaced as completely ignorable
368	$k = '[.0000.0000.0000.0000]'
369	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
370    }
371
372    # replaced as completely ignorable
373    $k = '[.0000.0000.0000.0000]'
374	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
375
376    my $is_L3_ignorable = TRUE;
377
378    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
379	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
380	my @wt = _getHexArray($arr);
381	push @key, pack(VCE_TEMPLATE, $var, @wt);
382	$is_L3_ignorable = FALSE
383	    if $wt[0] + $wt[1] + $wt[2] != 0;
384	  # if $arr !~ /[1-9A-Fa-f]/; NG
385	  # Conformance Test shows L3-ignorable is completely ignorable.
386	# For expansion, an entry $is_L3_ignorable
387	# if and only if "all" CEs are [.0000.0000.0000].
388    }
389
390    $self->{mapping}{$entry} = \@key;
391
392    if (@uv > 1) {
393	(!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
394	    and $self->{maxlength}{$uv[0]} = @uv;
395    }
396    else {
397	$is_L3_ignorable
398	    ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
399	    : ($self->{L3_ignorable}{$uv[0]} and
400	       $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
401    }
402}
403
404
405##
406## VCE = _varCE(variable term, VCE)
407##
408sub _varCE
409{
410    my $vbl = shift;
411    my $vce = shift;
412    if ($vbl eq 'non-ignorable') {
413	return $vce;
414    }
415    my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
416
417    if ($var) {
418	return pack(VCE_TEMPLATE, $var, 0, 0, 0,
419		$vbl eq 'blanked' ? $wt[3] : $wt[0]);
420    }
421    elsif ($vbl eq 'blanked') {
422	return $vce;
423    }
424    else {
425	return pack(VCE_TEMPLATE, $var, @wt[0..2],
426	    $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
427    }
428}
429
430sub viewSortKey
431{
432    my $self = shift;
433    $self->visualizeSortKey($self->getSortKey(@_));
434}
435
436sub visualizeSortKey
437{
438    my $self = shift;
439    my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
440
441    if ($self->{UCA_Version} <= 8) {
442	$view =~ s/ ?0000 ?/|/g;
443    } else {
444	$view =~ s/\b0000\b/|/g;
445    }
446    return "[$view]";
447}
448
449
450##
451## arrayref of JCPS   = splitEnt(string to be collated)
452## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
453##
454sub splitEnt
455{
456    my $self = shift;
457    my $wLen = $_[1];
458
459    my $code = $self->{preprocess};
460    my $norm = $self->{normCode};
461    my $map  = $self->{mapping};
462    my $max  = $self->{maxlength};
463    my $reH  = $self->{rearrangeHash};
464    my $ign  = $self->{L3_ignorable};
465    my $ver9 = $self->{UCA_Version} >= 9;
466
467    my ($str, @buf);
468
469    if ($wLen) {
470	$code and croak "Preprocess breaks character positions. "
471			. "Don't use with index(), match(), etc.";
472	$norm and croak "Normalization breaks character positions. "
473			. "Don't use with index(), match(), etc.";
474	$str = $_[0];
475    }
476    else {
477	$str = $_[0];
478	$str = &$code($str) if ref $code;
479	$str = &$norm($str) if ref $norm;
480    }
481
482    # get array of Unicode code point of string.
483    my @src = unpack_U($str);
484
485    # rearrangement:
486    # Character positions are not kept if rearranged,
487    # then neglected if $wLen is true.
488    if ($reH && ! $wLen) {
489	for (my $i = 0; $i < @src; $i++) {
490	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
491		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
492		$i++;
493	    }
494	}
495    }
496
497    # To remove a character marked as a completely ignorable.
498    for (my $i = 0; $i < @src; $i++) {
499	$src[$i] = undef
500	    if _isIllegal($src[$i]) || ($ver9 && $ign->{ $src[$i] });
501    }
502
503    for (my $i = 0; $i < @src; $i++) {
504	my $jcps = $src[$i];
505	next if ! defined $jcps;
506	my $i_orig = $i;
507
508	if ($max->{$jcps}) { # contract
509	    my $temp_jcps = $jcps;
510	    my $jcpsLen = 1;
511	    my $maxLen = $max->{$jcps};
512
513	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
514		next if ! defined $src[$p];
515		$temp_jcps .= CODE_SEP . $src[$p];
516		$jcpsLen++;
517		if ($map->{$temp_jcps}) {
518		    $jcps = $temp_jcps;
519		    $i = $p;
520		}
521	    }
522
523	# not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
524	# This process requires Unicode::Normalize.
525	# If "normalization" is undef, here should be skipped *always*
526	# (in spite of bool value of $CVgetCombinClass),
527	# since canonical ordering cannot be expected.
528	# Blocked combining character should not be contracted.
529
530	    if ($self->{normalization})
531	    # $self->{normCode} is false in the case of "prenormalized".
532	    {
533		my $preCC = 0;
534		my $curCC = 0;
535
536		for (my $p = $i + 1; $p < @src; $p++) {
537		    next if ! defined $src[$p];
538		    $curCC = $CVgetCombinClass->($src[$p]);
539		    last unless $curCC;
540		    my $tail = CODE_SEP . $src[$p];
541		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
542			$jcps .= $tail;
543			$src[$p] = undef;
544		    } else {
545			$preCC = $curCC;
546		    }
547		}
548	    }
549	}
550
551	if ($wLen) {
552	    for (; $i + 1 < @src; $i++) {
553		last if defined $src[$i + 1];
554	    }
555	}
556
557	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
558    }
559    return \@buf;
560}
561
562
563##
564## list of VCE = getWt(JCPS)
565##
566sub getWt
567{
568    my $self = shift;
569    my $u    = shift;
570    my $vbl  = $self->{variable};
571    my $map  = $self->{mapping};
572    my $der  = $self->{derivCode};
573
574    return if !defined $u;
575    return map(_varCE($vbl, $_), @{ $map->{$u} })
576	if $map->{$u};
577
578    # JCPS must not be a contraction, then it's a code point.
579    if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
580	my $hang = $self->{overrideHangul};
581	my @hangulCE;
582	if ($hang) {
583	    @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
584	}
585	elsif (!defined $hang) {
586	    @hangulCE = $der->($u);
587	}
588	else {
589	    my $max  = $self->{maxlength};
590	    my @decH = _decompHangul($u);
591
592	    if (@decH == 2) {
593		my $contract = join(CODE_SEP, @decH);
594		@decH = ($contract) if $map->{$contract};
595	    } else { # must be <@decH == 3>
596		if ($max->{$decH[0]}) {
597		    my $contract = join(CODE_SEP, @decH);
598		    if ($map->{$contract}) {
599			@decH = ($contract);
600		    } else {
601			$contract = join(CODE_SEP, @decH[0,1]);
602			$map->{$contract} and @decH = ($contract, $decH[2]);
603		    }
604		    # even if V's ignorable, LT contraction is not supported.
605		    # If such a situatution were required, NFD should be used.
606		}
607		if (@decH == 3 && $max->{$decH[1]}) {
608		    my $contract = join(CODE_SEP, @decH[1,2]);
609		    $map->{$contract} and @decH = ($decH[0], $contract);
610		}
611	    }
612
613	    @hangulCE = map({
614		    $map->{$_} ? @{ $map->{$_} } : $der->($_);
615		} @decH);
616	}
617	return map _varCE($vbl, $_), @hangulCE;
618    }
619    elsif (CJK_UidIni  <= $u && $u <= CJK_UidFin  ||
620	   CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
621	   CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
622	my $cjk  = $self->{overrideCJK};
623	return map _varCE($vbl, $_),
624	    $cjk
625		? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
626		: defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
627		    ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
628		    : $der->($u);
629    }
630    else {
631	return map _varCE($vbl, $_), $der->($u);
632    }
633}
634
635
636##
637## string sortkey = getSortKey(string arg)
638##
639sub getSortKey
640{
641    my $self = shift;
642    my $lev  = $self->{level};
643    my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
644    my $ver9 = $self->{UCA_Version} >= 9;
645    my $v2i  = $ver9 && $self->{variable} ne 'non-ignorable';
646
647    my @buf; # weight arrays
648    if ($self->{hangul_terminator}) {
649	my $preHST = '';
650	foreach my $jcps (@$rEnt) {
651	    # weird things like VL, TL-contraction are not considered!
652	    my $curHST = '';
653	    foreach my $u (split /;/, $jcps) {
654		$curHST .= getHST($u);
655	    }
656	    if ($preHST && !$curHST || # hangul before non-hangul
657		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
658		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
659		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
660
661		push @buf, $self->getWtHangulTerm();
662	    }
663	    $preHST = $curHST;
664
665	    push @buf, $self->getWt($jcps);
666	}
667	$preHST # end at hangul
668	    and push @buf, $self->getWtHangulTerm();
669    }
670    else {
671	foreach my $jcps (@$rEnt) {
672	    push @buf, $self->getWt($jcps);
673	}
674    }
675
676    # make sort key
677    my @ret = ([],[],[],[]);
678    my $last_is_variable;
679
680    foreach my $vwt (@buf) {
681	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
682	if ($v2i) {
683	    if ($var) {
684		$last_is_variable = TRUE;
685	    }
686	    elsif (!$wt[0]) { # ignorable
687		next if $last_is_variable;
688	    }
689	    else {
690		$last_is_variable = FALSE;
691	    }
692	}
693	foreach my $v (0..$lev-1) {
694	    0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
695	}
696    }
697
698    # modification of tertiary weights
699    if ($self->{upper_before_lower}) {
700	foreach (@{ $ret[2] }) {
701	    if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
702	    elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
703	    elsif ($_ == 0x1C)             { $_ += 1 } # square upper
704	    elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
705	}
706    }
707    if ($self->{katakana_before_hiragana}) {
708	foreach (@{ $ret[2] }) {
709	    if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
710	    elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
711	}
712    }
713
714    if ($self->{backwardsFlag}) {
715	for (my $v = MinLevel; $v <= MaxLevel; $v++) {
716	    if ($self->{backwardsFlag} & (1 << $v)) {
717		@{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
718	    }
719	}
720    }
721
722    join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
723}
724
725
726##
727## int compare = cmp(string a, string b)
728##
729sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
730sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
731sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
732sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
733sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
734sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
735sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
736
737##
738## list[strings] sorted = sort(list[strings] arg)
739##
740sub sort {
741    my $obj = shift;
742    return
743	map { $_->[1] }
744	    sort{ $a->[0] cmp $b->[0] }
745		map [ $obj->getSortKey($_), $_ ], @_;
746}
747
748
749sub _derivCE_9 {
750    my $u = shift;
751    my $base =
752	(CJK_UidIni  <= $u && $u <= CJK_UidFin)
753	    ? 0xFB40 : # CJK
754	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
755	 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
756	    ? 0xFB80   # CJK ext.
757	    : 0xFBC0;  # others
758
759    my $aaaa = $base + ($u >> 15);
760    my $bbbb = ($u & 0x7FFF) | 0x8000;
761    return
762	pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
763	pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
764}
765
766sub _derivCE_8 {
767    my $code = shift;
768    my $aaaa =  0xFF80 + ($code >> 15);
769    my $bbbb = ($code & 0x7FFF) | 0x8000;
770    return
771	pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
772	pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
773}
774
775
776sub getWtHangulTerm {
777    my $self = shift;
778    return _varCE($self->{variable},
779	pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
780}
781
782
783##
784## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
785##
786sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
787
788#
789# $code *must* be in Hangul syllable.
790# Check it before you enter here.
791#
792sub _decompHangul {
793    my $code = shift;
794    my $SIndex = $code - Hangul_SBase;
795    my $LIndex = int( $SIndex / Hangul_NCount);
796    my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
797    my $TIndex =      $SIndex % Hangul_TCount;
798    return (
799	Hangul_LBase + $LIndex,
800	Hangul_VBase + $VIndex,
801	$TIndex ? (Hangul_TBase + $TIndex) : (),
802    );
803}
804
805sub _isIllegal {
806    my $code = shift;
807    return ! defined $code                      # removed
808	|| ($code < 0 || 0x10FFFF < $code)      # out of range
809	|| (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
810	|| (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
811	|| (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
812    ;
813}
814
815# Hangul Syllable Type
816sub getHST {
817    my $u = shift;
818    return
819	Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
820	Hangul_VIni <= $u && $u <= Hangul_VFin	     ? "V" :
821	Hangul_TIni <= $u && $u <= Hangul_TFin	     ? "T" :
822	Hangul_SIni <= $u && $u <= Hangul_SFin ?
823	    ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
824}
825
826
827##
828## bool _nonIgnorAtLevel(arrayref weights, int level)
829##
830sub _nonIgnorAtLevel($$)
831{
832    my $wt = shift;
833    return if ! defined $wt;
834    my $lv = shift;
835    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
836}
837
838##
839## bool _eqArray(
840##    arrayref of arrayref[weights] source,
841##    arrayref of arrayref[weights] substr,
842##    int level)
843## * comparison of graphemes vs graphemes.
844##   @$source >= @$substr must be true (check it before call this);
845##
846sub _eqArray($$$)
847{
848    my $source = shift;
849    my $substr = shift;
850    my $lev = shift;
851
852    for my $g (0..@$substr-1){
853	# Do the $g'th graphemes have the same number of AV weigths?
854	return if @{ $source->[$g] } != @{ $substr->[$g] };
855
856	for my $w (0..@{ $substr->[$g] }-1) {
857	    for my $v (0..$lev-1) {
858		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
859	    }
860	}
861    }
862    return 1;
863}
864
865##
866## (int position, int length)
867## int position = index(string, substring, position, [undoc'ed grobal])
868##
869## With "grobal" (only for the list context),
870##  returns list of arrayref[position, length].
871##
872sub index
873{
874    my $self = shift;
875    my $str  = shift;
876    my $len  = length($str);
877    my $subE = $self->splitEnt(shift);
878    my $pos  = @_ ? shift : 0;
879       $pos  = 0 if $pos < 0;
880    my $grob = shift;
881
882    my $lev  = $self->{level};
883    my $ver9 = $self->{UCA_Version} >= 9;
884    my $v2i  = $self->{variable} ne 'non-ignorable';
885
886    if (! @$subE) {
887	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
888	return $grob
889	    ? map([$_, 0], $temp..$len)
890	    : wantarray ? ($temp,0) : $temp;
891    }
892    $len < $pos
893	and return wantarray ? () : NOMATCHPOS;
894    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
895    @$strE
896	or return wantarray ? () : NOMATCHPOS;
897
898    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
899
900    my $last_is_variable;
901    for my $vwt (map $self->getWt($_), @$subE) {
902	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
903	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
904
905	if ($v2i && $ver9) {
906	    if ($var) {
907		$last_is_variable = TRUE;
908	    }
909	    elsif (!$wt[0]) { # ignorable
910		$to_be_pushed = FALSE if $last_is_variable;
911	    }
912	    else {
913		$last_is_variable = FALSE;
914	    }
915	}
916
917	if (@subWt && !$var && !$wt[0]) {
918	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
919	} else {
920	    push @subWt, [ \@wt ];
921	}
922    }
923
924    my $count = 0;
925    my $end = @$strE - 1;
926
927    $last_is_variable = FALSE; # reuse
928    for (my $i = 0; $i <= $end; ) { # no $i++
929	my $found_base = 0;
930
931	# fetch a grapheme
932	while ($i <= $end && $found_base == 0) {
933	    for my $vwt ($self->getWt($strE->[$i][0])) {
934		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
935		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
936
937		if ($v2i && $ver9) {
938		    if ($var) {
939			$last_is_variable = TRUE;
940		    }
941		    elsif (!$wt[0]) { # ignorable
942			$to_be_pushed = FALSE if $last_is_variable;
943		    }
944		    else {
945			$last_is_variable = FALSE;
946		    }
947		}
948
949		if (@strWt && !$var && !$wt[0]) {
950		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
951		    $finPos[-1] = $strE->[$i][2];
952		} elsif ($to_be_pushed) {
953		    push @strWt, [ \@wt ];
954		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
955		    $finPos[-1] = NOMATCHPOS if $found_base;
956		    push @finPos, $strE->[$i][2];
957		    $found_base++;
958		}
959		# else ===> no-op
960	    }
961	    $i++;
962	}
963
964	# try to match
965	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
966	    if ($iniPos[0] != NOMATCHPOS &&
967		    $finPos[$#subWt] != NOMATCHPOS &&
968			_eqArray(\@strWt, \@subWt, $lev)) {
969		my $temp = $iniPos[0] + $pos;
970
971		if ($grob) {
972		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
973		    splice @strWt,  0, $#subWt;
974		    splice @iniPos, 0, $#subWt;
975		    splice @finPos, 0, $#subWt;
976		}
977		else {
978		    return wantarray
979			? ($temp, $finPos[$#subWt] - $iniPos[0])
980			:  $temp;
981		}
982	    }
983	    shift @strWt;
984	    shift @iniPos;
985	    shift @finPos;
986	}
987    }
988
989    return $grob
990	? @g_ret
991	: wantarray ? () : NOMATCHPOS;
992}
993
994##
995## scalarref to matching part = match(string, substring)
996##
997sub match
998{
999    my $self = shift;
1000    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1001	my $temp = substr($_[0], $pos, $len);
1002	return wantarray ? $temp : \$temp;
1003	# An lvalue ref \substr should be avoided,
1004	# since its value is affected by modification of its referent.
1005    }
1006    else {
1007	return;
1008    }
1009}
1010
1011##
1012## arrayref matching parts = gmatch(string, substring)
1013##
1014sub gmatch
1015{
1016    my $self = shift;
1017    my $str  = shift;
1018    my $sub  = shift;
1019    return map substr($str, $_->[0], $_->[1]),
1020		$self->index($str, $sub, 0, 'g');
1021}
1022
1023##
1024## bool subst'ed = subst(string, substring, replace)
1025##
1026sub subst
1027{
1028    my $self = shift;
1029    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1030
1031    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1032	if ($code) {
1033	    my $mat = substr($_[0], $pos, $len);
1034	    substr($_[0], $pos, $len, $code->($mat));
1035	} else {
1036	    substr($_[0], $pos, $len, $_[2]);
1037	}
1038	return TRUE;
1039    }
1040    else {
1041	return FALSE;
1042    }
1043}
1044
1045##
1046## int count = gsubst(string, substring, replace)
1047##
1048sub gsubst
1049{
1050    my $self = shift;
1051    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1052    my $cnt = 0;
1053
1054    # Replacement is carried out from the end, then use reverse.
1055    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1056	if ($code) {
1057	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1058	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1059	} else {
1060	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1061	}
1062	$cnt++;
1063    }
1064    return $cnt;
1065}
1066
10671;
1068__END__
1069
1070=head1 NAME
1071
1072Unicode::Collate - Unicode Collation Algorithm
1073
1074=head1 SYNOPSIS
1075
1076  use Unicode::Collate;
1077
1078  #construct
1079  $Collator = Unicode::Collate->new(%tailoring);
1080
1081  #sort
1082  @sorted = $Collator->sort(@not_sorted);
1083
1084  #compare
1085  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1086
1087  # If %tailoring is false (i.e. empty),
1088  # $Collator should do the default collation.
1089
1090=head1 DESCRIPTION
1091
1092This module is an implementation
1093of Unicode Technical Standard #10 (UTS #10)
1094"Unicode Collation Algorithm."
1095
1096=head2 Constructor and Tailoring
1097
1098The C<new> method returns a collator object.
1099
1100   $Collator = Unicode::Collate->new(
1101      UCA_Version => $UCA_Version,
1102      alternate => $alternate, # deprecated: use of 'variable' is recommended.
1103      backwards => $levelNumber, # or \@levelNumbers
1104      entry => $element,
1105      hangul_terminator => $term_primary_weight,
1106      ignoreName => qr/$ignoreName/,
1107      ignoreChar => qr/$ignoreChar/,
1108      katakana_before_hiragana => $bool,
1109      level => $collationLevel,
1110      normalization  => $normalization_form,
1111      overrideCJK => \&overrideCJK,
1112      overrideHangul => \&overrideHangul,
1113      preprocess => \&preprocess,
1114      rearrange => \@charList,
1115      table => $filename,
1116      undefName => qr/$undefName/,
1117      undefChar => qr/$undefChar/,
1118      upper_before_lower => $bool,
1119      variable => $variable,
1120   );
1121
1122=over 4
1123
1124=item UCA_Version
1125
1126If the tracking version number of the older UCA is given,
1127the older behavior of that tracking version is emulated on collating.
1128If omitted, the return value of C<UCA_Version()> is used.
1129
1130The supported tracking version: 8, 9, or 11.
1131
1132B<This parameter may be removed in the future version,
1133as switching the algorithm would affect the performance.>
1134
1135=item backwards
1136
1137-- see 3.1.2 French Accents, UTS #10.
1138
1139     backwards => $levelNumber or \@levelNumbers
1140
1141Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1142If omitted, forwards at all the levels.
1143
1144=item entry
1145
1146-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1147
1148If the same character (or a sequence of characters) exists
1149in the collation element table through C<table>,
1150mapping to collation elements is overrided.
1151If it does not exist, the mapping is defined additionally.
1152
1153    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11540063 0068 ; [.0E6A.0020.0002.0063] # ch
11550043 0068 ; [.0E6A.0020.0007.0043] # Ch
11560043 0048 ; [.0E6A.0020.0008.0043] # CH
1157006C 006C ; [.0F4C.0020.0002.006C] # ll
1158004C 006C ; [.0F4C.0020.0007.004C] # Ll
1159004C 004C ; [.0F4C.0020.0008.004C] # LL
1160006E 0303 ; [.0F7B.0020.0002.006E] # n-tilde
1161004E 0303 ; [.0F7B.0020.0008.004E] # N-tilde
1162ENTRY
1163
1164    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
116500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
116600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1167ENTRY
1168
1169B<NOTE:> The code point in the UCA file format (before C<';'>)
1170B<must> be a Unicode code point (defined as hexadecimal),
1171but not a native code point.
1172So C<0063> must always denote C<U+0063>,
1173but not a character of C<"\x63">.
1174
1175Weighting may vary depending on collation element table.
1176So ensure the weights defined in C<entry> will be consistent with
1177those in the collation element table loaded via C<table>.
1178
1179In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1180and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1181(as a value between C<0E60> and C<0E6D>)
1182makes ordering as C<C E<lt> CH E<lt> D>.
1183Exactly speaking DUCET already has some characters between C<C> and C<D>:
1184C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1185C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1186and C<c-curl> (C<U+0255>) with C<0E69>.
1187Then primary weight C<0E6A> for C<CH> makes C<CH>
1188ordered between C<c-curl> and C<D>.
1189
1190=item hangul_terminator
1191
1192-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1193
1194If a true value is given (non-zero but should be positive),
1195it will be added as a terminator primary weight to the end of
1196every standard Hangul syllable. Secondary and any higher weights
1197for terminator are set to zero.
1198If the value is false or C<hangul_terminator> key does not exist,
1199insertion of terminator weights will not be performed.
1200
1201Boundaries of Hangul syllables are determined
1202according to conjoining Jamo behavior in F<the Unicode Standard>
1203and F<HangulSyllableType.txt>.
1204
1205B<Implementation Note:>
1206(1) For expansion mapping (Unicode character mapped
1207to a sequence of collation elements), a terminator will not be added
1208between collation elements, even if Hangul syllable boundary exists there.
1209Addition of terminator is restricted to the next position
1210to the last collation element.
1211
1212(2) Non-conjoining Hangul letters
1213(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1214automatically terminated with a terminator primary weight.
1215These characters may need terminator included in a collation element
1216table beforehand.
1217
1218=item ignoreName
1219
1220=item ignoreChar
1221
1222-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1223
1224Makes the entry in the table completely ignorable;
1225i.e. as if the weights were zero at all level.
1226
1227E.g. when 'a' and 'e' are ignorable,
1228'element' is equal to 'lament' (or 'lmnt').
1229
1230=item level
1231
1232-- see 4.3 Form a sort key for each string, UTS #10.
1233
1234Set the maximum level.
1235Any higher levels than the specified one are ignored.
1236
1237  Level 1: alphabetic ordering
1238  Level 2: diacritic ordering
1239  Level 3: case ordering
1240  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1241
1242  ex.level => 2,
1243
1244If omitted, the maximum is the 4th.
1245
1246=item normalization
1247
1248-- see 4.1 Normalize each input string, UTS #10.
1249
1250If specified, strings are normalized before preparation of sort keys
1251(the normalization is executed after preprocess).
1252
1253A form name C<Unicode::Normalize::normalize()> accepts will be applied
1254as C<$normalization_form>.
1255Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1256See C<Unicode::Normalize::normalize()> for detail.
1257If omitted, C<'NFD'> is used.
1258
1259C<normalization> is performed after C<preprocess> (if defined).
1260
1261Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1262though they are not concerned with C<Unicode::Normalize::normalize()>.
1263
1264If C<undef> (not a string C<"undef">) is passed explicitly
1265as the value for this key,
1266any normalization is not carried out (this may make tailoring easier
1267if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1268only contiguous contractions are resolved;
1269e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1270C<A-cedilla-ring> would be primary equal to C<A>.
1271In this point,
1272C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1273B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1274
1275In the case of C<(normalization =E<gt> "prenormalized")>,
1276any normalization is not performed, but
1277non-contiguous contractions with combining characters are performed.
1278Therefore
1279C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1280B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1281If source strings are finely prenormalized,
1282C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1283
1284Except C<(normalization =E<gt> undef)>,
1285B<Unicode::Normalize> is required (see also B<CAVEAT>).
1286
1287=item overrideCJK
1288
1289-- see 7.1 Derived Collation Elements, UTS #10.
1290
1291By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1292(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>]  are lesser than
1293C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1294C<U+20000> to C<U+2A6D6>].
1295
1296Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1297
1298ex. CJK Unified Ideographs in the JIS code point order.
1299
1300  overrideCJK => sub {
1301      my $u = shift;             # get a Unicode codepoint
1302      my $b = pack('n', $u);     # to UTF-16BE
1303      my $s = your_unicode_to_sjis_converter($b); # convert
1304      my $n = unpack('n', $s);   # convert sjis to short
1305      [ $n, 0x20, 0x2, $u ];     # return the collation element
1306  },
1307
1308ex. ignores all CJK Unified Ideographs.
1309
1310  overrideCJK => sub {()}, # CODEREF returning empty list
1311
1312   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1313   # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1314
1315If C<undef> is passed explicitly as the value for this key,
1316weights for CJK Unified Ideographs are treated as undefined.
1317But assignment of weight for CJK Unified Ideographs
1318in table or C<entry> is still valid.
1319
1320=item overrideHangul
1321
1322-- see 7.1 Derived Collation Elements, UTS #10.
1323
1324By default, Hangul Syllables are decomposed into Hangul Jamo,
1325even if C<(normalization =E<gt> undef)>.
1326But the mapping of Hangul Syllables may be overrided.
1327
1328This tag works like C<overrideCJK>, so see there for examples.
1329
1330If you want to override the mapping of Hangul Syllables,
1331NFD, NFKD, and FCD are not appropriate,
1332since they will decompose Hangul Syllables before overriding.
1333
1334If C<undef> is passed explicitly as the value for this key,
1335weight for Hangul Syllables is treated as undefined
1336without decomposition into Hangul Jamo.
1337But definition of weight for Hangul Syllables
1338in table or C<entry> is still valid.
1339
1340=item preprocess
1341
1342-- see 5.1 Preprocessing, UTS #10.
1343
1344If specified, the coderef is used to preprocess
1345before the formation of sort keys.
1346
1347ex. dropping English articles, such as "a" or "the".
1348Then, "the pen" is before "a pencil".
1349
1350     preprocess => sub {
1351           my $str = shift;
1352           $str =~ s/\b(?:an?|the)\s+//gi;
1353           return $str;
1354        },
1355
1356C<preprocess> is performed before C<normalization> (if defined).
1357
1358=item rearrange
1359
1360-- see 3.1.3 Rearrangement, UTS #10.
1361
1362Characters that are not coded in logical order and to be rearranged.
1363By default,
1364
1365    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1366
1367If you want to disallow any rearrangement,
1368pass C<undef> or C<[]> (a reference to an empty list)
1369as the value for this key.
1370
1371B<According to the version 9 of UCA, this parameter shall not be used;
1372but it is not warned at present.>
1373
1374=item table
1375
1376-- see 3.2 Default Unicode Collation Element Table, UTS #10.
1377
1378You can use another collation element table if desired.
1379The table file must be put into a directory
1380where F<Unicode/Collate.pm> is installed; e.g. into
1381F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
1382
1383By default, the filename F<allkeys.txt> is used.
1384
1385If C<undef> is passed explicitly as the value for this key,
1386no file is read (but you can define collation elements via C<entry>).
1387
1388A typical way to define a collation element table
1389without any file of table:
1390
1391   $onlyABC = Unicode::Collate->new(
1392       table => undef,
1393       entry => << 'ENTRIES',
13940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
13950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
13960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
13970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
13980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
13990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1400ENTRIES
1401    );
1402
1403=item undefName
1404
1405=item undefChar
1406
1407-- see 6.3.4 Reducing the Repertoire, UTS #10.
1408
1409Undefines the collation element as if it were unassigned in the table.
1410This reduces the size of the table.
1411If an unassigned character appears in the string to be collated,
1412the sort key is made from its codepoint
1413as a single-character collation element,
1414as it is greater than any other assigned collation elements
1415(in the codepoint order among the unassigned characters).
1416But, it'd be better to ignore characters
1417unfamiliar to you and maybe never used.
1418
1419=item katakana_before_hiragana
1420
1421=item upper_before_lower
1422
1423-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1424
1425By default, lowercase is before uppercase
1426and hiragana is before katakana.
1427
1428If the tag is made true, this is reversed.
1429
1430B<NOTE>: These tags simplemindedly assume
1431any lowercase/uppercase or hiragana/katakana distinctions
1432must occur in level 3, and their weights at level 3
1433must be same as those mentioned in 7.3.1, UTS #10.
1434If you define your collation elements which violate this requirement,
1435these tags don't work validly.
1436
1437=item variable
1438
1439=item alternate
1440
1441-- see 3.2.2 Variable Weighting, UTS #10.
1442
1443(the title in UCA version 8: Alternate Weighting)
1444
1445This key allows to variable weighting for variable collation elements,
1446which are marked with an ASTERISK in the table
1447(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1448
1449   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1450
1451These names are case-insensitive.
1452By default (if specification is omitted), 'shifted' is adopted.
1453
1454   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1455                    considered at the 4th level.
1456
1457   'Non-Ignorable'  Variable elements are not reset to ignorable.
1458
1459   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1460                    their level 4 weight is replaced by the old level 1 weight.
1461                    Level 4 weight for Non-Variable elements is 0xFFFF.
1462
1463   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1464                    are trimmed.
1465
1466For backward compatibility, C<alternate> can be used as an alias
1467for C<variable>.
1468
1469=back
1470
1471=head2 Methods for Collation
1472
1473=over 4
1474
1475=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1476
1477Sorts a list of strings.
1478
1479=item C<$result = $Collator-E<gt>cmp($a, $b)>
1480
1481Returns 1 (when C<$a> is greater than C<$b>)
1482or 0 (when C<$a> is equal to C<$b>)
1483or -1 (when C<$a> is lesser than C<$b>).
1484
1485=item C<$result = $Collator-E<gt>eq($a, $b)>
1486
1487=item C<$result = $Collator-E<gt>ne($a, $b)>
1488
1489=item C<$result = $Collator-E<gt>lt($a, $b)>
1490
1491=item C<$result = $Collator-E<gt>le($a, $b)>
1492
1493=item C<$result = $Collator-E<gt>gt($a, $b)>
1494
1495=item C<$result = $Collator-E<gt>ge($a, $b)>
1496
1497They works like the same name operators as theirs.
1498
1499   eq : whether $a is equal to $b.
1500   ne : whether $a is not equal to $b.
1501   lt : whether $a is lesser than $b.
1502   le : whether $a is lesser than $b or equal to $b.
1503   gt : whether $a is greater than $b.
1504   ge : whether $a is greater than $b or equal to $b.
1505
1506=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1507
1508-- see 4.3 Form a sort key for each string, UTS #10.
1509
1510Returns a sort key.
1511
1512You compare the sort keys using a binary comparison
1513and get the result of the comparison of the strings using UCA.
1514
1515   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1516
1517      is equivalent to
1518
1519   $Collator->cmp($a, $b)
1520
1521=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1522
1523   use Unicode::Collate;
1524   my $c = Unicode::Collate->new();
1525   print $c->viewSortKey("Perl"),"\n";
1526
1527   # output:
1528   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1529   #  Level 1               Level 2               Level 3               Level 4
1530
1531    (If C<UCA_Version> is 8, the output is slightly different.)
1532
1533=back
1534
1535=head2 Methods for Searching
1536
1537B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1538for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1539C<subst>, C<gsubst>) is croaked,
1540as the position and the length might differ
1541from those on the specified string.
1542(And C<rearrange> and C<hangul_terminator> tags are neglected.)
1543
1544The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1545like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1546but they are not aware of any pattern, but only a literal substring.
1547
1548=over 4
1549
1550=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1551
1552=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1553
1554If C<$substring> matches a part of C<$string>, returns
1555the position of the first occurrence of the matching part in scalar context;
1556in list context, returns a two-element list of
1557the position and the length of the matching part.
1558
1559If C<$substring> does not match any part of C<$string>,
1560returns C<-1> in scalar context and
1561an empty list in list context.
1562
1563e.g. you say
1564
1565  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1566                                     # (normalization => undef) is REQUIRED.
1567  my $str = "Ich mu� studieren Perl.";
1568  my $sub = "M�SS";
1569  my $match;
1570  if (my($pos,$len) = $Collator->index($str, $sub)) {
1571      $match = substr($str, $pos, $len);
1572  }
1573
1574and get C<"mu�"> in C<$match> since C<"mu�">
1575is primary equal to C<"M�SS">.
1576
1577=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1578
1579=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1580
1581If C<$substring> matches a part of C<$string>, in scalar context, returns
1582B<a reference to> the first occurrence of the matching part
1583(C<$match_ref> is always true if matches,
1584since every reference is B<true>);
1585in list context, returns the first occurrence of the matching part.
1586
1587If C<$substring> does not match any part of C<$string>,
1588returns C<undef> in scalar context and
1589an empty list in list context.
1590
1591e.g.
1592
1593    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1594	print "matches [$$match_ref].\n";
1595    } else {
1596	print "doesn't match.\n";
1597    }
1598
1599     or
1600
1601    if (($match) = $Collator->match($str, $sub)) { # list context
1602	print "matches [$match].\n";
1603    } else {
1604	print "doesn't match.\n";
1605    }
1606
1607=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1608
1609If C<$substring> matches a part of C<$string>, returns
1610all the matching parts (or matching count in scalar context).
1611
1612If C<$substring> does not match any part of C<$string>,
1613returns an empty list.
1614
1615=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1616
1617If C<$substring> matches a part of C<$string>,
1618the first occurrence of the matching part is replaced by C<$replacement>
1619(C<$string> is modified) and return C<$count> (always equals to C<1>).
1620
1621C<$replacement> can be a C<CODEREF>,
1622taking the matching part as an argument,
1623and returning a string to replace the matching part
1624(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1625
1626=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1627
1628If C<$substring> matches a part of C<$string>,
1629all the occurrences of the matching part is replaced by C<$replacement>
1630(C<$string> is modified) and return C<$count>.
1631
1632C<$replacement> can be a C<CODEREF>,
1633taking the matching part as an argument,
1634and returning a string to replace the matching part
1635(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1636
1637e.g.
1638
1639  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1640                                     # (normalization => undef) is REQUIRED.
1641  my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1642  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1643
1644  # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1645  # i.e., all the camels are made bold-faced.
1646
1647=back
1648
1649=head2 Other Methods
1650
1651=over 4
1652
1653=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1654
1655Change the value of specified keys and returns the changed part.
1656
1657    $Collator = Unicode::Collate->new(level => 4);
1658
1659    $Collator->eq("perl", "PERL"); # false
1660
1661    %old = $Collator->change(level => 2); # returns (level => 4).
1662
1663    $Collator->eq("perl", "PERL"); # true
1664
1665    $Collator->change(%old); # returns (level => 2).
1666
1667    $Collator->eq("perl", "PERL"); # false
1668
1669Not all C<(key,value)>s are allowed to be changed.
1670See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1671
1672In the scalar context, returns the modified collator
1673(but it is B<not> a clone from the original).
1674
1675    $Collator->change(level => 2)->eq("perl", "PERL"); # true
1676
1677    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1678
1679    $Collator->change(level => 4)->eq("perl", "PERL"); # false
1680
1681=item C<$version = $Collator-E<gt>version()>
1682
1683Returns the version number (a string) of the Unicode Standard
1684which the C<table> file used by the collator object is based on.
1685If the table does not include a version line (starting with C<@version>),
1686returns C<"unknown">.
1687
1688=item C<UCA_Version()>
1689
1690Returns the tracking version number of UTS #10 this module consults.
1691
1692=item C<Base_Unicode_Version()>
1693
1694Returns the version number of UTS #10 this module consults.
1695
1696=back
1697
1698=head2 EXPORT
1699
1700None by default.
1701
1702=head2 CAVEAT
1703
1704Use of the C<normalization> parameter requires
1705the B<Unicode::Normalize> module.
1706
1707If you need not it (say, in the case when you need not
1708handle any combining characters),
1709assign C<normalization =E<gt> undef> explicitly.
1710
1711-- see 6.5 Avoiding Normalization, UTS #10.
1712
1713=head2 Conformance Test
1714
1715The Conformance Test for the UCA is available
1716under L<http://www.unicode.org/Public/UCA/>.
1717
1718For F<CollationTest_SHIFTED.txt>,
1719a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1720for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1721C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1722
1723B<Unicode::Normalize is required to try The Conformance Test.>
1724
1725=head1 AUTHOR
1726
1727SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1728
1729  http://homepage1.nifty.com/nomenclator/perl/
1730
1731  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1732
1733  This library is free software; you can redistribute it
1734  and/or modify it under the same terms as Perl itself.
1735
1736=head1 SEE ALSO
1737
1738=over 4
1739
1740=item Unicode Collation Algorithm - UTS #10
1741
1742L<http://www.unicode.org/reports/tr10/>
1743
1744=item The Default Unicode Collation Element Table (DUCET)
1745
1746L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1747
1748=item The conformance test for the UCA
1749
1750L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1751
1752L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1753
1754=item Hangul Syllable Type
1755
1756L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1757
1758=item Unicode Normalization Forms - UAX #15
1759
1760L<http://www.unicode.org/reports/tr15/>
1761
1762=item L<Unicode::Normalize>
1763
1764=back
1765
1766=cut
1767