xref: /openbsd-src/gnu/usr.bin/perl/dist/Unicode-Normalize/Normalize.pm (revision f2a19305cfc49ea4d1a5feb55cd6c283c6f1e031)
15759b3d2Safresh1package Unicode::Normalize;
25759b3d2Safresh1
35759b3d2Safresh1use 5.006;
45759b3d2Safresh1use strict;
55759b3d2Safresh1use warnings;
65759b3d2Safresh1use Carp;
75759b3d2Safresh1
85759b3d2Safresh1no warnings 'utf8';
95759b3d2Safresh1
10*f2a19305Safresh1our $VERSION = '1.32';
115759b3d2Safresh1our $PACKAGE = __PACKAGE__;
125759b3d2Safresh1
135759b3d2Safresh1our @EXPORT = qw( NFC NFD NFKC NFKD );
145759b3d2Safresh1our @EXPORT_OK = qw(
155759b3d2Safresh1    normalize decompose reorder compose
165759b3d2Safresh1    checkNFD checkNFKD checkNFC checkNFKC check
175759b3d2Safresh1    getCanon getCompat getComposite getCombinClass
185759b3d2Safresh1    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
195759b3d2Safresh1    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
205759b3d2Safresh1    FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
215759b3d2Safresh1    normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
225759b3d2Safresh1);
235759b3d2Safresh1our %EXPORT_TAGS = (
245759b3d2Safresh1    all       => [ @EXPORT, @EXPORT_OK ],
255759b3d2Safresh1    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
265759b3d2Safresh1    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
275759b3d2Safresh1    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
285759b3d2Safresh1);
295759b3d2Safresh1
305759b3d2Safresh1##
315759b3d2Safresh1## utilities for tests
325759b3d2Safresh1##
335759b3d2Safresh1
34256a93a4Safresh1                             # No EBCDIC support on early perls
35256a93a4Safresh1*to_native = ($::IS_ASCII || $] < 5.008)
36256a93a4Safresh1             ? sub { return shift }
37256a93a4Safresh1             : sub { utf8::unicode_to_native(shift) };
38256a93a4Safresh1
39256a93a4Safresh1*from_native = ($::IS_ASCII || $] < 5.008)
40256a93a4Safresh1             ? sub { return shift }
41256a93a4Safresh1             : sub { utf8::native_to_unicode(shift) };
42256a93a4Safresh1
43256a93a4Safresh1# The .t files are all in terms of Unicode, so xlate to/from native
44256a93a4Safresh1sub dot_t_pack_U {
45256a93a4Safresh1    return pack('U*', map { to_native($_) } @_);
465759b3d2Safresh1}
475759b3d2Safresh1
48256a93a4Safresh1sub dot_t_unpack_U {
495759b3d2Safresh1
505759b3d2Safresh1    # The empty pack returns an empty UTF-8 string, so the effect is to force
515759b3d2Safresh1    # the shifted parameter into being UTF-8.  This allows this to work on
525759b3d2Safresh1    # Perl 5.6, where there is no utf8::upgrade().
53256a93a4Safresh1    return map { from_native($_) } unpack('U*', shift(@_).pack('U*'));
54256a93a4Safresh1}
55256a93a4Safresh1
56256a93a4Safresh1sub get_printable_string ($) {
57256a93a4Safresh1    use bytes;
58256a93a4Safresh1    my $s = shift;
59256a93a4Safresh1
60256a93a4Safresh1    # DeMorgan's laws cause this to mean ascii printables
61256a93a4Safresh1    return $s if $s =~ /[^[:^ascii:][:^print:]]/;
62256a93a4Safresh1
63256a93a4Safresh1    return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s;
64256a93a4Safresh1}
65256a93a4Safresh1
66256a93a4Safresh1sub ok ($$;$) {
67256a93a4Safresh1    my $count_ref = shift;  # Test number in caller
68256a93a4Safresh1    my $p = my $r = shift;
69256a93a4Safresh1    my $x;
70256a93a4Safresh1    if (@_) {
71256a93a4Safresh1        $x = shift;
72256a93a4Safresh1        $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
73256a93a4Safresh1    }
74256a93a4Safresh1
75256a93a4Safresh1    print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n";
76256a93a4Safresh1
77256a93a4Safresh1    return if $p;
78256a93a4Safresh1
79256a93a4Safresh1    my (undef, $file, $line) = caller(1);
80256a93a4Safresh1    print STDERR "# Failed test $$count_ref at $file line $line\n";
81256a93a4Safresh1
82256a93a4Safresh1    return unless defined $x;
83256a93a4Safresh1
84256a93a4Safresh1    print STDERR "#      got ", get_printable_string($r), "\n";
85256a93a4Safresh1    print STDERR "# expected ", get_printable_string($x), "\n";
865759b3d2Safresh1}
875759b3d2Safresh1
885759b3d2Safresh1require Exporter;
895759b3d2Safresh1
905759b3d2Safresh1##### The above part is common to XS and PP #####
915759b3d2Safresh1
925759b3d2Safresh1our @ISA = qw(Exporter);
935759b3d2Safresh1use XSLoader ();
945759b3d2Safresh1XSLoader::load( 'Unicode::Normalize', $VERSION );
955759b3d2Safresh1
965759b3d2Safresh1##### The below part is common to XS and PP #####
975759b3d2Safresh1
985759b3d2Safresh1##
995759b3d2Safresh1## normalize
1005759b3d2Safresh1##
1015759b3d2Safresh1
1025759b3d2Safresh1sub FCD ($) {
1035759b3d2Safresh1    my $str = shift;
1045759b3d2Safresh1    return checkFCD($str) ? $str : NFD($str);
1055759b3d2Safresh1}
1065759b3d2Safresh1
1075759b3d2Safresh1our %formNorm = (
1085759b3d2Safresh1    NFC  => \&NFC,	C  => \&NFC,
1095759b3d2Safresh1    NFD  => \&NFD,	D  => \&NFD,
1105759b3d2Safresh1    NFKC => \&NFKC,	KC => \&NFKC,
1115759b3d2Safresh1    NFKD => \&NFKD,	KD => \&NFKD,
1125759b3d2Safresh1    FCD  => \&FCD,	FCC => \&FCC,
1135759b3d2Safresh1);
1145759b3d2Safresh1
1155759b3d2Safresh1sub normalize($$)
1165759b3d2Safresh1{
1175759b3d2Safresh1    my $form = shift;
1185759b3d2Safresh1    my $str = shift;
1195759b3d2Safresh1    if (exists $formNorm{$form}) {
1205759b3d2Safresh1	return $formNorm{$form}->($str);
1215759b3d2Safresh1    }
1225759b3d2Safresh1    croak($PACKAGE."::normalize: invalid form name: $form");
1235759b3d2Safresh1}
1245759b3d2Safresh1
1255759b3d2Safresh1##
1265759b3d2Safresh1## partial
1275759b3d2Safresh1##
1285759b3d2Safresh1
1295759b3d2Safresh1sub normalize_partial ($$) {
1305759b3d2Safresh1    if (exists $formNorm{$_[0]}) {
1315759b3d2Safresh1	my $n = normalize($_[0], $_[1]);
1325759b3d2Safresh1	my($p, $u) = splitOnLastStarter($n);
1335759b3d2Safresh1	$_[1] = $u;
1345759b3d2Safresh1	return $p;
1355759b3d2Safresh1    }
1365759b3d2Safresh1    croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
1375759b3d2Safresh1}
1385759b3d2Safresh1
1395759b3d2Safresh1sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
1405759b3d2Safresh1sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
1415759b3d2Safresh1sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
1425759b3d2Safresh1sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
1435759b3d2Safresh1
1445759b3d2Safresh1##
1455759b3d2Safresh1## check
1465759b3d2Safresh1##
1475759b3d2Safresh1
1485759b3d2Safresh1our %formCheck = (
1495759b3d2Safresh1    NFC  => \&checkNFC, 	C  => \&checkNFC,
1505759b3d2Safresh1    NFD  => \&checkNFD, 	D  => \&checkNFD,
1515759b3d2Safresh1    NFKC => \&checkNFKC,	KC => \&checkNFKC,
1525759b3d2Safresh1    NFKD => \&checkNFKD,	KD => \&checkNFKD,
1535759b3d2Safresh1    FCD  => \&checkFCD, 	FCC => \&checkFCC,
1545759b3d2Safresh1);
1555759b3d2Safresh1
1565759b3d2Safresh1sub check($$)
1575759b3d2Safresh1{
1585759b3d2Safresh1    my $form = shift;
1595759b3d2Safresh1    my $str = shift;
1605759b3d2Safresh1    if (exists $formCheck{$form}) {
1615759b3d2Safresh1	return $formCheck{$form}->($str);
1625759b3d2Safresh1    }
1635759b3d2Safresh1    croak($PACKAGE."::check: invalid form name: $form");
1645759b3d2Safresh1}
1655759b3d2Safresh1
1665759b3d2Safresh11;
1675759b3d2Safresh1__END__
1685759b3d2Safresh1
1695759b3d2Safresh1=head1 NAME
1705759b3d2Safresh1
1715759b3d2Safresh1Unicode::Normalize - Unicode Normalization Forms
1725759b3d2Safresh1
1735759b3d2Safresh1=head1 SYNOPSIS
1745759b3d2Safresh1
1755759b3d2Safresh1(1) using function names exported by default:
1765759b3d2Safresh1
1775759b3d2Safresh1  use Unicode::Normalize;
1785759b3d2Safresh1
1795759b3d2Safresh1  $NFD_string  = NFD($string);  # Normalization Form D
1805759b3d2Safresh1  $NFC_string  = NFC($string);  # Normalization Form C
1815759b3d2Safresh1  $NFKD_string = NFKD($string); # Normalization Form KD
1825759b3d2Safresh1  $NFKC_string = NFKC($string); # Normalization Form KC
1835759b3d2Safresh1
1845759b3d2Safresh1(2) using function names exported on request:
1855759b3d2Safresh1
1865759b3d2Safresh1  use Unicode::Normalize 'normalize';
1875759b3d2Safresh1
1885759b3d2Safresh1  $NFD_string  = normalize('D',  $string);  # Normalization Form D
1895759b3d2Safresh1  $NFC_string  = normalize('C',  $string);  # Normalization Form C
1905759b3d2Safresh1  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
1915759b3d2Safresh1  $NFKC_string = normalize('KC', $string);  # Normalization Form KC
1925759b3d2Safresh1
1935759b3d2Safresh1=head1 DESCRIPTION
1945759b3d2Safresh1
1955759b3d2Safresh1Parameters:
1965759b3d2Safresh1
1975759b3d2Safresh1C<$string> is used as a string under character semantics (see L<perlunicode>).
1985759b3d2Safresh1
1995759b3d2Safresh1C<$code_point> should be an unsigned integer representing a Unicode code point.
2005759b3d2Safresh1
2015759b3d2Safresh1Note: Between XSUB and pure Perl, there is an incompatibility
2025759b3d2Safresh1about the interpretation of C<$code_point> as a decimal number.
2035759b3d2Safresh1XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
2045759b3d2Safresh1Do not use a floating point nor a negative sign in C<$code_point>.
2055759b3d2Safresh1
2065759b3d2Safresh1=head2 Normalization Forms
2075759b3d2Safresh1
2085759b3d2Safresh1=over 4
2095759b3d2Safresh1
2105759b3d2Safresh1=item C<$NFD_string = NFD($string)>
2115759b3d2Safresh1
2125759b3d2Safresh1It returns the Normalization Form D (formed by canonical decomposition).
2135759b3d2Safresh1
2145759b3d2Safresh1=item C<$NFC_string = NFC($string)>
2155759b3d2Safresh1
2165759b3d2Safresh1It returns the Normalization Form C (formed by canonical decomposition
2175759b3d2Safresh1followed by canonical composition).
2185759b3d2Safresh1
2195759b3d2Safresh1=item C<$NFKD_string = NFKD($string)>
2205759b3d2Safresh1
2215759b3d2Safresh1It returns the Normalization Form KD (formed by compatibility decomposition).
2225759b3d2Safresh1
2235759b3d2Safresh1=item C<$NFKC_string = NFKC($string)>
2245759b3d2Safresh1
2255759b3d2Safresh1It returns the Normalization Form KC (formed by compatibility decomposition
2265759b3d2Safresh1followed by B<canonical> composition).
2275759b3d2Safresh1
2285759b3d2Safresh1=item C<$FCD_string = FCD($string)>
2295759b3d2Safresh1
2305759b3d2Safresh1If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
2315759b3d2Safresh1it returns the string without modification; otherwise it returns an FCD string.
2325759b3d2Safresh1
2335759b3d2Safresh1Note: FCD is not always unique, then plural forms may be equivalent
2345759b3d2Safresh1each other. C<FCD()> will return one of these equivalent forms.
2355759b3d2Safresh1
2365759b3d2Safresh1=item C<$FCC_string = FCC($string)>
2375759b3d2Safresh1
2385759b3d2Safresh1It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
2395759b3d2Safresh1
2405759b3d2Safresh1Note: FCC is unique, as well as four normalization forms (NF*).
2415759b3d2Safresh1
2425759b3d2Safresh1=item C<$normalized_string = normalize($form_name, $string)>
2435759b3d2Safresh1
2445759b3d2Safresh1It returns the normalization form of C<$form_name>.
2455759b3d2Safresh1
2465759b3d2Safresh1As C<$form_name>, one of the following names must be given.
2475759b3d2Safresh1
2485759b3d2Safresh1  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
2495759b3d2Safresh1  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
2505759b3d2Safresh1  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
2515759b3d2Safresh1  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
2525759b3d2Safresh1
2535759b3d2Safresh1  'FCD'          for "Fast C or D" Form  (UTN #5)
2545759b3d2Safresh1  'FCC'          for "Fast C Contiguous" (UTN #5)
2555759b3d2Safresh1
2565759b3d2Safresh1=back
2575759b3d2Safresh1
2585759b3d2Safresh1=head2 Decomposition and Composition
2595759b3d2Safresh1
2605759b3d2Safresh1=over 4
2615759b3d2Safresh1
2625759b3d2Safresh1=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
2635759b3d2Safresh1
2645759b3d2Safresh1It returns the concatenation of the decomposition of each character
2655759b3d2Safresh1in the string.
2665759b3d2Safresh1
2675759b3d2Safresh1If the second parameter (a boolean) is omitted or false,
2685759b3d2Safresh1the decomposition is canonical decomposition;
2695759b3d2Safresh1if the second parameter (a boolean) is true,
2705759b3d2Safresh1the decomposition is compatibility decomposition.
2715759b3d2Safresh1
2725759b3d2Safresh1The string returned is not always in NFD/NFKD. Reordering may be required.
2735759b3d2Safresh1
2745759b3d2Safresh1 $NFD_string  = reorder(decompose($string));       # eq. to NFD()
2755759b3d2Safresh1 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
2765759b3d2Safresh1
2775759b3d2Safresh1=item C<$reordered_string = reorder($string)>
2785759b3d2Safresh1
2795759b3d2Safresh1It returns the result of reordering the combining characters
2805759b3d2Safresh1according to Canonical Ordering Behavior.
2815759b3d2Safresh1
2825759b3d2Safresh1For example, when you have a list of NFD/NFKD strings,
2835759b3d2Safresh1you can get the concatenated NFD/NFKD string from them, by saying
2845759b3d2Safresh1
2855759b3d2Safresh1    $concat_NFD  = reorder(join '', @NFD_strings);
2865759b3d2Safresh1    $concat_NFKD = reorder(join '', @NFKD_strings);
2875759b3d2Safresh1
2885759b3d2Safresh1=item C<$composed_string = compose($string)>
2895759b3d2Safresh1
2905759b3d2Safresh1It returns the result of canonical composition
2915759b3d2Safresh1without applying any decomposition.
2925759b3d2Safresh1
2935759b3d2Safresh1For example, when you have a NFD/NFKD string,
2945759b3d2Safresh1you can get its NFC/NFKC string, by saying
2955759b3d2Safresh1
2965759b3d2Safresh1    $NFC_string  = compose($NFD_string);
2975759b3d2Safresh1    $NFKC_string = compose($NFKD_string);
2985759b3d2Safresh1
2995759b3d2Safresh1=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
3005759b3d2Safresh1
3015759b3d2Safresh1It returns two strings: the first one, C<$processed>, is a part
3025759b3d2Safresh1before the last starter, and the second one, C<$unprocessed> is
3035759b3d2Safresh1another part after the first part. A starter is a character having
3045759b3d2Safresh1a combining class of zero (see UAX #15).
3055759b3d2Safresh1
3065759b3d2Safresh1Note that C<$processed> may be empty (when C<$normalized> contains no
3075759b3d2Safresh1starter or starts with the last starter), and then C<$unprocessed>
3085759b3d2Safresh1should be equal to the entire C<$normalized>.
3095759b3d2Safresh1
3105759b3d2Safresh1When you have a C<$normalized> string and an C<$unnormalized> string
3115759b3d2Safresh1following it, a simple concatenation is wrong:
3125759b3d2Safresh1
3135759b3d2Safresh1 $concat = $normalized . normalize($form, $unnormalized); # wrong!
3145759b3d2Safresh1
3155759b3d2Safresh1Instead of it, do like this:
3165759b3d2Safresh1
3175759b3d2Safresh1 ($processed, $unprocessed) = splitOnLastStarter($normalized);
3185759b3d2Safresh1 $concat = $processed . normalize($form,$unprocessed.$unnormalized);
3195759b3d2Safresh1
3205759b3d2Safresh1C<splitOnLastStarter()> should be called with a pre-normalized parameter
3215759b3d2Safresh1C<$normalized>, that is in the same form as C<$form> you want.
3225759b3d2Safresh1
3235759b3d2Safresh1If you have an array of C<@string> that should be concatenated and then
3245759b3d2Safresh1normalized, you can do like this:
3255759b3d2Safresh1
3265759b3d2Safresh1    my $result = "";
3275759b3d2Safresh1    my $unproc = "";
3285759b3d2Safresh1    foreach my $str (@string) {
3295759b3d2Safresh1        $unproc .= $str;
3305759b3d2Safresh1        my $n = normalize($form, $unproc);
3315759b3d2Safresh1        my($p, $u) = splitOnLastStarter($n);
3325759b3d2Safresh1        $result .= $p;
3335759b3d2Safresh1        $unproc  = $u;
3345759b3d2Safresh1    }
3355759b3d2Safresh1    $result .= $unproc;
3365759b3d2Safresh1    # instead of normalize($form, join('', @string))
3375759b3d2Safresh1
3385759b3d2Safresh1=item C<$processed = normalize_partial($form, $unprocessed)>
3395759b3d2Safresh1
3405759b3d2Safresh1A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
3415759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect.
3425759b3d2Safresh1
3435759b3d2Safresh1If you have an array of C<@string> that should be concatenated and then
3445759b3d2Safresh1normalized, you can do like this:
3455759b3d2Safresh1
3465759b3d2Safresh1    my $result = "";
3475759b3d2Safresh1    my $unproc = "";
3485759b3d2Safresh1    foreach my $str (@string) {
3495759b3d2Safresh1        $unproc .= $str;
3505759b3d2Safresh1        $result .= normalize_partial($form, $unproc);
3515759b3d2Safresh1    }
3525759b3d2Safresh1    $result .= $unproc;
3535759b3d2Safresh1    # instead of normalize($form, join('', @string))
3545759b3d2Safresh1
3555759b3d2Safresh1=item C<$processed = NFD_partial($unprocessed)>
3565759b3d2Safresh1
3575759b3d2Safresh1It does like C<normalize_partial('NFD', $unprocessed)>.
3585759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect.
3595759b3d2Safresh1
3605759b3d2Safresh1=item C<$processed = NFC_partial($unprocessed)>
3615759b3d2Safresh1
3625759b3d2Safresh1It does like C<normalize_partial('NFC', $unprocessed)>.
3635759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect.
3645759b3d2Safresh1
3655759b3d2Safresh1=item C<$processed = NFKD_partial($unprocessed)>
3665759b3d2Safresh1
3675759b3d2Safresh1It does like C<normalize_partial('NFKD', $unprocessed)>.
3685759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect.
3695759b3d2Safresh1
3705759b3d2Safresh1=item C<$processed = NFKC_partial($unprocessed)>
3715759b3d2Safresh1
3725759b3d2Safresh1It does like C<normalize_partial('NFKC', $unprocessed)>.
3735759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect.
3745759b3d2Safresh1
3755759b3d2Safresh1=back
3765759b3d2Safresh1
3775759b3d2Safresh1=head2 Quick Check
3785759b3d2Safresh1
379*f2a19305Safresh1(see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>)
3805759b3d2Safresh1
3815759b3d2Safresh1The following functions check whether the string is in that normalization form.
3825759b3d2Safresh1
3835759b3d2Safresh1The result returned will be one of the following:
3845759b3d2Safresh1
3855759b3d2Safresh1    YES     The string is in that normalization form.
3865759b3d2Safresh1    NO      The string is not in that normalization form.
3875759b3d2Safresh1    MAYBE   Dubious. Maybe yes, maybe no.
3885759b3d2Safresh1
3895759b3d2Safresh1=over 4
3905759b3d2Safresh1
3915759b3d2Safresh1=item C<$result = checkNFD($string)>
3925759b3d2Safresh1
3935759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
3945759b3d2Safresh1
3955759b3d2Safresh1=item C<$result = checkNFC($string)>
3965759b3d2Safresh1
3975759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
3985759b3d2Safresh1C<undef> if C<MAYBE>.
3995759b3d2Safresh1
4005759b3d2Safresh1=item C<$result = checkNFKD($string)>
4015759b3d2Safresh1
4025759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
4035759b3d2Safresh1
4045759b3d2Safresh1=item C<$result = checkNFKC($string)>
4055759b3d2Safresh1
4065759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
4075759b3d2Safresh1C<undef> if C<MAYBE>.
4085759b3d2Safresh1
4095759b3d2Safresh1=item C<$result = checkFCD($string)>
4105759b3d2Safresh1
4115759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
4125759b3d2Safresh1
4135759b3d2Safresh1=item C<$result = checkFCC($string)>
4145759b3d2Safresh1
4155759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
4165759b3d2Safresh1C<undef> if C<MAYBE>.
4175759b3d2Safresh1
4185759b3d2Safresh1Note: If a string is not in FCD, it must not be in FCC.
4195759b3d2Safresh1So C<checkFCC($not_FCD_string)> should return C<NO>.
4205759b3d2Safresh1
4215759b3d2Safresh1=item C<$result = check($form_name, $string)>
4225759b3d2Safresh1
4235759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
4245759b3d2Safresh1C<undef> if C<MAYBE>.
4255759b3d2Safresh1
4265759b3d2Safresh1As C<$form_name>, one of the following names must be given.
4275759b3d2Safresh1
4285759b3d2Safresh1  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
4295759b3d2Safresh1  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
4305759b3d2Safresh1  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
4315759b3d2Safresh1  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
4325759b3d2Safresh1
4335759b3d2Safresh1  'FCD'          for "Fast C or D" Form  (UTN #5)
4345759b3d2Safresh1  'FCC'          for "Fast C Contiguous" (UTN #5)
4355759b3d2Safresh1
4365759b3d2Safresh1=back
4375759b3d2Safresh1
4385759b3d2Safresh1B<Note>
4395759b3d2Safresh1
4405759b3d2Safresh1In the cases of NFD, NFKD, and FCD, the answer must be
4415759b3d2Safresh1either C<YES> or C<NO>. The answer C<MAYBE> may be returned
4425759b3d2Safresh1in the cases of NFC, NFKC, and FCC.
4435759b3d2Safresh1
4445759b3d2Safresh1A C<MAYBE> string should contain at least one combining character
4455759b3d2Safresh1or the like. For example, C<COMBINING ACUTE ACCENT> has
4465759b3d2Safresh1the MAYBE_NFC/MAYBE_NFKC property.
4475759b3d2Safresh1
4485759b3d2Safresh1Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
4495759b3d2Safresh1and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
4505759b3d2Safresh1C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
4515759b3d2Safresh1(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
4525759b3d2Safresh1while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
4535759b3d2Safresh1
4545759b3d2Safresh1If you want to check exactly, compare the string with its NFC/NFKC/FCC.
4555759b3d2Safresh1
4565759b3d2Safresh1    if ($string eq NFC($string)) {
4575759b3d2Safresh1        # $string is exactly normalized in NFC;
4585759b3d2Safresh1    } else {
4595759b3d2Safresh1        # $string is not normalized in NFC;
4605759b3d2Safresh1    }
4615759b3d2Safresh1
4625759b3d2Safresh1    if ($string eq NFKC($string)) {
4635759b3d2Safresh1        # $string is exactly normalized in NFKC;
4645759b3d2Safresh1    } else {
4655759b3d2Safresh1        # $string is not normalized in NFKC;
4665759b3d2Safresh1    }
4675759b3d2Safresh1
4685759b3d2Safresh1=head2 Character Data
4695759b3d2Safresh1
4705759b3d2Safresh1These functions are interface of character data used internally.
4715759b3d2Safresh1If you want only to get Unicode normalization forms, you don't need
4725759b3d2Safresh1call them yourself.
4735759b3d2Safresh1
4745759b3d2Safresh1=over 4
4755759b3d2Safresh1
4765759b3d2Safresh1=item C<$canonical_decomposition = getCanon($code_point)>
4775759b3d2Safresh1
4785759b3d2Safresh1If the character is canonically decomposable (including Hangul Syllables),
4795759b3d2Safresh1it returns the (full) canonical decomposition as a string.
4805759b3d2Safresh1Otherwise it returns C<undef>.
4815759b3d2Safresh1
4825759b3d2Safresh1B<Note:> According to the Unicode standard, the canonical decomposition
4835759b3d2Safresh1of the character that is not canonically decomposable is same as
4845759b3d2Safresh1the character itself.
4855759b3d2Safresh1
4865759b3d2Safresh1=item C<$compatibility_decomposition = getCompat($code_point)>
4875759b3d2Safresh1
4885759b3d2Safresh1If the character is compatibility decomposable (including Hangul Syllables),
4895759b3d2Safresh1it returns the (full) compatibility decomposition as a string.
4905759b3d2Safresh1Otherwise it returns C<undef>.
4915759b3d2Safresh1
4925759b3d2Safresh1B<Note:> According to the Unicode standard, the compatibility decomposition
4935759b3d2Safresh1of the character that is not compatibility decomposable is same as
4945759b3d2Safresh1the character itself.
4955759b3d2Safresh1
4965759b3d2Safresh1=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
4975759b3d2Safresh1
4985759b3d2Safresh1If two characters here and next (as code points) are composable
4995759b3d2Safresh1(including Hangul Jamo/Syllables and Composition Exclusions),
5005759b3d2Safresh1it returns the code point of the composite.
5015759b3d2Safresh1
5025759b3d2Safresh1If they are not composable, it returns C<undef>.
5035759b3d2Safresh1
5045759b3d2Safresh1=item C<$combining_class = getCombinClass($code_point)>
5055759b3d2Safresh1
5065759b3d2Safresh1It returns the combining class (as an integer) of the character.
5075759b3d2Safresh1
5085759b3d2Safresh1=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
5095759b3d2Safresh1
5105759b3d2Safresh1It returns a boolean whether the character of the specified codepoint
5115759b3d2Safresh1may be composed with the previous one in a certain composition
5125759b3d2Safresh1(including Hangul Compositions, but excluding
5135759b3d2Safresh1Composition Exclusions and Non-Starter Decompositions).
5145759b3d2Safresh1
5155759b3d2Safresh1=item C<$is_exclusion = isExclusion($code_point)>
5165759b3d2Safresh1
5175759b3d2Safresh1It returns a boolean whether the code point is a composition exclusion.
5185759b3d2Safresh1
5195759b3d2Safresh1=item C<$is_singleton = isSingleton($code_point)>
5205759b3d2Safresh1
5215759b3d2Safresh1It returns a boolean whether the code point is a singleton
5225759b3d2Safresh1
5235759b3d2Safresh1=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
5245759b3d2Safresh1
5255759b3d2Safresh1It returns a boolean whether the code point has Non-Starter Decomposition.
5265759b3d2Safresh1
5275759b3d2Safresh1=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
5285759b3d2Safresh1
5295759b3d2Safresh1It returns a boolean of the derived property Comp_Ex
5305759b3d2Safresh1(Full_Composition_Exclusion). This property is generated from
5315759b3d2Safresh1Composition Exclusions + Singletons + Non-Starter Decompositions.
5325759b3d2Safresh1
5335759b3d2Safresh1=item C<$NFD_is_NO = isNFD_NO($code_point)>
5345759b3d2Safresh1
5355759b3d2Safresh1It returns a boolean of the derived property NFD_NO
5365759b3d2Safresh1(NFD_Quick_Check=No).
5375759b3d2Safresh1
5385759b3d2Safresh1=item C<$NFC_is_NO = isNFC_NO($code_point)>
5395759b3d2Safresh1
5405759b3d2Safresh1It returns a boolean of the derived property NFC_NO
5415759b3d2Safresh1(NFC_Quick_Check=No).
5425759b3d2Safresh1
5435759b3d2Safresh1=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
5445759b3d2Safresh1
5455759b3d2Safresh1It returns a boolean of the derived property NFC_MAYBE
5465759b3d2Safresh1(NFC_Quick_Check=Maybe).
5475759b3d2Safresh1
5485759b3d2Safresh1=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
5495759b3d2Safresh1
5505759b3d2Safresh1It returns a boolean of the derived property NFKD_NO
5515759b3d2Safresh1(NFKD_Quick_Check=No).
5525759b3d2Safresh1
5535759b3d2Safresh1=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
5545759b3d2Safresh1
5555759b3d2Safresh1It returns a boolean of the derived property NFKC_NO
5565759b3d2Safresh1(NFKC_Quick_Check=No).
5575759b3d2Safresh1
5585759b3d2Safresh1=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
5595759b3d2Safresh1
5605759b3d2Safresh1It returns a boolean of the derived property NFKC_MAYBE
5615759b3d2Safresh1(NFKC_Quick_Check=Maybe).
5625759b3d2Safresh1
5635759b3d2Safresh1=back
5645759b3d2Safresh1
5655759b3d2Safresh1=head1 EXPORT
5665759b3d2Safresh1
5675759b3d2Safresh1C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
5685759b3d2Safresh1
5695759b3d2Safresh1C<normalize> and other some functions: on request.
5705759b3d2Safresh1
5715759b3d2Safresh1=head1 CAVEATS
5725759b3d2Safresh1
5735759b3d2Safresh1=over 4
5745759b3d2Safresh1
5755759b3d2Safresh1=item Perl's version vs. Unicode version
5765759b3d2Safresh1
5775759b3d2Safresh1Since this module refers to perl core's Unicode database in the directory
5785759b3d2Safresh1F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
5795759b3d2Safresh1normalization implemented by this module depends on what has been
5805759b3d2Safresh1compiled into your perl.  The following table lists the default Unicode
5815759b3d2Safresh1version that comes with various perl versions.  (It is possible to change
5825759b3d2Safresh1the Unicode version in any perl version to be any earlier Unicode version,
5835759b3d2Safresh1so one could cause Unicode 3.2 to be used in any perl version starting with
5845759b3d2Safresh15.8.0.  Read F<C<$Config{privlib}>/unicore/README.perl> for details.
5855759b3d2Safresh1
5865759b3d2Safresh1    perl's version     implemented Unicode version
5875759b3d2Safresh1       5.6.1              3.0.1
5885759b3d2Safresh1       5.7.2              3.1.0
5895759b3d2Safresh1       5.7.3              3.1.1 (normalization is same as 3.1.0)
5905759b3d2Safresh1       5.8.0              3.2.0
5915759b3d2Safresh1         5.8.1-5.8.3      4.0.0
5925759b3d2Safresh1         5.8.4-5.8.6      4.0.1 (normalization is same as 4.0.0)
5935759b3d2Safresh1         5.8.7-5.8.8      4.1.0
5945759b3d2Safresh1       5.10.0             5.0.0
5955759b3d2Safresh1        5.8.9, 5.10.1     5.1.0
5965759b3d2Safresh1       5.12.x             5.2.0
5975759b3d2Safresh1       5.14.x             6.0.0
5985759b3d2Safresh1       5.16.x             6.1.0
5995759b3d2Safresh1       5.18.x             6.2.0
6005759b3d2Safresh1       5.20.x             6.3.0
6015759b3d2Safresh1       5.22.x             7.0.0
6025759b3d2Safresh1
6035759b3d2Safresh1=item Correction of decomposition mapping
6045759b3d2Safresh1
6055759b3d2Safresh1In older Unicode versions, a small number of characters (all of which are
6065759b3d2Safresh1CJK compatibility ideographs as far as they have been found) may have
607*f2a19305Safresh1an erroneous decomposition mapping (see
608*f2a19305Safresh1F<lib/unicore/NormalizationCorrections.txt>).
609*f2a19305Safresh1Anyhow, this module will neither refer to
610*f2a19305Safresh1F<lib/unicore/NormalizationCorrections.txt>
6115759b3d2Safresh1nor provide any specific version of normalization. Therefore this module
6125759b3d2Safresh1running on an older perl with an older Unicode database may use
6135759b3d2Safresh1the erroneous decomposition mapping blindly conforming to the Unicode database.
6145759b3d2Safresh1
6155759b3d2Safresh1=item Revised definition of canonical composition
6165759b3d2Safresh1
6175759b3d2Safresh1In Unicode 4.1.0, the definition D2 of canonical composition (which
6185759b3d2Safresh1affects NFC and NFKC) has been changed (see Public Review Issue #29
6195759b3d2Safresh1and recent UAX #15). This module has used the newer definition
6205759b3d2Safresh1since the version 0.07 (Oct 31, 2001).
6215759b3d2Safresh1This module will not support the normalization according to the older
6225759b3d2Safresh1definition, even if the Unicode version implemented by perl is
6235759b3d2Safresh1lower than 4.1.0.
6245759b3d2Safresh1
6255759b3d2Safresh1=back
6265759b3d2Safresh1
6275759b3d2Safresh1=head1 AUTHOR
6285759b3d2Safresh1
6295759b3d2Safresh1SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
6305759b3d2Safresh1
6315759b3d2Safresh1Currently maintained by <perl5-porters@perl.org>
6325759b3d2Safresh1
6335759b3d2Safresh1Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
6345759b3d2Safresh1
6355759b3d2Safresh1=head1 LICENSE
6365759b3d2Safresh1
6375759b3d2Safresh1This module is free software; you can redistribute it
6385759b3d2Safresh1and/or modify it under the same terms as Perl itself.
6395759b3d2Safresh1
6405759b3d2Safresh1=head1 SEE ALSO
6415759b3d2Safresh1
6425759b3d2Safresh1=over 4
6435759b3d2Safresh1
644de8cc8edSafresh1=item L<http://www.unicode.org/reports/tr15/>
6455759b3d2Safresh1
6465759b3d2Safresh1Unicode Normalization Forms - UAX #15
6475759b3d2Safresh1
648de8cc8edSafresh1=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt>
6495759b3d2Safresh1
6505759b3d2Safresh1Composition Exclusion Table
6515759b3d2Safresh1
652de8cc8edSafresh1=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt>
6535759b3d2Safresh1
6545759b3d2Safresh1Derived Normalization Properties
6555759b3d2Safresh1
656de8cc8edSafresh1=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt>
6575759b3d2Safresh1
6585759b3d2Safresh1Normalization Corrections
6595759b3d2Safresh1
660de8cc8edSafresh1=item L<http://www.unicode.org/review/pr-29.html>
6615759b3d2Safresh1
6625759b3d2Safresh1Public Review Issue #29: Normalization Issue
6635759b3d2Safresh1
664de8cc8edSafresh1=item L<http://www.unicode.org/notes/tn5/>
6655759b3d2Safresh1
6665759b3d2Safresh1Canonical Equivalence in Applications - UTN #5
6675759b3d2Safresh1
6685759b3d2Safresh1=back
6695759b3d2Safresh1
6705759b3d2Safresh1=cut
671