15759b3d2Safresh1package Unicode::Normalize; 25759b3d2Safresh1 35759b3d2Safresh1use 5.006; 45759b3d2Safresh1use strict; 55759b3d2Safresh1use warnings; 65759b3d2Safresh1use Carp; 75759b3d2Safresh1 85759b3d2Safresh1no warnings 'utf8'; 95759b3d2Safresh1 10*f2a19305Safresh1our $VERSION = '1.32'; 115759b3d2Safresh1our $PACKAGE = __PACKAGE__; 125759b3d2Safresh1 135759b3d2Safresh1our @EXPORT = qw( NFC NFD NFKC NFKD ); 145759b3d2Safresh1our @EXPORT_OK = qw( 155759b3d2Safresh1 normalize decompose reorder compose 165759b3d2Safresh1 checkNFD checkNFKD checkNFC checkNFKC check 175759b3d2Safresh1 getCanon getCompat getComposite getCombinClass 185759b3d2Safresh1 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex 195759b3d2Safresh1 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE 205759b3d2Safresh1 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter 215759b3d2Safresh1 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial 225759b3d2Safresh1); 235759b3d2Safresh1our %EXPORT_TAGS = ( 245759b3d2Safresh1 all => [ @EXPORT, @EXPORT_OK ], 255759b3d2Safresh1 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], 265759b3d2Safresh1 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], 275759b3d2Safresh1 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], 285759b3d2Safresh1); 295759b3d2Safresh1 305759b3d2Safresh1## 315759b3d2Safresh1## utilities for tests 325759b3d2Safresh1## 335759b3d2Safresh1 34256a93a4Safresh1 # No EBCDIC support on early perls 35256a93a4Safresh1*to_native = ($::IS_ASCII || $] < 5.008) 36256a93a4Safresh1 ? sub { return shift } 37256a93a4Safresh1 : sub { utf8::unicode_to_native(shift) }; 38256a93a4Safresh1 39256a93a4Safresh1*from_native = ($::IS_ASCII || $] < 5.008) 40256a93a4Safresh1 ? sub { return shift } 41256a93a4Safresh1 : sub { utf8::native_to_unicode(shift) }; 42256a93a4Safresh1 43256a93a4Safresh1# The .t files are all in terms of Unicode, so xlate to/from native 44256a93a4Safresh1sub dot_t_pack_U { 45256a93a4Safresh1 return pack('U*', map { to_native($_) } @_); 465759b3d2Safresh1} 475759b3d2Safresh1 48256a93a4Safresh1sub dot_t_unpack_U { 495759b3d2Safresh1 505759b3d2Safresh1 # The empty pack returns an empty UTF-8 string, so the effect is to force 515759b3d2Safresh1 # the shifted parameter into being UTF-8. This allows this to work on 525759b3d2Safresh1 # Perl 5.6, where there is no utf8::upgrade(). 53256a93a4Safresh1 return map { from_native($_) } unpack('U*', shift(@_).pack('U*')); 54256a93a4Safresh1} 55256a93a4Safresh1 56256a93a4Safresh1sub get_printable_string ($) { 57256a93a4Safresh1 use bytes; 58256a93a4Safresh1 my $s = shift; 59256a93a4Safresh1 60256a93a4Safresh1 # DeMorgan's laws cause this to mean ascii printables 61256a93a4Safresh1 return $s if $s =~ /[^[:^ascii:][:^print:]]/; 62256a93a4Safresh1 63256a93a4Safresh1 return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s; 64256a93a4Safresh1} 65256a93a4Safresh1 66256a93a4Safresh1sub ok ($$;$) { 67256a93a4Safresh1 my $count_ref = shift; # Test number in caller 68256a93a4Safresh1 my $p = my $r = shift; 69256a93a4Safresh1 my $x; 70256a93a4Safresh1 if (@_) { 71256a93a4Safresh1 $x = shift; 72256a93a4Safresh1 $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; 73256a93a4Safresh1 } 74256a93a4Safresh1 75256a93a4Safresh1 print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n"; 76256a93a4Safresh1 77256a93a4Safresh1 return if $p; 78256a93a4Safresh1 79256a93a4Safresh1 my (undef, $file, $line) = caller(1); 80256a93a4Safresh1 print STDERR "# Failed test $$count_ref at $file line $line\n"; 81256a93a4Safresh1 82256a93a4Safresh1 return unless defined $x; 83256a93a4Safresh1 84256a93a4Safresh1 print STDERR "# got ", get_printable_string($r), "\n"; 85256a93a4Safresh1 print STDERR "# expected ", get_printable_string($x), "\n"; 865759b3d2Safresh1} 875759b3d2Safresh1 885759b3d2Safresh1require Exporter; 895759b3d2Safresh1 905759b3d2Safresh1##### The above part is common to XS and PP ##### 915759b3d2Safresh1 925759b3d2Safresh1our @ISA = qw(Exporter); 935759b3d2Safresh1use XSLoader (); 945759b3d2Safresh1XSLoader::load( 'Unicode::Normalize', $VERSION ); 955759b3d2Safresh1 965759b3d2Safresh1##### The below part is common to XS and PP ##### 975759b3d2Safresh1 985759b3d2Safresh1## 995759b3d2Safresh1## normalize 1005759b3d2Safresh1## 1015759b3d2Safresh1 1025759b3d2Safresh1sub FCD ($) { 1035759b3d2Safresh1 my $str = shift; 1045759b3d2Safresh1 return checkFCD($str) ? $str : NFD($str); 1055759b3d2Safresh1} 1065759b3d2Safresh1 1075759b3d2Safresh1our %formNorm = ( 1085759b3d2Safresh1 NFC => \&NFC, C => \&NFC, 1095759b3d2Safresh1 NFD => \&NFD, D => \&NFD, 1105759b3d2Safresh1 NFKC => \&NFKC, KC => \&NFKC, 1115759b3d2Safresh1 NFKD => \&NFKD, KD => \&NFKD, 1125759b3d2Safresh1 FCD => \&FCD, FCC => \&FCC, 1135759b3d2Safresh1); 1145759b3d2Safresh1 1155759b3d2Safresh1sub normalize($$) 1165759b3d2Safresh1{ 1175759b3d2Safresh1 my $form = shift; 1185759b3d2Safresh1 my $str = shift; 1195759b3d2Safresh1 if (exists $formNorm{$form}) { 1205759b3d2Safresh1 return $formNorm{$form}->($str); 1215759b3d2Safresh1 } 1225759b3d2Safresh1 croak($PACKAGE."::normalize: invalid form name: $form"); 1235759b3d2Safresh1} 1245759b3d2Safresh1 1255759b3d2Safresh1## 1265759b3d2Safresh1## partial 1275759b3d2Safresh1## 1285759b3d2Safresh1 1295759b3d2Safresh1sub normalize_partial ($$) { 1305759b3d2Safresh1 if (exists $formNorm{$_[0]}) { 1315759b3d2Safresh1 my $n = normalize($_[0], $_[1]); 1325759b3d2Safresh1 my($p, $u) = splitOnLastStarter($n); 1335759b3d2Safresh1 $_[1] = $u; 1345759b3d2Safresh1 return $p; 1355759b3d2Safresh1 } 1365759b3d2Safresh1 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); 1375759b3d2Safresh1} 1385759b3d2Safresh1 1395759b3d2Safresh1sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } 1405759b3d2Safresh1sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } 1415759b3d2Safresh1sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } 1425759b3d2Safresh1sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } 1435759b3d2Safresh1 1445759b3d2Safresh1## 1455759b3d2Safresh1## check 1465759b3d2Safresh1## 1475759b3d2Safresh1 1485759b3d2Safresh1our %formCheck = ( 1495759b3d2Safresh1 NFC => \&checkNFC, C => \&checkNFC, 1505759b3d2Safresh1 NFD => \&checkNFD, D => \&checkNFD, 1515759b3d2Safresh1 NFKC => \&checkNFKC, KC => \&checkNFKC, 1525759b3d2Safresh1 NFKD => \&checkNFKD, KD => \&checkNFKD, 1535759b3d2Safresh1 FCD => \&checkFCD, FCC => \&checkFCC, 1545759b3d2Safresh1); 1555759b3d2Safresh1 1565759b3d2Safresh1sub check($$) 1575759b3d2Safresh1{ 1585759b3d2Safresh1 my $form = shift; 1595759b3d2Safresh1 my $str = shift; 1605759b3d2Safresh1 if (exists $formCheck{$form}) { 1615759b3d2Safresh1 return $formCheck{$form}->($str); 1625759b3d2Safresh1 } 1635759b3d2Safresh1 croak($PACKAGE."::check: invalid form name: $form"); 1645759b3d2Safresh1} 1655759b3d2Safresh1 1665759b3d2Safresh11; 1675759b3d2Safresh1__END__ 1685759b3d2Safresh1 1695759b3d2Safresh1=head1 NAME 1705759b3d2Safresh1 1715759b3d2Safresh1Unicode::Normalize - Unicode Normalization Forms 1725759b3d2Safresh1 1735759b3d2Safresh1=head1 SYNOPSIS 1745759b3d2Safresh1 1755759b3d2Safresh1(1) using function names exported by default: 1765759b3d2Safresh1 1775759b3d2Safresh1 use Unicode::Normalize; 1785759b3d2Safresh1 1795759b3d2Safresh1 $NFD_string = NFD($string); # Normalization Form D 1805759b3d2Safresh1 $NFC_string = NFC($string); # Normalization Form C 1815759b3d2Safresh1 $NFKD_string = NFKD($string); # Normalization Form KD 1825759b3d2Safresh1 $NFKC_string = NFKC($string); # Normalization Form KC 1835759b3d2Safresh1 1845759b3d2Safresh1(2) using function names exported on request: 1855759b3d2Safresh1 1865759b3d2Safresh1 use Unicode::Normalize 'normalize'; 1875759b3d2Safresh1 1885759b3d2Safresh1 $NFD_string = normalize('D', $string); # Normalization Form D 1895759b3d2Safresh1 $NFC_string = normalize('C', $string); # Normalization Form C 1905759b3d2Safresh1 $NFKD_string = normalize('KD', $string); # Normalization Form KD 1915759b3d2Safresh1 $NFKC_string = normalize('KC', $string); # Normalization Form KC 1925759b3d2Safresh1 1935759b3d2Safresh1=head1 DESCRIPTION 1945759b3d2Safresh1 1955759b3d2Safresh1Parameters: 1965759b3d2Safresh1 1975759b3d2Safresh1C<$string> is used as a string under character semantics (see L<perlunicode>). 1985759b3d2Safresh1 1995759b3d2Safresh1C<$code_point> should be an unsigned integer representing a Unicode code point. 2005759b3d2Safresh1 2015759b3d2Safresh1Note: Between XSUB and pure Perl, there is an incompatibility 2025759b3d2Safresh1about the interpretation of C<$code_point> as a decimal number. 2035759b3d2Safresh1XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. 2045759b3d2Safresh1Do not use a floating point nor a negative sign in C<$code_point>. 2055759b3d2Safresh1 2065759b3d2Safresh1=head2 Normalization Forms 2075759b3d2Safresh1 2085759b3d2Safresh1=over 4 2095759b3d2Safresh1 2105759b3d2Safresh1=item C<$NFD_string = NFD($string)> 2115759b3d2Safresh1 2125759b3d2Safresh1It returns the Normalization Form D (formed by canonical decomposition). 2135759b3d2Safresh1 2145759b3d2Safresh1=item C<$NFC_string = NFC($string)> 2155759b3d2Safresh1 2165759b3d2Safresh1It returns the Normalization Form C (formed by canonical decomposition 2175759b3d2Safresh1followed by canonical composition). 2185759b3d2Safresh1 2195759b3d2Safresh1=item C<$NFKD_string = NFKD($string)> 2205759b3d2Safresh1 2215759b3d2Safresh1It returns the Normalization Form KD (formed by compatibility decomposition). 2225759b3d2Safresh1 2235759b3d2Safresh1=item C<$NFKC_string = NFKC($string)> 2245759b3d2Safresh1 2255759b3d2Safresh1It returns the Normalization Form KC (formed by compatibility decomposition 2265759b3d2Safresh1followed by B<canonical> composition). 2275759b3d2Safresh1 2285759b3d2Safresh1=item C<$FCD_string = FCD($string)> 2295759b3d2Safresh1 2305759b3d2Safresh1If the given string is in FCD ("Fast C or D" form; cf. UTN #5), 2315759b3d2Safresh1it returns the string without modification; otherwise it returns an FCD string. 2325759b3d2Safresh1 2335759b3d2Safresh1Note: FCD is not always unique, then plural forms may be equivalent 2345759b3d2Safresh1each other. C<FCD()> will return one of these equivalent forms. 2355759b3d2Safresh1 2365759b3d2Safresh1=item C<$FCC_string = FCC($string)> 2375759b3d2Safresh1 2385759b3d2Safresh1It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). 2395759b3d2Safresh1 2405759b3d2Safresh1Note: FCC is unique, as well as four normalization forms (NF*). 2415759b3d2Safresh1 2425759b3d2Safresh1=item C<$normalized_string = normalize($form_name, $string)> 2435759b3d2Safresh1 2445759b3d2Safresh1It returns the normalization form of C<$form_name>. 2455759b3d2Safresh1 2465759b3d2Safresh1As C<$form_name>, one of the following names must be given. 2475759b3d2Safresh1 2485759b3d2Safresh1 'C' or 'NFC' for Normalization Form C (UAX #15) 2495759b3d2Safresh1 'D' or 'NFD' for Normalization Form D (UAX #15) 2505759b3d2Safresh1 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 2515759b3d2Safresh1 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 2525759b3d2Safresh1 2535759b3d2Safresh1 'FCD' for "Fast C or D" Form (UTN #5) 2545759b3d2Safresh1 'FCC' for "Fast C Contiguous" (UTN #5) 2555759b3d2Safresh1 2565759b3d2Safresh1=back 2575759b3d2Safresh1 2585759b3d2Safresh1=head2 Decomposition and Composition 2595759b3d2Safresh1 2605759b3d2Safresh1=over 4 2615759b3d2Safresh1 2625759b3d2Safresh1=item C<$decomposed_string = decompose($string [, $useCompatMapping])> 2635759b3d2Safresh1 2645759b3d2Safresh1It returns the concatenation of the decomposition of each character 2655759b3d2Safresh1in the string. 2665759b3d2Safresh1 2675759b3d2Safresh1If the second parameter (a boolean) is omitted or false, 2685759b3d2Safresh1the decomposition is canonical decomposition; 2695759b3d2Safresh1if the second parameter (a boolean) is true, 2705759b3d2Safresh1the decomposition is compatibility decomposition. 2715759b3d2Safresh1 2725759b3d2Safresh1The string returned is not always in NFD/NFKD. Reordering may be required. 2735759b3d2Safresh1 2745759b3d2Safresh1 $NFD_string = reorder(decompose($string)); # eq. to NFD() 2755759b3d2Safresh1 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() 2765759b3d2Safresh1 2775759b3d2Safresh1=item C<$reordered_string = reorder($string)> 2785759b3d2Safresh1 2795759b3d2Safresh1It returns the result of reordering the combining characters 2805759b3d2Safresh1according to Canonical Ordering Behavior. 2815759b3d2Safresh1 2825759b3d2Safresh1For example, when you have a list of NFD/NFKD strings, 2835759b3d2Safresh1you can get the concatenated NFD/NFKD string from them, by saying 2845759b3d2Safresh1 2855759b3d2Safresh1 $concat_NFD = reorder(join '', @NFD_strings); 2865759b3d2Safresh1 $concat_NFKD = reorder(join '', @NFKD_strings); 2875759b3d2Safresh1 2885759b3d2Safresh1=item C<$composed_string = compose($string)> 2895759b3d2Safresh1 2905759b3d2Safresh1It returns the result of canonical composition 2915759b3d2Safresh1without applying any decomposition. 2925759b3d2Safresh1 2935759b3d2Safresh1For example, when you have a NFD/NFKD string, 2945759b3d2Safresh1you can get its NFC/NFKC string, by saying 2955759b3d2Safresh1 2965759b3d2Safresh1 $NFC_string = compose($NFD_string); 2975759b3d2Safresh1 $NFKC_string = compose($NFKD_string); 2985759b3d2Safresh1 2995759b3d2Safresh1=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> 3005759b3d2Safresh1 3015759b3d2Safresh1It returns two strings: the first one, C<$processed>, is a part 3025759b3d2Safresh1before the last starter, and the second one, C<$unprocessed> is 3035759b3d2Safresh1another part after the first part. A starter is a character having 3045759b3d2Safresh1a combining class of zero (see UAX #15). 3055759b3d2Safresh1 3065759b3d2Safresh1Note that C<$processed> may be empty (when C<$normalized> contains no 3075759b3d2Safresh1starter or starts with the last starter), and then C<$unprocessed> 3085759b3d2Safresh1should be equal to the entire C<$normalized>. 3095759b3d2Safresh1 3105759b3d2Safresh1When you have a C<$normalized> string and an C<$unnormalized> string 3115759b3d2Safresh1following it, a simple concatenation is wrong: 3125759b3d2Safresh1 3135759b3d2Safresh1 $concat = $normalized . normalize($form, $unnormalized); # wrong! 3145759b3d2Safresh1 3155759b3d2Safresh1Instead of it, do like this: 3165759b3d2Safresh1 3175759b3d2Safresh1 ($processed, $unprocessed) = splitOnLastStarter($normalized); 3185759b3d2Safresh1 $concat = $processed . normalize($form,$unprocessed.$unnormalized); 3195759b3d2Safresh1 3205759b3d2Safresh1C<splitOnLastStarter()> should be called with a pre-normalized parameter 3215759b3d2Safresh1C<$normalized>, that is in the same form as C<$form> you want. 3225759b3d2Safresh1 3235759b3d2Safresh1If you have an array of C<@string> that should be concatenated and then 3245759b3d2Safresh1normalized, you can do like this: 3255759b3d2Safresh1 3265759b3d2Safresh1 my $result = ""; 3275759b3d2Safresh1 my $unproc = ""; 3285759b3d2Safresh1 foreach my $str (@string) { 3295759b3d2Safresh1 $unproc .= $str; 3305759b3d2Safresh1 my $n = normalize($form, $unproc); 3315759b3d2Safresh1 my($p, $u) = splitOnLastStarter($n); 3325759b3d2Safresh1 $result .= $p; 3335759b3d2Safresh1 $unproc = $u; 3345759b3d2Safresh1 } 3355759b3d2Safresh1 $result .= $unproc; 3365759b3d2Safresh1 # instead of normalize($form, join('', @string)) 3375759b3d2Safresh1 3385759b3d2Safresh1=item C<$processed = normalize_partial($form, $unprocessed)> 3395759b3d2Safresh1 3405759b3d2Safresh1A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. 3415759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect. 3425759b3d2Safresh1 3435759b3d2Safresh1If you have an array of C<@string> that should be concatenated and then 3445759b3d2Safresh1normalized, you can do like this: 3455759b3d2Safresh1 3465759b3d2Safresh1 my $result = ""; 3475759b3d2Safresh1 my $unproc = ""; 3485759b3d2Safresh1 foreach my $str (@string) { 3495759b3d2Safresh1 $unproc .= $str; 3505759b3d2Safresh1 $result .= normalize_partial($form, $unproc); 3515759b3d2Safresh1 } 3525759b3d2Safresh1 $result .= $unproc; 3535759b3d2Safresh1 # instead of normalize($form, join('', @string)) 3545759b3d2Safresh1 3555759b3d2Safresh1=item C<$processed = NFD_partial($unprocessed)> 3565759b3d2Safresh1 3575759b3d2Safresh1It does like C<normalize_partial('NFD', $unprocessed)>. 3585759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect. 3595759b3d2Safresh1 3605759b3d2Safresh1=item C<$processed = NFC_partial($unprocessed)> 3615759b3d2Safresh1 3625759b3d2Safresh1It does like C<normalize_partial('NFC', $unprocessed)>. 3635759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect. 3645759b3d2Safresh1 3655759b3d2Safresh1=item C<$processed = NFKD_partial($unprocessed)> 3665759b3d2Safresh1 3675759b3d2Safresh1It does like C<normalize_partial('NFKD', $unprocessed)>. 3685759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect. 3695759b3d2Safresh1 3705759b3d2Safresh1=item C<$processed = NFKC_partial($unprocessed)> 3715759b3d2Safresh1 3725759b3d2Safresh1It does like C<normalize_partial('NFKC', $unprocessed)>. 3735759b3d2Safresh1Note that C<$unprocessed> will be modified as a side-effect. 3745759b3d2Safresh1 3755759b3d2Safresh1=back 3765759b3d2Safresh1 3775759b3d2Safresh1=head2 Quick Check 3785759b3d2Safresh1 379*f2a19305Safresh1(see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>) 3805759b3d2Safresh1 3815759b3d2Safresh1The following functions check whether the string is in that normalization form. 3825759b3d2Safresh1 3835759b3d2Safresh1The result returned will be one of the following: 3845759b3d2Safresh1 3855759b3d2Safresh1 YES The string is in that normalization form. 3865759b3d2Safresh1 NO The string is not in that normalization form. 3875759b3d2Safresh1 MAYBE Dubious. Maybe yes, maybe no. 3885759b3d2Safresh1 3895759b3d2Safresh1=over 4 3905759b3d2Safresh1 3915759b3d2Safresh1=item C<$result = checkNFD($string)> 3925759b3d2Safresh1 3935759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 3945759b3d2Safresh1 3955759b3d2Safresh1=item C<$result = checkNFC($string)> 3965759b3d2Safresh1 3975759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 3985759b3d2Safresh1C<undef> if C<MAYBE>. 3995759b3d2Safresh1 4005759b3d2Safresh1=item C<$result = checkNFKD($string)> 4015759b3d2Safresh1 4025759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 4035759b3d2Safresh1 4045759b3d2Safresh1=item C<$result = checkNFKC($string)> 4055759b3d2Safresh1 4065759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 4075759b3d2Safresh1C<undef> if C<MAYBE>. 4085759b3d2Safresh1 4095759b3d2Safresh1=item C<$result = checkFCD($string)> 4105759b3d2Safresh1 4115759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 4125759b3d2Safresh1 4135759b3d2Safresh1=item C<$result = checkFCC($string)> 4145759b3d2Safresh1 4155759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 4165759b3d2Safresh1C<undef> if C<MAYBE>. 4175759b3d2Safresh1 4185759b3d2Safresh1Note: If a string is not in FCD, it must not be in FCC. 4195759b3d2Safresh1So C<checkFCC($not_FCD_string)> should return C<NO>. 4205759b3d2Safresh1 4215759b3d2Safresh1=item C<$result = check($form_name, $string)> 4225759b3d2Safresh1 4235759b3d2Safresh1It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 4245759b3d2Safresh1C<undef> if C<MAYBE>. 4255759b3d2Safresh1 4265759b3d2Safresh1As C<$form_name>, one of the following names must be given. 4275759b3d2Safresh1 4285759b3d2Safresh1 'C' or 'NFC' for Normalization Form C (UAX #15) 4295759b3d2Safresh1 'D' or 'NFD' for Normalization Form D (UAX #15) 4305759b3d2Safresh1 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 4315759b3d2Safresh1 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 4325759b3d2Safresh1 4335759b3d2Safresh1 'FCD' for "Fast C or D" Form (UTN #5) 4345759b3d2Safresh1 'FCC' for "Fast C Contiguous" (UTN #5) 4355759b3d2Safresh1 4365759b3d2Safresh1=back 4375759b3d2Safresh1 4385759b3d2Safresh1B<Note> 4395759b3d2Safresh1 4405759b3d2Safresh1In the cases of NFD, NFKD, and FCD, the answer must be 4415759b3d2Safresh1either C<YES> or C<NO>. The answer C<MAYBE> may be returned 4425759b3d2Safresh1in the cases of NFC, NFKC, and FCC. 4435759b3d2Safresh1 4445759b3d2Safresh1A C<MAYBE> string should contain at least one combining character 4455759b3d2Safresh1or the like. For example, C<COMBINING ACUTE ACCENT> has 4465759b3d2Safresh1the MAYBE_NFC/MAYBE_NFKC property. 4475759b3d2Safresh1 4485759b3d2Safresh1Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> 4495759b3d2Safresh1and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. 4505759b3d2Safresh1C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC 4515759b3d2Safresh1(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), 4525759b3d2Safresh1while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. 4535759b3d2Safresh1 4545759b3d2Safresh1If you want to check exactly, compare the string with its NFC/NFKC/FCC. 4555759b3d2Safresh1 4565759b3d2Safresh1 if ($string eq NFC($string)) { 4575759b3d2Safresh1 # $string is exactly normalized in NFC; 4585759b3d2Safresh1 } else { 4595759b3d2Safresh1 # $string is not normalized in NFC; 4605759b3d2Safresh1 } 4615759b3d2Safresh1 4625759b3d2Safresh1 if ($string eq NFKC($string)) { 4635759b3d2Safresh1 # $string is exactly normalized in NFKC; 4645759b3d2Safresh1 } else { 4655759b3d2Safresh1 # $string is not normalized in NFKC; 4665759b3d2Safresh1 } 4675759b3d2Safresh1 4685759b3d2Safresh1=head2 Character Data 4695759b3d2Safresh1 4705759b3d2Safresh1These functions are interface of character data used internally. 4715759b3d2Safresh1If you want only to get Unicode normalization forms, you don't need 4725759b3d2Safresh1call them yourself. 4735759b3d2Safresh1 4745759b3d2Safresh1=over 4 4755759b3d2Safresh1 4765759b3d2Safresh1=item C<$canonical_decomposition = getCanon($code_point)> 4775759b3d2Safresh1 4785759b3d2Safresh1If the character is canonically decomposable (including Hangul Syllables), 4795759b3d2Safresh1it returns the (full) canonical decomposition as a string. 4805759b3d2Safresh1Otherwise it returns C<undef>. 4815759b3d2Safresh1 4825759b3d2Safresh1B<Note:> According to the Unicode standard, the canonical decomposition 4835759b3d2Safresh1of the character that is not canonically decomposable is same as 4845759b3d2Safresh1the character itself. 4855759b3d2Safresh1 4865759b3d2Safresh1=item C<$compatibility_decomposition = getCompat($code_point)> 4875759b3d2Safresh1 4885759b3d2Safresh1If the character is compatibility decomposable (including Hangul Syllables), 4895759b3d2Safresh1it returns the (full) compatibility decomposition as a string. 4905759b3d2Safresh1Otherwise it returns C<undef>. 4915759b3d2Safresh1 4925759b3d2Safresh1B<Note:> According to the Unicode standard, the compatibility decomposition 4935759b3d2Safresh1of the character that is not compatibility decomposable is same as 4945759b3d2Safresh1the character itself. 4955759b3d2Safresh1 4965759b3d2Safresh1=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> 4975759b3d2Safresh1 4985759b3d2Safresh1If two characters here and next (as code points) are composable 4995759b3d2Safresh1(including Hangul Jamo/Syllables and Composition Exclusions), 5005759b3d2Safresh1it returns the code point of the composite. 5015759b3d2Safresh1 5025759b3d2Safresh1If they are not composable, it returns C<undef>. 5035759b3d2Safresh1 5045759b3d2Safresh1=item C<$combining_class = getCombinClass($code_point)> 5055759b3d2Safresh1 5065759b3d2Safresh1It returns the combining class (as an integer) of the character. 5075759b3d2Safresh1 5085759b3d2Safresh1=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> 5095759b3d2Safresh1 5105759b3d2Safresh1It returns a boolean whether the character of the specified codepoint 5115759b3d2Safresh1may be composed with the previous one in a certain composition 5125759b3d2Safresh1(including Hangul Compositions, but excluding 5135759b3d2Safresh1Composition Exclusions and Non-Starter Decompositions). 5145759b3d2Safresh1 5155759b3d2Safresh1=item C<$is_exclusion = isExclusion($code_point)> 5165759b3d2Safresh1 5175759b3d2Safresh1It returns a boolean whether the code point is a composition exclusion. 5185759b3d2Safresh1 5195759b3d2Safresh1=item C<$is_singleton = isSingleton($code_point)> 5205759b3d2Safresh1 5215759b3d2Safresh1It returns a boolean whether the code point is a singleton 5225759b3d2Safresh1 5235759b3d2Safresh1=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> 5245759b3d2Safresh1 5255759b3d2Safresh1It returns a boolean whether the code point has Non-Starter Decomposition. 5265759b3d2Safresh1 5275759b3d2Safresh1=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> 5285759b3d2Safresh1 5295759b3d2Safresh1It returns a boolean of the derived property Comp_Ex 5305759b3d2Safresh1(Full_Composition_Exclusion). This property is generated from 5315759b3d2Safresh1Composition Exclusions + Singletons + Non-Starter Decompositions. 5325759b3d2Safresh1 5335759b3d2Safresh1=item C<$NFD_is_NO = isNFD_NO($code_point)> 5345759b3d2Safresh1 5355759b3d2Safresh1It returns a boolean of the derived property NFD_NO 5365759b3d2Safresh1(NFD_Quick_Check=No). 5375759b3d2Safresh1 5385759b3d2Safresh1=item C<$NFC_is_NO = isNFC_NO($code_point)> 5395759b3d2Safresh1 5405759b3d2Safresh1It returns a boolean of the derived property NFC_NO 5415759b3d2Safresh1(NFC_Quick_Check=No). 5425759b3d2Safresh1 5435759b3d2Safresh1=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> 5445759b3d2Safresh1 5455759b3d2Safresh1It returns a boolean of the derived property NFC_MAYBE 5465759b3d2Safresh1(NFC_Quick_Check=Maybe). 5475759b3d2Safresh1 5485759b3d2Safresh1=item C<$NFKD_is_NO = isNFKD_NO($code_point)> 5495759b3d2Safresh1 5505759b3d2Safresh1It returns a boolean of the derived property NFKD_NO 5515759b3d2Safresh1(NFKD_Quick_Check=No). 5525759b3d2Safresh1 5535759b3d2Safresh1=item C<$NFKC_is_NO = isNFKC_NO($code_point)> 5545759b3d2Safresh1 5555759b3d2Safresh1It returns a boolean of the derived property NFKC_NO 5565759b3d2Safresh1(NFKC_Quick_Check=No). 5575759b3d2Safresh1 5585759b3d2Safresh1=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> 5595759b3d2Safresh1 5605759b3d2Safresh1It returns a boolean of the derived property NFKC_MAYBE 5615759b3d2Safresh1(NFKC_Quick_Check=Maybe). 5625759b3d2Safresh1 5635759b3d2Safresh1=back 5645759b3d2Safresh1 5655759b3d2Safresh1=head1 EXPORT 5665759b3d2Safresh1 5675759b3d2Safresh1C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. 5685759b3d2Safresh1 5695759b3d2Safresh1C<normalize> and other some functions: on request. 5705759b3d2Safresh1 5715759b3d2Safresh1=head1 CAVEATS 5725759b3d2Safresh1 5735759b3d2Safresh1=over 4 5745759b3d2Safresh1 5755759b3d2Safresh1=item Perl's version vs. Unicode version 5765759b3d2Safresh1 5775759b3d2Safresh1Since this module refers to perl core's Unicode database in the directory 5785759b3d2Safresh1F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of 5795759b3d2Safresh1normalization implemented by this module depends on what has been 5805759b3d2Safresh1compiled into your perl. The following table lists the default Unicode 5815759b3d2Safresh1version that comes with various perl versions. (It is possible to change 5825759b3d2Safresh1the Unicode version in any perl version to be any earlier Unicode version, 5835759b3d2Safresh1so one could cause Unicode 3.2 to be used in any perl version starting with 5845759b3d2Safresh15.8.0. Read F<C<$Config{privlib}>/unicore/README.perl> for details. 5855759b3d2Safresh1 5865759b3d2Safresh1 perl's version implemented Unicode version 5875759b3d2Safresh1 5.6.1 3.0.1 5885759b3d2Safresh1 5.7.2 3.1.0 5895759b3d2Safresh1 5.7.3 3.1.1 (normalization is same as 3.1.0) 5905759b3d2Safresh1 5.8.0 3.2.0 5915759b3d2Safresh1 5.8.1-5.8.3 4.0.0 5925759b3d2Safresh1 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) 5935759b3d2Safresh1 5.8.7-5.8.8 4.1.0 5945759b3d2Safresh1 5.10.0 5.0.0 5955759b3d2Safresh1 5.8.9, 5.10.1 5.1.0 5965759b3d2Safresh1 5.12.x 5.2.0 5975759b3d2Safresh1 5.14.x 6.0.0 5985759b3d2Safresh1 5.16.x 6.1.0 5995759b3d2Safresh1 5.18.x 6.2.0 6005759b3d2Safresh1 5.20.x 6.3.0 6015759b3d2Safresh1 5.22.x 7.0.0 6025759b3d2Safresh1 6035759b3d2Safresh1=item Correction of decomposition mapping 6045759b3d2Safresh1 6055759b3d2Safresh1In older Unicode versions, a small number of characters (all of which are 6065759b3d2Safresh1CJK compatibility ideographs as far as they have been found) may have 607*f2a19305Safresh1an erroneous decomposition mapping (see 608*f2a19305Safresh1F<lib/unicore/NormalizationCorrections.txt>). 609*f2a19305Safresh1Anyhow, this module will neither refer to 610*f2a19305Safresh1F<lib/unicore/NormalizationCorrections.txt> 6115759b3d2Safresh1nor provide any specific version of normalization. Therefore this module 6125759b3d2Safresh1running on an older perl with an older Unicode database may use 6135759b3d2Safresh1the erroneous decomposition mapping blindly conforming to the Unicode database. 6145759b3d2Safresh1 6155759b3d2Safresh1=item Revised definition of canonical composition 6165759b3d2Safresh1 6175759b3d2Safresh1In Unicode 4.1.0, the definition D2 of canonical composition (which 6185759b3d2Safresh1affects NFC and NFKC) has been changed (see Public Review Issue #29 6195759b3d2Safresh1and recent UAX #15). This module has used the newer definition 6205759b3d2Safresh1since the version 0.07 (Oct 31, 2001). 6215759b3d2Safresh1This module will not support the normalization according to the older 6225759b3d2Safresh1definition, even if the Unicode version implemented by perl is 6235759b3d2Safresh1lower than 4.1.0. 6245759b3d2Safresh1 6255759b3d2Safresh1=back 6265759b3d2Safresh1 6275759b3d2Safresh1=head1 AUTHOR 6285759b3d2Safresh1 6295759b3d2Safresh1SADAHIRO Tomoyuki <SADAHIRO@cpan.org> 6305759b3d2Safresh1 6315759b3d2Safresh1Currently maintained by <perl5-porters@perl.org> 6325759b3d2Safresh1 6335759b3d2Safresh1Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. 6345759b3d2Safresh1 6355759b3d2Safresh1=head1 LICENSE 6365759b3d2Safresh1 6375759b3d2Safresh1This module is free software; you can redistribute it 6385759b3d2Safresh1and/or modify it under the same terms as Perl itself. 6395759b3d2Safresh1 6405759b3d2Safresh1=head1 SEE ALSO 6415759b3d2Safresh1 6425759b3d2Safresh1=over 4 6435759b3d2Safresh1 644de8cc8edSafresh1=item L<http://www.unicode.org/reports/tr15/> 6455759b3d2Safresh1 6465759b3d2Safresh1Unicode Normalization Forms - UAX #15 6475759b3d2Safresh1 648de8cc8edSafresh1=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt> 6495759b3d2Safresh1 6505759b3d2Safresh1Composition Exclusion Table 6515759b3d2Safresh1 652de8cc8edSafresh1=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt> 6535759b3d2Safresh1 6545759b3d2Safresh1Derived Normalization Properties 6555759b3d2Safresh1 656de8cc8edSafresh1=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt> 6575759b3d2Safresh1 6585759b3d2Safresh1Normalization Corrections 6595759b3d2Safresh1 660de8cc8edSafresh1=item L<http://www.unicode.org/review/pr-29.html> 6615759b3d2Safresh1 6625759b3d2Safresh1Public Review Issue #29: Normalization Issue 6635759b3d2Safresh1 664de8cc8edSafresh1=item L<http://www.unicode.org/notes/tn5/> 6655759b3d2Safresh1 6665759b3d2Safresh1Canonical Equivalence in Applications - UTN #5 6675759b3d2Safresh1 6685759b3d2Safresh1=back 6695759b3d2Safresh1 6705759b3d2Safresh1=cut 671