1package Unicode::Normalize; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Normalize cannot stringify a Unicode code point\n"; 6 } 7} 8 9use 5.006; 10use strict; 11use warnings; 12use Carp; 13 14no warnings 'utf8'; 15 16our $VERSION = '0.28'; 17our $PACKAGE = __PACKAGE__; 18 19require Exporter; 20require DynaLoader; 21 22our @ISA = qw(Exporter DynaLoader); 23our @EXPORT = qw( NFC NFD NFKC NFKD ); 24our @EXPORT_OK = qw( 25 normalize decompose reorder compose 26 checkNFD checkNFKD checkNFC checkNFKC check 27 getCanon getCompat getComposite getCombinClass 28 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex 29 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE 30 FCD checkFCD FCC checkFCC composeContiguous 31 splitOnLastStarter 32); 33our %EXPORT_TAGS = ( 34 all => [ @EXPORT, @EXPORT_OK ], 35 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], 36 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], 37 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], 38); 39 40###### 41 42bootstrap Unicode::Normalize $VERSION; 43 44###### 45 46sub pack_U { 47 return pack('U*', @_); 48} 49 50sub unpack_U { 51 return unpack('U*', pack('U*').shift); 52} 53 54 55## 56## normalization forms 57## 58 59use constant COMPAT => 1; 60 61sub NFD ($) { reorder(decompose($_[0])) } 62sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } 63sub NFC ($) { compose(reorder(decompose($_[0]))) } 64sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } 65 66sub FCD ($) { 67 my $str = shift; 68 return checkFCD($str) ? $str : NFD($str); 69} 70sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } 71 72our %formNorm = ( 73 NFC => \&NFC, C => \&NFC, 74 NFD => \&NFD, D => \&NFD, 75 NFKC => \&NFKC, KC => \&NFKC, 76 NFKD => \&NFKD, KD => \&NFKD, 77 FCD => \&FCD, FCC => \&FCC, 78); 79 80sub normalize($$) 81{ 82 my $form = shift; 83 my $str = shift; 84 return exists $formNorm{$form} 85 ? $formNorm{$form}->($str) 86 : croak $PACKAGE."::normalize: invalid form name: $form"; 87} 88 89 90## 91## quick check 92## 93 94our %formCheck = ( 95 NFC => \&checkNFC, C => \&checkNFC, 96 NFD => \&checkNFD, D => \&checkNFD, 97 NFKC => \&checkNFKC, KC => \&checkNFKC, 98 NFKD => \&checkNFKD, KD => \&checkNFKD, 99 FCD => \&checkFCD, FCC => \&checkFCC, 100); 101 102sub check($$) 103{ 104 my $form = shift; 105 my $str = shift; 106 return exists $formCheck{$form} 107 ? $formCheck{$form}->($str) 108 : croak $PACKAGE."::check: invalid form name: $form"; 109} 110 1111; 112__END__ 113 114=head1 NAME 115 116Unicode::Normalize - Unicode Normalization Forms 117 118=head1 SYNOPSIS 119 120 use Unicode::Normalize; 121 122 $NFD_string = NFD($string); # Normalization Form D 123 $NFC_string = NFC($string); # Normalization Form C 124 $NFKD_string = NFKD($string); # Normalization Form KD 125 $NFKC_string = NFKC($string); # Normalization Form KC 126 127 or 128 129 use Unicode::Normalize 'normalize'; 130 131 $NFD_string = normalize('D', $string); # Normalization Form D 132 $NFC_string = normalize('C', $string); # Normalization Form C 133 $NFKD_string = normalize('KD', $string); # Normalization Form KD 134 $NFKC_string = normalize('KC', $string); # Normalization Form KC 135 136=head1 DESCRIPTION 137 138Parameters: 139 140C<$string> is used as a string under character semantics 141(see F<perlunicode>). 142 143C<$codepoint> should be an unsigned integer 144representing a Unicode code point. 145 146Note: Between XS edition and pure Perl edition, 147interpretation of C<$codepoint> as a decimal number has incompatibility. 148XS converts C<$codepoint> to an unsigned integer, but pure Perl does not. 149Do not use a floating point nor a negative sign in C<$codepoint>. 150 151=head2 Normalization Forms 152 153=over 4 154 155=item C<$NFD_string = NFD($string)> 156 157returns the Normalization Form D (formed by canonical decomposition). 158 159=item C<$NFC_string = NFC($string)> 160 161returns the Normalization Form C (formed by canonical decomposition 162followed by canonical composition). 163 164=item C<$NFKD_string = NFKD($string)> 165 166returns the Normalization Form KD (formed by compatibility decomposition). 167 168=item C<$NFKC_string = NFKC($string)> 169 170returns the Normalization Form KC (formed by compatibility decomposition 171followed by B<canonical> composition). 172 173=item C<$FCD_string = FCD($string)> 174 175If the given string is in FCD ("Fast C or D" form; cf. UTN #5), 176returns it without modification; otherwise returns an FCD string. 177 178Note: FCD is not always unique, then plural forms may be equivalent 179each other. C<FCD()> will return one of these equivalent forms. 180 181=item C<$FCC_string = FCC($string)> 182 183returns the FCC form ("Fast C Contiguous"; cf. UTN #5). 184 185Note: FCC is unique, as well as four normalization forms (NF*). 186 187=item C<$normalized_string = normalize($form_name, $string)> 188 189As C<$form_name>, one of the following names must be given. 190 191 'C' or 'NFC' for Normalization Form C (UAX #15) 192 'D' or 'NFD' for Normalization Form D (UAX #15) 193 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 194 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 195 196 'FCD' for "Fast C or D" Form (UTN #5) 197 'FCC' for "Fast C Contiguous" (UTN #5) 198 199=back 200 201=head2 Decomposition and Composition 202 203=over 4 204 205=item C<$decomposed_string = decompose($string)> 206 207=item C<$decomposed_string = decompose($string, $useCompatMapping)> 208 209Decomposes the specified string and returns the result. 210 211If the second parameter (a boolean) is omitted or false, decomposes it 212using the Canonical Decomposition Mapping. 213If true, decomposes it using the Compatibility Decomposition Mapping. 214 215The string returned is not always in NFD/NFKD. 216Reordering may be required. 217 218 $NFD_string = reorder(decompose($string)); # eq. to NFD() 219 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() 220 221=item C<$reordered_string = reorder($string)> 222 223Reorders the combining characters and the like in the canonical ordering 224and returns the result. 225 226E.g., when you have a list of NFD/NFKD strings, 227you can get the concatenated NFD/NFKD string from them, saying 228 229 $concat_NFD = reorder(join '', @NFD_strings); 230 $concat_NFKD = reorder(join '', @NFKD_strings); 231 232=item C<$composed_string = compose($string)> 233 234Returns the string where composable pairs are composed. 235 236E.g., when you have a NFD/NFKD string, 237you can get its NFC/NFKC string, saying 238 239 $NFC_string = compose($NFD_string); 240 $NFKC_string = compose($NFKD_string); 241 242=back 243 244=head2 Quick Check 245 246(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) 247 248The following functions check whether the string is in that normalization form. 249 250The result returned will be: 251 252 YES The string is in that normalization form. 253 NO The string is not in that normalization form. 254 MAYBE Dubious. Maybe yes, maybe no. 255 256=over 4 257 258=item C<$result = checkNFD($string)> 259 260returns C<YES> (C<1>) or C<NO> (C<empty string>). 261 262=item C<$result = checkNFC($string)> 263 264returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). 265 266=item C<$result = checkNFKD($string)> 267 268returns C<YES> (C<1>) or C<NO> (C<empty string>). 269 270=item C<$result = checkNFKC($string)> 271 272returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). 273 274=item C<$result = checkFCD($string)> 275 276returns C<YES> (C<1>) or C<NO> (C<empty string>). 277 278=item C<$result = checkFCC($string)> 279 280returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). 281 282If a string is not in FCD, it must not be in FCC. 283So C<checkFCC($not_FCD_string)> should return C<NO>. 284 285=item C<$result = check($form_name, $string)> 286 287returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). 288 289C<$form_name> is alike to that for C<normalize()>. 290 291=back 292 293B<Note> 294 295In the cases of NFD, NFKD, and FCD, the answer must be 296either C<YES> or C<NO>. The answer C<MAYBE> may be returned 297in the cases of NFC, NFKC, and FCC. 298 299A C<MAYBE> string should contain at least one combining character 300or the like. For example, C<COMBINING ACUTE ACCENT> has 301the MAYBE_NFC/MAYBE_NFKC property. 302 303Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> 304and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. 305C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC 306(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), 307while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. 308 309If you want to check exactly, compare the string with its NFC/NFKC/FCC; 310i.e., 311 312 $string eq NFC($string) # thorough than checkNFC($string) 313 $string eq NFKC($string) # thorough than checkNFKC($string) 314 $string eq FCC($string) # thorough than checkFCC($string) 315 316=head2 Character Data 317 318These functions are interface of character data used internally. 319If you want only to get Unicode normalization forms, you don't need 320call them yourself. 321 322=over 4 323 324=item C<$canonical_decomposed = getCanon($codepoint)> 325 326If the character of the specified codepoint is canonically 327decomposable (including Hangul Syllables), 328returns the B<completely decomposed> string canonically equivalent to it. 329 330If it is not decomposable, returns C<undef>. 331 332=item C<$compatibility_decomposed = getCompat($codepoint)> 333 334If the character of the specified codepoint is compatibility 335decomposable (including Hangul Syllables), 336returns the B<completely decomposed> string compatibility equivalent to it. 337 338If it is not decomposable, returns C<undef>. 339 340=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> 341 342If two characters here and next (as codepoints) are composable 343(including Hangul Jamo/Syllables and Composition Exclusions), 344returns the codepoint of the composite. 345 346If they are not composable, returns C<undef>. 347 348=item C<$combining_class = getCombinClass($codepoint)> 349 350Returns the combining class of the character as an integer. 351 352=item C<$is_exclusion = isExclusion($codepoint)> 353 354Returns a boolean whether the character of the specified codepoint 355is a composition exclusion. 356 357=item C<$is_singleton = isSingleton($codepoint)> 358 359Returns a boolean whether the character of the specified codepoint is 360a singleton. 361 362=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> 363 364Returns a boolean whether the canonical decomposition 365of the character of the specified codepoint 366is a Non-Starter Decomposition. 367 368=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> 369 370Returns a boolean whether the character of the specified codepoint 371may be composed with the previous one in a certain composition 372(including Hangul Compositions, but excluding 373Composition Exclusions and Non-Starter Decompositions). 374 375=back 376 377=head2 EXPORT 378 379C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. 380 381C<normalize> and other some functions: on request. 382 383=head1 AUTHOR 384 385SADAHIRO Tomoyuki, <SADAHIRO@cpan.org> 386 387 http://homepage1.nifty.com/nomenclator/perl/ 388 389 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. 390 391 This module is free software; you can redistribute it 392 and/or modify it under the same terms as Perl itself. 393 394=head1 SEE ALSO 395 396=over 4 397 398=item http://www.unicode.org/reports/tr15/ 399 400Unicode Normalization Forms - UAX #15 401 402=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt 403 404Derived Normalization Properties 405 406=item http://www.unicode.org/notes/tn5/ 407 408Canonical Equivalence in Applications - UTN #5 409 410=back 411 412=cut 413 414