1package Unicode::Normalize; 2 3BEGIN { 4 unless ('A' eq pack('U', 0x41)) { 5 die "Unicode::Normalize cannot stringify a Unicode code point\n"; 6 } 7 unless (0x41 == unpack('U', 'A')) { 8 die "Unicode::Normalize cannot get Unicode code point\n"; 9 } 10} 11 12use 5.006; 13use strict; 14use warnings; 15use Carp; 16 17no warnings 'utf8'; 18 19our $VERSION = '1.27'; 20our $PACKAGE = __PACKAGE__; 21 22our @EXPORT = qw( NFC NFD NFKC NFKD ); 23our @EXPORT_OK = qw( 24 normalize decompose reorder compose 25 checkNFD checkNFKD checkNFC checkNFKC check 26 getCanon getCompat getComposite getCombinClass 27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex 28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE 29 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter 30 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial 31); 32our %EXPORT_TAGS = ( 33 all => [ @EXPORT, @EXPORT_OK ], 34 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], 35 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], 36 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], 37); 38 39## 40## utilities for tests 41## 42 43sub pack_U { 44 return pack('U*', @_); 45} 46 47sub unpack_U { 48 49 # The empty pack returns an empty UTF-8 string, so the effect is to force 50 # the shifted parameter into being UTF-8. This allows this to work on 51 # Perl 5.6, where there is no utf8::upgrade(). 52 return unpack('U*', shift(@_).pack('U*')); 53} 54 55require Exporter; 56 57##### The above part is common to XS and PP ##### 58 59our @ISA = qw(Exporter); 60use XSLoader (); 61XSLoader::load( 'Unicode::Normalize', $VERSION ); 62 63##### The below part is common to XS and PP ##### 64 65## 66## normalize 67## 68 69sub FCD ($) { 70 my $str = shift; 71 return checkFCD($str) ? $str : NFD($str); 72} 73 74our %formNorm = ( 75 NFC => \&NFC, C => \&NFC, 76 NFD => \&NFD, D => \&NFD, 77 NFKC => \&NFKC, KC => \&NFKC, 78 NFKD => \&NFKD, KD => \&NFKD, 79 FCD => \&FCD, FCC => \&FCC, 80); 81 82sub normalize($$) 83{ 84 my $form = shift; 85 my $str = shift; 86 if (exists $formNorm{$form}) { 87 return $formNorm{$form}->($str); 88 } 89 croak($PACKAGE."::normalize: invalid form name: $form"); 90} 91 92## 93## partial 94## 95 96sub normalize_partial ($$) { 97 if (exists $formNorm{$_[0]}) { 98 my $n = normalize($_[0], $_[1]); 99 my($p, $u) = splitOnLastStarter($n); 100 $_[1] = $u; 101 return $p; 102 } 103 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); 104} 105 106sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } 107sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } 108sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } 109sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } 110 111## 112## check 113## 114 115our %formCheck = ( 116 NFC => \&checkNFC, C => \&checkNFC, 117 NFD => \&checkNFD, D => \&checkNFD, 118 NFKC => \&checkNFKC, KC => \&checkNFKC, 119 NFKD => \&checkNFKD, KD => \&checkNFKD, 120 FCD => \&checkFCD, FCC => \&checkFCC, 121); 122 123sub check($$) 124{ 125 my $form = shift; 126 my $str = shift; 127 if (exists $formCheck{$form}) { 128 return $formCheck{$form}->($str); 129 } 130 croak($PACKAGE."::check: invalid form name: $form"); 131} 132 1331; 134__END__ 135 136=head1 NAME 137 138Unicode::Normalize - Unicode Normalization Forms 139 140=head1 SYNOPSIS 141 142(1) using function names exported by default: 143 144 use Unicode::Normalize; 145 146 $NFD_string = NFD($string); # Normalization Form D 147 $NFC_string = NFC($string); # Normalization Form C 148 $NFKD_string = NFKD($string); # Normalization Form KD 149 $NFKC_string = NFKC($string); # Normalization Form KC 150 151(2) using function names exported on request: 152 153 use Unicode::Normalize 'normalize'; 154 155 $NFD_string = normalize('D', $string); # Normalization Form D 156 $NFC_string = normalize('C', $string); # Normalization Form C 157 $NFKD_string = normalize('KD', $string); # Normalization Form KD 158 $NFKC_string = normalize('KC', $string); # Normalization Form KC 159 160=head1 DESCRIPTION 161 162Parameters: 163 164C<$string> is used as a string under character semantics (see L<perlunicode>). 165 166C<$code_point> should be an unsigned integer representing a Unicode code point. 167 168Note: Between XSUB and pure Perl, there is an incompatibility 169about the interpretation of C<$code_point> as a decimal number. 170XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. 171Do not use a floating point nor a negative sign in C<$code_point>. 172 173=head2 Normalization Forms 174 175=over 4 176 177=item C<$NFD_string = NFD($string)> 178 179It returns the Normalization Form D (formed by canonical decomposition). 180 181=item C<$NFC_string = NFC($string)> 182 183It returns the Normalization Form C (formed by canonical decomposition 184followed by canonical composition). 185 186=item C<$NFKD_string = NFKD($string)> 187 188It returns the Normalization Form KD (formed by compatibility decomposition). 189 190=item C<$NFKC_string = NFKC($string)> 191 192It returns the Normalization Form KC (formed by compatibility decomposition 193followed by B<canonical> composition). 194 195=item C<$FCD_string = FCD($string)> 196 197If the given string is in FCD ("Fast C or D" form; cf. UTN #5), 198it returns the string without modification; otherwise it returns an FCD string. 199 200Note: FCD is not always unique, then plural forms may be equivalent 201each other. C<FCD()> will return one of these equivalent forms. 202 203=item C<$FCC_string = FCC($string)> 204 205It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). 206 207Note: FCC is unique, as well as four normalization forms (NF*). 208 209=item C<$normalized_string = normalize($form_name, $string)> 210 211It returns the normalization form of C<$form_name>. 212 213As C<$form_name>, one of the following names must be given. 214 215 'C' or 'NFC' for Normalization Form C (UAX #15) 216 'D' or 'NFD' for Normalization Form D (UAX #15) 217 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 218 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 219 220 'FCD' for "Fast C or D" Form (UTN #5) 221 'FCC' for "Fast C Contiguous" (UTN #5) 222 223=back 224 225=head2 Decomposition and Composition 226 227=over 4 228 229=item C<$decomposed_string = decompose($string [, $useCompatMapping])> 230 231It returns the concatenation of the decomposition of each character 232in the string. 233 234If the second parameter (a boolean) is omitted or false, 235the decomposition is canonical decomposition; 236if the second parameter (a boolean) is true, 237the decomposition is compatibility decomposition. 238 239The string returned is not always in NFD/NFKD. Reordering may be required. 240 241 $NFD_string = reorder(decompose($string)); # eq. to NFD() 242 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() 243 244=item C<$reordered_string = reorder($string)> 245 246It returns the result of reordering the combining characters 247according to Canonical Ordering Behavior. 248 249For example, when you have a list of NFD/NFKD strings, 250you can get the concatenated NFD/NFKD string from them, by saying 251 252 $concat_NFD = reorder(join '', @NFD_strings); 253 $concat_NFKD = reorder(join '', @NFKD_strings); 254 255=item C<$composed_string = compose($string)> 256 257It returns the result of canonical composition 258without applying any decomposition. 259 260For example, when you have a NFD/NFKD string, 261you can get its NFC/NFKC string, by saying 262 263 $NFC_string = compose($NFD_string); 264 $NFKC_string = compose($NFKD_string); 265 266=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> 267 268It returns two strings: the first one, C<$processed>, is a part 269before the last starter, and the second one, C<$unprocessed> is 270another part after the first part. A starter is a character having 271a combining class of zero (see UAX #15). 272 273Note that C<$processed> may be empty (when C<$normalized> contains no 274starter or starts with the last starter), and then C<$unprocessed> 275should be equal to the entire C<$normalized>. 276 277When you have a C<$normalized> string and an C<$unnormalized> string 278following it, a simple concatenation is wrong: 279 280 $concat = $normalized . normalize($form, $unnormalized); # wrong! 281 282Instead of it, do like this: 283 284 ($processed, $unprocessed) = splitOnLastStarter($normalized); 285 $concat = $processed . normalize($form,$unprocessed.$unnormalized); 286 287C<splitOnLastStarter()> should be called with a pre-normalized parameter 288C<$normalized>, that is in the same form as C<$form> you want. 289 290If you have an array of C<@string> that should be concatenated and then 291normalized, you can do like this: 292 293 my $result = ""; 294 my $unproc = ""; 295 foreach my $str (@string) { 296 $unproc .= $str; 297 my $n = normalize($form, $unproc); 298 my($p, $u) = splitOnLastStarter($n); 299 $result .= $p; 300 $unproc = $u; 301 } 302 $result .= $unproc; 303 # instead of normalize($form, join('', @string)) 304 305=item C<$processed = normalize_partial($form, $unprocessed)> 306 307A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. 308Note that C<$unprocessed> will be modified as a side-effect. 309 310If you have an array of C<@string> that should be concatenated and then 311normalized, you can do like this: 312 313 my $result = ""; 314 my $unproc = ""; 315 foreach my $str (@string) { 316 $unproc .= $str; 317 $result .= normalize_partial($form, $unproc); 318 } 319 $result .= $unproc; 320 # instead of normalize($form, join('', @string)) 321 322=item C<$processed = NFD_partial($unprocessed)> 323 324It does like C<normalize_partial('NFD', $unprocessed)>. 325Note that C<$unprocessed> will be modified as a side-effect. 326 327=item C<$processed = NFC_partial($unprocessed)> 328 329It does like C<normalize_partial('NFC', $unprocessed)>. 330Note that C<$unprocessed> will be modified as a side-effect. 331 332=item C<$processed = NFKD_partial($unprocessed)> 333 334It does like C<normalize_partial('NFKD', $unprocessed)>. 335Note that C<$unprocessed> will be modified as a side-effect. 336 337=item C<$processed = NFKC_partial($unprocessed)> 338 339It does like C<normalize_partial('NFKC', $unprocessed)>. 340Note that C<$unprocessed> will be modified as a side-effect. 341 342=back 343 344=head2 Quick Check 345 346(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) 347 348The following functions check whether the string is in that normalization form. 349 350The result returned will be one of the following: 351 352 YES The string is in that normalization form. 353 NO The string is not in that normalization form. 354 MAYBE Dubious. Maybe yes, maybe no. 355 356=over 4 357 358=item C<$result = checkNFD($string)> 359 360It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 361 362=item C<$result = checkNFC($string)> 363 364It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 365C<undef> if C<MAYBE>. 366 367=item C<$result = checkNFKD($string)> 368 369It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 370 371=item C<$result = checkNFKC($string)> 372 373It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 374C<undef> if C<MAYBE>. 375 376=item C<$result = checkFCD($string)> 377 378It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 379 380=item C<$result = checkFCC($string)> 381 382It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 383C<undef> if C<MAYBE>. 384 385Note: If a string is not in FCD, it must not be in FCC. 386So C<checkFCC($not_FCD_string)> should return C<NO>. 387 388=item C<$result = check($form_name, $string)> 389 390It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 391C<undef> if C<MAYBE>. 392 393As C<$form_name>, one of the following names must be given. 394 395 'C' or 'NFC' for Normalization Form C (UAX #15) 396 'D' or 'NFD' for Normalization Form D (UAX #15) 397 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 398 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 399 400 'FCD' for "Fast C or D" Form (UTN #5) 401 'FCC' for "Fast C Contiguous" (UTN #5) 402 403=back 404 405B<Note> 406 407In the cases of NFD, NFKD, and FCD, the answer must be 408either C<YES> or C<NO>. The answer C<MAYBE> may be returned 409in the cases of NFC, NFKC, and FCC. 410 411A C<MAYBE> string should contain at least one combining character 412or the like. For example, C<COMBINING ACUTE ACCENT> has 413the MAYBE_NFC/MAYBE_NFKC property. 414 415Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> 416and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. 417C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC 418(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), 419while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. 420 421If you want to check exactly, compare the string with its NFC/NFKC/FCC. 422 423 if ($string eq NFC($string)) { 424 # $string is exactly normalized in NFC; 425 } else { 426 # $string is not normalized in NFC; 427 } 428 429 if ($string eq NFKC($string)) { 430 # $string is exactly normalized in NFKC; 431 } else { 432 # $string is not normalized in NFKC; 433 } 434 435=head2 Character Data 436 437These functions are interface of character data used internally. 438If you want only to get Unicode normalization forms, you don't need 439call them yourself. 440 441=over 4 442 443=item C<$canonical_decomposition = getCanon($code_point)> 444 445If the character is canonically decomposable (including Hangul Syllables), 446it returns the (full) canonical decomposition as a string. 447Otherwise it returns C<undef>. 448 449B<Note:> According to the Unicode standard, the canonical decomposition 450of the character that is not canonically decomposable is same as 451the character itself. 452 453=item C<$compatibility_decomposition = getCompat($code_point)> 454 455If the character is compatibility decomposable (including Hangul Syllables), 456it returns the (full) compatibility decomposition as a string. 457Otherwise it returns C<undef>. 458 459B<Note:> According to the Unicode standard, the compatibility decomposition 460of the character that is not compatibility decomposable is same as 461the character itself. 462 463=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> 464 465If two characters here and next (as code points) are composable 466(including Hangul Jamo/Syllables and Composition Exclusions), 467it returns the code point of the composite. 468 469If they are not composable, it returns C<undef>. 470 471=item C<$combining_class = getCombinClass($code_point)> 472 473It returns the combining class (as an integer) of the character. 474 475=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> 476 477It returns a boolean whether the character of the specified codepoint 478may be composed with the previous one in a certain composition 479(including Hangul Compositions, but excluding 480Composition Exclusions and Non-Starter Decompositions). 481 482=item C<$is_exclusion = isExclusion($code_point)> 483 484It returns a boolean whether the code point is a composition exclusion. 485 486=item C<$is_singleton = isSingleton($code_point)> 487 488It returns a boolean whether the code point is a singleton 489 490=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> 491 492It returns a boolean whether the code point has Non-Starter Decomposition. 493 494=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> 495 496It returns a boolean of the derived property Comp_Ex 497(Full_Composition_Exclusion). This property is generated from 498Composition Exclusions + Singletons + Non-Starter Decompositions. 499 500=item C<$NFD_is_NO = isNFD_NO($code_point)> 501 502It returns a boolean of the derived property NFD_NO 503(NFD_Quick_Check=No). 504 505=item C<$NFC_is_NO = isNFC_NO($code_point)> 506 507It returns a boolean of the derived property NFC_NO 508(NFC_Quick_Check=No). 509 510=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> 511 512It returns a boolean of the derived property NFC_MAYBE 513(NFC_Quick_Check=Maybe). 514 515=item C<$NFKD_is_NO = isNFKD_NO($code_point)> 516 517It returns a boolean of the derived property NFKD_NO 518(NFKD_Quick_Check=No). 519 520=item C<$NFKC_is_NO = isNFKC_NO($code_point)> 521 522It returns a boolean of the derived property NFKC_NO 523(NFKC_Quick_Check=No). 524 525=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> 526 527It returns a boolean of the derived property NFKC_MAYBE 528(NFKC_Quick_Check=Maybe). 529 530=back 531 532=head1 EXPORT 533 534C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. 535 536C<normalize> and other some functions: on request. 537 538=head1 CAVEATS 539 540=over 4 541 542=item Perl's version vs. Unicode version 543 544Since this module refers to perl core's Unicode database in the directory 545F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of 546normalization implemented by this module depends on what has been 547compiled into your perl. The following table lists the default Unicode 548version that comes with various perl versions. (It is possible to change 549the Unicode version in any perl version to be any earlier Unicode version, 550so one could cause Unicode 3.2 to be used in any perl version starting with 5515.8.0. Read F<C<$Config{privlib}>/unicore/README.perl> for details. 552 553 perl's version implemented Unicode version 554 5.6.1 3.0.1 555 5.7.2 3.1.0 556 5.7.3 3.1.1 (normalization is same as 3.1.0) 557 5.8.0 3.2.0 558 5.8.1-5.8.3 4.0.0 559 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) 560 5.8.7-5.8.8 4.1.0 561 5.10.0 5.0.0 562 5.8.9, 5.10.1 5.1.0 563 5.12.x 5.2.0 564 5.14.x 6.0.0 565 5.16.x 6.1.0 566 5.18.x 6.2.0 567 5.20.x 6.3.0 568 5.22.x 7.0.0 569 570=item Correction of decomposition mapping 571 572In older Unicode versions, a small number of characters (all of which are 573CJK compatibility ideographs as far as they have been found) may have 574an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). 575Anyhow, this module will neither refer to F<NormalizationCorrections.txt> 576nor provide any specific version of normalization. Therefore this module 577running on an older perl with an older Unicode database may use 578the erroneous decomposition mapping blindly conforming to the Unicode database. 579 580=item Revised definition of canonical composition 581 582In Unicode 4.1.0, the definition D2 of canonical composition (which 583affects NFC and NFKC) has been changed (see Public Review Issue #29 584and recent UAX #15). This module has used the newer definition 585since the version 0.07 (Oct 31, 2001). 586This module will not support the normalization according to the older 587definition, even if the Unicode version implemented by perl is 588lower than 4.1.0. 589 590=back 591 592=head1 AUTHOR 593 594SADAHIRO Tomoyuki <SADAHIRO@cpan.org> 595 596Currently maintained by <perl5-porters@perl.org> 597 598Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. 599 600=head1 LICENSE 601 602This module is free software; you can redistribute it 603and/or modify it under the same terms as Perl itself. 604 605=head1 SEE ALSO 606 607=over 4 608 609=item L<http://www.unicode.org/reports/tr15/> 610 611Unicode Normalization Forms - UAX #15 612 613=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt> 614 615Composition Exclusion Table 616 617=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt> 618 619Derived Normalization Properties 620 621=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt> 622 623Normalization Corrections 624 625=item L<http://www.unicode.org/review/pr-29.html> 626 627Public Review Issue #29: Normalization Issue 628 629=item L<http://www.unicode.org/notes/tn5/> 630 631Canonical Equivalence in Applications - UTN #5 632 633=back 634 635=cut 636