xref: /openbsd-src/gnu/usr.bin/perl/dist/Unicode-Normalize/Normalize.pm (revision d0fc3bb68efd6c434b4053cd7adb29023cbec341)
1package Unicode::Normalize;
2
3BEGIN {
4    unless ('A' eq pack('U', 0x41)) {
5	die "Unicode::Normalize cannot stringify a Unicode code point\n";
6    }
7    unless (0x41 == unpack('U', 'A')) {
8	die "Unicode::Normalize cannot get Unicode code point\n";
9    }
10}
11
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16
17no warnings 'utf8';
18
19our $VERSION = '1.27';
20our $PACKAGE = __PACKAGE__;
21
22our @EXPORT = qw( NFC NFD NFKC NFKD );
23our @EXPORT_OK = qw(
24    normalize decompose reorder compose
25    checkNFD checkNFKD checkNFC checkNFKC check
26    getCanon getCompat getComposite getCombinClass
27    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29    FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
30    normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
31);
32our %EXPORT_TAGS = (
33    all       => [ @EXPORT, @EXPORT_OK ],
34    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
35    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
36    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
37);
38
39##
40## utilities for tests
41##
42
43sub pack_U {
44    return pack('U*', @_);
45}
46
47sub unpack_U {
48
49    # The empty pack returns an empty UTF-8 string, so the effect is to force
50    # the shifted parameter into being UTF-8.  This allows this to work on
51    # Perl 5.6, where there is no utf8::upgrade().
52    return unpack('U*', shift(@_).pack('U*'));
53}
54
55require Exporter;
56
57##### The above part is common to XS and PP #####
58
59our @ISA = qw(Exporter);
60use XSLoader ();
61XSLoader::load( 'Unicode::Normalize', $VERSION );
62
63##### The below part is common to XS and PP #####
64
65##
66## normalize
67##
68
69sub FCD ($) {
70    my $str = shift;
71    return checkFCD($str) ? $str : NFD($str);
72}
73
74our %formNorm = (
75    NFC  => \&NFC,	C  => \&NFC,
76    NFD  => \&NFD,	D  => \&NFD,
77    NFKC => \&NFKC,	KC => \&NFKC,
78    NFKD => \&NFKD,	KD => \&NFKD,
79    FCD  => \&FCD,	FCC => \&FCC,
80);
81
82sub normalize($$)
83{
84    my $form = shift;
85    my $str = shift;
86    if (exists $formNorm{$form}) {
87	return $formNorm{$form}->($str);
88    }
89    croak($PACKAGE."::normalize: invalid form name: $form");
90}
91
92##
93## partial
94##
95
96sub normalize_partial ($$) {
97    if (exists $formNorm{$_[0]}) {
98	my $n = normalize($_[0], $_[1]);
99	my($p, $u) = splitOnLastStarter($n);
100	$_[1] = $u;
101	return $p;
102    }
103    croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
104}
105
106sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
107sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
108sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
109sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
110
111##
112## check
113##
114
115our %formCheck = (
116    NFC  => \&checkNFC, 	C  => \&checkNFC,
117    NFD  => \&checkNFD, 	D  => \&checkNFD,
118    NFKC => \&checkNFKC,	KC => \&checkNFKC,
119    NFKD => \&checkNFKD,	KD => \&checkNFKD,
120    FCD  => \&checkFCD, 	FCC => \&checkFCC,
121);
122
123sub check($$)
124{
125    my $form = shift;
126    my $str = shift;
127    if (exists $formCheck{$form}) {
128	return $formCheck{$form}->($str);
129    }
130    croak($PACKAGE."::check: invalid form name: $form");
131}
132
1331;
134__END__
135
136=head1 NAME
137
138Unicode::Normalize - Unicode Normalization Forms
139
140=head1 SYNOPSIS
141
142(1) using function names exported by default:
143
144  use Unicode::Normalize;
145
146  $NFD_string  = NFD($string);  # Normalization Form D
147  $NFC_string  = NFC($string);  # Normalization Form C
148  $NFKD_string = NFKD($string); # Normalization Form KD
149  $NFKC_string = NFKC($string); # Normalization Form KC
150
151(2) using function names exported on request:
152
153  use Unicode::Normalize 'normalize';
154
155  $NFD_string  = normalize('D',  $string);  # Normalization Form D
156  $NFC_string  = normalize('C',  $string);  # Normalization Form C
157  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
158  $NFKC_string = normalize('KC', $string);  # Normalization Form KC
159
160=head1 DESCRIPTION
161
162Parameters:
163
164C<$string> is used as a string under character semantics (see L<perlunicode>).
165
166C<$code_point> should be an unsigned integer representing a Unicode code point.
167
168Note: Between XSUB and pure Perl, there is an incompatibility
169about the interpretation of C<$code_point> as a decimal number.
170XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
171Do not use a floating point nor a negative sign in C<$code_point>.
172
173=head2 Normalization Forms
174
175=over 4
176
177=item C<$NFD_string = NFD($string)>
178
179It returns the Normalization Form D (formed by canonical decomposition).
180
181=item C<$NFC_string = NFC($string)>
182
183It returns the Normalization Form C (formed by canonical decomposition
184followed by canonical composition).
185
186=item C<$NFKD_string = NFKD($string)>
187
188It returns the Normalization Form KD (formed by compatibility decomposition).
189
190=item C<$NFKC_string = NFKC($string)>
191
192It returns the Normalization Form KC (formed by compatibility decomposition
193followed by B<canonical> composition).
194
195=item C<$FCD_string = FCD($string)>
196
197If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
198it returns the string without modification; otherwise it returns an FCD string.
199
200Note: FCD is not always unique, then plural forms may be equivalent
201each other. C<FCD()> will return one of these equivalent forms.
202
203=item C<$FCC_string = FCC($string)>
204
205It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
206
207Note: FCC is unique, as well as four normalization forms (NF*).
208
209=item C<$normalized_string = normalize($form_name, $string)>
210
211It returns the normalization form of C<$form_name>.
212
213As C<$form_name>, one of the following names must be given.
214
215  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
216  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
217  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
218  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
219
220  'FCD'          for "Fast C or D" Form  (UTN #5)
221  'FCC'          for "Fast C Contiguous" (UTN #5)
222
223=back
224
225=head2 Decomposition and Composition
226
227=over 4
228
229=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
230
231It returns the concatenation of the decomposition of each character
232in the string.
233
234If the second parameter (a boolean) is omitted or false,
235the decomposition is canonical decomposition;
236if the second parameter (a boolean) is true,
237the decomposition is compatibility decomposition.
238
239The string returned is not always in NFD/NFKD. Reordering may be required.
240
241 $NFD_string  = reorder(decompose($string));       # eq. to NFD()
242 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
243
244=item C<$reordered_string = reorder($string)>
245
246It returns the result of reordering the combining characters
247according to Canonical Ordering Behavior.
248
249For example, when you have a list of NFD/NFKD strings,
250you can get the concatenated NFD/NFKD string from them, by saying
251
252    $concat_NFD  = reorder(join '', @NFD_strings);
253    $concat_NFKD = reorder(join '', @NFKD_strings);
254
255=item C<$composed_string = compose($string)>
256
257It returns the result of canonical composition
258without applying any decomposition.
259
260For example, when you have a NFD/NFKD string,
261you can get its NFC/NFKC string, by saying
262
263    $NFC_string  = compose($NFD_string);
264    $NFKC_string = compose($NFKD_string);
265
266=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
267
268It returns two strings: the first one, C<$processed>, is a part
269before the last starter, and the second one, C<$unprocessed> is
270another part after the first part. A starter is a character having
271a combining class of zero (see UAX #15).
272
273Note that C<$processed> may be empty (when C<$normalized> contains no
274starter or starts with the last starter), and then C<$unprocessed>
275should be equal to the entire C<$normalized>.
276
277When you have a C<$normalized> string and an C<$unnormalized> string
278following it, a simple concatenation is wrong:
279
280 $concat = $normalized . normalize($form, $unnormalized); # wrong!
281
282Instead of it, do like this:
283
284 ($processed, $unprocessed) = splitOnLastStarter($normalized);
285 $concat = $processed . normalize($form,$unprocessed.$unnormalized);
286
287C<splitOnLastStarter()> should be called with a pre-normalized parameter
288C<$normalized>, that is in the same form as C<$form> you want.
289
290If you have an array of C<@string> that should be concatenated and then
291normalized, you can do like this:
292
293    my $result = "";
294    my $unproc = "";
295    foreach my $str (@string) {
296        $unproc .= $str;
297        my $n = normalize($form, $unproc);
298        my($p, $u) = splitOnLastStarter($n);
299        $result .= $p;
300        $unproc  = $u;
301    }
302    $result .= $unproc;
303    # instead of normalize($form, join('', @string))
304
305=item C<$processed = normalize_partial($form, $unprocessed)>
306
307A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
308Note that C<$unprocessed> will be modified as a side-effect.
309
310If you have an array of C<@string> that should be concatenated and then
311normalized, you can do like this:
312
313    my $result = "";
314    my $unproc = "";
315    foreach my $str (@string) {
316        $unproc .= $str;
317        $result .= normalize_partial($form, $unproc);
318    }
319    $result .= $unproc;
320    # instead of normalize($form, join('', @string))
321
322=item C<$processed = NFD_partial($unprocessed)>
323
324It does like C<normalize_partial('NFD', $unprocessed)>.
325Note that C<$unprocessed> will be modified as a side-effect.
326
327=item C<$processed = NFC_partial($unprocessed)>
328
329It does like C<normalize_partial('NFC', $unprocessed)>.
330Note that C<$unprocessed> will be modified as a side-effect.
331
332=item C<$processed = NFKD_partial($unprocessed)>
333
334It does like C<normalize_partial('NFKD', $unprocessed)>.
335Note that C<$unprocessed> will be modified as a side-effect.
336
337=item C<$processed = NFKC_partial($unprocessed)>
338
339It does like C<normalize_partial('NFKC', $unprocessed)>.
340Note that C<$unprocessed> will be modified as a side-effect.
341
342=back
343
344=head2 Quick Check
345
346(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
347
348The following functions check whether the string is in that normalization form.
349
350The result returned will be one of the following:
351
352    YES     The string is in that normalization form.
353    NO      The string is not in that normalization form.
354    MAYBE   Dubious. Maybe yes, maybe no.
355
356=over 4
357
358=item C<$result = checkNFD($string)>
359
360It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
361
362=item C<$result = checkNFC($string)>
363
364It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
365C<undef> if C<MAYBE>.
366
367=item C<$result = checkNFKD($string)>
368
369It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
370
371=item C<$result = checkNFKC($string)>
372
373It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
374C<undef> if C<MAYBE>.
375
376=item C<$result = checkFCD($string)>
377
378It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
379
380=item C<$result = checkFCC($string)>
381
382It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
383C<undef> if C<MAYBE>.
384
385Note: If a string is not in FCD, it must not be in FCC.
386So C<checkFCC($not_FCD_string)> should return C<NO>.
387
388=item C<$result = check($form_name, $string)>
389
390It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
391C<undef> if C<MAYBE>.
392
393As C<$form_name>, one of the following names must be given.
394
395  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
396  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
397  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
398  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
399
400  'FCD'          for "Fast C or D" Form  (UTN #5)
401  'FCC'          for "Fast C Contiguous" (UTN #5)
402
403=back
404
405B<Note>
406
407In the cases of NFD, NFKD, and FCD, the answer must be
408either C<YES> or C<NO>. The answer C<MAYBE> may be returned
409in the cases of NFC, NFKC, and FCC.
410
411A C<MAYBE> string should contain at least one combining character
412or the like. For example, C<COMBINING ACUTE ACCENT> has
413the MAYBE_NFC/MAYBE_NFKC property.
414
415Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
416and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
417C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
418(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
419while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
420
421If you want to check exactly, compare the string with its NFC/NFKC/FCC.
422
423    if ($string eq NFC($string)) {
424        # $string is exactly normalized in NFC;
425    } else {
426        # $string is not normalized in NFC;
427    }
428
429    if ($string eq NFKC($string)) {
430        # $string is exactly normalized in NFKC;
431    } else {
432        # $string is not normalized in NFKC;
433    }
434
435=head2 Character Data
436
437These functions are interface of character data used internally.
438If you want only to get Unicode normalization forms, you don't need
439call them yourself.
440
441=over 4
442
443=item C<$canonical_decomposition = getCanon($code_point)>
444
445If the character is canonically decomposable (including Hangul Syllables),
446it returns the (full) canonical decomposition as a string.
447Otherwise it returns C<undef>.
448
449B<Note:> According to the Unicode standard, the canonical decomposition
450of the character that is not canonically decomposable is same as
451the character itself.
452
453=item C<$compatibility_decomposition = getCompat($code_point)>
454
455If the character is compatibility decomposable (including Hangul Syllables),
456it returns the (full) compatibility decomposition as a string.
457Otherwise it returns C<undef>.
458
459B<Note:> According to the Unicode standard, the compatibility decomposition
460of the character that is not compatibility decomposable is same as
461the character itself.
462
463=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
464
465If two characters here and next (as code points) are composable
466(including Hangul Jamo/Syllables and Composition Exclusions),
467it returns the code point of the composite.
468
469If they are not composable, it returns C<undef>.
470
471=item C<$combining_class = getCombinClass($code_point)>
472
473It returns the combining class (as an integer) of the character.
474
475=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
476
477It returns a boolean whether the character of the specified codepoint
478may be composed with the previous one in a certain composition
479(including Hangul Compositions, but excluding
480Composition Exclusions and Non-Starter Decompositions).
481
482=item C<$is_exclusion = isExclusion($code_point)>
483
484It returns a boolean whether the code point is a composition exclusion.
485
486=item C<$is_singleton = isSingleton($code_point)>
487
488It returns a boolean whether the code point is a singleton
489
490=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
491
492It returns a boolean whether the code point has Non-Starter Decomposition.
493
494=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
495
496It returns a boolean of the derived property Comp_Ex
497(Full_Composition_Exclusion). This property is generated from
498Composition Exclusions + Singletons + Non-Starter Decompositions.
499
500=item C<$NFD_is_NO = isNFD_NO($code_point)>
501
502It returns a boolean of the derived property NFD_NO
503(NFD_Quick_Check=No).
504
505=item C<$NFC_is_NO = isNFC_NO($code_point)>
506
507It returns a boolean of the derived property NFC_NO
508(NFC_Quick_Check=No).
509
510=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
511
512It returns a boolean of the derived property NFC_MAYBE
513(NFC_Quick_Check=Maybe).
514
515=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
516
517It returns a boolean of the derived property NFKD_NO
518(NFKD_Quick_Check=No).
519
520=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
521
522It returns a boolean of the derived property NFKC_NO
523(NFKC_Quick_Check=No).
524
525=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
526
527It returns a boolean of the derived property NFKC_MAYBE
528(NFKC_Quick_Check=Maybe).
529
530=back
531
532=head1 EXPORT
533
534C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
535
536C<normalize> and other some functions: on request.
537
538=head1 CAVEATS
539
540=over 4
541
542=item Perl's version vs. Unicode version
543
544Since this module refers to perl core's Unicode database in the directory
545F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
546normalization implemented by this module depends on what has been
547compiled into your perl.  The following table lists the default Unicode
548version that comes with various perl versions.  (It is possible to change
549the Unicode version in any perl version to be any earlier Unicode version,
550so one could cause Unicode 3.2 to be used in any perl version starting with
5515.8.0.  Read F<C<$Config{privlib}>/unicore/README.perl> for details.
552
553    perl's version     implemented Unicode version
554       5.6.1              3.0.1
555       5.7.2              3.1.0
556       5.7.3              3.1.1 (normalization is same as 3.1.0)
557       5.8.0              3.2.0
558         5.8.1-5.8.3      4.0.0
559         5.8.4-5.8.6      4.0.1 (normalization is same as 4.0.0)
560         5.8.7-5.8.8      4.1.0
561       5.10.0             5.0.0
562        5.8.9, 5.10.1     5.1.0
563       5.12.x             5.2.0
564       5.14.x             6.0.0
565       5.16.x             6.1.0
566       5.18.x             6.2.0
567       5.20.x             6.3.0
568       5.22.x             7.0.0
569
570=item Correction of decomposition mapping
571
572In older Unicode versions, a small number of characters (all of which are
573CJK compatibility ideographs as far as they have been found) may have
574an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
575Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
576nor provide any specific version of normalization. Therefore this module
577running on an older perl with an older Unicode database may use
578the erroneous decomposition mapping blindly conforming to the Unicode database.
579
580=item Revised definition of canonical composition
581
582In Unicode 4.1.0, the definition D2 of canonical composition (which
583affects NFC and NFKC) has been changed (see Public Review Issue #29
584and recent UAX #15). This module has used the newer definition
585since the version 0.07 (Oct 31, 2001).
586This module will not support the normalization according to the older
587definition, even if the Unicode version implemented by perl is
588lower than 4.1.0.
589
590=back
591
592=head1 AUTHOR
593
594SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
595
596Currently maintained by <perl5-porters@perl.org>
597
598Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
599
600=head1 LICENSE
601
602This module is free software; you can redistribute it
603and/or modify it under the same terms as Perl itself.
604
605=head1 SEE ALSO
606
607=over 4
608
609=item L<http://www.unicode.org/reports/tr15/>
610
611Unicode Normalization Forms - UAX #15
612
613=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt>
614
615Composition Exclusion Table
616
617=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt>
618
619Derived Normalization Properties
620
621=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt>
622
623Normalization Corrections
624
625=item L<http://www.unicode.org/review/pr-29.html>
626
627Public Review Issue #29: Normalization Issue
628
629=item L<http://www.unicode.org/notes/tn5/>
630
631Canonical Equivalence in Applications - UTN #5
632
633=back
634
635=cut
636