1package Unicode::Normalize;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Normalize cannot stringify a Unicode code point\n";
6    }
7}
8
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
14no warnings 'utf8';
15
16our $VERSION = '0.28';
17our $PACKAGE = __PACKAGE__;
18
19require Exporter;
20require DynaLoader;
21
22our @ISA = qw(Exporter DynaLoader);
23our @EXPORT = qw( NFC NFD NFKC NFKD );
24our @EXPORT_OK = qw(
25    normalize decompose reorder compose
26    checkNFD checkNFKD checkNFC checkNFKC check
27    getCanon getCompat getComposite getCombinClass
28    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
29    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
30    FCD checkFCD FCC checkFCC composeContiguous
31    splitOnLastStarter
32);
33our %EXPORT_TAGS = (
34    all       => [ @EXPORT, @EXPORT_OK ],
35    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
36    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
37    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
38);
39
40######
41
42bootstrap Unicode::Normalize $VERSION;
43
44######
45
46sub pack_U {
47    return pack('U*', @_);
48}
49
50sub unpack_U {
51    return unpack('U*', pack('U*').shift);
52}
53
54
55##
56## normalization forms
57##
58
59use constant COMPAT => 1;
60
61sub NFD  ($) { reorder(decompose($_[0])) }
62sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
63sub NFC  ($) { compose(reorder(decompose($_[0]))) }
64sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
65
66sub FCD ($) {
67    my $str = shift;
68    return checkFCD($str) ? $str : NFD($str);
69}
70sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
71
72our %formNorm = (
73    NFC  => \&NFC,	C  => \&NFC,
74    NFD  => \&NFD,	D  => \&NFD,
75    NFKC => \&NFKC,	KC => \&NFKC,
76    NFKD => \&NFKD,	KD => \&NFKD,
77    FCD  => \&FCD,	FCC => \&FCC,
78);
79
80sub normalize($$)
81{
82    my $form = shift;
83    my $str = shift;
84    return exists $formNorm{$form}
85	? $formNorm{$form}->($str)
86	: croak $PACKAGE."::normalize: invalid form name: $form";
87}
88
89
90##
91## quick check
92##
93
94our %formCheck = (
95    NFC  => \&checkNFC, 	C  => \&checkNFC,
96    NFD  => \&checkNFD, 	D  => \&checkNFD,
97    NFKC => \&checkNFKC,	KC => \&checkNFKC,
98    NFKD => \&checkNFKD,	KD => \&checkNFKD,
99    FCD  => \&checkFCD, 	FCC => \&checkFCC,
100);
101
102sub check($$)
103{
104    my $form = shift;
105    my $str = shift;
106    return exists $formCheck{$form}
107	? $formCheck{$form}->($str)
108	: croak $PACKAGE."::check: invalid form name: $form";
109}
110
1111;
112__END__
113
114=head1 NAME
115
116Unicode::Normalize - Unicode Normalization Forms
117
118=head1 SYNOPSIS
119
120  use Unicode::Normalize;
121
122  $NFD_string  = NFD($string);  # Normalization Form D
123  $NFC_string  = NFC($string);  # Normalization Form C
124  $NFKD_string = NFKD($string); # Normalization Form KD
125  $NFKC_string = NFKC($string); # Normalization Form KC
126
127   or
128
129  use Unicode::Normalize 'normalize';
130
131  $NFD_string  = normalize('D',  $string);  # Normalization Form D
132  $NFC_string  = normalize('C',  $string);  # Normalization Form C
133  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
134  $NFKC_string = normalize('KC', $string);  # Normalization Form KC
135
136=head1 DESCRIPTION
137
138Parameters:
139
140C<$string> is used as a string under character semantics
141(see F<perlunicode>).
142
143C<$codepoint> should be an unsigned integer
144representing a Unicode code point.
145
146Note: Between XS edition and pure Perl edition,
147interpretation of C<$codepoint> as a decimal number has incompatibility.
148XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
149Do not use a floating point nor a negative sign in C<$codepoint>.
150
151=head2 Normalization Forms
152
153=over 4
154
155=item C<$NFD_string = NFD($string)>
156
157returns the Normalization Form D (formed by canonical decomposition).
158
159=item C<$NFC_string = NFC($string)>
160
161returns the Normalization Form C (formed by canonical decomposition
162followed by canonical composition).
163
164=item C<$NFKD_string = NFKD($string)>
165
166returns the Normalization Form KD (formed by compatibility decomposition).
167
168=item C<$NFKC_string = NFKC($string)>
169
170returns the Normalization Form KC (formed by compatibility decomposition
171followed by B<canonical> composition).
172
173=item C<$FCD_string = FCD($string)>
174
175If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
176returns it without modification; otherwise returns an FCD string.
177
178Note: FCD is not always unique, then plural forms may be equivalent
179each other. C<FCD()> will return one of these equivalent forms.
180
181=item C<$FCC_string = FCC($string)>
182
183returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
184
185Note: FCC is unique, as well as four normalization forms (NF*).
186
187=item C<$normalized_string = normalize($form_name, $string)>
188
189As C<$form_name>, one of the following names must be given.
190
191  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
192  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
193  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
194  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
195
196  'FCD'          for "Fast C or D" Form  (UTN #5)
197  'FCC'          for "Fast C Contiguous" (UTN #5)
198
199=back
200
201=head2 Decomposition and Composition
202
203=over 4
204
205=item C<$decomposed_string = decompose($string)>
206
207=item C<$decomposed_string = decompose($string, $useCompatMapping)>
208
209Decomposes the specified string and returns the result.
210
211If the second parameter (a boolean) is omitted or false, decomposes it
212using the Canonical Decomposition Mapping.
213If true, decomposes it using the Compatibility Decomposition Mapping.
214
215The string returned is not always in NFD/NFKD.
216Reordering may be required.
217
218    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
219    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
220
221=item C<$reordered_string  = reorder($string)>
222
223Reorders the combining characters and the like in the canonical ordering
224and returns the result.
225
226E.g., when you have a list of NFD/NFKD strings,
227you can get the concatenated NFD/NFKD string from them, saying
228
229    $concat_NFD  = reorder(join '', @NFD_strings);
230    $concat_NFKD = reorder(join '', @NFKD_strings);
231
232=item C<$composed_string   = compose($string)>
233
234Returns the string where composable pairs are composed.
235
236E.g., when you have a NFD/NFKD string,
237you can get its NFC/NFKC string, saying
238
239    $NFC_string  = compose($NFD_string);
240    $NFKC_string = compose($NFKD_string);
241
242=back
243
244=head2 Quick Check
245
246(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
247
248The following functions check whether the string is in that normalization form.
249
250The result returned will be:
251
252    YES     The string is in that normalization form.
253    NO      The string is not in that normalization form.
254    MAYBE   Dubious. Maybe yes, maybe no.
255
256=over 4
257
258=item C<$result = checkNFD($string)>
259
260returns C<YES> (C<1>) or C<NO> (C<empty string>).
261
262=item C<$result = checkNFC($string)>
263
264returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
265
266=item C<$result = checkNFKD($string)>
267
268returns C<YES> (C<1>) or C<NO> (C<empty string>).
269
270=item C<$result = checkNFKC($string)>
271
272returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
273
274=item C<$result = checkFCD($string)>
275
276returns C<YES> (C<1>) or C<NO> (C<empty string>).
277
278=item C<$result = checkFCC($string)>
279
280returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
281
282If a string is not in FCD, it must not be in FCC.
283So C<checkFCC($not_FCD_string)> should return C<NO>.
284
285=item C<$result = check($form_name, $string)>
286
287returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
288
289C<$form_name> is alike to that for C<normalize()>.
290
291=back
292
293B<Note>
294
295In the cases of NFD, NFKD, and FCD, the answer must be
296either C<YES> or C<NO>. The answer C<MAYBE> may be returned
297in the cases of NFC, NFKC, and FCC.
298
299A C<MAYBE> string should contain at least one combining character
300or the like. For example, C<COMBINING ACUTE ACCENT> has
301the MAYBE_NFC/MAYBE_NFKC property.
302
303Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
304and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
305C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
306(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
307while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
308
309If you want to check exactly, compare the string with its NFC/NFKC/FCC;
310i.e.,
311
312    $string eq NFC($string)    # thorough than checkNFC($string)
313    $string eq NFKC($string)   # thorough than checkNFKC($string)
314    $string eq FCC($string)    # thorough than checkFCC($string)
315
316=head2 Character Data
317
318These functions are interface of character data used internally.
319If you want only to get Unicode normalization forms, you don't need
320call them yourself.
321
322=over 4
323
324=item C<$canonical_decomposed = getCanon($codepoint)>
325
326If the character of the specified codepoint is canonically
327decomposable (including Hangul Syllables),
328returns the B<completely decomposed> string canonically equivalent to it.
329
330If it is not decomposable, returns C<undef>.
331
332=item C<$compatibility_decomposed = getCompat($codepoint)>
333
334If the character of the specified codepoint is compatibility
335decomposable (including Hangul Syllables),
336returns the B<completely decomposed> string compatibility equivalent to it.
337
338If it is not decomposable, returns C<undef>.
339
340=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
341
342If two characters here and next (as codepoints) are composable
343(including Hangul Jamo/Syllables and Composition Exclusions),
344returns the codepoint of the composite.
345
346If they are not composable, returns C<undef>.
347
348=item C<$combining_class = getCombinClass($codepoint)>
349
350Returns the combining class of the character as an integer.
351
352=item C<$is_exclusion = isExclusion($codepoint)>
353
354Returns a boolean whether the character of the specified codepoint
355is a composition exclusion.
356
357=item C<$is_singleton = isSingleton($codepoint)>
358
359Returns a boolean whether the character of the specified codepoint is
360a singleton.
361
362=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
363
364Returns a boolean whether the canonical decomposition
365of the character of the specified codepoint
366is a Non-Starter Decomposition.
367
368=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
369
370Returns a boolean whether the character of the specified codepoint
371may be composed with the previous one in a certain composition
372(including Hangul Compositions, but excluding
373Composition Exclusions and Non-Starter Decompositions).
374
375=back
376
377=head2 EXPORT
378
379C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
380
381C<normalize> and other some functions: on request.
382
383=head1 AUTHOR
384
385SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
386
387  http://homepage1.nifty.com/nomenclator/perl/
388
389  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
390
391  This module is free software; you can redistribute it
392  and/or modify it under the same terms as Perl itself.
393
394=head1 SEE ALSO
395
396=over 4
397
398=item http://www.unicode.org/reports/tr15/
399
400Unicode Normalization Forms - UAX #15
401
402=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
403
404Derived Normalization Properties
405
406=item http://www.unicode.org/notes/tn5/
407
408Canonical Equivalence in Applications - UTN #5
409
410=back
411
412=cut
413
414