xref: /openbsd-src/gnu/usr.bin/perl/dist/Unicode-Normalize/t/func.t (revision 256a93a44f36679bee503f12e49566c2183f6181)
1
2BEGIN {
3    if ($ENV{PERL_CORE}) {
4        chdir('t') if -d 't';
5        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
6    }
7}
8
9#########################
10
11use strict;
12use warnings;
13BEGIN { $| = 1; print "1..217\n"; }
14my $count = 0;
15sub ok { Unicode::Normalize::ok(\$count, @_) }
16
17use Unicode::Normalize qw(:all);
18
19ok(1);
20
21sub _pack_U { Unicode::Normalize::dot_t_pack_U(@_) }
22sub hexU { _pack_U map hex, split ' ', shift }
23
24# This won't work on EBCDIC platforms prior to v5.8.0, which is when this
25# translation function was defined
26*to_native = (defined &utf8::unicode_to_native)
27             ? \&utf8::unicode_to_native
28             : sub { return shift };
29
30#########################
31
32ok(getCombinClass( to_native(0)),   0);
33ok(getCombinClass(to_native(41)),   0);
34ok(getCombinClass(to_native(65)),   0);
35ok(getCombinClass( 768), 230);
36ok(getCombinClass(1809),  36);
37
38ok(getCanon(to_native(   0)), undef);
39ok(getCanon(to_native(0x29)), undef);
40ok(getCanon(to_native(0x41)), undef);
41ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300));
42ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308));
43ok(getCanon(0x304C), _pack_U(0x304B, 0x3099));
44ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
45ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
46ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
47ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161));
48ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
49ok(getCanon(0x212C), undef);
50ok(getCanon(0x3243), undef);
51ok(getCanon(0xFA2D), _pack_U(0x9DB4));
52
53# 20
54
55ok(getCompat(to_native(   0)), undef);
56ok(getCompat(to_native(0x29)), undef);
57ok(getCompat(to_native(0x41)), undef);
58ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300));
59ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308));
60ok(getCompat(0x304C), _pack_U(0x304B, 0x3099));
61ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
62ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
63ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
64ok(getCompat(0x212C), _pack_U(0x0042));
65ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029));
66ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161));
67ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
68ok(getCompat(0xFA2D), _pack_U(0x9DB4));
69
70# 34
71
72ok(getComposite(to_native(   0), to_native(   0)), undef);
73ok(getComposite(to_native(   0), to_native(0x29)), undef);
74ok(getComposite(to_native(0x29), to_native(   0)), undef);
75ok(getComposite(to_native(0x29), to_native(0x29)), undef);
76ok(getComposite(to_native(   0), to_native(0x41)), undef);
77ok(getComposite(to_native(0x41), to_native(   0)), undef);
78ok(getComposite(to_native(0x41), to_native(0x41)), undef);
79ok(getComposite(to_native(12), to_native(0x0300)), undef);
80ok(getComposite(to_native(0x0055), 0xFF00), undef);
81ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0));
82ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9));
83ok(getComposite(0x0112, 0x0300), 0x1E14);
84ok(getComposite(0x1100, 0x1161), 0xAC00);
85ok(getComposite(0x1100, 0x1173), 0xADF8);
86ok(getComposite(0x1100, 0x11AF), undef);
87ok(getComposite(0x1173, 0x11AF), undef);
88ok(getComposite(0xAC00, 0x11A7), undef);
89ok(getComposite(0xAC00, 0x11A8), 0xAC01);
90ok(getComposite(0xADF8, 0x11AF), 0xAE00);
91
92# 53
93
94sub uprops {
95  my $uv = shift;
96  my $r = "";
97     $r .= isExclusion($uv)   ? 'X' : 'x';
98     $r .= isSingleton($uv)   ? 'S' : 's';
99     $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition
100     $r .= isComp_Ex($uv)     ? 'F' : 'f'; # Full exclusion (X + S + N)
101     $r .= isComp2nd($uv)     ? 'B' : 'b'; # B = M = Y
102     $r .= isNFD_NO($uv)      ? 'D' : 'd';
103     $r .= isNFC_MAYBE($uv)   ? 'M' : 'm'; # Maybe
104     $r .= isNFC_NO($uv)      ? 'C' : 'c';
105     $r .= isNFKD_NO($uv)     ? 'K' : 'k';
106     $r .= isNFKC_MAYBE($uv)  ? 'Y' : 'y'; # maYbe
107     $r .= isNFKC_NO($uv)     ? 'G' : 'g';
108  return $r;
109}
110
111ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL
112ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS
113ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A
114ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE
115ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE
116ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT
117ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS
118ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA
119ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA
120ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA
121ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK
122ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A
123ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL
124ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN
125ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA
126ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900
127ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE
128ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A
129
130# 71
131
132ok(decompose(""), "");
133ok(decompose("A"), "A");
134ok(decompose("", 1), "");
135ok(decompose("A", 1), "A");
136
137ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8"));
138ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF"));
139ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76"));
140
141ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8"));
142ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF"));
143ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB"));
144
145# don't modify the source
146my $sDec = "\x{FA19}";
147ok(decompose($sDec), "\x{795E}");
148ok($sDec, "\x{FA19}");
149
150# 83
151
152ok(reorder(""), "");
153ok(reorder("A"), "A");
154ok(reorder(hexU("0041 0300 0315 0313 031b 0061")),
155	   hexU("0041 031b 0300 0313 0315 0061"));
156ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")),
157	   hexU("00C1 031b 0300 0313 0315 0061 309A 3099"));
158
159# don't modify the source
160my $sReord = "\x{3000}\x{300}\x{31b}";
161ok(reorder($sReord), "\x{3000}\x{31b}\x{300}");
162ok($sReord, "\x{3000}\x{300}\x{31b}");
163
164# 89
165
166ok(compose(""), "");
167ok(compose("A"), "A");
168ok(compose(hexU("0061 0300")),      hexU("00E0"));
169ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B"));
170ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315"));
171ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313"));
172ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B"));
173ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
174ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
175
176# don't modify the source
177my $sCom = "\x{304B}\x{3099}";
178ok(compose($sCom), "\x{304C}");
179ok($sCom, "\x{304B}\x{3099}");
180
181# 100
182
183ok(composeContiguous(""), "");
184ok(composeContiguous("A"), "A");
185ok(composeContiguous(hexU("0061 0300")),      hexU("00E0"));
186ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B"));
187ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315"));
188ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313"));
189ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300"));
190ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
191ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
192
193# don't modify the source
194my $sCtg = "\x{30DB}\x{309A}";
195ok(composeContiguous($sCtg), "\x{30DD}");
196ok($sCtg, "\x{30DB}\x{309A}");
197
198# 111
199
200sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
201
202ok(answer(checkNFD("")),  "YES");
203ok(answer(checkNFC("")),  "YES");
204ok(answer(checkNFKD("")), "YES");
205ok(answer(checkNFKC("")), "YES");
206ok(answer(check("NFD", "")), "YES");
207ok(answer(check("NFC", "")), "YES");
208ok(answer(check("NFKD","")), "YES");
209ok(answer(check("NFKC","")), "YES");
210
211# U+0000 to U+007F are prenormalized in all the normalization forms.
212ok(answer(checkNFD("AZaz\t12!#`")),  "YES");
213ok(answer(checkNFC("AZaz\t12!#`")),  "YES");
214ok(answer(checkNFKD("AZaz\t12!#`")), "YES");
215ok(answer(checkNFKC("AZaz\t12!#`")), "YES");
216ok(answer(check("D", "AZaz\t12!#`")), "YES");
217ok(answer(check("C", "AZaz\t12!#`")), "YES");
218ok(answer(check("KD","AZaz\t12!#`")), "YES");
219ok(answer(check("KC","AZaz\t12!#`")), "YES");
220
221ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES");
222ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO");
223ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE");
224ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES");
225ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE");
226ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO");
227ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO");
228ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
229ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO");    # A+ring+cedilla
230ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES");
231ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO");
232ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))),   "NO");
233ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO");
234ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO");
235ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO");
236ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
237ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO");    # A+ring+cedilla
238ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO");
239
240# 145
241
242"012ABC" =~ /(\d+)(\w+)/;
243ok("012" eq NFC $1 && "ABC" eq NFC $2);
244
245ok(normalize('C', $1), "012");
246ok(normalize('C', $2), "ABC");
247
248ok(normalize('NFC', $1), "012");
249ok(normalize('NFC', $2), "ABC");
250 # s/^NF// in normalize() must not prevent using $1, $&, etc.
251
252# 150
253
254# a string with initial zero should be treated like a number
255
256# LATIN CAPITAL LETTER A WITH GRAVE
257ok(getCombinClass(sprintf("0%d", to_native(192))), 0);
258ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300));
259ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300));
260my $lead_zero = sprintf "0%d", to_native(65);
261ok(getComposite($lead_zero, "0768"), to_native(192));
262ok(isNFD_NO (sprintf("0%d", to_native(192))));
263ok(isNFKD_NO(sprintf("0%d", to_native(192))));
264
265# DEVANAGARI LETTER QA
266ok(isExclusion("02392"));
267ok(isComp_Ex  ("02392"));
268ok(isNFC_NO   ("02392"));
269ok(isNFKC_NO  ("02392"));
270ok(isNFD_NO   ("02392"));
271ok(isNFKD_NO  ("02392"));
272
273# ANGSTROM SIGN
274ok(isSingleton("08491"));
275ok(isComp_Ex  ("08491"));
276ok(isNFC_NO   ("08491"));
277ok(isNFKC_NO  ("08491"));
278ok(isNFD_NO   ("08491"));
279ok(isNFKD_NO  ("08491"));
280
281# COMBINING GREEK DIALYTIKA TONOS
282ok(isNonStDecomp("0836"));
283ok(isComp_Ex    ("0836"));
284ok(isNFC_NO     ("0836"));
285ok(isNFKC_NO    ("0836"));
286ok(isNFD_NO     ("0836"));
287ok(isNFKD_NO    ("0836"));
288
289# COMBINING GRAVE ACCENT
290ok(getCombinClass("0768"), 230);
291ok(isComp2nd   ("0768"));
292ok(isNFC_MAYBE ("0768"));
293ok(isNFKC_MAYBE("0768"));
294
295# HANGUL SYLLABLE GA
296ok(getCombinClass("044032"), 0);
297ok(getCanon("044032"),  _pack_U(0x1100, 0x1161));
298ok(getCompat("044032"), _pack_U(0x1100, 0x1161));
299ok(getComposite("04352", "04449"), 0xAC00);
300
301# 182
302
303# string with 22 combining characters: (0x300..0x315)
304my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042);
305ok(decompose($str_cc22), $str_cc22);
306ok(reorder($str_cc22), $str_cc22);
307ok(compose($str_cc22), $str_cc22);
308ok(composeContiguous($str_cc22), $str_cc22);
309ok(NFD($str_cc22), $str_cc22);
310ok(NFC($str_cc22), $str_cc22);
311ok(NFKD($str_cc22), $str_cc22);
312ok(NFKC($str_cc22), $str_cc22);
313ok(FCD($str_cc22), $str_cc22);
314ok(FCC($str_cc22), $str_cc22);
315
316# 192
317
318# string with 40 combining characters of the same class: (0x300..0x313)x2
319my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042);
320ok(decompose($str_cc40), $str_cc40);
321ok(reorder($str_cc40), $str_cc40);
322ok(compose($str_cc40), $str_cc40);
323ok(composeContiguous($str_cc40), $str_cc40);
324ok(NFD($str_cc40), $str_cc40);
325ok(NFC($str_cc40), $str_cc40);
326ok(NFKD($str_cc40), $str_cc40);
327ok(NFKC($str_cc40), $str_cc40);
328ok(FCD($str_cc40), $str_cc40);
329ok(FCC($str_cc40), $str_cc40);
330
331# 202
332
333my $precomp = hexU("304C 304E 3050 3052 3054");
334my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099");
335ok(decompose($precomp x 5),  $combseq x 5);
336ok(decompose($precomp x 10), $combseq x 10);
337ok(decompose($precomp x 20), $combseq x 20);
338
339my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8");
340my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161");
341ok(decompose($hangsyl x 5), $jamoseq x 5);
342ok(decompose($hangsyl x 10), $jamoseq x 10);
343ok(decompose($hangsyl x 20), $jamoseq x 20);
344
345my $notcomp = hexU("304B 304D 304F 3051 3053");
346ok(decompose($precomp . $notcomp),     $combseq . $notcomp);
347ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5);
348ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10);
349
350# 211
351
352my $preUnicode3_1 = !defined getCanon(0x1D15E);
353my $preUnicode3_2 = !defined getCanon(0x2ADC);
354
355# HEBREW LETTER YOD WITH HIRIQ
356ok($preUnicode3_1 xor isExclusion(0xFB1D));
357ok($preUnicode3_1 xor isComp_Ex  (0xFB1D));
358
359# MUSICAL SYMBOL HALF NOTE
360ok($preUnicode3_1 xor isExclusion(0x1D15E));
361ok($preUnicode3_1 xor isComp_Ex  (0x1D15E));
362
363# FORKING
364ok($preUnicode3_2 xor isExclusion(0x2ADC));
365ok($preUnicode3_2 xor isComp_Ex  (0x2ADC));
366
367# 217
368
369