1 2BEGIN { 3 if ($ENV{PERL_CORE}) { 4 chdir('t') if -d 't'; 5 @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); 6 } 7} 8 9######################### 10 11use strict; 12use warnings; 13BEGIN { $| = 1; print "1..217\n"; } 14my $count = 0; 15sub ok { Unicode::Normalize::ok(\$count, @_) } 16 17use Unicode::Normalize qw(:all); 18 19ok(1); 20 21sub _pack_U { Unicode::Normalize::dot_t_pack_U(@_) } 22sub hexU { _pack_U map hex, split ' ', shift } 23 24# This won't work on EBCDIC platforms prior to v5.8.0, which is when this 25# translation function was defined 26*to_native = (defined &utf8::unicode_to_native) 27 ? \&utf8::unicode_to_native 28 : sub { return shift }; 29 30######################### 31 32ok(getCombinClass( to_native(0)), 0); 33ok(getCombinClass(to_native(41)), 0); 34ok(getCombinClass(to_native(65)), 0); 35ok(getCombinClass( 768), 230); 36ok(getCombinClass(1809), 36); 37 38ok(getCanon(to_native( 0)), undef); 39ok(getCanon(to_native(0x29)), undef); 40ok(getCanon(to_native(0x41)), undef); 41ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); 42ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); 43ok(getCanon(0x304C), _pack_U(0x304B, 0x3099)); 44ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); 45ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); 46ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); 47ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161)); 48ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); 49ok(getCanon(0x212C), undef); 50ok(getCanon(0x3243), undef); 51ok(getCanon(0xFA2D), _pack_U(0x9DB4)); 52 53# 20 54 55ok(getCompat(to_native( 0)), undef); 56ok(getCompat(to_native(0x29)), undef); 57ok(getCompat(to_native(0x41)), undef); 58ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); 59ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); 60ok(getCompat(0x304C), _pack_U(0x304B, 0x3099)); 61ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); 62ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); 63ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); 64ok(getCompat(0x212C), _pack_U(0x0042)); 65ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029)); 66ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161)); 67ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); 68ok(getCompat(0xFA2D), _pack_U(0x9DB4)); 69 70# 34 71 72ok(getComposite(to_native( 0), to_native( 0)), undef); 73ok(getComposite(to_native( 0), to_native(0x29)), undef); 74ok(getComposite(to_native(0x29), to_native( 0)), undef); 75ok(getComposite(to_native(0x29), to_native(0x29)), undef); 76ok(getComposite(to_native( 0), to_native(0x41)), undef); 77ok(getComposite(to_native(0x41), to_native( 0)), undef); 78ok(getComposite(to_native(0x41), to_native(0x41)), undef); 79ok(getComposite(to_native(12), to_native(0x0300)), undef); 80ok(getComposite(to_native(0x0055), 0xFF00), undef); 81ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0)); 82ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9)); 83ok(getComposite(0x0112, 0x0300), 0x1E14); 84ok(getComposite(0x1100, 0x1161), 0xAC00); 85ok(getComposite(0x1100, 0x1173), 0xADF8); 86ok(getComposite(0x1100, 0x11AF), undef); 87ok(getComposite(0x1173, 0x11AF), undef); 88ok(getComposite(0xAC00, 0x11A7), undef); 89ok(getComposite(0xAC00, 0x11A8), 0xAC01); 90ok(getComposite(0xADF8, 0x11AF), 0xAE00); 91 92# 53 93 94sub uprops { 95 my $uv = shift; 96 my $r = ""; 97 $r .= isExclusion($uv) ? 'X' : 'x'; 98 $r .= isSingleton($uv) ? 'S' : 's'; 99 $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition 100 $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N) 101 $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y 102 $r .= isNFD_NO($uv) ? 'D' : 'd'; 103 $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe 104 $r .= isNFC_NO($uv) ? 'C' : 'c'; 105 $r .= isNFKD_NO($uv) ? 'K' : 'k'; 106 $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe 107 $r .= isNFKC_NO($uv) ? 'G' : 'g'; 108 return $r; 109} 110 111ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL 112ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS 113ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A 114ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE 115ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE 116ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT 117ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS 118ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA 119ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA 120ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA 121ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK 122ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A 123ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL 124ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN 125ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA 126ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900 127ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE 128ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A 129 130# 71 131 132ok(decompose(""), ""); 133ok(decompose("A"), "A"); 134ok(decompose("", 1), ""); 135ok(decompose("A", 1), "A"); 136 137ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8")); 138ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF")); 139ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76")); 140 141ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8")); 142ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF")); 143ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB")); 144 145# don't modify the source 146my $sDec = "\x{FA19}"; 147ok(decompose($sDec), "\x{795E}"); 148ok($sDec, "\x{FA19}"); 149 150# 83 151 152ok(reorder(""), ""); 153ok(reorder("A"), "A"); 154ok(reorder(hexU("0041 0300 0315 0313 031b 0061")), 155 hexU("0041 031b 0300 0313 0315 0061")); 156ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")), 157 hexU("00C1 031b 0300 0313 0315 0061 309A 3099")); 158 159# don't modify the source 160my $sReord = "\x{3000}\x{300}\x{31b}"; 161ok(reorder($sReord), "\x{3000}\x{31b}\x{300}"); 162ok($sReord, "\x{3000}\x{300}\x{31b}"); 163 164# 89 165 166ok(compose(""), ""); 167ok(compose("A"), "A"); 168ok(compose(hexU("0061 0300")), hexU("00E0")); 169ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B")); 170ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315")); 171ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313")); 172ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B")); 173ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300")); 174ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300")); 175 176# don't modify the source 177my $sCom = "\x{304B}\x{3099}"; 178ok(compose($sCom), "\x{304C}"); 179ok($sCom, "\x{304B}\x{3099}"); 180 181# 100 182 183ok(composeContiguous(""), ""); 184ok(composeContiguous("A"), "A"); 185ok(composeContiguous(hexU("0061 0300")), hexU("00E0")); 186ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B")); 187ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315")); 188ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313")); 189ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300")); 190ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300")); 191ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300")); 192 193# don't modify the source 194my $sCtg = "\x{30DB}\x{309A}"; 195ok(composeContiguous($sCtg), "\x{30DD}"); 196ok($sCtg, "\x{30DB}\x{309A}"); 197 198# 111 199 200sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } 201 202ok(answer(checkNFD("")), "YES"); 203ok(answer(checkNFC("")), "YES"); 204ok(answer(checkNFKD("")), "YES"); 205ok(answer(checkNFKC("")), "YES"); 206ok(answer(check("NFD", "")), "YES"); 207ok(answer(check("NFC", "")), "YES"); 208ok(answer(check("NFKD","")), "YES"); 209ok(answer(check("NFKC","")), "YES"); 210 211# U+0000 to U+007F are prenormalized in all the normalization forms. 212ok(answer(checkNFD("AZaz\t12!#`")), "YES"); 213ok(answer(checkNFC("AZaz\t12!#`")), "YES"); 214ok(answer(checkNFKD("AZaz\t12!#`")), "YES"); 215ok(answer(checkNFKC("AZaz\t12!#`")), "YES"); 216ok(answer(check("D", "AZaz\t12!#`")), "YES"); 217ok(answer(check("C", "AZaz\t12!#`")), "YES"); 218ok(answer(check("KD","AZaz\t12!#`")), "YES"); 219ok(answer(check("KC","AZaz\t12!#`")), "YES"); 220 221ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); 222ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO"); 223ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE"); 224ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES"); 225ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE"); 226ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO"); 227ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO"); 228ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring 229ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla 230ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES"); 231ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO"); 232ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO"); 233ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO"); 234ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO"); 235ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO"); 236ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring 237ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla 238ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO"); 239 240# 145 241 242"012ABC" =~ /(\d+)(\w+)/; 243ok("012" eq NFC $1 && "ABC" eq NFC $2); 244 245ok(normalize('C', $1), "012"); 246ok(normalize('C', $2), "ABC"); 247 248ok(normalize('NFC', $1), "012"); 249ok(normalize('NFC', $2), "ABC"); 250 # s/^NF// in normalize() must not prevent using $1, $&, etc. 251 252# 150 253 254# a string with initial zero should be treated like a number 255 256# LATIN CAPITAL LETTER A WITH GRAVE 257ok(getCombinClass(sprintf("0%d", to_native(192))), 0); 258ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); 259ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); 260my $lead_zero = sprintf "0%d", to_native(65); 261ok(getComposite($lead_zero, "0768"), to_native(192)); 262ok(isNFD_NO (sprintf("0%d", to_native(192)))); 263ok(isNFKD_NO(sprintf("0%d", to_native(192)))); 264 265# DEVANAGARI LETTER QA 266ok(isExclusion("02392")); 267ok(isComp_Ex ("02392")); 268ok(isNFC_NO ("02392")); 269ok(isNFKC_NO ("02392")); 270ok(isNFD_NO ("02392")); 271ok(isNFKD_NO ("02392")); 272 273# ANGSTROM SIGN 274ok(isSingleton("08491")); 275ok(isComp_Ex ("08491")); 276ok(isNFC_NO ("08491")); 277ok(isNFKC_NO ("08491")); 278ok(isNFD_NO ("08491")); 279ok(isNFKD_NO ("08491")); 280 281# COMBINING GREEK DIALYTIKA TONOS 282ok(isNonStDecomp("0836")); 283ok(isComp_Ex ("0836")); 284ok(isNFC_NO ("0836")); 285ok(isNFKC_NO ("0836")); 286ok(isNFD_NO ("0836")); 287ok(isNFKD_NO ("0836")); 288 289# COMBINING GRAVE ACCENT 290ok(getCombinClass("0768"), 230); 291ok(isComp2nd ("0768")); 292ok(isNFC_MAYBE ("0768")); 293ok(isNFKC_MAYBE("0768")); 294 295# HANGUL SYLLABLE GA 296ok(getCombinClass("044032"), 0); 297ok(getCanon("044032"), _pack_U(0x1100, 0x1161)); 298ok(getCompat("044032"), _pack_U(0x1100, 0x1161)); 299ok(getComposite("04352", "04449"), 0xAC00); 300 301# 182 302 303# string with 22 combining characters: (0x300..0x315) 304my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042); 305ok(decompose($str_cc22), $str_cc22); 306ok(reorder($str_cc22), $str_cc22); 307ok(compose($str_cc22), $str_cc22); 308ok(composeContiguous($str_cc22), $str_cc22); 309ok(NFD($str_cc22), $str_cc22); 310ok(NFC($str_cc22), $str_cc22); 311ok(NFKD($str_cc22), $str_cc22); 312ok(NFKC($str_cc22), $str_cc22); 313ok(FCD($str_cc22), $str_cc22); 314ok(FCC($str_cc22), $str_cc22); 315 316# 192 317 318# string with 40 combining characters of the same class: (0x300..0x313)x2 319my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042); 320ok(decompose($str_cc40), $str_cc40); 321ok(reorder($str_cc40), $str_cc40); 322ok(compose($str_cc40), $str_cc40); 323ok(composeContiguous($str_cc40), $str_cc40); 324ok(NFD($str_cc40), $str_cc40); 325ok(NFC($str_cc40), $str_cc40); 326ok(NFKD($str_cc40), $str_cc40); 327ok(NFKC($str_cc40), $str_cc40); 328ok(FCD($str_cc40), $str_cc40); 329ok(FCC($str_cc40), $str_cc40); 330 331# 202 332 333my $precomp = hexU("304C 304E 3050 3052 3054"); 334my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099"); 335ok(decompose($precomp x 5), $combseq x 5); 336ok(decompose($precomp x 10), $combseq x 10); 337ok(decompose($precomp x 20), $combseq x 20); 338 339my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8"); 340my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161"); 341ok(decompose($hangsyl x 5), $jamoseq x 5); 342ok(decompose($hangsyl x 10), $jamoseq x 10); 343ok(decompose($hangsyl x 20), $jamoseq x 20); 344 345my $notcomp = hexU("304B 304D 304F 3051 3053"); 346ok(decompose($precomp . $notcomp), $combseq . $notcomp); 347ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5); 348ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10); 349 350# 211 351 352my $preUnicode3_1 = !defined getCanon(0x1D15E); 353my $preUnicode3_2 = !defined getCanon(0x2ADC); 354 355# HEBREW LETTER YOD WITH HIRIQ 356ok($preUnicode3_1 xor isExclusion(0xFB1D)); 357ok($preUnicode3_1 xor isComp_Ex (0xFB1D)); 358 359# MUSICAL SYMBOL HALF NOTE 360ok($preUnicode3_1 xor isExclusion(0x1D15E)); 361ok($preUnicode3_1 xor isComp_Ex (0x1D15E)); 362 363# FORKING 364ok($preUnicode3_2 xor isExclusion(0x2ADC)); 365ok($preUnicode3_2 xor isComp_Ex (0x2ADC)); 366 367# 217 368 369