1 //===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Support/Unicode.h" 10 #include "llvm/ADT/StringExtras.h" 11 #include "llvm/ADT/edit_distance.h" 12 #include "llvm/Support/ConvertUTF.h" 13 #include "gmock/gmock.h" 14 #include "gtest/gtest.h" 15 16 namespace llvm { 17 namespace sys { 18 namespace unicode { 19 namespace { 20 21 TEST(Unicode, columnWidthUTF8) { 22 EXPECT_EQ(0, columnWidthUTF8("")); 23 EXPECT_EQ(1, columnWidthUTF8(" ")); 24 EXPECT_EQ(1, columnWidthUTF8("a")); 25 EXPECT_EQ(1, columnWidthUTF8("~")); 26 27 EXPECT_EQ(6, columnWidthUTF8("abcdef")); 28 29 EXPECT_EQ(-1, columnWidthUTF8("\x01")); 30 EXPECT_EQ(-1, columnWidthUTF8("\t")); 31 EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01")); 32 EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE 33 34 // 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some 35 // text editors display it only when a line is broken at it, some use it as a 36 // line-break hint, but don't display. We choose terminal-oriented 37 // interpretation. 38 EXPECT_EQ(1, columnWidthUTF8("\302\255")); 39 40 EXPECT_EQ(0, columnWidthUTF8("\314\200")); // 0300 COMBINING GRAVE ACCENT 41 EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI 42 EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00 43 44 EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200")); 45 EXPECT_EQ(3, columnWidthUTF8("q\344\270\200")); 46 EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200")); 47 48 EXPECT_EQ(2, columnWidthUTF8("\u231A")); // WATCH (emoji) 49 EXPECT_EQ(2, columnWidthUTF8("\U0001FADB")); // PEA POD (Unicode 15 emoji) 50 EXPECT_EQ(2, columnWidthUTF8("\U0001B132")); // HIRAGANA LETTER SMALL KO 51 EXPECT_EQ(2, columnWidthUTF8("\U00017042")); // TANGUT IDEOGRAPH 52 53 // Invalid UTF-8 strings, columnWidthUTF8 should error out. 54 EXPECT_EQ(-2, columnWidthUTF8("\344")); 55 EXPECT_EQ(-2, columnWidthUTF8("\344\270")); 56 EXPECT_EQ(-2, columnWidthUTF8("\344\270\033")); 57 EXPECT_EQ(-2, columnWidthUTF8("\344\270\300")); 58 EXPECT_EQ(-2, columnWidthUTF8("\377\366\355")); 59 60 EXPECT_EQ(-2, columnWidthUTF8("qwer\344")); 61 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270")); 62 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033")); 63 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300")); 64 EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355")); 65 66 // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode 67 // characters. 68 EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200")); // U+200000 69 EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000 70 } 71 72 TEST(Unicode, isPrintable) { 73 EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F> 74 EXPECT_FALSE(isPrintable(0x01)); 75 EXPECT_FALSE(isPrintable(0x1F)); 76 EXPECT_TRUE(isPrintable(' ')); 77 EXPECT_TRUE(isPrintable('A')); 78 EXPECT_TRUE(isPrintable('~')); 79 EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F> 80 EXPECT_FALSE(isPrintable(0x90)); 81 EXPECT_FALSE(isPrintable(0x9F)); 82 83 EXPECT_TRUE(isPrintable(0xAC)); 84 EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals 85 // as either a space or a dash. 86 EXPECT_TRUE(isPrintable(0xAE)); 87 88 EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 89 EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379> 90 91 EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN 92 93 EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF> 94 EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000 95 96 EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter 97 98 // test the validity of a fast path in columnWidthUTF8 99 for (unsigned char c = 0; c < 128; ++c) { 100 const UTF8 buf8[2] = {c, 0}; 101 const UTF8 *Target8 = &buf8[0]; 102 UTF32 buf32[1]; 103 UTF32 *Target32 = &buf32[0]; 104 auto status = ConvertUTF8toUTF32(&Target8, Target8 + 1, &Target32, 105 Target32 + 1, strictConversion); 106 EXPECT_EQ(status, conversionOK); 107 EXPECT_EQ((columnWidthUTF8(reinterpret_cast<const char *>(buf8)) == 1), 108 (bool)isPrintable(buf32[0])); 109 } 110 } 111 112 TEST(Unicode, nameToCodepointStrict) { 113 auto map = [](StringRef Str) { 114 return nameToCodepointStrict(Str).value_or(0xFFFF'FFFF); 115 }; 116 117 // generated codepoints 118 EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400")); 119 EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF")); 120 EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00")); 121 EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC")); 122 EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000")); 123 EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD")); 124 EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700")); 125 EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740")); 126 EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D")); 127 EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820")); 128 EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1")); 129 EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0")); 130 EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0")); 131 EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000")); 132 EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A")); 133 EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000")); 134 EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7")); 135 EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00")); 136 EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08")); 137 EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00")); 138 EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5")); 139 EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170")); 140 EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB")); 141 EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900")); 142 EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D")); 143 EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70")); 144 EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9")); 145 EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800")); 146 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D")); 147 EXPECT_EQ(0x31350u, map("CJK UNIFIED IDEOGRAPH-31350")); // Unicode 15.0 148 149 EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA")); 150 EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS")); 151 EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH")); 152 EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB")); 153 EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA")); 154 EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A")); 155 EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E")); 156 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I")); 157 158 EXPECT_EQ(0x1F984u, map("UNICORN FACE")); 159 EXPECT_EQ(0x00640u, map("ARABIC TATWEEL")); 160 EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU")); 161 EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001")); 162 EXPECT_EQ(0x02235u, map("BECAUSE")); 163 EXPECT_EQ(0x1F514u, map("BELL")); 164 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 165 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 166 EXPECT_EQ(0x1F402u, map("OX")); // 2 characters 167 EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 168 "ABOVE WITH ALEF MAKSURA ISOLATED FORM")); 169 EXPECT_EQ(0x11F04u, map("KAWI LETTER A")); // Unicode 15.0 170 EXPECT_EQ(0x1FA77u, map("PINK HEART")); // Unicode 15.0 171 172 // Aliases 173 EXPECT_EQ(0x0000u, map("NULL")); 174 EXPECT_EQ(0x0007u, map("ALERT")); 175 EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION")); 176 EXPECT_EQ(0x0009u, map("CHARACTER TABULATION")); 177 EXPECT_EQ(0x000Au, map("LINE FEED")); 178 EXPECT_EQ(0x000Au, map("NEW LINE")); 179 EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION")); 180 EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION")); 181 EXPECT_EQ(0x2118u, 182 map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction 183 EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction 184 EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate 185 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate 186 187 // Should perform exact case match 188 EXPECT_EQ(0xFFFFFFFFu, map("")); 189 EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER")); 190 EXPECT_EQ(0xFFFFFFFFu, map("unicorn face")); 191 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE")); 192 EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE")); 193 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN")); 194 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i")); 195 EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i")); 196 EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI")); 197 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE")); 198 EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D")); 199 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d")); 200 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D")); 201 EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER")); 202 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1")); 203 EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE")); 204 205 // Should not support abbreviations or figments 206 EXPECT_EQ(0xFFFFFFFFu, map("FVS1")); 207 EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET")); 208 EXPECT_EQ(0xFFFFFFFFu, map("BEL")); 209 } 210 211 TEST(Unicode, nameToCodepointLoose) { 212 auto map = [](StringRef Str) { 213 auto Opt = nameToCodepointLooseMatching(Str); 214 if (!Opt) 215 return char32_t(0xFFFF'FFFF); 216 return Opt->CodePoint; 217 }; 218 219 // generated codepoints 220 EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF")); 221 EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00")); 222 EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC")); 223 EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000")); 224 EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD")); 225 EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700")); 226 EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740")); 227 EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400")); 228 EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D")); 229 EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820")); 230 EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1")); 231 EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0")); 232 EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0")); 233 EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000")); 234 EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A")); 235 EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000")); 236 EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7")); 237 EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00")); 238 EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08")); 239 EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00")); 240 EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5")); 241 EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170")); 242 EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB")); 243 EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900")); 244 EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D")); 245 EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70")); 246 EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9")); 247 EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800")); 248 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D")); 249 250 EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA")); 251 EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS")); 252 EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH")); 253 EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB")); 254 EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA")); 255 EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A")); 256 EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E")); 257 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I")); 258 259 EXPECT_EQ(0x1F984u, map("UNICORN FACE")); 260 EXPECT_EQ(0x00640u, map("ARABIC TATWEEL")); 261 EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU")); 262 EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001")); 263 EXPECT_EQ(0x02235u, map("BECAUSE")); 264 EXPECT_EQ(0x1F514u, map("BELL")); 265 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 266 EXPECT_EQ(0x1F402u, map("OX")); // 2 characters 267 EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 268 "ABOVE WITH ALEF MAKSURA ISOLATED FORM")); 269 270 // Aliases 271 EXPECT_EQ(0x0000u, map("NULL")); 272 EXPECT_EQ(0x0007u, map("ALERT")); 273 EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION")); 274 EXPECT_EQ(0x0009u, map("CHARACTER TABULATION")); 275 EXPECT_EQ(0x000Au, map("LINE FEED")); 276 EXPECT_EQ(0x000Au, map("NEW LINE")); 277 EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION")); 278 EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION")); 279 EXPECT_EQ(0x2118u, 280 map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction 281 EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction 282 EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate 283 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate 284 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate 285 286 // Should perform loose matching 287 EXPECT_EQ(0xFFFFFFFFu, map("")); 288 EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER")); 289 EXPECT_EQ(0x0001F984u, map("unicorn face")); 290 EXPECT_EQ(0x0001F984u, map("UNICORN FaCE")); 291 EXPECT_EQ(0x0001F984u, map("UNICORNFaCE")); 292 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN")); 293 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i")); 294 EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i")); 295 EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI")); 296 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE")); 297 298 EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D")); 299 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d")); 300 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D")); 301 302 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER")); 303 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1")); 304 305 // https://unicode.org/reports/tr44/#Matching_Names 306 // UAX44-LM2: Medial hypens are ignored, non medial hyphens are not 307 EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E")); 308 EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE")); 309 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-")); 310 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -")); 311 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --")); 312 EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE")); 313 314 EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A")); 315 EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA")); 316 EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A")); 317 EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A")); 318 EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A")); 319 ; 320 321 // special case 322 EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E")); 323 EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE")); 324 325 // names that are prefix to existing characters should not match 326 EXPECT_FALSE(nameToCodepointLooseMatching("B")); 327 EXPECT_FALSE(nameToCodepointLooseMatching("BE")); 328 EXPECT_FALSE(nameToCodepointLooseMatching("BEE")); 329 EXPECT_FALSE(nameToCodepointLooseMatching("BEET")); 330 EXPECT_FALSE(nameToCodepointLooseMatching("BEETL")); 331 EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE")); 332 } 333 334 } // namespace 335 336 bool operator==(MatchForCodepointName a, MatchForCodepointName b) { 337 return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value; 338 } 339 340 namespace { 341 342 TEST(Unicode, nearestMatchesForCodepointName) { 343 auto Normalize = [](StringRef Name) { 344 std::string Out; 345 Out.reserve(Name.size()); 346 for (char C : Name) { 347 if (isAlnum(C)) 348 Out.push_back(toUpper(C)); 349 } 350 return Out; 351 }; 352 353 auto L = [&](StringRef name) { 354 auto v = nearestMatchesForCodepointName(name, 3); 355 for (auto &r : v) { 356 auto A = Normalize(r.Name); 357 auto B = Normalize(name); 358 EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance); 359 } 360 return v; 361 }; 362 using ::testing::ElementsAre; 363 using M = MatchForCodepointName; 364 365 ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C}, 366 M{"ARC", 3, 0x2312})); 367 // shortest name 368 ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93}, 369 M{"BOY", 2, 0x1F466})); 370 371 // longest name 372 ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF " 373 "MAKSURA INITIAL FORM"), 374 ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 375 "ABOVE WITH ALEF MAKSURA INITIAL FORM", 376 0, 0xFBFB}, 377 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 378 "ABOVE WITH ALEF MAKSURA FINAL FORM", 379 4, 0xFBFA}, 380 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 381 "ABOVE WITH ALEF MAKSURA ISOLATED FORM", 382 7, 0xFBF9})); 383 384 // same result with underscore, spaces, etc 385 ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH " 386 "ALEF MAKsURAINITIAL form_"), 387 ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 388 "ABOVE WITH ALEF MAKSURA INITIAL FORM", 389 0, 0xFBFB}, 390 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 391 "ABOVE WITH ALEF MAKSURA FINAL FORM", 392 4, 0xFBFA}, 393 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 394 "ABOVE WITH ALEF MAKSURA ISOLATED FORM", 395 7, 0xFBF9})); 396 397 ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"), 398 ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B}, 399 M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393}, 400 M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391})); 401 402 ASSERT_THAT(L("greekcapitalletter-lambda"), 403 ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B}, 404 M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393}, 405 M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391})); 406 407 // typo http://www.unicode.org/notes/tn27/tn27-5.html 408 ASSERT_THAT( 409 L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"), 410 ElementsAre( 411 M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0, 412 0xFE18}, // typo 413 M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2, 414 0xFE18}, // correction 415 M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6, 416 0xFE17})); 417 418 // typo http://www.unicode.org/notes/tn27/tn27-5.html 419 ASSERT_THAT( 420 L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"), 421 ElementsAre( 422 M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5}, 423 M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5}, 424 M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7, 425 0x1D0C6})); 426 } 427 428 } // namespace 429 } // namespace unicode 430 } // namespace sys 431 } // namespace llvm 432