1 //===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Support/Unicode.h" 10 #include "llvm/ADT/StringExtras.h" 11 #include "llvm/ADT/edit_distance.h" 12 #include "llvm/Support/ConvertUTF.h" 13 #include "gmock/gmock.h" 14 #include "gtest/gtest.h" 15 16 namespace llvm { 17 namespace sys { 18 namespace unicode { 19 namespace { 20 21 TEST(Unicode, columnWidthUTF8) { 22 EXPECT_EQ(0, columnWidthUTF8("")); 23 EXPECT_EQ(1, columnWidthUTF8(" ")); 24 EXPECT_EQ(1, columnWidthUTF8("a")); 25 EXPECT_EQ(1, columnWidthUTF8("~")); 26 27 EXPECT_EQ(6, columnWidthUTF8("abcdef")); 28 29 EXPECT_EQ(-1, columnWidthUTF8("\x01")); 30 EXPECT_EQ(-1, columnWidthUTF8("\t")); 31 EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01")); 32 EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE 33 34 // 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some 35 // text editors display it only when a line is broken at it, some use it as a 36 // line-break hint, but don't display. We choose terminal-oriented 37 // interpretation. 38 EXPECT_EQ(1, columnWidthUTF8("\302\255")); 39 40 EXPECT_EQ(0, columnWidthUTF8("\314\200")); // 0300 COMBINING GRAVE ACCENT 41 EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI 42 EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00 43 44 EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200")); 45 EXPECT_EQ(3, columnWidthUTF8("q\344\270\200")); 46 EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200")); 47 48 // Invalid UTF-8 strings, columnWidthUTF8 should error out. 49 EXPECT_EQ(-2, columnWidthUTF8("\344")); 50 EXPECT_EQ(-2, columnWidthUTF8("\344\270")); 51 EXPECT_EQ(-2, columnWidthUTF8("\344\270\033")); 52 EXPECT_EQ(-2, columnWidthUTF8("\344\270\300")); 53 EXPECT_EQ(-2, columnWidthUTF8("\377\366\355")); 54 55 EXPECT_EQ(-2, columnWidthUTF8("qwer\344")); 56 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270")); 57 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033")); 58 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300")); 59 EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355")); 60 61 // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode 62 // characters. 63 EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200")); // U+200000 64 EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000 65 } 66 67 TEST(Unicode, isPrintable) { 68 EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F> 69 EXPECT_FALSE(isPrintable(0x01)); 70 EXPECT_FALSE(isPrintable(0x1F)); 71 EXPECT_TRUE(isPrintable(' ')); 72 EXPECT_TRUE(isPrintable('A')); 73 EXPECT_TRUE(isPrintable('~')); 74 EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F> 75 EXPECT_FALSE(isPrintable(0x90)); 76 EXPECT_FALSE(isPrintable(0x9F)); 77 78 EXPECT_TRUE(isPrintable(0xAC)); 79 EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals 80 // as either a space or a dash. 81 EXPECT_TRUE(isPrintable(0xAE)); 82 83 EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 84 EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379> 85 86 EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN 87 88 EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF> 89 EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000 90 91 EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter 92 93 // test the validity of a fast path in columnWidthUTF8 94 for (unsigned char c = 0; c < 128; ++c) { 95 const UTF8 buf8[2] = {c, 0}; 96 const UTF8 *Target8 = &buf8[0]; 97 UTF32 buf32[1]; 98 UTF32 *Target32 = &buf32[0]; 99 auto status = ConvertUTF8toUTF32(&Target8, Target8 + 1, &Target32, 100 Target32 + 1, strictConversion); 101 EXPECT_EQ(status, conversionOK); 102 EXPECT_EQ((columnWidthUTF8(reinterpret_cast<const char *>(buf8)) == 1), 103 (bool)isPrintable(buf32[0])); 104 } 105 } 106 107 TEST(Unicode, nameToCodepointStrict) { 108 auto map = [](StringRef Str) { 109 return nameToCodepointStrict(Str).value_or(0xFFFF'FFFF); 110 }; 111 112 // generated codepoints 113 EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400")); 114 EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF")); 115 EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00")); 116 EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC")); 117 EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000")); 118 EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD")); 119 EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700")); 120 EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740")); 121 EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D")); 122 EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820")); 123 EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1")); 124 EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0")); 125 EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0")); 126 EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000")); 127 EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A")); 128 EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000")); 129 EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7")); 130 EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00")); 131 EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08")); 132 EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00")); 133 EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5")); 134 EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170")); 135 EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB")); 136 EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900")); 137 EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D")); 138 EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70")); 139 EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9")); 140 EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800")); 141 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D")); 142 EXPECT_EQ(0x31350u, map("CJK UNIFIED IDEOGRAPH-31350")); // Unicode 15.0 143 144 EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA")); 145 EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS")); 146 EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH")); 147 EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB")); 148 EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA")); 149 EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A")); 150 EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E")); 151 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I")); 152 153 EXPECT_EQ(0x1F984u, map("UNICORN FACE")); 154 EXPECT_EQ(0x00640u, map("ARABIC TATWEEL")); 155 EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU")); 156 EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001")); 157 EXPECT_EQ(0x02235u, map("BECAUSE")); 158 EXPECT_EQ(0x1F514u, map("BELL")); 159 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 160 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 161 EXPECT_EQ(0x1F402u, map("OX")); // 2 characters 162 EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 163 "ABOVE WITH ALEF MAKSURA ISOLATED FORM")); 164 EXPECT_EQ(0x11F04u, map("KAWI LETTER A")); // Unicode 15.0 165 EXPECT_EQ(0x1FA77u, map("PINK HEART")); // Unicode 15.0 166 167 // Aliases 168 EXPECT_EQ(0x0000u, map("NULL")); 169 EXPECT_EQ(0x0007u, map("ALERT")); 170 EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION")); 171 EXPECT_EQ(0x0009u, map("CHARACTER TABULATION")); 172 EXPECT_EQ(0x000Au, map("LINE FEED")); 173 EXPECT_EQ(0x000Au, map("NEW LINE")); 174 EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION")); 175 EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION")); 176 EXPECT_EQ(0x2118u, 177 map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction 178 EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction 179 EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate 180 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate 181 182 // Should perform exact case match 183 EXPECT_EQ(0xFFFFFFFFu, map("")); 184 EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER")); 185 EXPECT_EQ(0xFFFFFFFFu, map("unicorn face")); 186 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE")); 187 EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE")); 188 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN")); 189 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i")); 190 EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i")); 191 EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI")); 192 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE")); 193 EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D")); 194 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d")); 195 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D")); 196 EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER")); 197 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1")); 198 EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE")); 199 200 // Should not support abbreviations or figments 201 EXPECT_EQ(0xFFFFFFFFu, map("FVS1")); 202 EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET")); 203 EXPECT_EQ(0xFFFFFFFFu, map("BEL")); 204 } 205 206 TEST(Unicode, nameToCodepointLoose) { 207 auto map = [](StringRef Str) { 208 auto Opt = nameToCodepointLooseMatching(Str); 209 if (!Opt) 210 return char32_t(0xFFFF'FFFF); 211 return Opt->CodePoint; 212 }; 213 214 // generated codepoints 215 EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF")); 216 EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00")); 217 EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC")); 218 EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000")); 219 EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD")); 220 EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700")); 221 EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740")); 222 EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400")); 223 EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D")); 224 EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820")); 225 EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1")); 226 EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0")); 227 EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0")); 228 EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000")); 229 EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A")); 230 EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000")); 231 EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7")); 232 EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00")); 233 EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08")); 234 EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00")); 235 EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5")); 236 EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170")); 237 EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB")); 238 EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900")); 239 EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D")); 240 EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70")); 241 EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9")); 242 EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800")); 243 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D")); 244 245 EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA")); 246 EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS")); 247 EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH")); 248 EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB")); 249 EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA")); 250 EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A")); 251 EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E")); 252 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I")); 253 254 EXPECT_EQ(0x1F984u, map("UNICORN FACE")); 255 EXPECT_EQ(0x00640u, map("ARABIC TATWEEL")); 256 EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU")); 257 EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001")); 258 EXPECT_EQ(0x02235u, map("BECAUSE")); 259 EXPECT_EQ(0x1F514u, map("BELL")); 260 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 261 EXPECT_EQ(0x1F402u, map("OX")); // 2 characters 262 EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 263 "ABOVE WITH ALEF MAKSURA ISOLATED FORM")); 264 265 // Aliases 266 EXPECT_EQ(0x0000u, map("NULL")); 267 EXPECT_EQ(0x0007u, map("ALERT")); 268 EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION")); 269 EXPECT_EQ(0x0009u, map("CHARACTER TABULATION")); 270 EXPECT_EQ(0x000Au, map("LINE FEED")); 271 EXPECT_EQ(0x000Au, map("NEW LINE")); 272 EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION")); 273 EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION")); 274 EXPECT_EQ(0x2118u, 275 map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction 276 EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction 277 EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate 278 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate 279 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate 280 281 // Should perform loose matching 282 EXPECT_EQ(0xFFFFFFFFu, map("")); 283 EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER")); 284 EXPECT_EQ(0x0001F984u, map("unicorn face")); 285 EXPECT_EQ(0x0001F984u, map("UNICORN FaCE")); 286 EXPECT_EQ(0x0001F984u, map("UNICORNFaCE")); 287 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN")); 288 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i")); 289 EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i")); 290 EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI")); 291 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE")); 292 293 EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D")); 294 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d")); 295 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D")); 296 297 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER")); 298 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1")); 299 300 // https://unicode.org/reports/tr44/#Matching_Names 301 // UAX44-LM2: Medial hypens are ignored, non medial hyphens are not 302 EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E")); 303 EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE")); 304 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-")); 305 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -")); 306 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --")); 307 EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE")); 308 309 EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A")); 310 EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA")); 311 EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A")); 312 EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A")); 313 EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A")); 314 ; 315 316 // special case 317 EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E")); 318 EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE")); 319 320 // names that are prefix to existing characters should not match 321 EXPECT_FALSE(nameToCodepointLooseMatching("B")); 322 EXPECT_FALSE(nameToCodepointLooseMatching("BE")); 323 EXPECT_FALSE(nameToCodepointLooseMatching("BEE")); 324 EXPECT_FALSE(nameToCodepointLooseMatching("BEET")); 325 EXPECT_FALSE(nameToCodepointLooseMatching("BEETL")); 326 EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE")); 327 } 328 329 } // namespace 330 331 bool operator==(MatchForCodepointName a, MatchForCodepointName b) { 332 return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value; 333 } 334 335 namespace { 336 337 TEST(Unicode, nearestMatchesForCodepointName) { 338 auto Normalize = [](StringRef Name) { 339 std::string Out; 340 Out.reserve(Name.size()); 341 for (char C : Name) { 342 if (isAlnum(C)) 343 Out.push_back(toUpper(C)); 344 } 345 return Out; 346 }; 347 348 auto L = [&](StringRef name) { 349 auto v = nearestMatchesForCodepointName(name, 3); 350 for (auto &r : v) { 351 auto A = Normalize(r.Name); 352 auto B = Normalize(name); 353 EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance); 354 } 355 return v; 356 }; 357 using ::testing::ElementsAre; 358 using M = MatchForCodepointName; 359 360 ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C}, 361 M{"ARC", 3, 0x2312})); 362 // shortest name 363 ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93}, 364 M{"BOY", 2, 0x1F466})); 365 366 // longest name 367 ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF " 368 "MAKSURA INITIAL FORM"), 369 ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 370 "ABOVE WITH ALEF MAKSURA INITIAL FORM", 371 0, 0xFBFB}, 372 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 373 "ABOVE WITH ALEF MAKSURA FINAL FORM", 374 4, 0xFBFA}, 375 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 376 "ABOVE WITH ALEF MAKSURA ISOLATED FORM", 377 7, 0xFBF9})); 378 379 // same result with underscore, spaces, etc 380 ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH " 381 "ALEF MAKsURAINITIAL form_"), 382 ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 383 "ABOVE WITH ALEF MAKSURA INITIAL FORM", 384 0, 0xFBFB}, 385 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 386 "ABOVE WITH ALEF MAKSURA FINAL FORM", 387 4, 0xFBFA}, 388 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 389 "ABOVE WITH ALEF MAKSURA ISOLATED FORM", 390 7, 0xFBF9})); 391 392 ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"), 393 ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B}, 394 M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393}, 395 M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391})); 396 397 ASSERT_THAT(L("greekcapitalletter-lambda"), 398 ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B}, 399 M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393}, 400 M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391})); 401 402 // typo http://www.unicode.org/notes/tn27/tn27-5.html 403 ASSERT_THAT( 404 L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"), 405 ElementsAre( 406 M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0, 407 0xFE18}, // typo 408 M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2, 409 0xFE18}, // correction 410 M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6, 411 0xFE17})); 412 413 // typo http://www.unicode.org/notes/tn27/tn27-5.html 414 ASSERT_THAT( 415 L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"), 416 ElementsAre( 417 M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5}, 418 M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5}, 419 M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7, 420 0x1D0C6})); 421 } 422 423 } // namespace 424 } // namespace unicode 425 } // namespace sys 426 } // namespace llvm 427