1 //===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Support/Unicode.h" 10 #include "llvm/ADT/StringExtras.h" 11 #include "llvm/ADT/edit_distance.h" 12 #include "llvm/Support/ConvertUTF.h" 13 #include "gmock/gmock.h" 14 #include "gtest/gtest.h" 15 16 namespace llvm { 17 namespace sys { 18 namespace unicode { 19 namespace { 20 21 TEST(Unicode, columnWidthUTF8) { 22 EXPECT_EQ(0, columnWidthUTF8("")); 23 EXPECT_EQ(1, columnWidthUTF8(" ")); 24 EXPECT_EQ(1, columnWidthUTF8("a")); 25 EXPECT_EQ(1, columnWidthUTF8("~")); 26 27 EXPECT_EQ(6, columnWidthUTF8("abcdef")); 28 29 EXPECT_EQ(-1, columnWidthUTF8("\x01")); 30 EXPECT_EQ(-1, columnWidthUTF8("\t")); 31 EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01")); 32 EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE 33 34 // 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some 35 // text editors display it only when a line is broken at it, some use it as a 36 // line-break hint, but don't display. We choose terminal-oriented 37 // interpretation. 38 EXPECT_EQ(1, columnWidthUTF8("\302\255")); 39 40 EXPECT_EQ(0, columnWidthUTF8("\314\200")); // 0300 COMBINING GRAVE ACCENT 41 EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI 42 EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00 43 44 EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200")); 45 EXPECT_EQ(3, columnWidthUTF8("q\344\270\200")); 46 EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200")); 47 48 // Invalid UTF-8 strings, columnWidthUTF8 should error out. 49 EXPECT_EQ(-2, columnWidthUTF8("\344")); 50 EXPECT_EQ(-2, columnWidthUTF8("\344\270")); 51 EXPECT_EQ(-2, columnWidthUTF8("\344\270\033")); 52 EXPECT_EQ(-2, columnWidthUTF8("\344\270\300")); 53 EXPECT_EQ(-2, columnWidthUTF8("\377\366\355")); 54 55 EXPECT_EQ(-2, columnWidthUTF8("qwer\344")); 56 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270")); 57 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033")); 58 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300")); 59 EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355")); 60 61 // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode 62 // characters. 63 EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200")); // U+200000 64 EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000 65 } 66 67 TEST(Unicode, isPrintable) { 68 EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F> 69 EXPECT_FALSE(isPrintable(0x01)); 70 EXPECT_FALSE(isPrintable(0x1F)); 71 EXPECT_TRUE(isPrintable(' ')); 72 EXPECT_TRUE(isPrintable('A')); 73 EXPECT_TRUE(isPrintable('~')); 74 EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F> 75 EXPECT_FALSE(isPrintable(0x90)); 76 EXPECT_FALSE(isPrintable(0x9F)); 77 78 EXPECT_TRUE(isPrintable(0xAC)); 79 EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals 80 // as either a space or a dash. 81 EXPECT_TRUE(isPrintable(0xAE)); 82 83 EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 84 EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379> 85 86 EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN 87 88 EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF> 89 EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000 90 91 EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter 92 93 // test the validity of a fast path in columnWidthUTF8 94 for (unsigned char c = 0; c < 128; ++c) { 95 const UTF8 buf8[2] = {c, 0}; 96 const UTF8 *Target8 = &buf8[0]; 97 UTF32 buf32[1]; 98 UTF32 *Target32 = &buf32[0]; 99 auto status = ConvertUTF8toUTF32(&Target8, Target8 + 1, &Target32, 100 Target32 + 1, strictConversion); 101 EXPECT_EQ(status, conversionOK); 102 EXPECT_EQ((columnWidthUTF8(reinterpret_cast<const char *>(buf8)) == 1), 103 (bool)isPrintable(buf32[0])); 104 } 105 } 106 107 TEST(Unicode, nameToCodepointStrict) { 108 auto map = [](StringRef Str) { 109 return nameToCodepointStrict(Str).getValueOr(0xFFFF'FFFF); 110 }; 111 112 // generated codepoints 113 EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400")); 114 EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF")); 115 EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00")); 116 EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC")); 117 EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000")); 118 EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD")); 119 EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700")); 120 EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740")); 121 EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D")); 122 EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820")); 123 EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1")); 124 EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0")); 125 EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0")); 126 EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000")); 127 EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A")); 128 EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000")); 129 EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7")); 130 EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00")); 131 EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08")); 132 EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00")); 133 EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5")); 134 EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170")); 135 EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB")); 136 EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900")); 137 EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D")); 138 EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70")); 139 EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9")); 140 EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800")); 141 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D")); 142 143 EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA")); 144 EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS")); 145 EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH")); 146 EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB")); 147 EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA")); 148 EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A")); 149 EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E")); 150 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I")); 151 152 EXPECT_EQ(0x1F984u, map("UNICORN FACE")); 153 EXPECT_EQ(0x00640u, map("ARABIC TATWEEL")); 154 EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU")); 155 EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001")); 156 EXPECT_EQ(0x02235u, map("BECAUSE")); 157 EXPECT_EQ(0x1F514u, map("BELL")); 158 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 159 EXPECT_EQ(0x1F402u, map("OX")); // 2 characters 160 EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 161 "ABOVE WITH ALEF MAKSURA ISOLATED FORM")); 162 163 // Aliases 164 EXPECT_EQ(0x0000u, map("NULL")); 165 EXPECT_EQ(0x0007u, map("ALERT")); 166 EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION")); 167 EXPECT_EQ(0x0009u, map("CHARACTER TABULATION")); 168 EXPECT_EQ(0x000Au, map("LINE FEED")); 169 EXPECT_EQ(0x000Au, map("NEW LINE")); 170 EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION")); 171 EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION")); 172 EXPECT_EQ(0x2118u, 173 map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction 174 EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction 175 EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate 176 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate 177 178 // Should perform exact case match 179 EXPECT_EQ(0xFFFFFFFFu, map("")); 180 EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER")); 181 EXPECT_EQ(0xFFFFFFFFu, map("unicorn face")); 182 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE")); 183 EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE")); 184 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN")); 185 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i")); 186 EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i")); 187 EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI")); 188 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE")); 189 EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D")); 190 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d")); 191 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D")); 192 EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER")); 193 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1")); 194 EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE")); 195 196 // Should not support abbreviations or figments 197 EXPECT_EQ(0xFFFFFFFFu, map("FVS1")); 198 EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET")); 199 EXPECT_EQ(0xFFFFFFFFu, map("BEL")); 200 } 201 202 TEST(Unicode, nameToCodepointLoose) { 203 auto map = [](StringRef Str) { 204 auto Opt = nameToCodepointLooseMatching(Str); 205 if (!Opt) 206 return char32_t(0xFFFF'FFFF); 207 return Opt->CodePoint; 208 }; 209 210 // generated codepoints 211 EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF")); 212 EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00")); 213 EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC")); 214 EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000")); 215 EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD")); 216 EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700")); 217 EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740")); 218 EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400")); 219 EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D")); 220 EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820")); 221 EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1")); 222 EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0")); 223 EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0")); 224 EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000")); 225 EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A")); 226 EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000")); 227 EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7")); 228 EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00")); 229 EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08")); 230 EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00")); 231 EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5")); 232 EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170")); 233 EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB")); 234 EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900")); 235 EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D")); 236 EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70")); 237 EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9")); 238 EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800")); 239 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D")); 240 241 EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA")); 242 EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS")); 243 EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH")); 244 EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB")); 245 EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA")); 246 EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A")); 247 EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E")); 248 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I")); 249 250 EXPECT_EQ(0x1F984u, map("UNICORN FACE")); 251 EXPECT_EQ(0x00640u, map("ARABIC TATWEEL")); 252 EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU")); 253 EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001")); 254 EXPECT_EQ(0x02235u, map("BECAUSE")); 255 EXPECT_EQ(0x1F514u, map("BELL")); 256 EXPECT_EQ(0x1F9A9u, map("FLAMINGO")); 257 EXPECT_EQ(0x1F402u, map("OX")); // 2 characters 258 EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 259 "ABOVE WITH ALEF MAKSURA ISOLATED FORM")); 260 261 // Aliases 262 EXPECT_EQ(0x0000u, map("NULL")); 263 EXPECT_EQ(0x0007u, map("ALERT")); 264 EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION")); 265 EXPECT_EQ(0x0009u, map("CHARACTER TABULATION")); 266 EXPECT_EQ(0x000Au, map("LINE FEED")); 267 EXPECT_EQ(0x000Au, map("NEW LINE")); 268 EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION")); 269 EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION")); 270 EXPECT_EQ(0x2118u, 271 map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction 272 EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction 273 EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate 274 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate 275 EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate 276 277 // Should perform loose matching 278 EXPECT_EQ(0xFFFFFFFFu, map("")); 279 EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER")); 280 EXPECT_EQ(0x0001F984u, map("unicorn face")); 281 EXPECT_EQ(0x0001F984u, map("UNICORN FaCE")); 282 EXPECT_EQ(0x0001F984u, map("UNICORNFaCE")); 283 EXPECT_EQ(0xFFFFFFFFu, map("UNICORN")); 284 EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i")); 285 EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i")); 286 EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI")); 287 EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE")); 288 289 EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D")); 290 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d")); 291 EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D")); 292 293 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER")); 294 EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1")); 295 296 // https://unicode.org/reports/tr44/#Matching_Names 297 // UAX44-LM2: Medial hypens are ignored, non medial hyphens are not 298 EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E")); 299 EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE")); 300 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-")); 301 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -")); 302 EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --")); 303 EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE")); 304 305 EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A")); 306 EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA")); 307 EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A")); 308 EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A")); 309 EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A")); 310 ; 311 312 // special case 313 EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E")); 314 EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE")); 315 316 // names that are prefix to existing characters should not match 317 EXPECT_FALSE(nameToCodepointLooseMatching("B")); 318 EXPECT_FALSE(nameToCodepointLooseMatching("BE")); 319 EXPECT_FALSE(nameToCodepointLooseMatching("BEE")); 320 EXPECT_FALSE(nameToCodepointLooseMatching("BEET")); 321 EXPECT_FALSE(nameToCodepointLooseMatching("BEETL")); 322 EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE")); 323 } 324 325 } // namespace 326 327 bool operator==(MatchForCodepointName a, MatchForCodepointName b) { 328 return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value; 329 } 330 331 namespace { 332 333 TEST(Unicode, nearestMatchesForCodepointName) { 334 auto Normalize = [](StringRef Name) { 335 std::string Out; 336 Out.reserve(Name.size()); 337 for (char C : Name) { 338 if (isAlnum(C)) 339 Out.push_back(toUpper(C)); 340 } 341 return Out; 342 }; 343 344 auto L = [&](StringRef name) { 345 auto v = nearestMatchesForCodepointName(name, 3); 346 for (auto &r : v) { 347 auto A = Normalize(r.Name); 348 auto B = Normalize(name); 349 EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance); 350 } 351 return v; 352 }; 353 using ::testing::ElementsAre; 354 using M = MatchForCodepointName; 355 356 ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C}, 357 M{"ARC", 3, 0x2312})); 358 // shortest name 359 ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93}, 360 M{"BOY", 2, 0x1F466})); 361 362 // longest name 363 ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF " 364 "MAKSURA INITIAL FORM"), 365 ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 366 "ABOVE WITH ALEF MAKSURA INITIAL FORM", 367 0, 0xFBFB}, 368 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 369 "ABOVE WITH ALEF MAKSURA FINAL FORM", 370 4, 0xFBFA}, 371 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 372 "ABOVE WITH ALEF MAKSURA ISOLATED FORM", 373 7, 0xFBF9})); 374 375 // same result with underscore, spaces, etc 376 ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH " 377 "ALEF MAKsURAINITIAL form_"), 378 ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 379 "ABOVE WITH ALEF MAKSURA INITIAL FORM", 380 0, 0xFBFB}, 381 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 382 "ABOVE WITH ALEF MAKSURA FINAL FORM", 383 4, 0xFBFA}, 384 M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA " 385 "ABOVE WITH ALEF MAKSURA ISOLATED FORM", 386 7, 0xFBF9})); 387 388 ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"), 389 ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B}, 390 M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393}, 391 M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391})); 392 393 ASSERT_THAT(L("greekcapitalletter-lambda"), 394 ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B}, 395 M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393}, 396 M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391})); 397 398 // typo http://www.unicode.org/notes/tn27/tn27-5.html 399 ASSERT_THAT( 400 L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"), 401 ElementsAre( 402 M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0, 403 0xFE18}, // typo 404 M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2, 405 0xFE18}, // correction 406 M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6, 407 0xFE17})); 408 409 // typo http://www.unicode.org/notes/tn27/tn27-5.html 410 ASSERT_THAT( 411 L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"), 412 ElementsAre( 413 M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5}, 414 M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5}, 415 M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7, 416 0x1D0C6})); 417 } 418 419 } // namespace 420 } // namespace unicode 421 } // namespace sys 422 } // namespace llvm 423