1 //===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Support/Unicode.h" 10 #include "llvm/Support/ConvertUTF.h" 11 #include "gtest/gtest.h" 12 13 namespace llvm { 14 namespace sys { 15 namespace unicode { 16 namespace { 17 18 TEST(Unicode, columnWidthUTF8) { 19 EXPECT_EQ(0, columnWidthUTF8("")); 20 EXPECT_EQ(1, columnWidthUTF8(" ")); 21 EXPECT_EQ(1, columnWidthUTF8("a")); 22 EXPECT_EQ(1, columnWidthUTF8("~")); 23 24 EXPECT_EQ(6, columnWidthUTF8("abcdef")); 25 26 EXPECT_EQ(-1, columnWidthUTF8("\x01")); 27 EXPECT_EQ(-1, columnWidthUTF8("\t")); 28 EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01")); 29 EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE 30 31 // 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some 32 // text editors display it only when a line is broken at it, some use it as a 33 // line-break hint, but don't display. We choose terminal-oriented 34 // interpretation. 35 EXPECT_EQ(1, columnWidthUTF8("\302\255")); 36 37 EXPECT_EQ(0, columnWidthUTF8("\314\200")); // 0300 COMBINING GRAVE ACCENT 38 EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI 39 EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00 40 41 EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200")); 42 EXPECT_EQ(3, columnWidthUTF8("q\344\270\200")); 43 EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200")); 44 45 // Invalid UTF-8 strings, columnWidthUTF8 should error out. 46 EXPECT_EQ(-2, columnWidthUTF8("\344")); 47 EXPECT_EQ(-2, columnWidthUTF8("\344\270")); 48 EXPECT_EQ(-2, columnWidthUTF8("\344\270\033")); 49 EXPECT_EQ(-2, columnWidthUTF8("\344\270\300")); 50 EXPECT_EQ(-2, columnWidthUTF8("\377\366\355")); 51 52 EXPECT_EQ(-2, columnWidthUTF8("qwer\344")); 53 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270")); 54 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033")); 55 EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300")); 56 EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355")); 57 58 // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode 59 // characters. 60 EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200")); // U+200000 61 EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000 62 } 63 64 TEST(Unicode, isPrintable) { 65 EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F> 66 EXPECT_FALSE(isPrintable(0x01)); 67 EXPECT_FALSE(isPrintable(0x1F)); 68 EXPECT_TRUE(isPrintable(' ')); 69 EXPECT_TRUE(isPrintable('A')); 70 EXPECT_TRUE(isPrintable('~')); 71 EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F> 72 EXPECT_FALSE(isPrintable(0x90)); 73 EXPECT_FALSE(isPrintable(0x9F)); 74 75 EXPECT_TRUE(isPrintable(0xAC)); 76 EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals 77 // as either a space or a dash. 78 EXPECT_TRUE(isPrintable(0xAE)); 79 80 EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 81 EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379> 82 83 EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN 84 85 EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF> 86 EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000 87 88 EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter 89 90 // test the validity of a fast path in columnWidthUTF8 91 for (unsigned char c = 0; c < 128; ++c) { 92 const UTF8 buf8[2] = {c, 0}; 93 const UTF8 *Target8 = &buf8[0]; 94 UTF32 buf32[1]; 95 UTF32 *Target32 = &buf32[0]; 96 auto status = ConvertUTF8toUTF32(&Target8, Target8 + 1, &Target32, 97 Target32 + 1, strictConversion); 98 EXPECT_TRUE(status == conversionOK); 99 EXPECT_TRUE((columnWidthUTF8(reinterpret_cast<const char *>(buf8)) == 1) == 100 (bool)isPrintable(buf32[0])); 101 } 102 } 103 104 } // namespace 105 } // namespace unicode 106 } // namespace sys 107 } // namespace llvm 108