1*7330f729Sjoerg //===--- Encoding.h - Format C++ code ---------------------------*- C++ -*-===//
2*7330f729Sjoerg //
3*7330f729Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*7330f729Sjoerg // See https://llvm.org/LICENSE.txt for license information.
5*7330f729Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*7330f729Sjoerg //
7*7330f729Sjoerg //===----------------------------------------------------------------------===//
8*7330f729Sjoerg ///
9*7330f729Sjoerg /// \file
10*7330f729Sjoerg /// Contains functions for text encoding manipulation. Supports UTF-8,
11*7330f729Sjoerg /// 8-bit encodings and escape sequences in C++ string literals.
12*7330f729Sjoerg ///
13*7330f729Sjoerg //===----------------------------------------------------------------------===//
14*7330f729Sjoerg
15*7330f729Sjoerg #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
16*7330f729Sjoerg #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
17*7330f729Sjoerg
18*7330f729Sjoerg #include "clang/Basic/LLVM.h"
19*7330f729Sjoerg #include "llvm/ADT/StringRef.h"
20*7330f729Sjoerg #include "llvm/Support/ConvertUTF.h"
21*7330f729Sjoerg #include "llvm/Support/Unicode.h"
22*7330f729Sjoerg
23*7330f729Sjoerg namespace clang {
24*7330f729Sjoerg namespace format {
25*7330f729Sjoerg namespace encoding {
26*7330f729Sjoerg
27*7330f729Sjoerg enum Encoding {
28*7330f729Sjoerg Encoding_UTF8,
29*7330f729Sjoerg Encoding_Unknown // We treat all other encodings as 8-bit encodings.
30*7330f729Sjoerg };
31*7330f729Sjoerg
32*7330f729Sjoerg /// Detects encoding of the Text. If the Text can be decoded using UTF-8,
33*7330f729Sjoerg /// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
detectEncoding(StringRef Text)34*7330f729Sjoerg inline Encoding detectEncoding(StringRef Text) {
35*7330f729Sjoerg const llvm::UTF8 *Ptr = reinterpret_cast<const llvm::UTF8 *>(Text.begin());
36*7330f729Sjoerg const llvm::UTF8 *BufEnd = reinterpret_cast<const llvm::UTF8 *>(Text.end());
37*7330f729Sjoerg if (llvm::isLegalUTF8String(&Ptr, BufEnd))
38*7330f729Sjoerg return Encoding_UTF8;
39*7330f729Sjoerg return Encoding_Unknown;
40*7330f729Sjoerg }
41*7330f729Sjoerg
42*7330f729Sjoerg /// Returns the number of columns required to display the \p Text on a
43*7330f729Sjoerg /// generic Unicode-capable terminal. Text is assumed to use the specified
44*7330f729Sjoerg /// \p Encoding.
columnWidth(StringRef Text,Encoding Encoding)45*7330f729Sjoerg inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
46*7330f729Sjoerg if (Encoding == Encoding_UTF8) {
47*7330f729Sjoerg int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
48*7330f729Sjoerg // FIXME: Figure out the correct way to handle this in the presence of both
49*7330f729Sjoerg // printable and unprintable multi-byte UTF-8 characters. Falling back to
50*7330f729Sjoerg // returning the number of bytes may cause problems, as columnWidth suddenly
51*7330f729Sjoerg // becomes non-additive.
52*7330f729Sjoerg if (ContentWidth >= 0)
53*7330f729Sjoerg return ContentWidth;
54*7330f729Sjoerg }
55*7330f729Sjoerg return Text.size();
56*7330f729Sjoerg }
57*7330f729Sjoerg
58*7330f729Sjoerg /// Returns the number of columns required to display the \p Text,
59*7330f729Sjoerg /// starting from the \p StartColumn on a terminal with the \p TabWidth. The
60*7330f729Sjoerg /// text is assumed to use the specified \p Encoding.
columnWidthWithTabs(StringRef Text,unsigned StartColumn,unsigned TabWidth,Encoding Encoding)61*7330f729Sjoerg inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
62*7330f729Sjoerg unsigned TabWidth, Encoding Encoding) {
63*7330f729Sjoerg unsigned TotalWidth = 0;
64*7330f729Sjoerg StringRef Tail = Text;
65*7330f729Sjoerg for (;;) {
66*7330f729Sjoerg StringRef::size_type TabPos = Tail.find('\t');
67*7330f729Sjoerg if (TabPos == StringRef::npos)
68*7330f729Sjoerg return TotalWidth + columnWidth(Tail, Encoding);
69*7330f729Sjoerg TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
70*7330f729Sjoerg if (TabWidth)
71*7330f729Sjoerg TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
72*7330f729Sjoerg Tail = Tail.substr(TabPos + 1);
73*7330f729Sjoerg }
74*7330f729Sjoerg }
75*7330f729Sjoerg
76*7330f729Sjoerg /// Gets the number of bytes in a sequence representing a single
77*7330f729Sjoerg /// codepoint and starting with FirstChar in the specified Encoding.
getCodePointNumBytes(char FirstChar,Encoding Encoding)78*7330f729Sjoerg inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
79*7330f729Sjoerg switch (Encoding) {
80*7330f729Sjoerg case Encoding_UTF8:
81*7330f729Sjoerg return llvm::getNumBytesForUTF8(FirstChar);
82*7330f729Sjoerg default:
83*7330f729Sjoerg return 1;
84*7330f729Sjoerg }
85*7330f729Sjoerg }
86*7330f729Sjoerg
isOctDigit(char c)87*7330f729Sjoerg inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
88*7330f729Sjoerg
isHexDigit(char c)89*7330f729Sjoerg inline bool isHexDigit(char c) {
90*7330f729Sjoerg return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
91*7330f729Sjoerg ('A' <= c && c <= 'F');
92*7330f729Sjoerg }
93*7330f729Sjoerg
94*7330f729Sjoerg /// Gets the length of an escape sequence inside a C++ string literal.
95*7330f729Sjoerg /// Text should span from the beginning of the escape sequence (starting with a
96*7330f729Sjoerg /// backslash) to the end of the string literal.
getEscapeSequenceLength(StringRef Text)97*7330f729Sjoerg inline unsigned getEscapeSequenceLength(StringRef Text) {
98*7330f729Sjoerg assert(Text[0] == '\\');
99*7330f729Sjoerg if (Text.size() < 2)
100*7330f729Sjoerg return 1;
101*7330f729Sjoerg
102*7330f729Sjoerg switch (Text[1]) {
103*7330f729Sjoerg case 'u':
104*7330f729Sjoerg return 6;
105*7330f729Sjoerg case 'U':
106*7330f729Sjoerg return 10;
107*7330f729Sjoerg case 'x': {
108*7330f729Sjoerg unsigned I = 2; // Point after '\x'.
109*7330f729Sjoerg while (I < Text.size() && isHexDigit(Text[I]))
110*7330f729Sjoerg ++I;
111*7330f729Sjoerg return I;
112*7330f729Sjoerg }
113*7330f729Sjoerg default:
114*7330f729Sjoerg if (isOctDigit(Text[1])) {
115*7330f729Sjoerg unsigned I = 1;
116*7330f729Sjoerg while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
117*7330f729Sjoerg ++I;
118*7330f729Sjoerg return I;
119*7330f729Sjoerg }
120*7330f729Sjoerg return 1 + llvm::getNumBytesForUTF8(Text[1]);
121*7330f729Sjoerg }
122*7330f729Sjoerg }
123*7330f729Sjoerg
124*7330f729Sjoerg } // namespace encoding
125*7330f729Sjoerg } // namespace format
126*7330f729Sjoerg } // namespace clang
127*7330f729Sjoerg
128*7330f729Sjoerg #endif
129