1f4a2713aSLionel Sambuc //===--- Encoding.h - Format C++ code -------------------------------------===//
2f4a2713aSLionel Sambuc //
3f4a2713aSLionel Sambuc // The LLVM Compiler Infrastructure
4f4a2713aSLionel Sambuc //
5f4a2713aSLionel Sambuc // This file is distributed under the University of Illinois Open Source
6f4a2713aSLionel Sambuc // License. See LICENSE.TXT for details.
7f4a2713aSLionel Sambuc //
8f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===//
9f4a2713aSLionel Sambuc ///
10f4a2713aSLionel Sambuc /// \file
11f4a2713aSLionel Sambuc /// \brief Contains functions for text encoding manipulation. Supports UTF-8,
12f4a2713aSLionel Sambuc /// 8-bit encodings and escape sequences in C++ string literals.
13f4a2713aSLionel Sambuc ///
14f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===//
15f4a2713aSLionel Sambuc
16*0a6a1f1dSLionel Sambuc #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
17*0a6a1f1dSLionel Sambuc #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
18f4a2713aSLionel Sambuc
19f4a2713aSLionel Sambuc #include "clang/Basic/LLVM.h"
20f4a2713aSLionel Sambuc #include "llvm/Support/ConvertUTF.h"
21f4a2713aSLionel Sambuc #include "llvm/Support/Unicode.h"
22f4a2713aSLionel Sambuc
23f4a2713aSLionel Sambuc namespace clang {
24f4a2713aSLionel Sambuc namespace format {
25f4a2713aSLionel Sambuc namespace encoding {
26f4a2713aSLionel Sambuc
27f4a2713aSLionel Sambuc enum Encoding {
28f4a2713aSLionel Sambuc Encoding_UTF8,
29f4a2713aSLionel Sambuc Encoding_Unknown // We treat all other encodings as 8-bit encodings.
30f4a2713aSLionel Sambuc };
31f4a2713aSLionel Sambuc
32f4a2713aSLionel Sambuc /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
33f4a2713aSLionel Sambuc /// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
detectEncoding(StringRef Text)34f4a2713aSLionel Sambuc inline Encoding detectEncoding(StringRef Text) {
35f4a2713aSLionel Sambuc const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
36f4a2713aSLionel Sambuc const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
37f4a2713aSLionel Sambuc if (::isLegalUTF8String(&Ptr, BufEnd))
38f4a2713aSLionel Sambuc return Encoding_UTF8;
39f4a2713aSLionel Sambuc return Encoding_Unknown;
40f4a2713aSLionel Sambuc }
41f4a2713aSLionel Sambuc
getCodePointCountUTF8(StringRef Text)42f4a2713aSLionel Sambuc inline unsigned getCodePointCountUTF8(StringRef Text) {
43f4a2713aSLionel Sambuc unsigned CodePoints = 0;
44f4a2713aSLionel Sambuc for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
45f4a2713aSLionel Sambuc ++CodePoints;
46f4a2713aSLionel Sambuc }
47f4a2713aSLionel Sambuc return CodePoints;
48f4a2713aSLionel Sambuc }
49f4a2713aSLionel Sambuc
50f4a2713aSLionel Sambuc /// \brief Gets the number of code points in the Text using the specified
51f4a2713aSLionel Sambuc /// Encoding.
getCodePointCount(StringRef Text,Encoding Encoding)52f4a2713aSLionel Sambuc inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
53f4a2713aSLionel Sambuc switch (Encoding) {
54f4a2713aSLionel Sambuc case Encoding_UTF8:
55f4a2713aSLionel Sambuc return getCodePointCountUTF8(Text);
56f4a2713aSLionel Sambuc default:
57f4a2713aSLionel Sambuc return Text.size();
58f4a2713aSLionel Sambuc }
59f4a2713aSLionel Sambuc }
60f4a2713aSLionel Sambuc
61f4a2713aSLionel Sambuc /// \brief Returns the number of columns required to display the \p Text on a
62f4a2713aSLionel Sambuc /// generic Unicode-capable terminal. Text is assumed to use the specified
63f4a2713aSLionel Sambuc /// \p Encoding.
columnWidth(StringRef Text,Encoding Encoding)64f4a2713aSLionel Sambuc inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
65f4a2713aSLionel Sambuc if (Encoding == Encoding_UTF8) {
66f4a2713aSLionel Sambuc int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
67*0a6a1f1dSLionel Sambuc // FIXME: Figure out the correct way to handle this in the presence of both
68*0a6a1f1dSLionel Sambuc // printable and unprintable multi-byte UTF-8 characters. Falling back to
69*0a6a1f1dSLionel Sambuc // returning the number of bytes may cause problems, as columnWidth suddenly
70*0a6a1f1dSLionel Sambuc // becomes non-additive.
71f4a2713aSLionel Sambuc if (ContentWidth >= 0)
72f4a2713aSLionel Sambuc return ContentWidth;
73f4a2713aSLionel Sambuc }
74f4a2713aSLionel Sambuc return Text.size();
75f4a2713aSLionel Sambuc }
76f4a2713aSLionel Sambuc
77f4a2713aSLionel Sambuc /// \brief Returns the number of columns required to display the \p Text,
78f4a2713aSLionel Sambuc /// starting from the \p StartColumn on a terminal with the \p TabWidth. The
79f4a2713aSLionel Sambuc /// text is assumed to use the specified \p Encoding.
columnWidthWithTabs(StringRef Text,unsigned StartColumn,unsigned TabWidth,Encoding Encoding)80f4a2713aSLionel Sambuc inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
81f4a2713aSLionel Sambuc unsigned TabWidth, Encoding Encoding) {
82f4a2713aSLionel Sambuc unsigned TotalWidth = 0;
83f4a2713aSLionel Sambuc StringRef Tail = Text;
84f4a2713aSLionel Sambuc for (;;) {
85f4a2713aSLionel Sambuc StringRef::size_type TabPos = Tail.find('\t');
86f4a2713aSLionel Sambuc if (TabPos == StringRef::npos)
87f4a2713aSLionel Sambuc return TotalWidth + columnWidth(Tail, Encoding);
88*0a6a1f1dSLionel Sambuc TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
89f4a2713aSLionel Sambuc TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
90f4a2713aSLionel Sambuc Tail = Tail.substr(TabPos + 1);
91f4a2713aSLionel Sambuc }
92f4a2713aSLionel Sambuc }
93f4a2713aSLionel Sambuc
94f4a2713aSLionel Sambuc /// \brief Gets the number of bytes in a sequence representing a single
95f4a2713aSLionel Sambuc /// codepoint and starting with FirstChar in the specified Encoding.
getCodePointNumBytes(char FirstChar,Encoding Encoding)96f4a2713aSLionel Sambuc inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
97f4a2713aSLionel Sambuc switch (Encoding) {
98f4a2713aSLionel Sambuc case Encoding_UTF8:
99f4a2713aSLionel Sambuc return getNumBytesForUTF8(FirstChar);
100f4a2713aSLionel Sambuc default:
101f4a2713aSLionel Sambuc return 1;
102f4a2713aSLionel Sambuc }
103f4a2713aSLionel Sambuc }
104f4a2713aSLionel Sambuc
isOctDigit(char c)105f4a2713aSLionel Sambuc inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
106f4a2713aSLionel Sambuc
isHexDigit(char c)107f4a2713aSLionel Sambuc inline bool isHexDigit(char c) {
108f4a2713aSLionel Sambuc return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
109f4a2713aSLionel Sambuc ('A' <= c && c <= 'F');
110f4a2713aSLionel Sambuc }
111f4a2713aSLionel Sambuc
112f4a2713aSLionel Sambuc /// \brief Gets the length of an escape sequence inside a C++ string literal.
113f4a2713aSLionel Sambuc /// Text should span from the beginning of the escape sequence (starting with a
114f4a2713aSLionel Sambuc /// backslash) to the end of the string literal.
getEscapeSequenceLength(StringRef Text)115f4a2713aSLionel Sambuc inline unsigned getEscapeSequenceLength(StringRef Text) {
116f4a2713aSLionel Sambuc assert(Text[0] == '\\');
117f4a2713aSLionel Sambuc if (Text.size() < 2)
118f4a2713aSLionel Sambuc return 1;
119f4a2713aSLionel Sambuc
120f4a2713aSLionel Sambuc switch (Text[1]) {
121f4a2713aSLionel Sambuc case 'u':
122f4a2713aSLionel Sambuc return 6;
123f4a2713aSLionel Sambuc case 'U':
124f4a2713aSLionel Sambuc return 10;
125f4a2713aSLionel Sambuc case 'x': {
126f4a2713aSLionel Sambuc unsigned I = 2; // Point after '\x'.
127f4a2713aSLionel Sambuc while (I < Text.size() && isHexDigit(Text[I]))
128f4a2713aSLionel Sambuc ++I;
129f4a2713aSLionel Sambuc return I;
130f4a2713aSLionel Sambuc }
131f4a2713aSLionel Sambuc default:
132f4a2713aSLionel Sambuc if (isOctDigit(Text[1])) {
133f4a2713aSLionel Sambuc unsigned I = 1;
134f4a2713aSLionel Sambuc while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
135f4a2713aSLionel Sambuc ++I;
136f4a2713aSLionel Sambuc return I;
137f4a2713aSLionel Sambuc }
138f4a2713aSLionel Sambuc return 2;
139f4a2713aSLionel Sambuc }
140f4a2713aSLionel Sambuc }
141f4a2713aSLionel Sambuc
142f4a2713aSLionel Sambuc } // namespace encoding
143f4a2713aSLionel Sambuc } // namespace format
144f4a2713aSLionel Sambuc } // namespace clang
145f4a2713aSLionel Sambuc
146*0a6a1f1dSLionel Sambuc #endif
147