xref: /llvm-project/clang/lib/Format/BreakableToken.h (revision 0fba8381d2a71ff440fdf0ae30d59a0bf07fea75)
1 //===--- BreakableToken.h - Format C++ code ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// Declares BreakableToken, BreakableStringLiteral, BreakableComment,
11 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
12 /// token type-specific logic to break long lines in tokens and reflow content
13 /// between tokens.
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
18 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 
20 #include "Encoding.h"
21 #include "WhitespaceManager.h"
22 #include "llvm/ADT/StringSet.h"
23 
24 namespace clang {
25 namespace format {
26 
27 /// Checks if \p Token switches formatting, like /* clang-format off */.
28 /// \p Token must be a comment.
29 bool switchesFormatting(const FormatToken &Token);
30 
31 struct FormatStyle;
32 
33 /// Base class for tokens / ranges of tokens that can allow breaking
34 /// within the tokens - for example, to avoid whitespace beyond the column
35 /// limit, or to reflow text.
36 ///
37 /// Generally, a breakable token consists of logical lines, addressed by a line
38 /// index. For example, in a sequence of line comments, each line comment is its
39 /// own logical line; similarly, for a block comment, each line in the block
40 /// comment is on its own logical line.
41 ///
42 /// There are two methods to compute the layout of the token:
43 /// - getRangeLength measures the number of columns needed for a range of text
44 ///   within a logical line, and
45 /// - getContentStartColumn returns the start column at which we want the
46 ///   content of a logical line to start (potentially after introducing a line
47 ///   break).
48 ///
49 /// The mechanism to adapt the layout of the breakable token is organised
50 /// around the concept of a \c Split, which is a whitespace range that signifies
51 /// a position of the content of a token where a reformatting might be done.
52 ///
53 /// Operating with splits is divided into two operations:
54 /// - getSplit, for finding a split starting at a position,
55 /// - insertBreak, for executing the split using a whitespace manager.
56 ///
57 /// There is a pair of operations that are used to compress a long whitespace
58 /// range with a single space if that will bring the line length under the
59 /// column limit:
60 /// - getLineLengthAfterCompression, for calculating the size in columns of the
61 ///   line after a whitespace range has been compressed, and
62 /// - compressWhitespace, for executing the whitespace compression using a
63 ///   whitespace manager; note that the compressed whitespace may be in the
64 ///   middle of the original line and of the reformatted line.
65 ///
66 /// For tokens where the whitespace before each line needs to be also
67 /// reformatted, for example for tokens supporting reflow, there are analogous
68 /// operations that might be executed before the main line breaking occurs:
69 /// - getReflowSplit, for finding a split such that the content preceding it
70 ///   needs to be specially reflown,
71 /// - reflow, for executing the split using a whitespace manager,
72 /// - introducesBreakBefore, for checking if reformatting the beginning
73 ///   of the content introduces a line break before it,
74 /// - adaptStartOfLine, for executing the reflow using a whitespace
75 ///   manager.
76 ///
77 /// For tokens that require the whitespace after the last line to be
78 /// reformatted, for example in multiline jsdoc comments that require the
79 /// trailing '*/' to be on a line of itself, there are analogous operations
80 /// that might be executed after the last line has been reformatted:
81 /// - getSplitAfterLastLine, for finding a split after the last line that needs
82 ///   to be reflown,
83 /// - replaceWhitespaceAfterLastLine, for executing the reflow using a
84 ///   whitespace manager.
85 ///
86 class BreakableToken {
87 public:
88   /// Contains starting character index and length of split.
89   typedef std::pair<StringRef::size_type, unsigned> Split;
90 
91   virtual ~BreakableToken() {}
92 
93   /// Returns the number of lines in this token in the original code.
94   virtual unsigned getLineCount() const = 0;
95 
96   /// Returns the number of columns required to format the text in the
97   /// byte range [\p Offset, \p Offset \c + \p Length).
98   ///
99   /// \p Offset is the byte offset from the start of the content of the line
100   ///    at \p LineIndex.
101   ///
102   /// \p StartColumn is the column at which the text starts in the formatted
103   ///    file, needed to compute tab stops correctly.
104   virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
105                                   StringRef::size_type Length,
106                                   unsigned StartColumn) const = 0;
107 
108   /// Returns the number of columns required to format the text following
109   /// the byte \p Offset in the line \p LineIndex, including potentially
110   /// unbreakable sequences of tokens following after the end of the token.
111   ///
112   /// \p Offset is the byte offset from the start of the content of the line
113   ///    at \p LineIndex.
114   ///
115   /// \p StartColumn is the column at which the text starts in the formatted
116   ///    file, needed to compute tab stops correctly.
117   ///
118   /// For breakable tokens that never use extra space at the end of a line, this
119   /// is equivalent to getRangeLength with a Length of StringRef::npos.
120   virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
121                                       unsigned StartColumn) const {
122     return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
123   }
124 
125   /// Returns the column at which content in line \p LineIndex starts,
126   /// assuming no reflow.
127   ///
128   /// If \p Break is true, returns the column at which the line should start
129   /// after the line break.
130   /// If \p Break is false, returns the column at which the line itself will
131   /// start.
132   virtual unsigned getContentStartColumn(unsigned LineIndex,
133                                          bool Break) const = 0;
134 
135   /// Returns additional content indent required for the second line after the
136   /// content at line \p LineIndex is broken.
137   ///
138   // (Next lines do not start with `///` since otherwise -Wdocumentation picks
139   // up the example annotations and generates warnings for them)
140   // For example, Javadoc @param annotations require and indent of 4 spaces and
141   // in this example getContentIndex(1) returns 4.
142   // /**
143   //  * @param loooooooooooooong line
144   //  *     continuation
145   //  */
146   virtual unsigned getContentIndent(unsigned LineIndex) const { return 0; }
147 
148   /// Returns a range (offset, length) at which to break the line at
149   /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
150   /// violate \p ColumnLimit, assuming the text starting at \p TailOffset in
151   /// the token is formatted starting at ContentStartColumn in the reformatted
152   /// file.
153   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
154                          unsigned ColumnLimit, unsigned ContentStartColumn,
155                          const llvm::Regex &CommentPragmasRegex) const = 0;
156 
157   /// Emits the previously retrieved \p Split via \p Whitespaces.
158   virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
159                            unsigned ContentIndent,
160                            WhitespaceManager &Whitespaces) const = 0;
161 
162   /// Returns the number of columns needed to format
163   /// \p RemainingTokenColumns, assuming that Split is within the range measured
164   /// by \p RemainingTokenColumns, and that the whitespace in Split is reduced
165   /// to a single space.
166   unsigned getLengthAfterCompression(unsigned RemainingTokenColumns,
167                                      Split Split) const;
168 
169   /// Replaces the whitespace range described by \p Split with a single
170   /// space.
171   virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
172                                   Split Split,
173                                   WhitespaceManager &Whitespaces) const = 0;
174 
175   /// Returns whether the token supports reflowing text.
176   virtual bool supportsReflow() const { return false; }
177 
178   /// Returns a whitespace range (offset, length) of the content at \p
179   /// LineIndex such that the content of that line is reflown to the end of the
180   /// previous one.
181   ///
182   /// Returning (StringRef::npos, 0) indicates reflowing is not possible.
183   ///
184   /// The range will include any whitespace preceding the specified line's
185   /// content.
186   ///
187   /// If the split is not contained within one token, for example when reflowing
188   /// line comments, returns (0, <length>).
189   virtual Split getReflowSplit(unsigned LineIndex,
190                                const llvm::Regex &CommentPragmasRegex) const {
191     return Split(StringRef::npos, 0);
192   }
193 
194   /// Reflows the current line into the end of the previous one.
195   virtual void reflow(unsigned LineIndex,
196                       WhitespaceManager &Whitespaces) const {}
197 
198   /// Returns whether there will be a line break at the start of the
199   /// token.
200   virtual bool introducesBreakBeforeToken() const { return false; }
201 
202   /// Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
203   virtual void adaptStartOfLine(unsigned LineIndex,
204                                 WhitespaceManager &Whitespaces) const {}
205 
206   /// Returns a whitespace range (offset, length) of the content at
207   /// the last line that needs to be reformatted after the last line has been
208   /// reformatted.
209   ///
210   /// A result having offset == StringRef::npos means that no reformat is
211   /// necessary.
212   virtual Split getSplitAfterLastLine(unsigned TailOffset) const {
213     return Split(StringRef::npos, 0);
214   }
215 
216   /// Replaces the whitespace from \p SplitAfterLastLine on the last line
217   /// after the last line has been formatted by performing a reformatting.
218   void replaceWhitespaceAfterLastLine(unsigned TailOffset,
219                                       Split SplitAfterLastLine,
220                                       WhitespaceManager &Whitespaces) const {
221     insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
222                 /*ContentIndent=*/0, Whitespaces);
223   }
224 
225   /// Updates the next token of \p State to the next token after this
226   /// one. This can be used when this token manages a set of underlying tokens
227   /// as a unit and is responsible for the formatting of the them.
228   virtual void updateNextToken(LineState &State) const {}
229 
230   /// Adds replacements that are needed when the token is broken. Such as
231   /// wrapping a JavaScript string in parentheses after it gets broken with plus
232   /// signs.
233   virtual void updateAfterBroken(WhitespaceManager &Whitespaces) const {}
234 
235 protected:
236   BreakableToken(const FormatToken &Tok, bool InPPDirective,
237                  encoding::Encoding Encoding, const FormatStyle &Style)
238       : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
239         Style(Style) {}
240 
241   const FormatToken &Tok;
242   const bool InPPDirective;
243   const encoding::Encoding Encoding;
244   const FormatStyle &Style;
245 };
246 
247 class BreakableStringLiteral : public BreakableToken {
248 public:
249   /// Creates a breakable token for a single line string literal.
250   ///
251   /// \p StartColumn specifies the column in which the token will start
252   /// after formatting.
253   BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
254                          StringRef Prefix, StringRef Postfix,
255                          unsigned UnbreakableTailLength, bool InPPDirective,
256                          encoding::Encoding Encoding, const FormatStyle &Style);
257 
258   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
259                  unsigned ContentStartColumn,
260                  const llvm::Regex &CommentPragmasRegex) const override;
261   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
262                    unsigned ContentIndent,
263                    WhitespaceManager &Whitespaces) const override;
264   void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
265                           WhitespaceManager &Whitespaces) const override {}
266   unsigned getLineCount() const override;
267   unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
268                           StringRef::size_type Length,
269                           unsigned StartColumn) const override;
270   unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
271                               unsigned StartColumn) const override;
272   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
273 
274 protected:
275   // The column in which the token starts.
276   unsigned StartColumn;
277   // The prefix a line needs after a break in the token.
278   StringRef Prefix;
279   // The postfix a line needs before introducing a break.
280   StringRef Postfix;
281   // The token text excluding the prefix and postfix.
282   StringRef Line;
283   // Length of the sequence of tokens after this string literal that cannot
284   // contain line breaks.
285   unsigned UnbreakableTailLength;
286 };
287 
288 class BreakableStringLiteralUsingOperators : public BreakableStringLiteral {
289 public:
290   enum QuoteStyleType {
291     DoubleQuotes,   // The string is quoted with double quotes.
292     SingleQuotes,   // The JavaScript string is quoted with single quotes.
293     AtDoubleQuotes, // The C# verbatim string is quoted with the at sign and
294                     // double quotes.
295   };
296   /// Creates a breakable token for a single line string literal for C#, Java,
297   /// JavaScript, or Verilog.
298   ///
299   /// \p StartColumn specifies the column in which the token will start
300   /// after formatting.
301   BreakableStringLiteralUsingOperators(
302       const FormatToken &Tok, QuoteStyleType QuoteStyle, bool UnindentPlus,
303       unsigned StartColumn, unsigned UnbreakableTailLength, bool InPPDirective,
304       encoding::Encoding Encoding, const FormatStyle &Style);
305   unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
306                               unsigned StartColumn) const override;
307   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
308   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
309                    unsigned ContentIndent,
310                    WhitespaceManager &Whitespaces) const override;
311   void updateAfterBroken(WhitespaceManager &Whitespaces) const override;
312 
313 protected:
314   // Whether braces or parentheses should be inserted around the string to form
315   // a concatenation.
316   bool BracesNeeded;
317   QuoteStyleType QuoteStyle;
318   // The braces or parentheses along with the first character which they
319   // replace, either a quote or at sign.
320   StringRef LeftBraceQuote;
321   StringRef RightBraceQuote;
322   // Width added to the left due to the added brace or parenthesis. Does not
323   // apply to the first line.
324   int ContinuationIndent;
325 };
326 
327 class BreakableComment : public BreakableToken {
328 protected:
329   /// Creates a breakable token for a comment.
330   ///
331   /// \p StartColumn specifies the column in which the comment will start after
332   /// formatting.
333   BreakableComment(const FormatToken &Token, unsigned StartColumn,
334                    bool InPPDirective, encoding::Encoding Encoding,
335                    const FormatStyle &Style);
336 
337 public:
338   bool supportsReflow() const override { return true; }
339   unsigned getLineCount() const override;
340   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
341                  unsigned ContentStartColumn,
342                  const llvm::Regex &CommentPragmasRegex) const override;
343   void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
344                           WhitespaceManager &Whitespaces) const override;
345 
346 protected:
347   // Returns the token containing the line at LineIndex.
348   const FormatToken &tokenAt(unsigned LineIndex) const;
349 
350   // Checks if the content of line LineIndex may be reflown with the previous
351   // line.
352   virtual bool mayReflow(unsigned LineIndex,
353                          const llvm::Regex &CommentPragmasRegex) const = 0;
354 
355   // Contains the original text of the lines of the block comment.
356   //
357   // In case of a block comments, excludes the leading /* in the first line and
358   // trailing */ in the last line. In case of line comments, excludes the
359   // leading // and spaces.
360   SmallVector<StringRef, 16> Lines;
361 
362   // Contains the text of the lines excluding all leading and trailing
363   // whitespace between the lines. Note that the decoration (if present) is also
364   // not considered part of the text.
365   SmallVector<StringRef, 16> Content;
366 
367   // Tokens[i] contains a reference to the token containing Lines[i] if the
368   // whitespace range before that token is managed by this block.
369   // Otherwise, Tokens[i] is a null pointer.
370   SmallVector<FormatToken *, 16> Tokens;
371 
372   // ContentColumn[i] is the target column at which Content[i] should be.
373   // Note that this excludes a leading "* " or "*" in case of block comments
374   // where all lines have a "*" prefix, or the leading "// " or "//" in case of
375   // line comments.
376   //
377   // In block comments, the first line's target column is always positive. The
378   // remaining lines' target columns are relative to the first line to allow
379   // correct indentation of comments in \c WhitespaceManager. Thus they can be
380   // negative as well (in case the first line needs to be unindented more than
381   // there's actual whitespace in another line).
382   SmallVector<int, 16> ContentColumn;
383 
384   // The intended start column of the first line of text from this section.
385   unsigned StartColumn;
386 
387   const bool AlwaysReflow = Style.ReflowComments == FormatStyle::RCS_Always;
388 
389   // The prefix to use in front a line that has been reflown up.
390   // For example, when reflowing the second line after the first here:
391   // // comment 1
392   // // comment 2
393   // we expect:
394   // // comment 1 comment 2
395   // and not:
396   // // comment 1comment 2
397   StringRef ReflowPrefix = " ";
398 };
399 
400 class BreakableBlockComment : public BreakableComment {
401 public:
402   BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
403                         unsigned OriginalStartColumn, bool FirstInLine,
404                         bool InPPDirective, encoding::Encoding Encoding,
405                         const FormatStyle &Style, bool UseCRLF);
406 
407   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
408                  unsigned ContentStartColumn,
409                  const llvm::Regex &CommentPragmasRegex) const override;
410   unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
411                           StringRef::size_type Length,
412                           unsigned StartColumn) const override;
413   unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
414                               unsigned StartColumn) const override;
415   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
416   unsigned getContentIndent(unsigned LineIndex) const override;
417   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
418                    unsigned ContentIndent,
419                    WhitespaceManager &Whitespaces) const override;
420   Split getReflowSplit(unsigned LineIndex,
421                        const llvm::Regex &CommentPragmasRegex) const override;
422   void reflow(unsigned LineIndex,
423               WhitespaceManager &Whitespaces) const override;
424   bool introducesBreakBeforeToken() const override;
425   void adaptStartOfLine(unsigned LineIndex,
426                         WhitespaceManager &Whitespaces) const override;
427   Split getSplitAfterLastLine(unsigned TailOffset) const override;
428 
429   bool mayReflow(unsigned LineIndex,
430                  const llvm::Regex &CommentPragmasRegex) const override;
431 
432   // Contains Javadoc annotations that require additional indent when continued
433   // on multiple lines.
434   static const llvm::StringSet<> ContentIndentingJavadocAnnotations;
435 
436 private:
437   // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
438   //
439   // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
440   // leading and trailing whitespace.
441   //
442   // Sets ContentColumn to the intended column in which the text at
443   // Lines[LineIndex] starts (note that the decoration, if present, is not
444   // considered part of the text).
445   void adjustWhitespace(unsigned LineIndex, int IndentDelta);
446 
447   // The column at which the text of a broken line should start.
448   // Note that an optional decoration would go before that column.
449   // IndentAtLineBreak is a uniform position for all lines in a block comment,
450   // regardless of their relative position.
451   // FIXME: Revisit the decision to do this; the main reason was to support
452   // patterns like
453   // /**************//**
454   //  * Comment
455   // We could also support such patterns by special casing the first line
456   // instead.
457   unsigned IndentAtLineBreak;
458 
459   // This is to distinguish between the case when the last line was empty and
460   // the case when it started with a decoration ("*" or "* ").
461   bool LastLineNeedsDecoration;
462 
463   // Either "* " if all lines begin with a "*", or empty.
464   StringRef Decoration;
465 
466   // If this block comment has decorations, this is the column of the start of
467   // the decorations.
468   unsigned DecorationColumn;
469 
470   // If true, make sure that the opening '/**' and the closing '*/' ends on a
471   // line of itself. Styles like jsdoc require this for multiline comments.
472   bool DelimitersOnNewline;
473 
474   // Length of the sequence of tokens after this string literal that cannot
475   // contain line breaks.
476   unsigned UnbreakableTailLength;
477 };
478 
479 class BreakableLineCommentSection : public BreakableComment {
480 public:
481   BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
482                               bool InPPDirective, encoding::Encoding Encoding,
483                               const FormatStyle &Style);
484 
485   unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
486                           StringRef::size_type Length,
487                           unsigned StartColumn) const override;
488   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
489   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
490                    unsigned ContentIndent,
491                    WhitespaceManager &Whitespaces) const override;
492   Split getReflowSplit(unsigned LineIndex,
493                        const llvm::Regex &CommentPragmasRegex) const override;
494   void reflow(unsigned LineIndex,
495               WhitespaceManager &Whitespaces) const override;
496   void adaptStartOfLine(unsigned LineIndex,
497                         WhitespaceManager &Whitespaces) const override;
498   void updateNextToken(LineState &State) const override;
499   bool mayReflow(unsigned LineIndex,
500                  const llvm::Regex &CommentPragmasRegex) const override;
501 
502 private:
503   // OriginalPrefix[i] contains the original prefix of line i, including
504   // trailing whitespace before the start of the content. The indentation
505   // preceding the prefix is not included.
506   // For example, if the line is:
507   // // content
508   // then the original prefix is "// ".
509   SmallVector<StringRef, 16> OriginalPrefix;
510 
511   /// Prefix[i] + SpacesToAdd[i] contains the intended leading "//" with
512   /// trailing spaces to account for the indentation of content within the
513   /// comment at line i after formatting. It can be different than the original
514   /// prefix.
515   /// When the original line starts like this:
516   /// //content
517   /// Then the OriginalPrefix[i] is "//", but the Prefix[i] is "// " in the LLVM
518   /// style.
519   /// When the line starts like:
520   /// // content
521   /// And we want to remove the spaces the OriginalPrefix[i] is "// " and
522   /// Prefix[i] is "//".
523   SmallVector<std::string, 16> Prefix;
524 
525   /// How many spaces are added or removed from the OriginalPrefix to form
526   /// Prefix.
527   SmallVector<int, 16> PrefixSpaceChange;
528 
529   /// The token to which the last line of this breakable token belongs
530   /// to; nullptr if that token is the initial token.
531   ///
532   /// The distinction is because if the token of the last line of this breakable
533   /// token is distinct from the initial token, this breakable token owns the
534   /// whitespace before the token of the last line, and the whitespace manager
535   /// must be able to modify it.
536   FormatToken *LastLineTok = nullptr;
537 };
538 } // namespace format
539 } // namespace clang
540 
541 #endif
542