xref: /llvm-project/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp (revision cbdc3e1bf9da09911ba353bcd20c6709bda43893)
135cca45bSserge-sans-paille //===--- MisleadingBidirectional.cpp - clang-tidy -------------------------===//
235cca45bSserge-sans-paille //
335cca45bSserge-sans-paille // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
435cca45bSserge-sans-paille // See https://llvm.org/LICENSE.txt for license information.
535cca45bSserge-sans-paille // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
635cca45bSserge-sans-paille //
735cca45bSserge-sans-paille //===----------------------------------------------------------------------===//
835cca45bSserge-sans-paille 
935cca45bSserge-sans-paille #include "MisleadingBidirectional.h"
1035cca45bSserge-sans-paille 
1135cca45bSserge-sans-paille #include "clang/Frontend/CompilerInstance.h"
1235cca45bSserge-sans-paille #include "clang/Lex/Preprocessor.h"
1335cca45bSserge-sans-paille #include "llvm/Support/ConvertUTF.h"
14c589730aSKrzysztof Parzyszek #include <optional>
1535cca45bSserge-sans-paille 
1635cca45bSserge-sans-paille using namespace clang;
1735cca45bSserge-sans-paille using namespace clang::tidy::misc;
1835cca45bSserge-sans-paille 
containsMisleadingBidi(StringRef Buffer,bool HonorLineBreaks=true)1935cca45bSserge-sans-paille static bool containsMisleadingBidi(StringRef Buffer,
2035cca45bSserge-sans-paille                                    bool HonorLineBreaks = true) {
2135cca45bSserge-sans-paille   const char *CurPtr = Buffer.begin();
2235cca45bSserge-sans-paille 
2335cca45bSserge-sans-paille   enum BidiChar {
2435cca45bSserge-sans-paille     PS = 0x2029,
2535cca45bSserge-sans-paille     RLO = 0x202E,
2635cca45bSserge-sans-paille     RLE = 0x202B,
2735cca45bSserge-sans-paille     LRO = 0x202D,
2835cca45bSserge-sans-paille     LRE = 0x202A,
2935cca45bSserge-sans-paille     PDF = 0x202C,
3035cca45bSserge-sans-paille     RLI = 0x2067,
3135cca45bSserge-sans-paille     LRI = 0x2066,
3235cca45bSserge-sans-paille     FSI = 0x2068,
3335cca45bSserge-sans-paille     PDI = 0x2069
3435cca45bSserge-sans-paille   };
3535cca45bSserge-sans-paille 
3635cca45bSserge-sans-paille   SmallVector<BidiChar> BidiContexts;
3735cca45bSserge-sans-paille 
3835cca45bSserge-sans-paille   // Scan each character while maintaining a stack of opened bidi context.
3935cca45bSserge-sans-paille   // RLO/RLE/LRO/LRE all are closed by PDF while RLI LRI and FSI are closed by
4035cca45bSserge-sans-paille   // PDI. New lines reset the context count. Extra PDF / PDI are ignored.
4135cca45bSserge-sans-paille   //
4235cca45bSserge-sans-paille   // Warn if we end up with an unclosed context.
4335cca45bSserge-sans-paille   while (CurPtr < Buffer.end()) {
4435cca45bSserge-sans-paille     unsigned char C = *CurPtr;
4535cca45bSserge-sans-paille     if (isASCII(C)) {
4635cca45bSserge-sans-paille       ++CurPtr;
4735cca45bSserge-sans-paille       bool IsParagrapSep =
4835cca45bSserge-sans-paille           (C == 0xA || C == 0xD || (0x1C <= C && C <= 0x1E) || C == 0x85);
4935cca45bSserge-sans-paille       bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F);
5035cca45bSserge-sans-paille       if (IsParagrapSep || IsSegmentSep)
5135cca45bSserge-sans-paille         BidiContexts.clear();
5235cca45bSserge-sans-paille       continue;
5335cca45bSserge-sans-paille     }
54*cbdc3e1bSPiotr Zegar     llvm::UTF32 CodePoint = 0;
5535cca45bSserge-sans-paille     llvm::ConversionResult Result = llvm::convertUTF8Sequence(
5635cca45bSserge-sans-paille         (const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)Buffer.end(),
5735cca45bSserge-sans-paille         &CodePoint, llvm::strictConversion);
5835cca45bSserge-sans-paille 
5935cca45bSserge-sans-paille     // If conversion fails, utf-8 is designed so that we can just try next char.
6035cca45bSserge-sans-paille     if (Result != llvm::conversionOK) {
6135cca45bSserge-sans-paille       ++CurPtr;
6235cca45bSserge-sans-paille       continue;
6335cca45bSserge-sans-paille     }
6435cca45bSserge-sans-paille 
6535cca45bSserge-sans-paille     // Open a PDF context.
6635cca45bSserge-sans-paille     if (CodePoint == RLO || CodePoint == RLE || CodePoint == LRO ||
6735cca45bSserge-sans-paille         CodePoint == LRE)
6835cca45bSserge-sans-paille       BidiContexts.push_back(PDF);
6935cca45bSserge-sans-paille     // Close PDF Context.
7035cca45bSserge-sans-paille     else if (CodePoint == PDF) {
7135cca45bSserge-sans-paille       if (!BidiContexts.empty() && BidiContexts.back() == PDF)
7235cca45bSserge-sans-paille         BidiContexts.pop_back();
7335cca45bSserge-sans-paille     }
7435cca45bSserge-sans-paille     // Open a PDI Context.
7535cca45bSserge-sans-paille     else if (CodePoint == RLI || CodePoint == LRI || CodePoint == FSI)
7635cca45bSserge-sans-paille       BidiContexts.push_back(PDI);
7735cca45bSserge-sans-paille     // Close a PDI Context.
7835cca45bSserge-sans-paille     else if (CodePoint == PDI) {
79e125e6c4SKazu Hirata       auto R = llvm::find(llvm::reverse(BidiContexts), PDI);
8035cca45bSserge-sans-paille       if (R != BidiContexts.rend())
8135cca45bSserge-sans-paille         BidiContexts.resize(BidiContexts.rend() - R - 1);
8235cca45bSserge-sans-paille     }
8335cca45bSserge-sans-paille     // Line break or equivalent
8435cca45bSserge-sans-paille     else if (CodePoint == PS)
8535cca45bSserge-sans-paille       BidiContexts.clear();
8635cca45bSserge-sans-paille   }
8735cca45bSserge-sans-paille   return !BidiContexts.empty();
8835cca45bSserge-sans-paille }
8935cca45bSserge-sans-paille 
9035cca45bSserge-sans-paille class MisleadingBidirectionalCheck::MisleadingBidirectionalHandler
9135cca45bSserge-sans-paille     : public CommentHandler {
9235cca45bSserge-sans-paille public:
MisleadingBidirectionalHandler(MisleadingBidirectionalCheck & Check)93c0d0b123SCarlos Galvez   MisleadingBidirectionalHandler(MisleadingBidirectionalCheck &Check)
9435cca45bSserge-sans-paille       : Check(Check) {}
9535cca45bSserge-sans-paille 
HandleComment(Preprocessor & PP,SourceRange Range)9635cca45bSserge-sans-paille   bool HandleComment(Preprocessor &PP, SourceRange Range) override {
9735cca45bSserge-sans-paille     // FIXME: check that we are in a /* */ comment
9835cca45bSserge-sans-paille     StringRef Text =
9935cca45bSserge-sans-paille         Lexer::getSourceText(CharSourceRange::getCharRange(Range),
10035cca45bSserge-sans-paille                              PP.getSourceManager(), PP.getLangOpts());
10135cca45bSserge-sans-paille 
10235cca45bSserge-sans-paille     if (containsMisleadingBidi(Text, true))
10335cca45bSserge-sans-paille       Check.diag(
10435cca45bSserge-sans-paille           Range.getBegin(),
10535cca45bSserge-sans-paille           "comment contains misleading bidirectional Unicode characters");
10635cca45bSserge-sans-paille     return false;
10735cca45bSserge-sans-paille   }
10835cca45bSserge-sans-paille 
10935cca45bSserge-sans-paille private:
11035cca45bSserge-sans-paille   MisleadingBidirectionalCheck &Check;
11135cca45bSserge-sans-paille };
11235cca45bSserge-sans-paille 
MisleadingBidirectionalCheck(StringRef Name,ClangTidyContext * Context)11335cca45bSserge-sans-paille MisleadingBidirectionalCheck::MisleadingBidirectionalCheck(
11435cca45bSserge-sans-paille     StringRef Name, ClangTidyContext *Context)
11535cca45bSserge-sans-paille     : ClangTidyCheck(Name, Context),
116c0d0b123SCarlos Galvez       Handler(std::make_unique<MisleadingBidirectionalHandler>(*this)) {}
11735cca45bSserge-sans-paille 
11835cca45bSserge-sans-paille MisleadingBidirectionalCheck::~MisleadingBidirectionalCheck() = default;
11935cca45bSserge-sans-paille 
registerPPCallbacks(const SourceManager & SM,Preprocessor * PP,Preprocessor * ModuleExpanderPP)12035cca45bSserge-sans-paille void MisleadingBidirectionalCheck::registerPPCallbacks(
12135cca45bSserge-sans-paille     const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
12235cca45bSserge-sans-paille   PP->addCommentHandler(Handler.get());
12335cca45bSserge-sans-paille }
12435cca45bSserge-sans-paille 
check(const ast_matchers::MatchFinder::MatchResult & Result)12535cca45bSserge-sans-paille void MisleadingBidirectionalCheck::check(
12635cca45bSserge-sans-paille     const ast_matchers::MatchFinder::MatchResult &Result) {
12735cca45bSserge-sans-paille   if (const auto *SL = Result.Nodes.getNodeAs<StringLiteral>("strlit")) {
12835cca45bSserge-sans-paille     StringRef Literal = SL->getBytes();
12935cca45bSserge-sans-paille     if (containsMisleadingBidi(Literal, false))
13035cca45bSserge-sans-paille       diag(SL->getBeginLoc(), "string literal contains misleading "
13135cca45bSserge-sans-paille                               "bidirectional Unicode characters");
13235cca45bSserge-sans-paille   }
13335cca45bSserge-sans-paille }
13435cca45bSserge-sans-paille 
registerMatchers(ast_matchers::MatchFinder * Finder)13535cca45bSserge-sans-paille void MisleadingBidirectionalCheck::registerMatchers(
13635cca45bSserge-sans-paille     ast_matchers::MatchFinder *Finder) {
13735cca45bSserge-sans-paille   Finder->addMatcher(ast_matchers::stringLiteral().bind("strlit"), this);
13835cca45bSserge-sans-paille }
139