xref: /llvm-project/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (revision 16ef4968dc9e93f9c7b361b9ad0645445bd1210e)
1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10 // which looks for an out-of-bound array element access.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/CharUnits.h"
15 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
16 #include "clang/StaticAnalyzer/Checkers/Taint.h"
17 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
18 #include "clang/StaticAnalyzer/Core/Checker.h"
19 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
20 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
21 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
22 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
23 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/Support/FormatVariadic.h"
26 #include "llvm/Support/raw_ostream.h"
27 #include <optional>
28 
29 using namespace clang;
30 using namespace ento;
31 using namespace taint;
32 using llvm::formatv;
33 
34 namespace {
35 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
36 
37 class ArrayBoundCheckerV2 :
38     public Checker<check::Location> {
39   BugType BT{this, "Out-of-bound access"};
40   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
41 
42   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
43                  NonLoc Offset, std::string RegName, std::string Msg) const;
44 
45   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
46 
47 public:
48   void checkLocation(SVal l, bool isLoad, const Stmt *S,
49                      CheckerContext &C) const;
50 };
51 } // anonymous namespace
52 
53 /// For a given Location that can be represented as a symbolic expression
54 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
55 /// Arr and the distance of Location from the beginning of Arr (expressed in a
56 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
57 /// cannot be determined.
58 static std::optional<std::pair<const SubRegion *, NonLoc>>
59 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
60   QualType T = SVB.getArrayIndexType();
61   auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
62     // We will use this utility to add and multiply values.
63     return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
64   };
65 
66   const SubRegion *OwnerRegion = nullptr;
67   std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
68 
69   const ElementRegion *CurRegion =
70       dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
71 
72   while (CurRegion) {
73     const auto Index = CurRegion->getIndex().getAs<NonLoc>();
74     if (!Index)
75       return std::nullopt;
76 
77     QualType ElemType = CurRegion->getElementType();
78 
79     // FIXME: The following early return was presumably added to safeguard the
80     // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
81     // it seems that `ElemType` cannot be incomplete at this point.
82     if (ElemType->isIncompleteType())
83       return std::nullopt;
84 
85     // Calculate Delta = Index * sizeof(ElemType).
86     NonLoc Size = SVB.makeArrayIndex(
87         SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
88     auto Delta = EvalBinOp(BO_Mul, *Index, Size);
89     if (!Delta)
90       return std::nullopt;
91 
92     // Perform Offset += Delta.
93     Offset = EvalBinOp(BO_Add, *Offset, *Delta);
94     if (!Offset)
95       return std::nullopt;
96 
97     OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
98     // When this is just another ElementRegion layer, we need to continue the
99     // offset calculations:
100     CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
101   }
102 
103   if (OwnerRegion)
104     return std::make_pair(OwnerRegion, *Offset);
105 
106   return std::nullopt;
107 }
108 
109 // TODO: once the constraint manager is smart enough to handle non simplified
110 // symbolic expressions remove this function. Note that this can not be used in
111 // the constraint manager as is, since this does not handle overflows. It is
112 // safe to assume, however, that memory offsets will not overflow.
113 // NOTE: callers of this function need to be aware of the effects of overflows
114 // and signed<->unsigned conversions!
115 static std::pair<NonLoc, nonloc::ConcreteInt>
116 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
117                      SValBuilder &svalBuilder) {
118   std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
119   if (SymVal && SymVal->isExpression()) {
120     if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
121       llvm::APSInt constant =
122           APSIntType(extent.getValue()).convert(SIE->getRHS());
123       switch (SIE->getOpcode()) {
124       case BO_Mul:
125         // The constant should never be 0 here, becasue multiplication by zero
126         // is simplified by the engine.
127         if ((extent.getValue() % constant) != 0)
128           return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
129         else
130           return getSimplifiedOffsets(
131               nonloc::SymbolVal(SIE->getLHS()),
132               svalBuilder.makeIntVal(extent.getValue() / constant),
133               svalBuilder);
134       case BO_Add:
135         return getSimplifiedOffsets(
136             nonloc::SymbolVal(SIE->getLHS()),
137             svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
138       default:
139         break;
140       }
141     }
142   }
143 
144   return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
145 }
146 
147 // Evaluate the comparison Value < Threshold with the help of the custom
148 // simplification algorithm defined for this checker. Return a pair of states,
149 // where the first one corresponds to "value below threshold" and the second
150 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
151 // the case when the evaluation fails.
152 static std::pair<ProgramStateRef, ProgramStateRef>
153 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
154                         SValBuilder &SVB) {
155   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
156     std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
157   }
158   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
159     QualType T = Value.getType(SVB.getContext());
160     if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) {
161       // In this case we reduced the bound check to a comparison of the form
162       //   (symbol or value with unsigned type) < (negative number)
163       // which is always false. We are handling these cases separately because
164       // evalBinOpNN can perform a signed->unsigned conversion that turns the
165       // negative number into a huge positive value and leads to wildly
166       // inaccurate conclusions.
167       return {nullptr, State};
168     }
169   }
170   auto BelowThreshold =
171       SVB.evalBinOpNN(State, BO_LT, Value, Threshold, SVB.getConditionType()).getAs<NonLoc>();
172 
173   if (BelowThreshold)
174     return State->assume(*BelowThreshold);
175 
176   return {nullptr, nullptr};
177 }
178 
179 static std::string getRegionName(const SubRegion *Region) {
180   if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
181     return RegName;
182 
183   // Field regions only have descriptive names when their parent has a
184   // descriptive name; so we provide a fallback representation for them:
185   if (const auto *FR = Region->getAs<FieldRegion>()) {
186     if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
187       return formatv("the field '{0}'", Name);
188     return "the unnamed field";
189   }
190 
191   if (isa<AllocaRegion>(Region))
192     return "the memory returned by 'alloca'";
193 
194   if (isa<SymbolicRegion>(Region) &&
195       isa<HeapSpaceRegion>(Region->getMemorySpace()))
196     return "the heap area";
197 
198   if (isa<StringRegion>(Region))
199     return "the string literal";
200 
201   return "the region";
202 }
203 
204 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
205   if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
206     return ConcreteVal->getValue().tryExtValue();
207   }
208   return std::nullopt;
209 }
210 
211 static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
212   static const char *ShortMsgTemplates[] = {
213       "Out of bound access to memory preceding {0}",
214       "Out of bound access to memory after the end of {0}",
215       "Potential out of bound access to {0} with tainted offset"};
216 
217   return formatv(ShortMsgTemplates[Kind], RegName);
218 }
219 
220 static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) {
221   SmallString<128> Buf;
222   llvm::raw_svector_ostream Out(Buf);
223   Out << "Access of " << RegName << " at negative byte offset";
224   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
225     Out << ' ' << ConcreteIdx->getValue();
226   return std::string(Buf);
227 }
228 static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
229                                  NonLoc Offset, NonLoc Extent, SVal Location) {
230   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
231   assert(EReg && "this checker only handles element access");
232   QualType ElemType = EReg->getElementType();
233 
234   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
235   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
236 
237   bool UseByteOffsets = true;
238   if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
239     const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
240     const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
241     if (!OffsetHasRemainder && !ExtentHasRemainder) {
242       UseByteOffsets = false;
243       if (OffsetN)
244         *OffsetN /= ElemSize;
245       if (ExtentN)
246         *ExtentN /= ElemSize;
247     }
248   }
249 
250   SmallString<256> Buf;
251   llvm::raw_svector_ostream Out(Buf);
252   Out << "Access of ";
253   if (!ExtentN && !UseByteOffsets)
254     Out << "'" << ElemType.getAsString() << "' element in ";
255   Out << RegName << " at ";
256   if (OffsetN) {
257     Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
258   } else {
259     Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
260   }
261   if (ExtentN) {
262     Out << ", while it holds only ";
263     if (*ExtentN != 1)
264       Out << *ExtentN;
265     else
266       Out << "a single";
267     if (UseByteOffsets)
268       Out << " byte";
269     else
270       Out << " '" << ElemType.getAsString() << "' element";
271 
272     if (*ExtentN > 1)
273       Out << "s";
274   }
275 
276   return std::string(Buf);
277 }
278 static std::string getTaintMsg(std::string RegName) {
279   SmallString<128> Buf;
280   llvm::raw_svector_ostream Out(Buf);
281   Out << "Access of " << RegName
282       << " with a tainted offset that may be too large";
283   return std::string(Buf);
284 }
285 
286 void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
287                                         const Stmt *LoadS,
288                                         CheckerContext &C) const {
289 
290   // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
291   // some new logic here that reasons directly about memory region extents.
292   // Once that logic is more mature, we can bring it back to assumeInBound()
293   // for all clients to use.
294   //
295   // The algorithm we are using here for bounds checking is to see if the
296   // memory access is within the extent of the base region.  Since we
297   // have some flexibility in defining the base region, we can achieve
298   // various levels of conservatism in our buffer overflow checking.
299 
300   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
301   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
302   // and incomplete analysis of these leads to false positives. As even
303   // accurate reports would be confusing for the users, just disable reports
304   // from these macros:
305   if (isFromCtypeMacro(LoadS, C.getASTContext()))
306     return;
307 
308   ProgramStateRef State = C.getState();
309   SValBuilder &SVB = C.getSValBuilder();
310 
311   const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
312       computeOffset(State, SVB, Location);
313 
314   if (!RawOffset)
315     return;
316 
317   auto [Reg, ByteOffset] = *RawOffset;
318 
319   // CHECK LOWER BOUND
320   const MemSpaceRegion *Space = Reg->getMemorySpace();
321   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
322     // A symbolic region in unknown space represents an unknown pointer that
323     // may point into the middle of an array, so we don't look for underflows.
324     // Both conditions are significant because we want to check underflows in
325     // symbolic regions on the heap (which may be introduced by checkers like
326     // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
327     // non-symbolic regions (e.g. a field subregion of a symbolic region) in
328     // unknown space.
329     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
330         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
331 
332     if (PrecedesLowerBound && !WithinLowerBound) {
333       // We know that the index definitely precedes the lower bound.
334       std::string RegName = getRegionName(Reg);
335       std::string Msg = getPrecedesMsg(RegName, ByteOffset);
336       reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg);
337       return;
338     }
339 
340     if (WithinLowerBound)
341       State = WithinLowerBound;
342   }
343 
344   // CHECK UPPER BOUND
345   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
346   if (auto KnownSize = Size.getAs<NonLoc>()) {
347     auto [WithinUpperBound, ExceedsUpperBound] =
348         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
349 
350     if (ExceedsUpperBound) {
351       if (!WithinUpperBound) {
352         // We know that the index definitely exceeds the upper bound.
353         std::string RegName = getRegionName(Reg);
354         std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset,
355                                         *KnownSize, Location);
356         reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg);
357         return;
358       }
359       if (isTainted(State, ByteOffset)) {
360         // Both cases are possible, but the index is tainted, so report.
361         std::string RegName = getRegionName(Reg);
362         std::string Msg = getTaintMsg(RegName);
363         reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg);
364         return;
365       }
366     }
367 
368     if (WithinUpperBound)
369       State = WithinUpperBound;
370   }
371 
372   C.addTransition(State);
373 }
374 
375 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
376                                     ProgramStateRef ErrorState, OOB_Kind Kind,
377                                     NonLoc Offset, std::string RegName,
378                                     std::string Msg) const {
379 
380   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
381   if (!ErrorNode)
382     return;
383 
384   std::string ShortMsg = getShortMsg(Kind, RegName);
385 
386   auto BR = std::make_unique<PathSensitiveBugReport>(
387       Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode);
388 
389   // Track back the propagation of taintedness.
390   if (Kind == OOB_Taint)
391     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
392       BR->markInteresting(Sym);
393 
394   C.emitReport(std::move(BR));
395 }
396 
397 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
398   SourceLocation Loc = S->getBeginLoc();
399   if (!Loc.isMacroID())
400     return false;
401 
402   StringRef MacroName = Lexer::getImmediateMacroName(
403       Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
404 
405   if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
406     return false;
407 
408   return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
409           (MacroName == "isblank") || (MacroName == "isdigit") ||
410           (MacroName == "isgraph") || (MacroName == "islower") ||
411           (MacroName == "isnctrl") || (MacroName == "isprint") ||
412           (MacroName == "ispunct") || (MacroName == "isspace") ||
413           (MacroName == "isupper") || (MacroName == "isxdigit"));
414 }
415 
416 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
417   mgr.registerChecker<ArrayBoundCheckerV2>();
418 }
419 
420 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
421   return true;
422 }
423