xref: /llvm-project/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (revision fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0)
1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10 // which looks for an out-of-bound array element access.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/CharUnits.h"
15 #include "clang/AST/ParentMapContext.h"
16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17 #include "clang/StaticAnalyzer/Checkers/Taint.h"
18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19 #include "clang/StaticAnalyzer/Core/Checker.h"
20 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include <optional>
29 
30 using namespace clang;
31 using namespace ento;
32 using namespace taint;
33 using llvm::formatv;
34 
35 namespace {
36 /// If `E` is a "clean" array subscript expression, return the type of the
37 /// accessed element. If the base of the subscript expression is modified by
38 /// pointer arithmetic (and not the beginning of a "full" memory region), this
39 /// always returns nullopt because that's the right (or the least bad) thing to
40 /// do for the diagnostic output that's relying on this.
41 static std::optional<QualType> determineElementType(const Expr *E,
42                                                     const CheckerContext &C) {
43   const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44   if (!ASE)
45     return std::nullopt;
46 
47   const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48   if (!SubscriptBaseReg)
49     return std::nullopt;
50 
51   // The base of the subscript expression is affected by pointer arithmetics,
52   // so we want to report byte offsets instead of indices.
53   if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
54     return std::nullopt;
55 
56   return ASE->getType();
57 }
58 
59 static std::optional<int64_t>
60 determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
61   if (!T)
62     return std::nullopt;
63   return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
64 }
65 
66 class StateUpdateReporter {
67   const SubRegion *Reg;
68   const NonLoc ByteOffsetVal;
69   const std::optional<QualType> ElementType;
70   const std::optional<int64_t> ElementSize;
71   bool AssumedNonNegative = false;
72   std::optional<NonLoc> AssumedUpperBound = std::nullopt;
73 
74 public:
75   StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
76                       CheckerContext &C)
77       : Reg(R), ByteOffsetVal(ByteOffsVal),
78         ElementType(determineElementType(E, C)),
79         ElementSize(determineElementSize(ElementType, C)) {}
80 
81   void recordNonNegativeAssumption() { AssumedNonNegative = true; }
82   void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
83     AssumedUpperBound = UpperBoundVal;
84   }
85 
86   const NoteTag *createNoteTag(CheckerContext &C) const;
87 
88 private:
89   std::string getMessage(PathSensitiveBugReport &BR) const;
90 
91   /// Return true if information about the value of `Sym` can put constraints
92   /// on some symbol which is interesting within the bug report `BR`.
93   /// In particular, this returns true when `Sym` is interesting within `BR`;
94   /// but it also returns true if `Sym` is an expression that contains integer
95   /// constants and a single symbolic operand which is interesting (in `BR`).
96   /// We need to use this instead of plain `BR.isInteresting()` because if we
97   /// are analyzing code like
98   ///   int array[10];
99   ///   int f(int arg) {
100   ///     return array[arg] && array[arg + 10];
101   ///   }
102   /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
103   /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
104   /// to detect this out of bounds access).
105   static bool providesInformationAboutInteresting(SymbolRef Sym,
106                                                   PathSensitiveBugReport &BR);
107   static bool providesInformationAboutInteresting(SVal SV,
108                                                   PathSensitiveBugReport &BR) {
109     return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
110   }
111 };
112 
113 struct Messages {
114   std::string Short, Full;
115 };
116 
117 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
118 // instead of `PreStmt` because the current implementation passes the whole
119 // expression to `CheckerContext::getSVal()` which only works after the
120 // symbolic evaluation of the expression. (To turn them into `PreStmt`
121 // callbacks, we'd need to duplicate the logic that evaluates these
122 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
123 // defined as `PostStmt` for the sake of consistency with the other callbacks.
124 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
125                                            check::PostStmt<UnaryOperator>,
126                                            check::PostStmt<MemberExpr>> {
127   BugType BT{this, "Out-of-bound access"};
128   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
129 
130   void performCheck(const Expr *E, CheckerContext &C) const;
131 
132   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
133                  NonLoc Offset, std::optional<NonLoc> Extent,
134                  bool IsTaintBug = false) const;
135 
136   static void markPartsInteresting(PathSensitiveBugReport &BR,
137                                    ProgramStateRef ErrorState, NonLoc Val,
138                                    bool MarkTaint);
139 
140   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
141 
142   static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
143                                        NonLoc Offset, NonLoc Limit,
144                                        CheckerContext &C);
145   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
146 
147 public:
148   void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
149     performCheck(E, C);
150   }
151   void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
152     if (E->getOpcode() == UO_Deref)
153       performCheck(E, C);
154   }
155   void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
156     if (E->isArrow())
157       performCheck(E->getBase(), C);
158   }
159 };
160 
161 } // anonymous namespace
162 
163 /// For a given Location that can be represented as a symbolic expression
164 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
165 /// Arr and the distance of Location from the beginning of Arr (expressed in a
166 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
167 /// cannot be determined.
168 static std::optional<std::pair<const SubRegion *, NonLoc>>
169 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
170   QualType T = SVB.getArrayIndexType();
171   auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
172     // We will use this utility to add and multiply values.
173     return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
174   };
175 
176   const SubRegion *OwnerRegion = nullptr;
177   std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
178 
179   const ElementRegion *CurRegion =
180       dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
181 
182   while (CurRegion) {
183     const auto Index = CurRegion->getIndex().getAs<NonLoc>();
184     if (!Index)
185       return std::nullopt;
186 
187     QualType ElemType = CurRegion->getElementType();
188 
189     // FIXME: The following early return was presumably added to safeguard the
190     // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
191     // it seems that `ElemType` cannot be incomplete at this point.
192     if (ElemType->isIncompleteType())
193       return std::nullopt;
194 
195     // Calculate Delta = Index * sizeof(ElemType).
196     NonLoc Size = SVB.makeArrayIndex(
197         SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
198     auto Delta = EvalBinOp(BO_Mul, *Index, Size);
199     if (!Delta)
200       return std::nullopt;
201 
202     // Perform Offset += Delta.
203     Offset = EvalBinOp(BO_Add, *Offset, *Delta);
204     if (!Offset)
205       return std::nullopt;
206 
207     OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
208     // When this is just another ElementRegion layer, we need to continue the
209     // offset calculations:
210     CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
211   }
212 
213   if (OwnerRegion)
214     return std::make_pair(OwnerRegion, *Offset);
215 
216   return std::nullopt;
217 }
218 
219 // NOTE: This function is the "heart" of this checker. It simplifies
220 // inequalities with transformations that are valid (and very elementary) in
221 // pure mathematics, but become invalid if we use them in C++ number model
222 // where the calculations may overflow.
223 // Due to the overflow issues I think it's impossible (or at least not
224 // practical) to integrate this kind of simplification into the resolution of
225 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
226 // produces valid results when the calculations are handling memory offsets
227 // and every value is well below SIZE_MAX.
228 // TODO: This algorithm should be moved to a central location where it's
229 // available for other checkers that need to compare memory offsets.
230 // NOTE: the simplification preserves the order of the two operands in a
231 // mathematical sense, but it may change the result produced by a C++
232 // comparison operator (and the automatic type conversions).
233 // For example, consider a comparison "X+1 < 0", where the LHS is stored as a
234 // size_t and the RHS is stored in an int. (As size_t is unsigned, this
235 // comparison is false for all values of "X".) However, the simplification may
236 // turn it into "X < -1", which is still always false in a mathematical sense,
237 // but can produce a true result when evaluated by `evalBinOp` (which follows
238 // the rules of C++ and casts -1 to SIZE_MAX).
239 static std::pair<NonLoc, nonloc::ConcreteInt>
240 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
241                      SValBuilder &svalBuilder) {
242   std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
243   if (SymVal && SymVal->isExpression()) {
244     if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
245       llvm::APSInt constant =
246           APSIntType(extent.getValue()).convert(SIE->getRHS());
247       switch (SIE->getOpcode()) {
248       case BO_Mul:
249         // The constant should never be 0 here, becasue multiplication by zero
250         // is simplified by the engine.
251         if ((extent.getValue() % constant) != 0)
252           return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
253         else
254           return getSimplifiedOffsets(
255               nonloc::SymbolVal(SIE->getLHS()),
256               svalBuilder.makeIntVal(extent.getValue() / constant),
257               svalBuilder);
258       case BO_Add:
259         return getSimplifiedOffsets(
260             nonloc::SymbolVal(SIE->getLHS()),
261             svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
262       default:
263         break;
264       }
265     }
266   }
267 
268   return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
269 }
270 
271 // Evaluate the comparison Value < Threshold with the help of the custom
272 // simplification algorithm defined for this checker. Return a pair of states,
273 // where the first one corresponds to "value below threshold" and the second
274 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
275 // the case when the evaluation fails.
276 // If the optional argument CheckEquality is true, then use BO_EQ instead of
277 // the default BO_LT after consistently applying the same simplification steps.
278 static std::pair<ProgramStateRef, ProgramStateRef>
279 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
280                         SValBuilder &SVB, bool CheckEquality = false) {
281   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
282     std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
283   }
284   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
285     QualType T = Value.getType(SVB.getContext());
286     if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) {
287       // In this case we reduced the bound check to a comparison of the form
288       //   (symbol or value with unsigned type) < (negative number)
289       // which is always false. We are handling these cases separately because
290       // evalBinOpNN can perform a signed->unsigned conversion that turns the
291       // negative number into a huge positive value and leads to wildly
292       // inaccurate conclusions.
293       return {nullptr, State};
294     }
295   }
296   const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
297   auto BelowThreshold =
298       SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
299           .getAs<NonLoc>();
300 
301   if (BelowThreshold)
302     return State->assume(*BelowThreshold);
303 
304   return {nullptr, nullptr};
305 }
306 
307 static std::string getRegionName(const SubRegion *Region) {
308   if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
309     return RegName;
310 
311   // Field regions only have descriptive names when their parent has a
312   // descriptive name; so we provide a fallback representation for them:
313   if (const auto *FR = Region->getAs<FieldRegion>()) {
314     if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
315       return formatv("the field '{0}'", Name);
316     return "the unnamed field";
317   }
318 
319   if (isa<AllocaRegion>(Region))
320     return "the memory returned by 'alloca'";
321 
322   if (isa<SymbolicRegion>(Region) &&
323       isa<HeapSpaceRegion>(Region->getMemorySpace()))
324     return "the heap area";
325 
326   if (isa<StringRegion>(Region))
327     return "the string literal";
328 
329   return "the region";
330 }
331 
332 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
333   if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
334     return ConcreteVal->getValue().tryExtValue();
335   }
336   return std::nullopt;
337 }
338 
339 static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
340   return SV ? getConcreteValue(*SV) : std::nullopt;
341 }
342 
343 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
344   std::string RegName = getRegionName(Region);
345   SmallString<128> Buf;
346   llvm::raw_svector_ostream Out(Buf);
347   Out << "Access of " << RegName << " at negative byte offset";
348   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
349     Out << ' ' << ConcreteIdx->getValue();
350   return {formatv("Out of bound access to memory preceding {0}", RegName),
351           std::string(Buf)};
352 }
353 
354 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
355 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
356 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
357 /// division is considered to be successful.
358 static bool tryDividePair(std::optional<int64_t> &Val1,
359                           std::optional<int64_t> &Val2, int64_t Divisor) {
360   if (!Divisor)
361     return false;
362   const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
363   const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
364   if (!Val1HasRemainder && !Val2HasRemainder) {
365     if (Val1)
366       *Val1 /= Divisor;
367     if (Val2)
368       *Val2 /= Divisor;
369     return true;
370   }
371   return false;
372 }
373 
374 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
375                                NonLoc Offset, NonLoc Extent, SVal Location) {
376   std::string RegName = getRegionName(Region);
377   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
378   assert(EReg && "this checker only handles element access");
379   QualType ElemType = EReg->getElementType();
380 
381   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
382   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
383 
384   int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
385 
386   bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
387 
388   SmallString<256> Buf;
389   llvm::raw_svector_ostream Out(Buf);
390   Out << "Access of ";
391   if (!ExtentN && !UseByteOffsets)
392     Out << "'" << ElemType.getAsString() << "' element in ";
393   Out << RegName << " at ";
394   if (OffsetN) {
395     Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
396   } else {
397     Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
398   }
399   if (ExtentN) {
400     Out << ", while it holds only ";
401     if (*ExtentN != 1)
402       Out << *ExtentN;
403     else
404       Out << "a single";
405     if (UseByteOffsets)
406       Out << " byte";
407     else
408       Out << " '" << ElemType.getAsString() << "' element";
409 
410     if (*ExtentN > 1)
411       Out << "s";
412   }
413 
414   return {
415       formatv("Out of bound access to memory after the end of {0}", RegName),
416       std::string(Buf)};
417 }
418 
419 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
420   std::string RegName = getRegionName(Region);
421   return {formatv("Potential out of bound access to {0} with tainted {1}",
422                   RegName, OffsetName),
423           formatv("Access of {0} with a tainted {1} that may be too large",
424                   RegName, OffsetName)};
425 }
426 
427 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
428   // Don't create a note tag if we didn't assume anything:
429   if (!AssumedNonNegative && !AssumedUpperBound)
430     return nullptr;
431 
432   return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
433     return getMessage(BR);
434   });
435 }
436 
437 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
438   bool ShouldReportNonNegative = AssumedNonNegative;
439   if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
440     if (AssumedUpperBound &&
441         providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
442       // Even if the byte offset isn't interesting (e.g. it's a constant value),
443       // the assumption can still be interesting if it provides information
444       // about an interesting symbolic upper bound.
445       ShouldReportNonNegative = false;
446     } else {
447       // We don't have anything interesting, don't report the assumption.
448       return "";
449     }
450   }
451 
452   std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
453   std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
454 
455   const bool UseIndex =
456       ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
457 
458   SmallString<256> Buf;
459   llvm::raw_svector_ostream Out(Buf);
460   Out << "Assuming ";
461   if (UseIndex) {
462     Out << "index ";
463     if (OffsetN)
464       Out << "'" << OffsetN << "' ";
465   } else if (AssumedUpperBound) {
466     Out << "byte offset ";
467     if (OffsetN)
468       Out << "'" << OffsetN << "' ";
469   } else {
470     Out << "offset ";
471   }
472 
473   Out << "is";
474   if (ShouldReportNonNegative) {
475     Out << " non-negative";
476   }
477   if (AssumedUpperBound) {
478     if (ShouldReportNonNegative)
479       Out << " and";
480     Out << " less than ";
481     if (ExtentN)
482       Out << *ExtentN << ", ";
483     if (UseIndex && ElementType)
484       Out << "the number of '" << ElementType->getAsString()
485           << "' elements in ";
486     else
487       Out << "the extent of ";
488     Out << getRegionName(Reg);
489   }
490   return std::string(Out.str());
491 }
492 
493 bool StateUpdateReporter::providesInformationAboutInteresting(
494     SymbolRef Sym, PathSensitiveBugReport &BR) {
495   if (!Sym)
496     return false;
497   for (SymbolRef PartSym : Sym->symbols()) {
498     // The interestingess mark may appear on any layer as we're stripping off
499     // the SymIntExpr, UnarySymExpr etc. layers...
500     if (BR.isInteresting(PartSym))
501       return true;
502     // ...but if both sides of the expression are symbolic, then there is no
503     // practical algorithm to produce separate constraints for the two
504     // operands (from the single combined result).
505     if (isa<SymSymExpr>(PartSym))
506       return false;
507   }
508   return false;
509 }
510 
511 void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
512   const SVal Location = C.getSVal(E);
513 
514   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
515   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
516   // and incomplete analysis of these leads to false positives. As even
517   // accurate reports would be confusing for the users, just disable reports
518   // from these macros:
519   if (isFromCtypeMacro(E, C.getASTContext()))
520     return;
521 
522   ProgramStateRef State = C.getState();
523   SValBuilder &SVB = C.getSValBuilder();
524 
525   const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
526       computeOffset(State, SVB, Location);
527 
528   if (!RawOffset)
529     return;
530 
531   auto [Reg, ByteOffset] = *RawOffset;
532 
533   // The state updates will be reported as a single note tag, which will be
534   // composed by this helper class.
535   StateUpdateReporter SUR(Reg, ByteOffset, E, C);
536 
537   // CHECK LOWER BOUND
538   const MemSpaceRegion *Space = Reg->getMemorySpace();
539   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
540     // A symbolic region in unknown space represents an unknown pointer that
541     // may point into the middle of an array, so we don't look for underflows.
542     // Both conditions are significant because we want to check underflows in
543     // symbolic regions on the heap (which may be introduced by checkers like
544     // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
545     // non-symbolic regions (e.g. a field subregion of a symbolic region) in
546     // unknown space.
547     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
548         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
549 
550     if (PrecedesLowerBound) {
551       // The offset may be invalid (negative)...
552       if (!WithinLowerBound) {
553         // ...and it cannot be valid (>= 0), so report an error.
554         Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
555         reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
556         return;
557       }
558       // ...but it can be valid as well, so the checker will (optimistically)
559       // assume that it's valid and mention this in the note tag.
560       SUR.recordNonNegativeAssumption();
561     }
562 
563     // Actually update the state. The "if" only fails in the extremely unlikely
564     // case when compareValueToThreshold returns {nullptr, nullptr} becasue
565     // evalBinOpNN fails to evaluate the less-than operator.
566     if (WithinLowerBound)
567       State = WithinLowerBound;
568   }
569 
570   // CHECK UPPER BOUND
571   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
572   if (auto KnownSize = Size.getAs<NonLoc>()) {
573     auto [WithinUpperBound, ExceedsUpperBound] =
574         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
575 
576     if (ExceedsUpperBound) {
577       // The offset may be invalid (>= Size)...
578       if (!WithinUpperBound) {
579         // ...and it cannot be within bounds, so report an error, unless we can
580         // definitely determine that this is an idiomatic `&array[size]`
581         // expression that calculates the past-the-end pointer.
582         if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
583                                      *KnownSize, C)) {
584           C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
585           return;
586         }
587 
588         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
589                                        *KnownSize, Location);
590         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
591         return;
592       }
593       // ...and it can be valid as well...
594       if (isTainted(State, ByteOffset)) {
595         // ...but it's tainted, so report an error.
596 
597         // Diagnostic detail: saying "tainted offset" is always correct, but
598         // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
599         // nicer to say "tainted index".
600         const char *OffsetName = "offset";
601         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
602           if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
603             OffsetName = "index";
604 
605         Messages Msgs = getTaintMsgs(Reg, OffsetName);
606         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
607                   /*IsTaintBug=*/true);
608         return;
609       }
610       // ...and it isn't tainted, so the checker will (optimistically) assume
611       // that the offset is in bounds and mention this in the note tag.
612       SUR.recordUpperBoundAssumption(*KnownSize);
613     }
614 
615     // Actually update the state. The "if" only fails in the extremely unlikely
616     // case when compareValueToThreshold returns {nullptr, nullptr} becasue
617     // evalBinOpNN fails to evaluate the less-than operator.
618     if (WithinUpperBound)
619       State = WithinUpperBound;
620   }
621 
622   // Add a transition, reporting the state updates that we accumulated.
623   C.addTransition(State, SUR.createNoteTag(C));
624 }
625 
626 void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
627                                                ProgramStateRef ErrorState,
628                                                NonLoc Val, bool MarkTaint) {
629   if (SymbolRef Sym = Val.getAsSymbol()) {
630     // If the offset is a symbolic value, iterate over its "parts" with
631     // `SymExpr::symbols()` and mark each of them as interesting.
632     // For example, if the offset is `x*4 + y` then we put interestingness onto
633     // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
634     // `x` and `y`.
635     for (SymbolRef PartSym : Sym->symbols())
636       BR.markInteresting(PartSym);
637   }
638 
639   if (MarkTaint) {
640     // If the issue that we're reporting depends on the taintedness of the
641     // offset, then put interestingness onto symbols that could be the origin
642     // of the taint. Note that this may find symbols that did not appear in
643     // `Sym->symbols()` (because they're only loosely connected to `Val`).
644     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
645       BR.markInteresting(Sym);
646   }
647 }
648 
649 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
650                                     ProgramStateRef ErrorState, Messages Msgs,
651                                     NonLoc Offset, std::optional<NonLoc> Extent,
652                                     bool IsTaintBug /*=false*/) const {
653 
654   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
655   if (!ErrorNode)
656     return;
657 
658   auto BR = std::make_unique<PathSensitiveBugReport>(
659       IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
660 
661   // FIXME: ideally we would just call trackExpressionValue() and that would
662   // "do the right thing": mark the relevant symbols as interesting, track the
663   // control dependencies and statements storing the relevant values and add
664   // helpful diagnostic pieces. However, right now trackExpressionValue() is
665   // a heap of unreliable heuristics, so it would cause several issues:
666   // - Interestingness is not applied consistently, e.g. if `array[x+10]`
667   //   causes an overflow, then `x` is not marked as interesting.
668   // - We get irrelevant diagnostic pieces, e.g. in the code
669   //   `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
670   //   it places a "Storing uninitialized value" note on the `malloc` call
671   //   (which is technically true, but irrelevant).
672   // If trackExpressionValue() becomes reliable, it should be applied instead
673   // of this custom markPartsInteresting().
674   markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
675   if (Extent)
676     markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
677 
678   C.emitReport(std::move(BR));
679 }
680 
681 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
682   SourceLocation Loc = S->getBeginLoc();
683   if (!Loc.isMacroID())
684     return false;
685 
686   StringRef MacroName = Lexer::getImmediateMacroName(
687       Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
688 
689   if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
690     return false;
691 
692   return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
693           (MacroName == "isblank") || (MacroName == "isdigit") ||
694           (MacroName == "isgraph") || (MacroName == "islower") ||
695           (MacroName == "isnctrl") || (MacroName == "isprint") ||
696           (MacroName == "ispunct") || (MacroName == "isspace") ||
697           (MacroName == "isupper") || (MacroName == "isxdigit"));
698 }
699 
700 bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
701   ParentMapContext &ParentCtx = ACtx.getParentMapContext();
702   do {
703     const DynTypedNodeList Parents = ParentCtx.getParents(*S);
704     if (Parents.empty())
705       return false;
706     S = Parents[0].get<Stmt>();
707   } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
708   const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
709   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
710 }
711 
712 bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
713                                                    ProgramStateRef State,
714                                                    NonLoc Offset, NonLoc Limit,
715                                                    CheckerContext &C) {
716   if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
717     auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
718         State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
719     return EqualsToThreshold && !NotEqualToThreshold;
720   }
721   return false;
722 }
723 
724 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
725   mgr.registerChecker<ArrayBoundCheckerV2>();
726 }
727 
728 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
729   return true;
730 }
731