1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check 10 // which looks for an out-of-bound array element access. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "clang/AST/CharUnits.h" 15 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" 16 #include "clang/StaticAnalyzer/Checkers/Taint.h" 17 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" 18 #include "clang/StaticAnalyzer/Core/Checker.h" 19 #include "clang/StaticAnalyzer/Core/CheckerManager.h" 20 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h" 21 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" 22 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" 23 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/Support/FormatVariadic.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include <optional> 28 29 using namespace clang; 30 using namespace ento; 31 using namespace taint; 32 using llvm::formatv; 33 34 namespace { 35 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint }; 36 37 class ArrayBoundCheckerV2 : 38 public Checker<check::Location> { 39 BugType BT{this, "Out-of-bound access"}; 40 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData}; 41 42 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind, 43 NonLoc Offset, std::string RegName, std::string Msg) const; 44 45 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); 46 47 public: 48 void checkLocation(SVal l, bool isLoad, const Stmt *S, 49 CheckerContext &C) const; 50 }; 51 } // anonymous namespace 52 53 /// For a given Location that can be represented as a symbolic expression 54 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block 55 /// Arr and the distance of Location from the beginning of Arr (expressed in a 56 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these 57 /// cannot be determined. 58 static std::optional<std::pair<const SubRegion *, NonLoc>> 59 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { 60 QualType T = SVB.getArrayIndexType(); 61 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) { 62 // We will use this utility to add and multiply values. 63 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>(); 64 }; 65 66 const SubRegion *OwnerRegion = nullptr; 67 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex(); 68 69 const ElementRegion *CurRegion = 70 dyn_cast_or_null<ElementRegion>(Location.getAsRegion()); 71 72 while (CurRegion) { 73 const auto Index = CurRegion->getIndex().getAs<NonLoc>(); 74 if (!Index) 75 return std::nullopt; 76 77 QualType ElemType = CurRegion->getElementType(); 78 79 // FIXME: The following early return was presumably added to safeguard the 80 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but 81 // it seems that `ElemType` cannot be incomplete at this point. 82 if (ElemType->isIncompleteType()) 83 return std::nullopt; 84 85 // Calculate Delta = Index * sizeof(ElemType). 86 NonLoc Size = SVB.makeArrayIndex( 87 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity()); 88 auto Delta = EvalBinOp(BO_Mul, *Index, Size); 89 if (!Delta) 90 return std::nullopt; 91 92 // Perform Offset += Delta. 93 Offset = EvalBinOp(BO_Add, *Offset, *Delta); 94 if (!Offset) 95 return std::nullopt; 96 97 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>(); 98 // When this is just another ElementRegion layer, we need to continue the 99 // offset calculations: 100 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion); 101 } 102 103 if (OwnerRegion) 104 return std::make_pair(OwnerRegion, *Offset); 105 106 return std::nullopt; 107 } 108 109 // TODO: once the constraint manager is smart enough to handle non simplified 110 // symbolic expressions remove this function. Note that this can not be used in 111 // the constraint manager as is, since this does not handle overflows. It is 112 // safe to assume, however, that memory offsets will not overflow. 113 // NOTE: callers of this function need to be aware of the effects of overflows 114 // and signed<->unsigned conversions! 115 static std::pair<NonLoc, nonloc::ConcreteInt> 116 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, 117 SValBuilder &svalBuilder) { 118 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>(); 119 if (SymVal && SymVal->isExpression()) { 120 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) { 121 llvm::APSInt constant = 122 APSIntType(extent.getValue()).convert(SIE->getRHS()); 123 switch (SIE->getOpcode()) { 124 case BO_Mul: 125 // The constant should never be 0 here, becasue multiplication by zero 126 // is simplified by the engine. 127 if ((extent.getValue() % constant) != 0) 128 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); 129 else 130 return getSimplifiedOffsets( 131 nonloc::SymbolVal(SIE->getLHS()), 132 svalBuilder.makeIntVal(extent.getValue() / constant), 133 svalBuilder); 134 case BO_Add: 135 return getSimplifiedOffsets( 136 nonloc::SymbolVal(SIE->getLHS()), 137 svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder); 138 default: 139 break; 140 } 141 } 142 } 143 144 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); 145 } 146 147 // Evaluate the comparison Value < Threshold with the help of the custom 148 // simplification algorithm defined for this checker. Return a pair of states, 149 // where the first one corresponds to "value below threshold" and the second 150 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in 151 // the case when the evaluation fails. 152 static std::pair<ProgramStateRef, ProgramStateRef> 153 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, 154 SValBuilder &SVB) { 155 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { 156 std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); 157 } 158 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { 159 QualType T = Value.getType(SVB.getContext()); 160 if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) { 161 // In this case we reduced the bound check to a comparison of the form 162 // (symbol or value with unsigned type) < (negative number) 163 // which is always false. We are handling these cases separately because 164 // evalBinOpNN can perform a signed->unsigned conversion that turns the 165 // negative number into a huge positive value and leads to wildly 166 // inaccurate conclusions. 167 return {nullptr, State}; 168 } 169 } 170 auto BelowThreshold = 171 SVB.evalBinOpNN(State, BO_LT, Value, Threshold, SVB.getConditionType()).getAs<NonLoc>(); 172 173 if (BelowThreshold) 174 return State->assume(*BelowThreshold); 175 176 return {nullptr, nullptr}; 177 } 178 179 static std::string getRegionName(const SubRegion *Region) { 180 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty()) 181 return RegName; 182 183 // Field regions only have descriptive names when their parent has a 184 // descriptive name; so we provide a fallback representation for them: 185 if (const auto *FR = Region->getAs<FieldRegion>()) { 186 if (StringRef Name = FR->getDecl()->getName(); !Name.empty()) 187 return formatv("the field '{0}'", Name); 188 return "the unnamed field"; 189 } 190 191 if (isa<AllocaRegion>(Region)) 192 return "the memory returned by 'alloca'"; 193 194 if (isa<SymbolicRegion>(Region) && 195 isa<HeapSpaceRegion>(Region->getMemorySpace())) 196 return "the heap area"; 197 198 if (isa<StringRegion>(Region)) 199 return "the string literal"; 200 201 return "the region"; 202 } 203 204 static std::optional<int64_t> getConcreteValue(NonLoc SV) { 205 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) { 206 return ConcreteVal->getValue().tryExtValue(); 207 } 208 return std::nullopt; 209 } 210 211 static std::string getShortMsg(OOB_Kind Kind, std::string RegName) { 212 static const char *ShortMsgTemplates[] = { 213 "Out of bound access to memory preceding {0}", 214 "Out of bound access to memory after the end of {0}", 215 "Potential out of bound access to {0} with tainted offset"}; 216 217 return formatv(ShortMsgTemplates[Kind], RegName); 218 } 219 220 static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) { 221 SmallString<128> Buf; 222 llvm::raw_svector_ostream Out(Buf); 223 Out << "Access of " << RegName << " at negative byte offset"; 224 if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>()) 225 Out << ' ' << ConcreteIdx->getValue(); 226 return std::string(Buf); 227 } 228 static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName, 229 NonLoc Offset, NonLoc Extent, SVal Location) { 230 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>(); 231 assert(EReg && "this checker only handles element access"); 232 QualType ElemType = EReg->getElementType(); 233 234 std::optional<int64_t> OffsetN = getConcreteValue(Offset); 235 std::optional<int64_t> ExtentN = getConcreteValue(Extent); 236 237 bool UseByteOffsets = true; 238 if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) { 239 const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize; 240 const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize; 241 if (!OffsetHasRemainder && !ExtentHasRemainder) { 242 UseByteOffsets = false; 243 if (OffsetN) 244 *OffsetN /= ElemSize; 245 if (ExtentN) 246 *ExtentN /= ElemSize; 247 } 248 } 249 250 SmallString<256> Buf; 251 llvm::raw_svector_ostream Out(Buf); 252 Out << "Access of "; 253 if (!ExtentN && !UseByteOffsets) 254 Out << "'" << ElemType.getAsString() << "' element in "; 255 Out << RegName << " at "; 256 if (OffsetN) { 257 Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN; 258 } else { 259 Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index"); 260 } 261 if (ExtentN) { 262 Out << ", while it holds only "; 263 if (*ExtentN != 1) 264 Out << *ExtentN; 265 else 266 Out << "a single"; 267 if (UseByteOffsets) 268 Out << " byte"; 269 else 270 Out << " '" << ElemType.getAsString() << "' element"; 271 272 if (*ExtentN > 1) 273 Out << "s"; 274 } 275 276 return std::string(Buf); 277 } 278 static std::string getTaintMsg(std::string RegName) { 279 SmallString<128> Buf; 280 llvm::raw_svector_ostream Out(Buf); 281 Out << "Access of " << RegName 282 << " with a tainted offset that may be too large"; 283 return std::string(Buf); 284 } 285 286 void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad, 287 const Stmt *LoadS, 288 CheckerContext &C) const { 289 290 // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping 291 // some new logic here that reasons directly about memory region extents. 292 // Once that logic is more mature, we can bring it back to assumeInBound() 293 // for all clients to use. 294 // 295 // The algorithm we are using here for bounds checking is to see if the 296 // memory access is within the extent of the base region. Since we 297 // have some flexibility in defining the base region, we can achieve 298 // various levels of conservatism in our buffer overflow checking. 299 300 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as 301 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX) 302 // and incomplete analysis of these leads to false positives. As even 303 // accurate reports would be confusing for the users, just disable reports 304 // from these macros: 305 if (isFromCtypeMacro(LoadS, C.getASTContext())) 306 return; 307 308 ProgramStateRef State = C.getState(); 309 SValBuilder &SVB = C.getSValBuilder(); 310 311 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset = 312 computeOffset(State, SVB, Location); 313 314 if (!RawOffset) 315 return; 316 317 auto [Reg, ByteOffset] = *RawOffset; 318 319 // CHECK LOWER BOUND 320 const MemSpaceRegion *Space = Reg->getMemorySpace(); 321 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) { 322 // A symbolic region in unknown space represents an unknown pointer that 323 // may point into the middle of an array, so we don't look for underflows. 324 // Both conditions are significant because we want to check underflows in 325 // symbolic regions on the heap (which may be introduced by checkers like 326 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and 327 // non-symbolic regions (e.g. a field subregion of a symbolic region) in 328 // unknown space. 329 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( 330 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); 331 332 if (PrecedesLowerBound && !WithinLowerBound) { 333 // We know that the index definitely precedes the lower bound. 334 std::string RegName = getRegionName(Reg); 335 std::string Msg = getPrecedesMsg(RegName, ByteOffset); 336 reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg); 337 return; 338 } 339 340 if (WithinLowerBound) 341 State = WithinLowerBound; 342 } 343 344 // CHECK UPPER BOUND 345 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB); 346 if (auto KnownSize = Size.getAs<NonLoc>()) { 347 auto [WithinUpperBound, ExceedsUpperBound] = 348 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB); 349 350 if (ExceedsUpperBound) { 351 if (!WithinUpperBound) { 352 // We know that the index definitely exceeds the upper bound. 353 std::string RegName = getRegionName(Reg); 354 std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset, 355 *KnownSize, Location); 356 reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg); 357 return; 358 } 359 if (isTainted(State, ByteOffset)) { 360 // Both cases are possible, but the index is tainted, so report. 361 std::string RegName = getRegionName(Reg); 362 std::string Msg = getTaintMsg(RegName); 363 reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg); 364 return; 365 } 366 } 367 368 if (WithinUpperBound) 369 State = WithinUpperBound; 370 } 371 372 C.addTransition(State); 373 } 374 375 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C, 376 ProgramStateRef ErrorState, OOB_Kind Kind, 377 NonLoc Offset, std::string RegName, 378 std::string Msg) const { 379 380 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState); 381 if (!ErrorNode) 382 return; 383 384 std::string ShortMsg = getShortMsg(Kind, RegName); 385 386 auto BR = std::make_unique<PathSensitiveBugReport>( 387 Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode); 388 389 // Track back the propagation of taintedness. 390 if (Kind == OOB_Taint) 391 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset)) 392 BR->markInteresting(Sym); 393 394 C.emitReport(std::move(BR)); 395 } 396 397 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { 398 SourceLocation Loc = S->getBeginLoc(); 399 if (!Loc.isMacroID()) 400 return false; 401 402 StringRef MacroName = Lexer::getImmediateMacroName( 403 Loc, ACtx.getSourceManager(), ACtx.getLangOpts()); 404 405 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's') 406 return false; 407 408 return ((MacroName == "isalnum") || (MacroName == "isalpha") || 409 (MacroName == "isblank") || (MacroName == "isdigit") || 410 (MacroName == "isgraph") || (MacroName == "islower") || 411 (MacroName == "isnctrl") || (MacroName == "isprint") || 412 (MacroName == "ispunct") || (MacroName == "isspace") || 413 (MacroName == "isupper") || (MacroName == "isxdigit")); 414 } 415 416 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) { 417 mgr.registerChecker<ArrayBoundCheckerV2>(); 418 } 419 420 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) { 421 return true; 422 } 423