1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // fuzzer::InputCorpus 10 //===----------------------------------------------------------------------===// 11 12 #ifndef LLVM_FUZZER_CORPUS 13 #define LLVM_FUZZER_CORPUS 14 15 #include "FuzzerDataFlowTrace.h" 16 #include "FuzzerDefs.h" 17 #include "FuzzerIO.h" 18 #include "FuzzerRandom.h" 19 #include "FuzzerSHA1.h" 20 #include "FuzzerTracePC.h" 21 #include <algorithm> 22 #include <numeric> 23 #include <random> 24 #include <unordered_set> 25 26 namespace fuzzer { 27 28 struct InputInfo { 29 Unit U; // The actual input data. 30 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 31 // Number of features that this input has and no smaller input has. 32 size_t NumFeatures = 0; 33 size_t Tmp = 0; // Used by ValidateFeatureSet. 34 // Stats. 35 size_t NumExecutedMutations = 0; 36 size_t NumSuccessfullMutations = 0; 37 bool MayDeleteFile = false; 38 bool Reduced = false; 39 bool HasFocusFunction = false; 40 Vector<uint32_t> UniqFeatureSet; 41 Vector<uint8_t> DataFlowTraceForFocusFunction; 42 }; 43 44 class InputCorpus { 45 static const size_t kFeatureSetSize = 1 << 21; 46 public: InputCorpus(const std::string & OutputCorpus)47 InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) { 48 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 49 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 50 } ~InputCorpus()51 ~InputCorpus() { 52 for (auto II : Inputs) 53 delete II; 54 } size()55 size_t size() const { return Inputs.size(); } SizeInBytes()56 size_t SizeInBytes() const { 57 size_t Res = 0; 58 for (auto II : Inputs) 59 Res += II->U.size(); 60 return Res; 61 } NumActiveUnits()62 size_t NumActiveUnits() const { 63 size_t Res = 0; 64 for (auto II : Inputs) 65 Res += !II->U.empty(); 66 return Res; 67 } MaxInputSize()68 size_t MaxInputSize() const { 69 size_t Res = 0; 70 for (auto II : Inputs) 71 Res = std::max(Res, II->U.size()); 72 return Res; 73 } 74 NumInputsThatTouchFocusFunction()75 size_t NumInputsThatTouchFocusFunction() { 76 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 77 return II->HasFocusFunction; 78 }); 79 } 80 NumInputsWithDataFlowTrace()81 size_t NumInputsWithDataFlowTrace() { 82 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 83 return !II->DataFlowTraceForFocusFunction.empty(); 84 }); 85 } 86 empty()87 bool empty() const { return Inputs.empty(); } 88 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } AddToCorpus(const Unit & U,size_t NumFeatures,bool MayDeleteFile,bool HasFocusFunction,const Vector<uint32_t> & FeatureSet,const DataFlowTrace & DFT,const InputInfo * BaseII)89 void AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 90 bool HasFocusFunction, const Vector<uint32_t> &FeatureSet, 91 const DataFlowTrace &DFT, const InputInfo *BaseII) { 92 assert(!U.empty()); 93 if (FeatureDebug) 94 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 95 Inputs.push_back(new InputInfo()); 96 InputInfo &II = *Inputs.back(); 97 II.U = U; 98 II.NumFeatures = NumFeatures; 99 II.MayDeleteFile = MayDeleteFile; 100 II.UniqFeatureSet = FeatureSet; 101 II.HasFocusFunction = HasFocusFunction; 102 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 103 ComputeSHA1(U.data(), U.size(), II.Sha1); 104 auto Sha1Str = Sha1ToString(II.Sha1); 105 Hashes.insert(Sha1Str); 106 if (HasFocusFunction) 107 if (auto V = DFT.Get(Sha1Str)) 108 II.DataFlowTraceForFocusFunction = *V; 109 // This is a gross heuristic. 110 // Ideally, when we add an element to a corpus we need to know its DFT. 111 // But if we don't, we'll use the DFT of its base input. 112 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 113 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 114 UpdateCorpusDistribution(); 115 PrintCorpus(); 116 // ValidateFeatureSet(); 117 } 118 119 // Debug-only PrintUnit(const Unit & U)120 void PrintUnit(const Unit &U) { 121 if (!FeatureDebug) return; 122 for (uint8_t C : U) { 123 if (C != 'F' && C != 'U' && C != 'Z') 124 C = '.'; 125 Printf("%c", C); 126 } 127 } 128 129 // Debug-only PrintFeatureSet(const Vector<uint32_t> & FeatureSet)130 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) { 131 if (!FeatureDebug) return; 132 Printf("{"); 133 for (uint32_t Feature: FeatureSet) 134 Printf("%u,", Feature); 135 Printf("}"); 136 } 137 138 // Debug-only PrintCorpus()139 void PrintCorpus() { 140 if (!FeatureDebug) return; 141 Printf("======= CORPUS:\n"); 142 int i = 0; 143 for (auto II : Inputs) { 144 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 145 Printf("[%2d] ", i); 146 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 147 PrintUnit(II->U); 148 Printf(" "); 149 PrintFeatureSet(II->UniqFeatureSet); 150 Printf("\n"); 151 } 152 i++; 153 } 154 } 155 Replace(InputInfo * II,const Unit & U)156 void Replace(InputInfo *II, const Unit &U) { 157 assert(II->U.size() > U.size()); 158 Hashes.erase(Sha1ToString(II->Sha1)); 159 DeleteFile(*II); 160 ComputeSHA1(U.data(), U.size(), II->Sha1); 161 Hashes.insert(Sha1ToString(II->Sha1)); 162 II->U = U; 163 II->Reduced = true; 164 UpdateCorpusDistribution(); 165 } 166 HasUnit(const Unit & U)167 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } HasUnit(const std::string & H)168 bool HasUnit(const std::string &H) { return Hashes.count(H); } ChooseUnitToMutate(Random & Rand)169 InputInfo &ChooseUnitToMutate(Random &Rand) { 170 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 171 assert(!II.U.empty()); 172 return II; 173 }; 174 175 // Returns an index of random unit from the corpus to mutate. ChooseUnitIdxToMutate(Random & Rand)176 size_t ChooseUnitIdxToMutate(Random &Rand) { 177 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 178 assert(Idx < Inputs.size()); 179 return Idx; 180 } 181 PrintStats()182 void PrintStats() { 183 for (size_t i = 0; i < Inputs.size(); i++) { 184 const auto &II = *Inputs[i]; 185 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 186 Sha1ToString(II.Sha1).c_str(), II.U.size(), 187 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction); 188 } 189 } 190 PrintFeatureSet()191 void PrintFeatureSet() { 192 for (size_t i = 0; i < kFeatureSetSize; i++) { 193 if(size_t Sz = GetFeature(i)) 194 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 195 } 196 Printf("\n\t"); 197 for (size_t i = 0; i < Inputs.size(); i++) 198 if (size_t N = Inputs[i]->NumFeatures) 199 Printf(" %zd=>%zd ", i, N); 200 Printf("\n"); 201 } 202 DeleteFile(const InputInfo & II)203 void DeleteFile(const InputInfo &II) { 204 if (!OutputCorpus.empty() && II.MayDeleteFile) 205 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 206 } 207 DeleteInput(size_t Idx)208 void DeleteInput(size_t Idx) { 209 InputInfo &II = *Inputs[Idx]; 210 DeleteFile(II); 211 Unit().swap(II.U); 212 if (FeatureDebug) 213 Printf("EVICTED %zd\n", Idx); 214 } 215 AddFeature(size_t Idx,uint32_t NewSize,bool Shrink)216 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 217 assert(NewSize); 218 Idx = Idx % kFeatureSetSize; 219 uint32_t OldSize = GetFeature(Idx); 220 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 221 if (OldSize > 0) { 222 size_t OldIdx = SmallestElementPerFeature[Idx]; 223 InputInfo &II = *Inputs[OldIdx]; 224 assert(II.NumFeatures > 0); 225 II.NumFeatures--; 226 if (II.NumFeatures == 0) 227 DeleteInput(OldIdx); 228 } else { 229 NumAddedFeatures++; 230 } 231 NumUpdatedFeatures++; 232 if (FeatureDebug) 233 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 234 SmallestElementPerFeature[Idx] = Inputs.size(); 235 InputSizesPerFeature[Idx] = NewSize; 236 return true; 237 } 238 return false; 239 } 240 IsFeatureNew(size_t Idx,uint32_t NewSize,bool Shrink)241 bool IsFeatureNew(size_t Idx, uint32_t NewSize, bool Shrink) { 242 assert(NewSize); 243 uint32_t OldSize = GetFeature(Idx % kFeatureSetSize); 244 return OldSize == 0 || (Shrink && OldSize > NewSize); 245 } 246 NumFeatures()247 size_t NumFeatures() const { return NumAddedFeatures; } NumFeatureUpdates()248 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 249 250 private: 251 252 static const bool FeatureDebug = false; 253 GetFeature(size_t Idx)254 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 255 ValidateFeatureSet()256 void ValidateFeatureSet() { 257 if (FeatureDebug) 258 PrintFeatureSet(); 259 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 260 if (GetFeature(Idx)) 261 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 262 for (auto II: Inputs) { 263 if (II->Tmp != II->NumFeatures) 264 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 265 assert(II->Tmp == II->NumFeatures); 266 II->Tmp = 0; 267 } 268 } 269 270 // Updates the probability distribution for the units in the corpus. 271 // Must be called whenever the corpus or unit weights are changed. 272 // 273 // Hypothesis: units added to the corpus last are more interesting. 274 // 275 // Hypothesis: inputs with infrequent features are more interesting. UpdateCorpusDistribution()276 void UpdateCorpusDistribution() { 277 size_t N = Inputs.size(); 278 assert(N); 279 Intervals.resize(N + 1); 280 Weights.resize(N); 281 std::iota(Intervals.begin(), Intervals.end(), 0); 282 for (size_t i = 0; i < N; i++) 283 Weights[i] = Inputs[i]->NumFeatures 284 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1) 285 : 0.; 286 if (FeatureDebug) { 287 for (size_t i = 0; i < N; i++) 288 Printf("%zd ", Inputs[i]->NumFeatures); 289 Printf("SCORE\n"); 290 for (size_t i = 0; i < N; i++) 291 Printf("%f ", Weights[i]); 292 Printf("Weights\n"); 293 } 294 CorpusDistribution = std::piecewise_constant_distribution<double>( 295 Intervals.begin(), Intervals.end(), Weights.begin()); 296 } 297 std::piecewise_constant_distribution<double> CorpusDistribution; 298 299 Vector<double> Intervals; 300 Vector<double> Weights; 301 302 std::unordered_set<std::string> Hashes; 303 Vector<InputInfo*> Inputs; 304 305 size_t NumAddedFeatures = 0; 306 size_t NumUpdatedFeatures = 0; 307 uint32_t InputSizesPerFeature[kFeatureSetSize]; 308 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 309 310 std::string OutputCorpus; 311 }; 312 313 } // namespace fuzzer 314 315 #endif // LLVM_FUZZER_CORPUS 316