10b57cec5SDimitry Andric //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // fuzzer::InputCorpus 90b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 100b57cec5SDimitry Andric 110b57cec5SDimitry Andric #ifndef LLVM_FUZZER_CORPUS 120b57cec5SDimitry Andric #define LLVM_FUZZER_CORPUS 130b57cec5SDimitry Andric 140b57cec5SDimitry Andric #include "FuzzerDataFlowTrace.h" 150b57cec5SDimitry Andric #include "FuzzerDefs.h" 160b57cec5SDimitry Andric #include "FuzzerIO.h" 170b57cec5SDimitry Andric #include "FuzzerRandom.h" 180b57cec5SDimitry Andric #include "FuzzerSHA1.h" 190b57cec5SDimitry Andric #include "FuzzerTracePC.h" 200b57cec5SDimitry Andric #include <algorithm> 21*5f757f3fSDimitry Andric #include <bitset> 22e8d8bef9SDimitry Andric #include <chrono> 230b57cec5SDimitry Andric #include <numeric> 240b57cec5SDimitry Andric #include <random> 250b57cec5SDimitry Andric #include <unordered_set> 260b57cec5SDimitry Andric 270b57cec5SDimitry Andric namespace fuzzer { 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric struct InputInfo { 300b57cec5SDimitry Andric Unit U; // The actual input data. 31e8d8bef9SDimitry Andric std::chrono::microseconds TimeOfUnit; 320b57cec5SDimitry Andric uint8_t Sha1[kSHA1NumBytes]; // Checksum. 330b57cec5SDimitry Andric // Number of features that this input has and no smaller input has. 340b57cec5SDimitry Andric size_t NumFeatures = 0; 350b57cec5SDimitry Andric size_t Tmp = 0; // Used by ValidateFeatureSet. 360b57cec5SDimitry Andric // Stats. 370b57cec5SDimitry Andric size_t NumExecutedMutations = 0; 380b57cec5SDimitry Andric size_t NumSuccessfullMutations = 0; 39e8d8bef9SDimitry Andric bool NeverReduce = false; 400b57cec5SDimitry Andric bool MayDeleteFile = false; 410b57cec5SDimitry Andric bool Reduced = false; 420b57cec5SDimitry Andric bool HasFocusFunction = false; 43349cc55cSDimitry Andric std::vector<uint32_t> UniqFeatureSet; 44349cc55cSDimitry Andric std::vector<uint8_t> DataFlowTraceForFocusFunction; 455ffd83dbSDimitry Andric // Power schedule. 465ffd83dbSDimitry Andric bool NeedsEnergyUpdate = false; 475ffd83dbSDimitry Andric double Energy = 0.0; 48fe6060f1SDimitry Andric double SumIncidence = 0.0; 49349cc55cSDimitry Andric std::vector<std::pair<uint32_t, uint16_t>> FeatureFreqs; 505ffd83dbSDimitry Andric 515ffd83dbSDimitry Andric // Delete feature Idx and its frequency from FeatureFreqs. DeleteFeatureFreqInputInfo525ffd83dbSDimitry Andric bool DeleteFeatureFreq(uint32_t Idx) { 535ffd83dbSDimitry Andric if (FeatureFreqs.empty()) 545ffd83dbSDimitry Andric return false; 555ffd83dbSDimitry Andric 565ffd83dbSDimitry Andric // Binary search over local feature frequencies sorted by index. 575ffd83dbSDimitry Andric auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 585ffd83dbSDimitry Andric std::pair<uint32_t, uint16_t>(Idx, 0)); 595ffd83dbSDimitry Andric 605ffd83dbSDimitry Andric if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 615ffd83dbSDimitry Andric FeatureFreqs.erase(Lower); 625ffd83dbSDimitry Andric return true; 635ffd83dbSDimitry Andric } 645ffd83dbSDimitry Andric return false; 655ffd83dbSDimitry Andric } 665ffd83dbSDimitry Andric 675ffd83dbSDimitry Andric // Assign more energy to a high-entropy seed, i.e., that reveals more 68e8d8bef9SDimitry Andric // information about the globally rare features in the neighborhood of the 69e8d8bef9SDimitry Andric // seed. Since we do not know the entropy of a seed that has never been 70e8d8bef9SDimitry Andric // executed we assign fresh seeds maximum entropy and let II->Energy approach 71e8d8bef9SDimitry Andric // the true entropy from above. If ScalePerExecTime is true, the computed 72e8d8bef9SDimitry Andric // entropy is scaled based on how fast this input executes compared to the 73e8d8bef9SDimitry Andric // average execution time of inputs. The faster an input executes, the more 74e8d8bef9SDimitry Andric // energy gets assigned to the input. UpdateEnergyInputInfo75e8d8bef9SDimitry Andric void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime, 76e8d8bef9SDimitry Andric std::chrono::microseconds AverageUnitExecutionTime) { 775ffd83dbSDimitry Andric Energy = 0.0; 78fe6060f1SDimitry Andric SumIncidence = 0.0; 795ffd83dbSDimitry Andric 805ffd83dbSDimitry Andric // Apply add-one smoothing to locally discovered features. 8106c3fb27SDimitry Andric for (const auto &F : FeatureFreqs) { 82fe6060f1SDimitry Andric double LocalIncidence = F.second + 1; 83fe6060f1SDimitry Andric Energy -= LocalIncidence * log(LocalIncidence); 845ffd83dbSDimitry Andric SumIncidence += LocalIncidence; 855ffd83dbSDimitry Andric } 865ffd83dbSDimitry Andric 875ffd83dbSDimitry Andric // Apply add-one smoothing to locally undiscovered features. 88fe6060f1SDimitry Andric // PreciseEnergy -= 0; // since log(1.0) == 0) 89fe6060f1SDimitry Andric SumIncidence += 90fe6060f1SDimitry Andric static_cast<double>(GlobalNumberOfFeatures - FeatureFreqs.size()); 915ffd83dbSDimitry Andric 925ffd83dbSDimitry Andric // Add a single locally abundant feature apply add-one smoothing. 93fe6060f1SDimitry Andric double AbdIncidence = static_cast<double>(NumExecutedMutations + 1); 94fe6060f1SDimitry Andric Energy -= AbdIncidence * log(AbdIncidence); 955ffd83dbSDimitry Andric SumIncidence += AbdIncidence; 965ffd83dbSDimitry Andric 975ffd83dbSDimitry Andric // Normalize. 985ffd83dbSDimitry Andric if (SumIncidence != 0) 99fe6060f1SDimitry Andric Energy = Energy / SumIncidence + log(SumIncidence); 100e8d8bef9SDimitry Andric 101e8d8bef9SDimitry Andric if (ScalePerExecTime) { 102e8d8bef9SDimitry Andric // Scaling to favor inputs with lower execution time. 103e8d8bef9SDimitry Andric uint32_t PerfScore = 100; 104e8d8bef9SDimitry Andric if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10) 105e8d8bef9SDimitry Andric PerfScore = 10; 106e8d8bef9SDimitry Andric else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4) 107e8d8bef9SDimitry Andric PerfScore = 25; 108e8d8bef9SDimitry Andric else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2) 109e8d8bef9SDimitry Andric PerfScore = 50; 110e8d8bef9SDimitry Andric else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4) 111e8d8bef9SDimitry Andric PerfScore = 75; 112e8d8bef9SDimitry Andric else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count()) 113e8d8bef9SDimitry Andric PerfScore = 300; 114e8d8bef9SDimitry Andric else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count()) 115e8d8bef9SDimitry Andric PerfScore = 200; 116e8d8bef9SDimitry Andric else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count()) 117e8d8bef9SDimitry Andric PerfScore = 150; 118e8d8bef9SDimitry Andric 119e8d8bef9SDimitry Andric Energy *= PerfScore; 120e8d8bef9SDimitry Andric } 1215ffd83dbSDimitry Andric } 1225ffd83dbSDimitry Andric 1235ffd83dbSDimitry Andric // Increment the frequency of the feature Idx. UpdateFeatureFrequencyInputInfo1245ffd83dbSDimitry Andric void UpdateFeatureFrequency(uint32_t Idx) { 1255ffd83dbSDimitry Andric NeedsEnergyUpdate = true; 1265ffd83dbSDimitry Andric 1275ffd83dbSDimitry Andric // The local feature frequencies is an ordered vector of pairs. 1285ffd83dbSDimitry Andric // If there are no local feature frequencies, push_back preserves order. 1295ffd83dbSDimitry Andric // Set the feature frequency for feature Idx32 to 1. 1305ffd83dbSDimitry Andric if (FeatureFreqs.empty()) { 1315ffd83dbSDimitry Andric FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1)); 1325ffd83dbSDimitry Andric return; 1335ffd83dbSDimitry Andric } 1345ffd83dbSDimitry Andric 1355ffd83dbSDimitry Andric // Binary search over local feature frequencies sorted by index. 1365ffd83dbSDimitry Andric auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 1375ffd83dbSDimitry Andric std::pair<uint32_t, uint16_t>(Idx, 0)); 1385ffd83dbSDimitry Andric 1395ffd83dbSDimitry Andric // If feature Idx32 already exists, increment its frequency. 1405ffd83dbSDimitry Andric // Otherwise, insert a new pair right after the next lower index. 1415ffd83dbSDimitry Andric if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 1425ffd83dbSDimitry Andric Lower->second++; 1435ffd83dbSDimitry Andric } else { 1445ffd83dbSDimitry Andric FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1)); 1455ffd83dbSDimitry Andric } 1465ffd83dbSDimitry Andric } 1475ffd83dbSDimitry Andric }; 1485ffd83dbSDimitry Andric 1495ffd83dbSDimitry Andric struct EntropicOptions { 1505ffd83dbSDimitry Andric bool Enabled; 1515ffd83dbSDimitry Andric size_t NumberOfRarestFeatures; 1525ffd83dbSDimitry Andric size_t FeatureFrequencyThreshold; 153e8d8bef9SDimitry Andric bool ScalePerExecTime; 1540b57cec5SDimitry Andric }; 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric class InputCorpus { 1575ffd83dbSDimitry Andric static const uint32_t kFeatureSetSize = 1 << 21; 1585ffd83dbSDimitry Andric static const uint8_t kMaxMutationFactor = 20; 1595ffd83dbSDimitry Andric static const size_t kSparseEnergyUpdates = 100; 1605ffd83dbSDimitry Andric 1615ffd83dbSDimitry Andric size_t NumExecutedMutations = 0; 1625ffd83dbSDimitry Andric 1635ffd83dbSDimitry Andric EntropicOptions Entropic; 1645ffd83dbSDimitry Andric 1650b57cec5SDimitry Andric public: InputCorpus(const std::string & OutputCorpus,EntropicOptions Entropic)1665ffd83dbSDimitry Andric InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic) 1675ffd83dbSDimitry Andric : Entropic(Entropic), OutputCorpus(OutputCorpus) { 1680b57cec5SDimitry Andric memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 1690b57cec5SDimitry Andric memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 1700b57cec5SDimitry Andric } ~InputCorpus()1710b57cec5SDimitry Andric ~InputCorpus() { 1720b57cec5SDimitry Andric for (auto II : Inputs) 1730b57cec5SDimitry Andric delete II; 1740b57cec5SDimitry Andric } size()1750b57cec5SDimitry Andric size_t size() const { return Inputs.size(); } SizeInBytes()1760b57cec5SDimitry Andric size_t SizeInBytes() const { 1770b57cec5SDimitry Andric size_t Res = 0; 1780b57cec5SDimitry Andric for (auto II : Inputs) 1790b57cec5SDimitry Andric Res += II->U.size(); 1800b57cec5SDimitry Andric return Res; 1810b57cec5SDimitry Andric } NumActiveUnits()1820b57cec5SDimitry Andric size_t NumActiveUnits() const { 1830b57cec5SDimitry Andric size_t Res = 0; 1840b57cec5SDimitry Andric for (auto II : Inputs) 1850b57cec5SDimitry Andric Res += !II->U.empty(); 1860b57cec5SDimitry Andric return Res; 1870b57cec5SDimitry Andric } MaxInputSize()1880b57cec5SDimitry Andric size_t MaxInputSize() const { 1890b57cec5SDimitry Andric size_t Res = 0; 1900b57cec5SDimitry Andric for (auto II : Inputs) 1910b57cec5SDimitry Andric Res = std::max(Res, II->U.size()); 1920b57cec5SDimitry Andric return Res; 1930b57cec5SDimitry Andric } IncrementNumExecutedMutations()1945ffd83dbSDimitry Andric void IncrementNumExecutedMutations() { NumExecutedMutations++; } 1950b57cec5SDimitry Andric NumInputsThatTouchFocusFunction()1960b57cec5SDimitry Andric size_t NumInputsThatTouchFocusFunction() { 1970b57cec5SDimitry Andric return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 1980b57cec5SDimitry Andric return II->HasFocusFunction; 1990b57cec5SDimitry Andric }); 2000b57cec5SDimitry Andric } 2010b57cec5SDimitry Andric NumInputsWithDataFlowTrace()2020b57cec5SDimitry Andric size_t NumInputsWithDataFlowTrace() { 2030b57cec5SDimitry Andric return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 2040b57cec5SDimitry Andric return !II->DataFlowTraceForFocusFunction.empty(); 2050b57cec5SDimitry Andric }); 2060b57cec5SDimitry Andric } 2070b57cec5SDimitry Andric empty()2080b57cec5SDimitry Andric bool empty() const { return Inputs.empty(); } 2090b57cec5SDimitry Andric const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } AddToCorpus(const Unit & U,size_t NumFeatures,bool MayDeleteFile,bool HasFocusFunction,bool NeverReduce,std::chrono::microseconds TimeOfUnit,const std::vector<uint32_t> & FeatureSet,const DataFlowTrace & DFT,const InputInfo * BaseII)2100b57cec5SDimitry Andric InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 211e8d8bef9SDimitry Andric bool HasFocusFunction, bool NeverReduce, 212e8d8bef9SDimitry Andric std::chrono::microseconds TimeOfUnit, 213349cc55cSDimitry Andric const std::vector<uint32_t> &FeatureSet, 2140b57cec5SDimitry Andric const DataFlowTrace &DFT, const InputInfo *BaseII) { 2150b57cec5SDimitry Andric assert(!U.empty()); 2160b57cec5SDimitry Andric if (FeatureDebug) 2170b57cec5SDimitry Andric Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 218fe6060f1SDimitry Andric // Inputs.size() is cast to uint32_t below. 219fe6060f1SDimitry Andric assert(Inputs.size() < std::numeric_limits<uint32_t>::max()); 2200b57cec5SDimitry Andric Inputs.push_back(new InputInfo()); 2210b57cec5SDimitry Andric InputInfo &II = *Inputs.back(); 2220b57cec5SDimitry Andric II.U = U; 2230b57cec5SDimitry Andric II.NumFeatures = NumFeatures; 224e8d8bef9SDimitry Andric II.NeverReduce = NeverReduce; 225e8d8bef9SDimitry Andric II.TimeOfUnit = TimeOfUnit; 2260b57cec5SDimitry Andric II.MayDeleteFile = MayDeleteFile; 2270b57cec5SDimitry Andric II.UniqFeatureSet = FeatureSet; 2280b57cec5SDimitry Andric II.HasFocusFunction = HasFocusFunction; 2295ffd83dbSDimitry Andric // Assign maximal energy to the new seed. 2305ffd83dbSDimitry Andric II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size()); 231fe6060f1SDimitry Andric II.SumIncidence = static_cast<double>(RareFeatures.size()); 2325ffd83dbSDimitry Andric II.NeedsEnergyUpdate = false; 2330b57cec5SDimitry Andric std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 2340b57cec5SDimitry Andric ComputeSHA1(U.data(), U.size(), II.Sha1); 2350b57cec5SDimitry Andric auto Sha1Str = Sha1ToString(II.Sha1); 2360b57cec5SDimitry Andric Hashes.insert(Sha1Str); 2370b57cec5SDimitry Andric if (HasFocusFunction) 2380b57cec5SDimitry Andric if (auto V = DFT.Get(Sha1Str)) 2390b57cec5SDimitry Andric II.DataFlowTraceForFocusFunction = *V; 2400b57cec5SDimitry Andric // This is a gross heuristic. 2410b57cec5SDimitry Andric // Ideally, when we add an element to a corpus we need to know its DFT. 2420b57cec5SDimitry Andric // But if we don't, we'll use the DFT of its base input. 2430b57cec5SDimitry Andric if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 2440b57cec5SDimitry Andric II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 2455ffd83dbSDimitry Andric DistributionNeedsUpdate = true; 2460b57cec5SDimitry Andric PrintCorpus(); 2470b57cec5SDimitry Andric // ValidateFeatureSet(); 2480b57cec5SDimitry Andric return &II; 2490b57cec5SDimitry Andric } 2500b57cec5SDimitry Andric 2510b57cec5SDimitry Andric // Debug-only PrintUnit(const Unit & U)2520b57cec5SDimitry Andric void PrintUnit(const Unit &U) { 2530b57cec5SDimitry Andric if (!FeatureDebug) return; 2540b57cec5SDimitry Andric for (uint8_t C : U) { 2550b57cec5SDimitry Andric if (C != 'F' && C != 'U' && C != 'Z') 2560b57cec5SDimitry Andric C = '.'; 2570b57cec5SDimitry Andric Printf("%c", C); 2580b57cec5SDimitry Andric } 2590b57cec5SDimitry Andric } 2600b57cec5SDimitry Andric 2610b57cec5SDimitry Andric // Debug-only PrintFeatureSet(const std::vector<uint32_t> & FeatureSet)262349cc55cSDimitry Andric void PrintFeatureSet(const std::vector<uint32_t> &FeatureSet) { 2630b57cec5SDimitry Andric if (!FeatureDebug) return; 2640b57cec5SDimitry Andric Printf("{"); 2650b57cec5SDimitry Andric for (uint32_t Feature: FeatureSet) 2660b57cec5SDimitry Andric Printf("%u,", Feature); 2670b57cec5SDimitry Andric Printf("}"); 2680b57cec5SDimitry Andric } 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric // Debug-only PrintCorpus()2710b57cec5SDimitry Andric void PrintCorpus() { 2720b57cec5SDimitry Andric if (!FeatureDebug) return; 2730b57cec5SDimitry Andric Printf("======= CORPUS:\n"); 2740b57cec5SDimitry Andric int i = 0; 2750b57cec5SDimitry Andric for (auto II : Inputs) { 2760b57cec5SDimitry Andric if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 2770b57cec5SDimitry Andric Printf("[%2d] ", i); 2780b57cec5SDimitry Andric Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 2790b57cec5SDimitry Andric PrintUnit(II->U); 2800b57cec5SDimitry Andric Printf(" "); 2810b57cec5SDimitry Andric PrintFeatureSet(II->UniqFeatureSet); 2820b57cec5SDimitry Andric Printf("\n"); 2830b57cec5SDimitry Andric } 2840b57cec5SDimitry Andric i++; 2850b57cec5SDimitry Andric } 2860b57cec5SDimitry Andric } 2870b57cec5SDimitry Andric Replace(InputInfo * II,const Unit & U,std::chrono::microseconds TimeOfUnit)288349cc55cSDimitry Andric void Replace(InputInfo *II, const Unit &U, 289349cc55cSDimitry Andric std::chrono::microseconds TimeOfUnit) { 2900b57cec5SDimitry Andric assert(II->U.size() > U.size()); 2910b57cec5SDimitry Andric Hashes.erase(Sha1ToString(II->Sha1)); 2920b57cec5SDimitry Andric DeleteFile(*II); 2930b57cec5SDimitry Andric ComputeSHA1(U.data(), U.size(), II->Sha1); 2940b57cec5SDimitry Andric Hashes.insert(Sha1ToString(II->Sha1)); 2950b57cec5SDimitry Andric II->U = U; 2960b57cec5SDimitry Andric II->Reduced = true; 297349cc55cSDimitry Andric II->TimeOfUnit = TimeOfUnit; 2985ffd83dbSDimitry Andric DistributionNeedsUpdate = true; 2990b57cec5SDimitry Andric } 3000b57cec5SDimitry Andric HasUnit(const Unit & U)3010b57cec5SDimitry Andric bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } HasUnit(const std::string & H)3020b57cec5SDimitry Andric bool HasUnit(const std::string &H) { return Hashes.count(H); } ChooseUnitToMutate(Random & Rand)3030b57cec5SDimitry Andric InputInfo &ChooseUnitToMutate(Random &Rand) { 3040b57cec5SDimitry Andric InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 3050b57cec5SDimitry Andric assert(!II.U.empty()); 3060b57cec5SDimitry Andric return II; 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric ChooseUnitToCrossOverWith(Random & Rand,bool UniformDist)309e8d8bef9SDimitry Andric InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) { 310e8d8bef9SDimitry Andric if (!UniformDist) { 311e8d8bef9SDimitry Andric return ChooseUnitToMutate(Rand); 312e8d8bef9SDimitry Andric } 313e8d8bef9SDimitry Andric InputInfo &II = *Inputs[Rand(Inputs.size())]; 314e8d8bef9SDimitry Andric assert(!II.U.empty()); 315e8d8bef9SDimitry Andric return II; 316e8d8bef9SDimitry Andric } 317e8d8bef9SDimitry Andric 3180b57cec5SDimitry Andric // Returns an index of random unit from the corpus to mutate. ChooseUnitIdxToMutate(Random & Rand)3190b57cec5SDimitry Andric size_t ChooseUnitIdxToMutate(Random &Rand) { 3205ffd83dbSDimitry Andric UpdateCorpusDistribution(Rand); 3210b57cec5SDimitry Andric size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 3220b57cec5SDimitry Andric assert(Idx < Inputs.size()); 3230b57cec5SDimitry Andric return Idx; 3240b57cec5SDimitry Andric } 3250b57cec5SDimitry Andric PrintStats()3260b57cec5SDimitry Andric void PrintStats() { 3270b57cec5SDimitry Andric for (size_t i = 0; i < Inputs.size(); i++) { 3280b57cec5SDimitry Andric const auto &II = *Inputs[i]; 3290b57cec5SDimitry Andric Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 3300b57cec5SDimitry Andric Sha1ToString(II.Sha1).c_str(), II.U.size(), 331349cc55cSDimitry Andric II.NumExecutedMutations, II.NumSuccessfullMutations, 332349cc55cSDimitry Andric II.HasFocusFunction); 3330b57cec5SDimitry Andric } 3340b57cec5SDimitry Andric } 3350b57cec5SDimitry Andric PrintFeatureSet()3360b57cec5SDimitry Andric void PrintFeatureSet() { 3370b57cec5SDimitry Andric for (size_t i = 0; i < kFeatureSetSize; i++) { 3380b57cec5SDimitry Andric if(size_t Sz = GetFeature(i)) 3390b57cec5SDimitry Andric Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 3400b57cec5SDimitry Andric } 3410b57cec5SDimitry Andric Printf("\n\t"); 3420b57cec5SDimitry Andric for (size_t i = 0; i < Inputs.size(); i++) 3430b57cec5SDimitry Andric if (size_t N = Inputs[i]->NumFeatures) 3440b57cec5SDimitry Andric Printf(" %zd=>%zd ", i, N); 3450b57cec5SDimitry Andric Printf("\n"); 3460b57cec5SDimitry Andric } 3470b57cec5SDimitry Andric DeleteFile(const InputInfo & II)3480b57cec5SDimitry Andric void DeleteFile(const InputInfo &II) { 3490b57cec5SDimitry Andric if (!OutputCorpus.empty() && II.MayDeleteFile) 3500b57cec5SDimitry Andric RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 3510b57cec5SDimitry Andric } 3520b57cec5SDimitry Andric DeleteInput(size_t Idx)3530b57cec5SDimitry Andric void DeleteInput(size_t Idx) { 3540b57cec5SDimitry Andric InputInfo &II = *Inputs[Idx]; 3550b57cec5SDimitry Andric DeleteFile(II); 3560b57cec5SDimitry Andric Unit().swap(II.U); 3575ffd83dbSDimitry Andric II.Energy = 0.0; 3585ffd83dbSDimitry Andric II.NeedsEnergyUpdate = false; 3595ffd83dbSDimitry Andric DistributionNeedsUpdate = true; 3600b57cec5SDimitry Andric if (FeatureDebug) 3610b57cec5SDimitry Andric Printf("EVICTED %zd\n", Idx); 3620b57cec5SDimitry Andric } 3630b57cec5SDimitry Andric AddRareFeature(uint32_t Idx)3645ffd83dbSDimitry Andric void AddRareFeature(uint32_t Idx) { 3655ffd83dbSDimitry Andric // Maintain *at least* TopXRarestFeatures many rare features 3665ffd83dbSDimitry Andric // and all features with a frequency below ConsideredRare. 3675ffd83dbSDimitry Andric // Remove all other features. 3685ffd83dbSDimitry Andric while (RareFeatures.size() > Entropic.NumberOfRarestFeatures && 3695ffd83dbSDimitry Andric FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) { 3705ffd83dbSDimitry Andric 3715ffd83dbSDimitry Andric // Find most and second most abbundant feature. 3725ffd83dbSDimitry Andric uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0], 3735ffd83dbSDimitry Andric RareFeatures[0]}; 3745ffd83dbSDimitry Andric size_t Delete = 0; 3755ffd83dbSDimitry Andric for (size_t i = 0; i < RareFeatures.size(); i++) { 3765ffd83dbSDimitry Andric uint32_t Idx2 = RareFeatures[i]; 3775ffd83dbSDimitry Andric if (GlobalFeatureFreqs[Idx2] >= 3785ffd83dbSDimitry Andric GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) { 3795ffd83dbSDimitry Andric MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0]; 3805ffd83dbSDimitry Andric MostAbundantRareFeatureIndices[0] = Idx2; 3815ffd83dbSDimitry Andric Delete = i; 3825ffd83dbSDimitry Andric } 3835ffd83dbSDimitry Andric } 3845ffd83dbSDimitry Andric 3855ffd83dbSDimitry Andric // Remove most abundant rare feature. 386*5f757f3fSDimitry Andric IsRareFeature[Delete] = false; 3875ffd83dbSDimitry Andric RareFeatures[Delete] = RareFeatures.back(); 3885ffd83dbSDimitry Andric RareFeatures.pop_back(); 3895ffd83dbSDimitry Andric 3905ffd83dbSDimitry Andric for (auto II : Inputs) { 3915ffd83dbSDimitry Andric if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0])) 3925ffd83dbSDimitry Andric II->NeedsEnergyUpdate = true; 3935ffd83dbSDimitry Andric } 3945ffd83dbSDimitry Andric 3955ffd83dbSDimitry Andric // Set 2nd most abundant as the new most abundant feature count. 3965ffd83dbSDimitry Andric FreqOfMostAbundantRareFeature = 3975ffd83dbSDimitry Andric GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]]; 3985ffd83dbSDimitry Andric } 3995ffd83dbSDimitry Andric 4005ffd83dbSDimitry Andric // Add rare feature, handle collisions, and update energy. 4015ffd83dbSDimitry Andric RareFeatures.push_back(Idx); 402*5f757f3fSDimitry Andric IsRareFeature[Idx] = true; 4035ffd83dbSDimitry Andric GlobalFeatureFreqs[Idx] = 0; 4045ffd83dbSDimitry Andric for (auto II : Inputs) { 4055ffd83dbSDimitry Andric II->DeleteFeatureFreq(Idx); 4065ffd83dbSDimitry Andric 4075ffd83dbSDimitry Andric // Apply add-one smoothing to this locally undiscovered feature. 4085ffd83dbSDimitry Andric // Zero energy seeds will never be fuzzed and remain zero energy. 4095ffd83dbSDimitry Andric if (II->Energy > 0.0) { 4105ffd83dbSDimitry Andric II->SumIncidence += 1; 411fe6060f1SDimitry Andric II->Energy += log(II->SumIncidence) / II->SumIncidence; 4125ffd83dbSDimitry Andric } 4135ffd83dbSDimitry Andric } 4145ffd83dbSDimitry Andric 4155ffd83dbSDimitry Andric DistributionNeedsUpdate = true; 4165ffd83dbSDimitry Andric } 4175ffd83dbSDimitry Andric AddFeature(size_t Idx,uint32_t NewSize,bool Shrink)4180b57cec5SDimitry Andric bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 4190b57cec5SDimitry Andric assert(NewSize); 4200b57cec5SDimitry Andric Idx = Idx % kFeatureSetSize; 4210b57cec5SDimitry Andric uint32_t OldSize = GetFeature(Idx); 4220b57cec5SDimitry Andric if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 4230b57cec5SDimitry Andric if (OldSize > 0) { 4240b57cec5SDimitry Andric size_t OldIdx = SmallestElementPerFeature[Idx]; 4250b57cec5SDimitry Andric InputInfo &II = *Inputs[OldIdx]; 4260b57cec5SDimitry Andric assert(II.NumFeatures > 0); 4270b57cec5SDimitry Andric II.NumFeatures--; 4280b57cec5SDimitry Andric if (II.NumFeatures == 0) 4290b57cec5SDimitry Andric DeleteInput(OldIdx); 4300b57cec5SDimitry Andric } else { 4310b57cec5SDimitry Andric NumAddedFeatures++; 4325ffd83dbSDimitry Andric if (Entropic.Enabled) 4335ffd83dbSDimitry Andric AddRareFeature((uint32_t)Idx); 4340b57cec5SDimitry Andric } 4350b57cec5SDimitry Andric NumUpdatedFeatures++; 4360b57cec5SDimitry Andric if (FeatureDebug) 4370b57cec5SDimitry Andric Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 438fe6060f1SDimitry Andric // Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus. 439fe6060f1SDimitry Andric SmallestElementPerFeature[Idx] = static_cast<uint32_t>(Inputs.size()); 4400b57cec5SDimitry Andric InputSizesPerFeature[Idx] = NewSize; 4410b57cec5SDimitry Andric return true; 4420b57cec5SDimitry Andric } 4430b57cec5SDimitry Andric return false; 4440b57cec5SDimitry Andric } 4450b57cec5SDimitry Andric 4465ffd83dbSDimitry Andric // Increment frequency of feature Idx globally and locally. UpdateFeatureFrequency(InputInfo * II,size_t Idx)4475ffd83dbSDimitry Andric void UpdateFeatureFrequency(InputInfo *II, size_t Idx) { 4485ffd83dbSDimitry Andric uint32_t Idx32 = Idx % kFeatureSetSize; 4495ffd83dbSDimitry Andric 4505ffd83dbSDimitry Andric // Saturated increment. 4515ffd83dbSDimitry Andric if (GlobalFeatureFreqs[Idx32] == 0xFFFF) 4525ffd83dbSDimitry Andric return; 4535ffd83dbSDimitry Andric uint16_t Freq = GlobalFeatureFreqs[Idx32]++; 4545ffd83dbSDimitry Andric 4555ffd83dbSDimitry Andric // Skip if abundant. 456*5f757f3fSDimitry Andric if (Freq > FreqOfMostAbundantRareFeature || !IsRareFeature[Idx32]) 4575ffd83dbSDimitry Andric return; 4585ffd83dbSDimitry Andric 4595ffd83dbSDimitry Andric // Update global frequencies. 4605ffd83dbSDimitry Andric if (Freq == FreqOfMostAbundantRareFeature) 4615ffd83dbSDimitry Andric FreqOfMostAbundantRareFeature++; 4625ffd83dbSDimitry Andric 4635ffd83dbSDimitry Andric // Update local frequencies. 4645ffd83dbSDimitry Andric if (II) 4655ffd83dbSDimitry Andric II->UpdateFeatureFrequency(Idx32); 4665ffd83dbSDimitry Andric } 4675ffd83dbSDimitry Andric NumFeatures()4680b57cec5SDimitry Andric size_t NumFeatures() const { return NumAddedFeatures; } NumFeatureUpdates()4690b57cec5SDimitry Andric size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 4700b57cec5SDimitry Andric 4710b57cec5SDimitry Andric private: 4720b57cec5SDimitry Andric 4730b57cec5SDimitry Andric static const bool FeatureDebug = false; 4740b57cec5SDimitry Andric GetFeature(size_t Idx)475fe6060f1SDimitry Andric uint32_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 4760b57cec5SDimitry Andric ValidateFeatureSet()4770b57cec5SDimitry Andric void ValidateFeatureSet() { 4780b57cec5SDimitry Andric if (FeatureDebug) 4790b57cec5SDimitry Andric PrintFeatureSet(); 4800b57cec5SDimitry Andric for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 4810b57cec5SDimitry Andric if (GetFeature(Idx)) 4820b57cec5SDimitry Andric Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 4830b57cec5SDimitry Andric for (auto II: Inputs) { 4840b57cec5SDimitry Andric if (II->Tmp != II->NumFeatures) 4850b57cec5SDimitry Andric Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 4860b57cec5SDimitry Andric assert(II->Tmp == II->NumFeatures); 4870b57cec5SDimitry Andric II->Tmp = 0; 4880b57cec5SDimitry Andric } 4890b57cec5SDimitry Andric } 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric // Updates the probability distribution for the units in the corpus. 4920b57cec5SDimitry Andric // Must be called whenever the corpus or unit weights are changed. 4930b57cec5SDimitry Andric // 4945ffd83dbSDimitry Andric // Hypothesis: inputs that maximize information about globally rare features 4955ffd83dbSDimitry Andric // are interesting. UpdateCorpusDistribution(Random & Rand)4965ffd83dbSDimitry Andric void UpdateCorpusDistribution(Random &Rand) { 4975ffd83dbSDimitry Andric // Skip update if no seeds or rare features were added/deleted. 4985ffd83dbSDimitry Andric // Sparse updates for local change of feature frequencies, 4995ffd83dbSDimitry Andric // i.e., randomly do not skip. 5005ffd83dbSDimitry Andric if (!DistributionNeedsUpdate && 5015ffd83dbSDimitry Andric (!Entropic.Enabled || Rand(kSparseEnergyUpdates))) 5025ffd83dbSDimitry Andric return; 5035ffd83dbSDimitry Andric 5045ffd83dbSDimitry Andric DistributionNeedsUpdate = false; 5055ffd83dbSDimitry Andric 5060b57cec5SDimitry Andric size_t N = Inputs.size(); 5070b57cec5SDimitry Andric assert(N); 5080b57cec5SDimitry Andric Intervals.resize(N + 1); 5090b57cec5SDimitry Andric Weights.resize(N); 5100b57cec5SDimitry Andric std::iota(Intervals.begin(), Intervals.end(), 0); 5115ffd83dbSDimitry Andric 512e8d8bef9SDimitry Andric std::chrono::microseconds AverageUnitExecutionTime(0); 513e8d8bef9SDimitry Andric for (auto II : Inputs) { 514e8d8bef9SDimitry Andric AverageUnitExecutionTime += II->TimeOfUnit; 515e8d8bef9SDimitry Andric } 516e8d8bef9SDimitry Andric AverageUnitExecutionTime /= N; 517e8d8bef9SDimitry Andric 5185ffd83dbSDimitry Andric bool VanillaSchedule = true; 5195ffd83dbSDimitry Andric if (Entropic.Enabled) { 5205ffd83dbSDimitry Andric for (auto II : Inputs) { 5215ffd83dbSDimitry Andric if (II->NeedsEnergyUpdate && II->Energy != 0.0) { 5225ffd83dbSDimitry Andric II->NeedsEnergyUpdate = false; 523e8d8bef9SDimitry Andric II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime, 524e8d8bef9SDimitry Andric AverageUnitExecutionTime); 5255ffd83dbSDimitry Andric } 5265ffd83dbSDimitry Andric } 5275ffd83dbSDimitry Andric 5285ffd83dbSDimitry Andric for (size_t i = 0; i < N; i++) { 5295ffd83dbSDimitry Andric 5305ffd83dbSDimitry Andric if (Inputs[i]->NumFeatures == 0) { 5315ffd83dbSDimitry Andric // If the seed doesn't represent any features, assign zero energy. 5325ffd83dbSDimitry Andric Weights[i] = 0.; 5335ffd83dbSDimitry Andric } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor > 5345ffd83dbSDimitry Andric NumExecutedMutations / Inputs.size()) { 5355ffd83dbSDimitry Andric // If the seed was fuzzed a lot more than average, assign zero energy. 5365ffd83dbSDimitry Andric Weights[i] = 0.; 5375ffd83dbSDimitry Andric } else { 5385ffd83dbSDimitry Andric // Otherwise, simply assign the computed energy. 5395ffd83dbSDimitry Andric Weights[i] = Inputs[i]->Energy; 5405ffd83dbSDimitry Andric } 5415ffd83dbSDimitry Andric 5425ffd83dbSDimitry Andric // If energy for all seeds is zero, fall back to vanilla schedule. 5435ffd83dbSDimitry Andric if (Weights[i] > 0.0) 5445ffd83dbSDimitry Andric VanillaSchedule = false; 5455ffd83dbSDimitry Andric } 5465ffd83dbSDimitry Andric } 5475ffd83dbSDimitry Andric 5485ffd83dbSDimitry Andric if (VanillaSchedule) { 5490b57cec5SDimitry Andric for (size_t i = 0; i < N; i++) 550fe6060f1SDimitry Andric Weights[i] = 551fe6060f1SDimitry Andric Inputs[i]->NumFeatures 552fe6060f1SDimitry Andric ? static_cast<double>((i + 1) * 553fe6060f1SDimitry Andric (Inputs[i]->HasFocusFunction ? 1000 : 1)) 5540b57cec5SDimitry Andric : 0.; 5555ffd83dbSDimitry Andric } 5565ffd83dbSDimitry Andric 5570b57cec5SDimitry Andric if (FeatureDebug) { 5580b57cec5SDimitry Andric for (size_t i = 0; i < N; i++) 5590b57cec5SDimitry Andric Printf("%zd ", Inputs[i]->NumFeatures); 5600b57cec5SDimitry Andric Printf("SCORE\n"); 5610b57cec5SDimitry Andric for (size_t i = 0; i < N; i++) 5620b57cec5SDimitry Andric Printf("%f ", Weights[i]); 5630b57cec5SDimitry Andric Printf("Weights\n"); 5640b57cec5SDimitry Andric } 5650b57cec5SDimitry Andric CorpusDistribution = std::piecewise_constant_distribution<double>( 5660b57cec5SDimitry Andric Intervals.begin(), Intervals.end(), Weights.begin()); 5670b57cec5SDimitry Andric } 5680b57cec5SDimitry Andric std::piecewise_constant_distribution<double> CorpusDistribution; 5690b57cec5SDimitry Andric 570349cc55cSDimitry Andric std::vector<double> Intervals; 571349cc55cSDimitry Andric std::vector<double> Weights; 5720b57cec5SDimitry Andric 5730b57cec5SDimitry Andric std::unordered_set<std::string> Hashes; 574349cc55cSDimitry Andric std::vector<InputInfo *> Inputs; 5750b57cec5SDimitry Andric 5760b57cec5SDimitry Andric size_t NumAddedFeatures = 0; 5770b57cec5SDimitry Andric size_t NumUpdatedFeatures = 0; 5780b57cec5SDimitry Andric uint32_t InputSizesPerFeature[kFeatureSetSize]; 5790b57cec5SDimitry Andric uint32_t SmallestElementPerFeature[kFeatureSetSize]; 5800b57cec5SDimitry Andric 5815ffd83dbSDimitry Andric bool DistributionNeedsUpdate = true; 5825ffd83dbSDimitry Andric uint16_t FreqOfMostAbundantRareFeature = 0; 5835ffd83dbSDimitry Andric uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {}; 584349cc55cSDimitry Andric std::vector<uint32_t> RareFeatures; 585*5f757f3fSDimitry Andric std::bitset<kFeatureSetSize> IsRareFeature; 5865ffd83dbSDimitry Andric 5870b57cec5SDimitry Andric std::string OutputCorpus; 5880b57cec5SDimitry Andric }; 5890b57cec5SDimitry Andric 5900b57cec5SDimitry Andric } // namespace fuzzer 5910b57cec5SDimitry Andric 5920b57cec5SDimitry Andric #endif // LLVM_FUZZER_CORPUS 593