1//===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// fuzzer::InputCorpus
9//===----------------------------------------------------------------------===//
10
11#ifndef LLVM_FUZZER_CORPUS
12#define LLVM_FUZZER_CORPUS
13
14#include "FuzzerDataFlowTrace.h"
15#include "FuzzerDefs.h"
16#include "FuzzerIO.h"
17#include "FuzzerRandom.h"
18#include "FuzzerSHA1.h"
19#include "FuzzerTracePC.h"
20#include <algorithm>
21#include <numeric>
22#include <random>
23#include <unordered_set>
24
25namespace fuzzer {
26
27struct InputInfo {
28  Unit U;  // The actual input data.
29  uint8_t Sha1[kSHA1NumBytes];  // Checksum.
30  // Number of features that this input has and no smaller input has.
31  size_t NumFeatures = 0;
32  size_t Tmp = 0; // Used by ValidateFeatureSet.
33  // Stats.
34  size_t NumExecutedMutations = 0;
35  size_t NumSuccessfullMutations = 0;
36  bool MayDeleteFile = false;
37  bool Reduced = false;
38  bool HasFocusFunction = false;
39  Vector<uint32_t> UniqFeatureSet;
40  Vector<uint8_t> DataFlowTraceForFocusFunction;
41};
42
43class InputCorpus {
44  static const size_t kFeatureSetSize = 1 << 21;
45 public:
46  InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) {
47    memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
48    memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
49  }
50  ~InputCorpus() {
51    for (auto II : Inputs)
52      delete II;
53  }
54  size_t size() const { return Inputs.size(); }
55  size_t SizeInBytes() const {
56    size_t Res = 0;
57    for (auto II : Inputs)
58      Res += II->U.size();
59    return Res;
60  }
61  size_t NumActiveUnits() const {
62    size_t Res = 0;
63    for (auto II : Inputs)
64      Res += !II->U.empty();
65    return Res;
66  }
67  size_t MaxInputSize() const {
68    size_t Res = 0;
69    for (auto II : Inputs)
70        Res = std::max(Res, II->U.size());
71    return Res;
72  }
73
74  size_t NumInputsThatTouchFocusFunction() {
75    return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
76      return II->HasFocusFunction;
77    });
78  }
79
80  size_t NumInputsWithDataFlowTrace() {
81    return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
82      return !II->DataFlowTraceForFocusFunction.empty();
83    });
84  }
85
86  bool empty() const { return Inputs.empty(); }
87  const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
88  InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
89                         bool HasFocusFunction,
90                         const Vector<uint32_t> &FeatureSet,
91                         const DataFlowTrace &DFT, const InputInfo *BaseII) {
92    assert(!U.empty());
93    if (FeatureDebug)
94      Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
95    Inputs.push_back(new InputInfo());
96    InputInfo &II = *Inputs.back();
97    II.U = U;
98    II.NumFeatures = NumFeatures;
99    II.MayDeleteFile = MayDeleteFile;
100    II.UniqFeatureSet = FeatureSet;
101    II.HasFocusFunction = HasFocusFunction;
102    std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
103    ComputeSHA1(U.data(), U.size(), II.Sha1);
104    auto Sha1Str = Sha1ToString(II.Sha1);
105    Hashes.insert(Sha1Str);
106    if (HasFocusFunction)
107      if (auto V = DFT.Get(Sha1Str))
108        II.DataFlowTraceForFocusFunction = *V;
109    // This is a gross heuristic.
110    // Ideally, when we add an element to a corpus we need to know its DFT.
111    // But if we don't, we'll use the DFT of its base input.
112    if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
113      II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
114    UpdateCorpusDistribution();
115    PrintCorpus();
116    // ValidateFeatureSet();
117    return &II;
118  }
119
120  // Debug-only
121  void PrintUnit(const Unit &U) {
122    if (!FeatureDebug) return;
123    for (uint8_t C : U) {
124      if (C != 'F' && C != 'U' && C != 'Z')
125        C = '.';
126      Printf("%c", C);
127    }
128  }
129
130  // Debug-only
131  void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) {
132    if (!FeatureDebug) return;
133    Printf("{");
134    for (uint32_t Feature: FeatureSet)
135      Printf("%u,", Feature);
136    Printf("}");
137  }
138
139  // Debug-only
140  void PrintCorpus() {
141    if (!FeatureDebug) return;
142    Printf("======= CORPUS:\n");
143    int i = 0;
144    for (auto II : Inputs) {
145      if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
146        Printf("[%2d] ", i);
147        Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
148        PrintUnit(II->U);
149        Printf(" ");
150        PrintFeatureSet(II->UniqFeatureSet);
151        Printf("\n");
152      }
153      i++;
154    }
155  }
156
157  void Replace(InputInfo *II, const Unit &U) {
158    assert(II->U.size() > U.size());
159    Hashes.erase(Sha1ToString(II->Sha1));
160    DeleteFile(*II);
161    ComputeSHA1(U.data(), U.size(), II->Sha1);
162    Hashes.insert(Sha1ToString(II->Sha1));
163    II->U = U;
164    II->Reduced = true;
165    UpdateCorpusDistribution();
166  }
167
168  bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
169  bool HasUnit(const std::string &H) { return Hashes.count(H); }
170  InputInfo &ChooseUnitToMutate(Random &Rand) {
171    InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
172    assert(!II.U.empty());
173    return II;
174  }
175
176  // Returns an index of random unit from the corpus to mutate.
177  size_t ChooseUnitIdxToMutate(Random &Rand) {
178    size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
179    assert(Idx < Inputs.size());
180    return Idx;
181  }
182
183  void PrintStats() {
184    for (size_t i = 0; i < Inputs.size(); i++) {
185      const auto &II = *Inputs[i];
186      Printf("  [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
187             Sha1ToString(II.Sha1).c_str(), II.U.size(),
188             II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction);
189    }
190  }
191
192  void PrintFeatureSet() {
193    for (size_t i = 0; i < kFeatureSetSize; i++) {
194      if(size_t Sz = GetFeature(i))
195        Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
196    }
197    Printf("\n\t");
198    for (size_t i = 0; i < Inputs.size(); i++)
199      if (size_t N = Inputs[i]->NumFeatures)
200        Printf(" %zd=>%zd ", i, N);
201    Printf("\n");
202  }
203
204  void DeleteFile(const InputInfo &II) {
205    if (!OutputCorpus.empty() && II.MayDeleteFile)
206      RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
207  }
208
209  void DeleteInput(size_t Idx) {
210    InputInfo &II = *Inputs[Idx];
211    DeleteFile(II);
212    Unit().swap(II.U);
213    if (FeatureDebug)
214      Printf("EVICTED %zd\n", Idx);
215  }
216
217  bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
218    assert(NewSize);
219    Idx = Idx % kFeatureSetSize;
220    uint32_t OldSize = GetFeature(Idx);
221    if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
222      if (OldSize > 0) {
223        size_t OldIdx = SmallestElementPerFeature[Idx];
224        InputInfo &II = *Inputs[OldIdx];
225        assert(II.NumFeatures > 0);
226        II.NumFeatures--;
227        if (II.NumFeatures == 0)
228          DeleteInput(OldIdx);
229      } else {
230        NumAddedFeatures++;
231      }
232      NumUpdatedFeatures++;
233      if (FeatureDebug)
234        Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
235      SmallestElementPerFeature[Idx] = Inputs.size();
236      InputSizesPerFeature[Idx] = NewSize;
237      return true;
238    }
239    return false;
240  }
241
242  size_t NumFeatures() const { return NumAddedFeatures; }
243  size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
244
245private:
246
247  static const bool FeatureDebug = false;
248
249  size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
250
251  void ValidateFeatureSet() {
252    if (FeatureDebug)
253      PrintFeatureSet();
254    for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
255      if (GetFeature(Idx))
256        Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
257    for (auto II: Inputs) {
258      if (II->Tmp != II->NumFeatures)
259        Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
260      assert(II->Tmp == II->NumFeatures);
261      II->Tmp = 0;
262    }
263  }
264
265  // Updates the probability distribution for the units in the corpus.
266  // Must be called whenever the corpus or unit weights are changed.
267  //
268  // Hypothesis: units added to the corpus last are more interesting.
269  //
270  // Hypothesis: inputs with infrequent features are more interesting.
271  void UpdateCorpusDistribution() {
272    size_t N = Inputs.size();
273    assert(N);
274    Intervals.resize(N + 1);
275    Weights.resize(N);
276    std::iota(Intervals.begin(), Intervals.end(), 0);
277    for (size_t i = 0; i < N; i++)
278      Weights[i] = Inputs[i]->NumFeatures
279                       ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1)
280                       : 0.;
281    if (FeatureDebug) {
282      for (size_t i = 0; i < N; i++)
283        Printf("%zd ", Inputs[i]->NumFeatures);
284      Printf("SCORE\n");
285      for (size_t i = 0; i < N; i++)
286        Printf("%f ", Weights[i]);
287      Printf("Weights\n");
288    }
289    CorpusDistribution = std::piecewise_constant_distribution<double>(
290        Intervals.begin(), Intervals.end(), Weights.begin());
291  }
292  std::piecewise_constant_distribution<double> CorpusDistribution;
293
294  Vector<double> Intervals;
295  Vector<double> Weights;
296
297  std::unordered_set<std::string> Hashes;
298  Vector<InputInfo*> Inputs;
299
300  size_t NumAddedFeatures = 0;
301  size_t NumUpdatedFeatures = 0;
302  uint32_t InputSizesPerFeature[kFeatureSetSize];
303  uint32_t SmallestElementPerFeature[kFeatureSetSize];
304
305  std::string OutputCorpus;
306};
307
308}  // namespace fuzzer
309
310#endif  // LLVM_FUZZER_CORPUS
311