1//===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// fuzzer::InputCorpus
9//===----------------------------------------------------------------------===//
10
11#ifndef LLVM_FUZZER_CORPUS
12#define LLVM_FUZZER_CORPUS
13
14#include "FuzzerDataFlowTrace.h"
15#include "FuzzerDefs.h"
16#include "FuzzerIO.h"
17#include "FuzzerRandom.h"
18#include "FuzzerSHA1.h"
19#include "FuzzerTracePC.h"
20#include <algorithm>
21#include <numeric>
22#include <random>
23#include <unordered_set>
24
25namespace fuzzer {
26
27struct InputInfo {
28  Unit U;  // The actual input data.
29  uint8_t Sha1[kSHA1NumBytes];  // Checksum.
30  // Number of features that this input has and no smaller input has.
31  size_t NumFeatures = 0;
32  size_t Tmp = 0; // Used by ValidateFeatureSet.
33  // Stats.
34  size_t NumExecutedMutations = 0;
35  size_t NumSuccessfullMutations = 0;
36  bool MayDeleteFile = false;
37  bool Reduced = false;
38  bool HasFocusFunction = false;
39  Vector<uint32_t> UniqFeatureSet;
40  Vector<uint8_t> DataFlowTraceForFocusFunction;
41  // Power schedule.
42  bool NeedsEnergyUpdate = false;
43  double Energy = 0.0;
44  size_t SumIncidence = 0;
45  Vector<std::pair<uint32_t, uint16_t>> FeatureFreqs;
46
47  // Delete feature Idx and its frequency from FeatureFreqs.
48  bool DeleteFeatureFreq(uint32_t Idx) {
49    if (FeatureFreqs.empty())
50      return false;
51
52    // Binary search over local feature frequencies sorted by index.
53    auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
54                                  std::pair<uint32_t, uint16_t>(Idx, 0));
55
56    if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
57      FeatureFreqs.erase(Lower);
58      return true;
59    }
60    return false;
61  }
62
63  // Assign more energy to a high-entropy seed, i.e., that reveals more
64  // information about the globally rare features in the neighborhood
65  // of the seed. Since we do not know the entropy of a seed that has
66  // never been executed we assign fresh seeds maximum entropy and
67  // let II->Energy approach the true entropy from above.
68  void UpdateEnergy(size_t GlobalNumberOfFeatures) {
69    Energy = 0.0;
70    SumIncidence = 0;
71
72    // Apply add-one smoothing to locally discovered features.
73    for (auto F : FeatureFreqs) {
74      size_t LocalIncidence = F.second + 1;
75      Energy -= LocalIncidence * logl(LocalIncidence);
76      SumIncidence += LocalIncidence;
77    }
78
79    // Apply add-one smoothing to locally undiscovered features.
80    //   PreciseEnergy -= 0; // since logl(1.0) == 0)
81    SumIncidence += (GlobalNumberOfFeatures - FeatureFreqs.size());
82
83    // Add a single locally abundant feature apply add-one smoothing.
84    size_t AbdIncidence = NumExecutedMutations + 1;
85    Energy -= AbdIncidence * logl(AbdIncidence);
86    SumIncidence += AbdIncidence;
87
88    // Normalize.
89    if (SumIncidence != 0)
90      Energy = (Energy / SumIncidence) + logl(SumIncidence);
91  }
92
93  // Increment the frequency of the feature Idx.
94  void UpdateFeatureFrequency(uint32_t Idx) {
95    NeedsEnergyUpdate = true;
96
97    // The local feature frequencies is an ordered vector of pairs.
98    // If there are no local feature frequencies, push_back preserves order.
99    // Set the feature frequency for feature Idx32 to 1.
100    if (FeatureFreqs.empty()) {
101      FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1));
102      return;
103    }
104
105    // Binary search over local feature frequencies sorted by index.
106    auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
107                                  std::pair<uint32_t, uint16_t>(Idx, 0));
108
109    // If feature Idx32 already exists, increment its frequency.
110    // Otherwise, insert a new pair right after the next lower index.
111    if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
112      Lower->second++;
113    } else {
114      FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1));
115    }
116  }
117};
118
119struct EntropicOptions {
120  bool Enabled;
121  size_t NumberOfRarestFeatures;
122  size_t FeatureFrequencyThreshold;
123};
124
125class InputCorpus {
126  static const uint32_t kFeatureSetSize = 1 << 21;
127  static const uint8_t kMaxMutationFactor = 20;
128  static const size_t kSparseEnergyUpdates = 100;
129
130  size_t NumExecutedMutations = 0;
131
132  EntropicOptions Entropic;
133
134public:
135  InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic)
136      : Entropic(Entropic), OutputCorpus(OutputCorpus) {
137    memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
138    memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
139  }
140  ~InputCorpus() {
141    for (auto II : Inputs)
142      delete II;
143  }
144  size_t size() const { return Inputs.size(); }
145  size_t SizeInBytes() const {
146    size_t Res = 0;
147    for (auto II : Inputs)
148      Res += II->U.size();
149    return Res;
150  }
151  size_t NumActiveUnits() const {
152    size_t Res = 0;
153    for (auto II : Inputs)
154      Res += !II->U.empty();
155    return Res;
156  }
157  size_t MaxInputSize() const {
158    size_t Res = 0;
159    for (auto II : Inputs)
160        Res = std::max(Res, II->U.size());
161    return Res;
162  }
163  void IncrementNumExecutedMutations() { NumExecutedMutations++; }
164
165  size_t NumInputsThatTouchFocusFunction() {
166    return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
167      return II->HasFocusFunction;
168    });
169  }
170
171  size_t NumInputsWithDataFlowTrace() {
172    return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
173      return !II->DataFlowTraceForFocusFunction.empty();
174    });
175  }
176
177  bool empty() const { return Inputs.empty(); }
178  const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
179  InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
180                         bool HasFocusFunction,
181                         const Vector<uint32_t> &FeatureSet,
182                         const DataFlowTrace &DFT, const InputInfo *BaseII) {
183    assert(!U.empty());
184    if (FeatureDebug)
185      Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
186    Inputs.push_back(new InputInfo());
187    InputInfo &II = *Inputs.back();
188    II.U = U;
189    II.NumFeatures = NumFeatures;
190    II.MayDeleteFile = MayDeleteFile;
191    II.UniqFeatureSet = FeatureSet;
192    II.HasFocusFunction = HasFocusFunction;
193    // Assign maximal energy to the new seed.
194    II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size());
195    II.SumIncidence = RareFeatures.size();
196    II.NeedsEnergyUpdate = false;
197    std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
198    ComputeSHA1(U.data(), U.size(), II.Sha1);
199    auto Sha1Str = Sha1ToString(II.Sha1);
200    Hashes.insert(Sha1Str);
201    if (HasFocusFunction)
202      if (auto V = DFT.Get(Sha1Str))
203        II.DataFlowTraceForFocusFunction = *V;
204    // This is a gross heuristic.
205    // Ideally, when we add an element to a corpus we need to know its DFT.
206    // But if we don't, we'll use the DFT of its base input.
207    if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
208      II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
209    DistributionNeedsUpdate = true;
210    PrintCorpus();
211    // ValidateFeatureSet();
212    return &II;
213  }
214
215  // Debug-only
216  void PrintUnit(const Unit &U) {
217    if (!FeatureDebug) return;
218    for (uint8_t C : U) {
219      if (C != 'F' && C != 'U' && C != 'Z')
220        C = '.';
221      Printf("%c", C);
222    }
223  }
224
225  // Debug-only
226  void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) {
227    if (!FeatureDebug) return;
228    Printf("{");
229    for (uint32_t Feature: FeatureSet)
230      Printf("%u,", Feature);
231    Printf("}");
232  }
233
234  // Debug-only
235  void PrintCorpus() {
236    if (!FeatureDebug) return;
237    Printf("======= CORPUS:\n");
238    int i = 0;
239    for (auto II : Inputs) {
240      if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
241        Printf("[%2d] ", i);
242        Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
243        PrintUnit(II->U);
244        Printf(" ");
245        PrintFeatureSet(II->UniqFeatureSet);
246        Printf("\n");
247      }
248      i++;
249    }
250  }
251
252  void Replace(InputInfo *II, const Unit &U) {
253    assert(II->U.size() > U.size());
254    Hashes.erase(Sha1ToString(II->Sha1));
255    DeleteFile(*II);
256    ComputeSHA1(U.data(), U.size(), II->Sha1);
257    Hashes.insert(Sha1ToString(II->Sha1));
258    II->U = U;
259    II->Reduced = true;
260    DistributionNeedsUpdate = true;
261  }
262
263  bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
264  bool HasUnit(const std::string &H) { return Hashes.count(H); }
265  InputInfo &ChooseUnitToMutate(Random &Rand) {
266    InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
267    assert(!II.U.empty());
268    return II;
269  }
270
271  // Returns an index of random unit from the corpus to mutate.
272  size_t ChooseUnitIdxToMutate(Random &Rand) {
273    UpdateCorpusDistribution(Rand);
274    size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
275    assert(Idx < Inputs.size());
276    return Idx;
277  }
278
279  void PrintStats() {
280    for (size_t i = 0; i < Inputs.size(); i++) {
281      const auto &II = *Inputs[i];
282      Printf("  [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
283             Sha1ToString(II.Sha1).c_str(), II.U.size(),
284             II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction);
285    }
286  }
287
288  void PrintFeatureSet() {
289    for (size_t i = 0; i < kFeatureSetSize; i++) {
290      if(size_t Sz = GetFeature(i))
291        Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
292    }
293    Printf("\n\t");
294    for (size_t i = 0; i < Inputs.size(); i++)
295      if (size_t N = Inputs[i]->NumFeatures)
296        Printf(" %zd=>%zd ", i, N);
297    Printf("\n");
298  }
299
300  void DeleteFile(const InputInfo &II) {
301    if (!OutputCorpus.empty() && II.MayDeleteFile)
302      RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
303  }
304
305  void DeleteInput(size_t Idx) {
306    InputInfo &II = *Inputs[Idx];
307    DeleteFile(II);
308    Unit().swap(II.U);
309    II.Energy = 0.0;
310    II.NeedsEnergyUpdate = false;
311    DistributionNeedsUpdate = true;
312    if (FeatureDebug)
313      Printf("EVICTED %zd\n", Idx);
314  }
315
316  void AddRareFeature(uint32_t Idx) {
317    // Maintain *at least* TopXRarestFeatures many rare features
318    // and all features with a frequency below ConsideredRare.
319    // Remove all other features.
320    while (RareFeatures.size() > Entropic.NumberOfRarestFeatures &&
321           FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) {
322
323      // Find most and second most abbundant feature.
324      uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0],
325                                                    RareFeatures[0]};
326      size_t Delete = 0;
327      for (size_t i = 0; i < RareFeatures.size(); i++) {
328        uint32_t Idx2 = RareFeatures[i];
329        if (GlobalFeatureFreqs[Idx2] >=
330            GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) {
331          MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0];
332          MostAbundantRareFeatureIndices[0] = Idx2;
333          Delete = i;
334        }
335      }
336
337      // Remove most abundant rare feature.
338      RareFeatures[Delete] = RareFeatures.back();
339      RareFeatures.pop_back();
340
341      for (auto II : Inputs) {
342        if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0]))
343          II->NeedsEnergyUpdate = true;
344      }
345
346      // Set 2nd most abundant as the new most abundant feature count.
347      FreqOfMostAbundantRareFeature =
348          GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]];
349    }
350
351    // Add rare feature, handle collisions, and update energy.
352    RareFeatures.push_back(Idx);
353    GlobalFeatureFreqs[Idx] = 0;
354    for (auto II : Inputs) {
355      II->DeleteFeatureFreq(Idx);
356
357      // Apply add-one smoothing to this locally undiscovered feature.
358      // Zero energy seeds will never be fuzzed and remain zero energy.
359      if (II->Energy > 0.0) {
360        II->SumIncidence += 1;
361        II->Energy += logl(II->SumIncidence) / II->SumIncidence;
362      }
363    }
364
365    DistributionNeedsUpdate = true;
366  }
367
368  bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
369    assert(NewSize);
370    Idx = Idx % kFeatureSetSize;
371    uint32_t OldSize = GetFeature(Idx);
372    if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
373      if (OldSize > 0) {
374        size_t OldIdx = SmallestElementPerFeature[Idx];
375        InputInfo &II = *Inputs[OldIdx];
376        assert(II.NumFeatures > 0);
377        II.NumFeatures--;
378        if (II.NumFeatures == 0)
379          DeleteInput(OldIdx);
380      } else {
381        NumAddedFeatures++;
382        if (Entropic.Enabled)
383          AddRareFeature((uint32_t)Idx);
384      }
385      NumUpdatedFeatures++;
386      if (FeatureDebug)
387        Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
388      SmallestElementPerFeature[Idx] = Inputs.size();
389      InputSizesPerFeature[Idx] = NewSize;
390      return true;
391    }
392    return false;
393  }
394
395  // Increment frequency of feature Idx globally and locally.
396  void UpdateFeatureFrequency(InputInfo *II, size_t Idx) {
397    uint32_t Idx32 = Idx % kFeatureSetSize;
398
399    // Saturated increment.
400    if (GlobalFeatureFreqs[Idx32] == 0xFFFF)
401      return;
402    uint16_t Freq = GlobalFeatureFreqs[Idx32]++;
403
404    // Skip if abundant.
405    if (Freq > FreqOfMostAbundantRareFeature ||
406        std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) ==
407            RareFeatures.end())
408      return;
409
410    // Update global frequencies.
411    if (Freq == FreqOfMostAbundantRareFeature)
412      FreqOfMostAbundantRareFeature++;
413
414    // Update local frequencies.
415    if (II)
416      II->UpdateFeatureFrequency(Idx32);
417  }
418
419  size_t NumFeatures() const { return NumAddedFeatures; }
420  size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
421
422private:
423
424  static const bool FeatureDebug = false;
425
426  size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
427
428  void ValidateFeatureSet() {
429    if (FeatureDebug)
430      PrintFeatureSet();
431    for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
432      if (GetFeature(Idx))
433        Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
434    for (auto II: Inputs) {
435      if (II->Tmp != II->NumFeatures)
436        Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
437      assert(II->Tmp == II->NumFeatures);
438      II->Tmp = 0;
439    }
440  }
441
442  // Updates the probability distribution for the units in the corpus.
443  // Must be called whenever the corpus or unit weights are changed.
444  //
445  // Hypothesis: inputs that maximize information about globally rare features
446  // are interesting.
447  void UpdateCorpusDistribution(Random &Rand) {
448    // Skip update if no seeds or rare features were added/deleted.
449    // Sparse updates for local change of feature frequencies,
450    // i.e., randomly do not skip.
451    if (!DistributionNeedsUpdate &&
452        (!Entropic.Enabled || Rand(kSparseEnergyUpdates)))
453      return;
454
455    DistributionNeedsUpdate = false;
456
457    size_t N = Inputs.size();
458    assert(N);
459    Intervals.resize(N + 1);
460    Weights.resize(N);
461    std::iota(Intervals.begin(), Intervals.end(), 0);
462
463    bool VanillaSchedule = true;
464    if (Entropic.Enabled) {
465      for (auto II : Inputs) {
466        if (II->NeedsEnergyUpdate && II->Energy != 0.0) {
467          II->NeedsEnergyUpdate = false;
468          II->UpdateEnergy(RareFeatures.size());
469        }
470      }
471
472      for (size_t i = 0; i < N; i++) {
473
474        if (Inputs[i]->NumFeatures == 0) {
475          // If the seed doesn't represent any features, assign zero energy.
476          Weights[i] = 0.;
477        } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor >
478                   NumExecutedMutations / Inputs.size()) {
479          // If the seed was fuzzed a lot more than average, assign zero energy.
480          Weights[i] = 0.;
481        } else {
482          // Otherwise, simply assign the computed energy.
483          Weights[i] = Inputs[i]->Energy;
484        }
485
486        // If energy for all seeds is zero, fall back to vanilla schedule.
487        if (Weights[i] > 0.0)
488          VanillaSchedule = false;
489      }
490    }
491
492    if (VanillaSchedule) {
493      for (size_t i = 0; i < N; i++)
494        Weights[i] = Inputs[i]->NumFeatures
495                         ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1)
496                         : 0.;
497    }
498
499    if (FeatureDebug) {
500      for (size_t i = 0; i < N; i++)
501        Printf("%zd ", Inputs[i]->NumFeatures);
502      Printf("SCORE\n");
503      for (size_t i = 0; i < N; i++)
504        Printf("%f ", Weights[i]);
505      Printf("Weights\n");
506    }
507    CorpusDistribution = std::piecewise_constant_distribution<double>(
508        Intervals.begin(), Intervals.end(), Weights.begin());
509  }
510  std::piecewise_constant_distribution<double> CorpusDistribution;
511
512  Vector<double> Intervals;
513  Vector<double> Weights;
514
515  std::unordered_set<std::string> Hashes;
516  Vector<InputInfo*> Inputs;
517
518  size_t NumAddedFeatures = 0;
519  size_t NumUpdatedFeatures = 0;
520  uint32_t InputSizesPerFeature[kFeatureSetSize];
521  uint32_t SmallestElementPerFeature[kFeatureSetSize];
522
523  bool DistributionNeedsUpdate = true;
524  uint16_t FreqOfMostAbundantRareFeature = 0;
525  uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {};
526  Vector<uint32_t> RareFeatures;
527
528  std::string OutputCorpus;
529};
530
531}  // namespace fuzzer
532
533#endif  // LLVM_FUZZER_CORPUS
534