1/*===- InstrProfilingValue.c - Support library for PGO instrumentation ----===*\
2|*
3|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4|* See https://llvm.org/LICENSE.txt for license information.
5|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6|*
7\*===----------------------------------------------------------------------===*/
8
9#include <limits.h>
10#include <stdio.h>
11#include <stdlib.h>
12#include <string.h>
13
14#include "InstrProfiling.h"
15#include "InstrProfilingInternal.h"
16#include "InstrProfilingUtil.h"
17
18#define INSTR_PROF_VALUE_PROF_DATA
19#define INSTR_PROF_COMMON_API_IMPL
20#include "profile/InstrProfData.inc"
21
22static int hasStaticCounters = 1;
23static int OutOfNodesWarnings = 0;
24static int hasNonDefaultValsPerSite = 0;
25#define INSTR_PROF_MAX_VP_WARNS 10
26#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 16
27#define INSTR_PROF_VNODE_POOL_SIZE 1024
28
29#ifndef _MSC_VER
30/* A shared static pool in addition to the vnodes statically
31 * allocated by the compiler.  */
32COMPILER_RT_VISIBILITY ValueProfNode
33    lprofValueProfNodes[INSTR_PROF_VNODE_POOL_SIZE] COMPILER_RT_SECTION(
34       COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME);
35#endif
36
37COMPILER_RT_VISIBILITY uint32_t VPMaxNumValsPerSite =
38    INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
39
40COMPILER_RT_VISIBILITY void lprofSetupValueProfiler() {
41  const char *Str = 0;
42  Str = getenv("LLVM_VP_MAX_NUM_VALS_PER_SITE");
43  if (Str && Str[0]) {
44    VPMaxNumValsPerSite = atoi(Str);
45    hasNonDefaultValsPerSite = 1;
46  }
47  if (VPMaxNumValsPerSite > INSTR_PROF_MAX_NUM_VAL_PER_SITE)
48    VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
49}
50
51COMPILER_RT_VISIBILITY void lprofSetMaxValsPerSite(uint32_t MaxVals) {
52  VPMaxNumValsPerSite = MaxVals;
53  hasNonDefaultValsPerSite = 1;
54}
55
56/* This method is only used in value profiler mock testing.  */
57COMPILER_RT_VISIBILITY void
58__llvm_profile_set_num_value_sites(__llvm_profile_data *Data,
59                                   uint32_t ValueKind, uint16_t NumValueSites) {
60  *((uint16_t *)&Data->NumValueSites[ValueKind]) = NumValueSites;
61}
62
63/* This method is only used in value profiler mock testing.  */
64COMPILER_RT_VISIBILITY const __llvm_profile_data *
65__llvm_profile_iterate_data(const __llvm_profile_data *Data) {
66  return Data + 1;
67}
68
69/* This method is only used in value profiler mock testing.  */
70COMPILER_RT_VISIBILITY void *
71__llvm_get_function_addr(const __llvm_profile_data *Data) {
72  return Data->FunctionPointer;
73}
74
75/* Allocate an array that holds the pointers to the linked lists of
76 * value profile counter nodes. The number of element of the array
77 * is the total number of value profile sites instrumented. Returns
78 * 0 if allocation fails.
79 */
80
81static int allocateValueProfileCounters(__llvm_profile_data *Data) {
82  uint64_t NumVSites = 0;
83  uint32_t VKI;
84
85  /* This function will never be called when value site array is allocated
86     statically at compile time.  */
87  hasStaticCounters = 0;
88  /* When dynamic allocation is enabled, allow tracking the max number of
89   * values allowd.  */
90  if (!hasNonDefaultValsPerSite)
91    VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
92
93  for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
94    NumVSites += Data->NumValueSites[VKI];
95
96  ValueProfNode **Mem =
97      (ValueProfNode **)calloc(NumVSites, sizeof(ValueProfNode *));
98  if (!Mem)
99    return 0;
100  if (!COMPILER_RT_BOOL_CMPXCHG(&Data->Values, 0, Mem)) {
101    free(Mem);
102    return 0;
103  }
104  return 1;
105}
106
107static ValueProfNode *allocateOneNode(void) {
108  ValueProfNode *Node;
109
110  if (!hasStaticCounters)
111    return (ValueProfNode *)calloc(1, sizeof(ValueProfNode));
112
113  /* Early check to avoid value wrapping around.  */
114  if (CurrentVNode + 1 > EndVNode) {
115    if (OutOfNodesWarnings++ < INSTR_PROF_MAX_VP_WARNS) {
116      PROF_WARN("Unable to track new values: %s. "
117                " Consider using option -mllvm -vp-counters-per-site=<n> to "
118                "allocate more"
119                " value profile counters at compile time. \n",
120                "Running out of static counters");
121    }
122    return 0;
123  }
124  Node = COMPILER_RT_PTR_FETCH_ADD(ValueProfNode, CurrentVNode, 1);
125  /* Due to section padding, EndVNode point to a byte which is one pass
126   * an incomplete VNode, so we need to skip the last incomplete node. */
127  if (Node + 1 > EndVNode)
128    return 0;
129
130  return Node;
131}
132
133static COMPILER_RT_ALWAYS_INLINE void
134instrumentTargetValueImpl(uint64_t TargetValue, void *Data,
135                          uint32_t CounterIndex, uint64_t CountValue) {
136  __llvm_profile_data *PData = (__llvm_profile_data *)Data;
137  if (!PData)
138    return;
139  if (!CountValue)
140    return;
141  if (!PData->Values) {
142    if (!allocateValueProfileCounters(PData))
143      return;
144  }
145
146  ValueProfNode **ValueCounters = (ValueProfNode **)PData->Values;
147  ValueProfNode *PrevVNode = NULL;
148  ValueProfNode *MinCountVNode = NULL;
149  ValueProfNode *CurVNode = ValueCounters[CounterIndex];
150  uint64_t MinCount = UINT64_MAX;
151
152  uint8_t VDataCount = 0;
153  while (CurVNode) {
154    if (TargetValue == CurVNode->Value) {
155      CurVNode->Count += CountValue;
156      return;
157    }
158    if (CurVNode->Count < MinCount) {
159      MinCount = CurVNode->Count;
160      MinCountVNode = CurVNode;
161    }
162    PrevVNode = CurVNode;
163    CurVNode = CurVNode->Next;
164    ++VDataCount;
165  }
166
167  if (VDataCount >= VPMaxNumValsPerSite) {
168    /* Bump down the min count node's count. If it reaches 0,
169     * evict it. This eviction/replacement policy makes hot
170     * targets more sticky while cold targets less so. In other
171     * words, it makes it less likely for the hot targets to be
172     * prematurally evicted during warmup/establishment period,
173     * when their counts are still low. In a special case when
174     * the number of values tracked is reduced to only one, this
175     * policy will guarantee that the dominating target with >50%
176     * total count will survive in the end. Note that this scheme
177     * allows the runtime to track the min count node in an adaptive
178     * manner. It can correct previous mistakes and eventually
179     * lock on a cold target that is alread in stable state.
180     *
181     * In very rare cases,  this replacement scheme may still lead
182     * to target loss. For instance, out of \c N value slots, \c N-1
183     * slots are occupied by luke warm targets during the warmup
184     * period and the remaining one slot is competed by two or more
185     * very hot targets. If those hot targets occur in an interleaved
186     * way, none of them will survive (gain enough weight to throw out
187     * other established entries) due to the ping-pong effect.
188     * To handle this situation, user can choose to increase the max
189     * number of tracked values per value site. Alternatively, a more
190     * expensive eviction mechanism can be implemented. It requires
191     * the runtime to track the total number of evictions per-site.
192     * When the total number of evictions reaches certain threshold,
193     * the runtime can wipe out more than one lowest count entries
194     * to give space for hot targets.
195     */
196    if (MinCountVNode->Count <= CountValue) {
197      CurVNode = MinCountVNode;
198      CurVNode->Value = TargetValue;
199      CurVNode->Count = CountValue;
200    } else
201      MinCountVNode->Count -= CountValue;
202
203    return;
204  }
205
206  CurVNode = allocateOneNode();
207  if (!CurVNode)
208    return;
209  CurVNode->Value = TargetValue;
210  CurVNode->Count += CountValue;
211
212  uint32_t Success = 0;
213  if (!ValueCounters[CounterIndex])
214    Success =
215        COMPILER_RT_BOOL_CMPXCHG(&ValueCounters[CounterIndex], 0, CurVNode);
216  else if (PrevVNode && !PrevVNode->Next)
217    Success = COMPILER_RT_BOOL_CMPXCHG(&(PrevVNode->Next), 0, CurVNode);
218
219  if (!Success && !hasStaticCounters) {
220    free(CurVNode);
221    return;
222  }
223}
224
225COMPILER_RT_VISIBILITY void
226__llvm_profile_instrument_target(uint64_t TargetValue, void *Data,
227                                 uint32_t CounterIndex) {
228  instrumentTargetValueImpl(TargetValue, Data, CounterIndex, 1);
229}
230COMPILER_RT_VISIBILITY void
231__llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
232                                       uint32_t CounterIndex,
233                                       uint64_t CountValue) {
234  instrumentTargetValueImpl(TargetValue, Data, CounterIndex, CountValue);
235}
236
237/*
238 * The target values are partitioned into multiple regions/ranges. There is one
239 * contiguous region which is precise -- every value in the range is tracked
240 * individually. A value outside the precise region will be collapsed into one
241 * value depending on the region it falls in.
242 *
243 * There are three regions:
244 * 1. (-inf, PreciseRangeStart) and (PreciseRangeLast, LargeRangeValue) belong
245 * to one region -- all values here should be mapped to one value of
246 * "PreciseRangeLast + 1".
247 * 2. [PreciseRangeStart, PreciseRangeLast]
248 * 3. Large values: [LargeValue, +inf) maps to one value of LargeValue.
249 *
250 * The range for large values is optional. The default value of INT64_MIN
251 * indicates it is not specified.
252 */
253COMPILER_RT_VISIBILITY void __llvm_profile_instrument_range(
254    uint64_t TargetValue, void *Data, uint32_t CounterIndex,
255    int64_t PreciseRangeStart, int64_t PreciseRangeLast, int64_t LargeValue) {
256
257  if (LargeValue != INT64_MIN && (int64_t)TargetValue >= LargeValue)
258    TargetValue = LargeValue;
259  else if ((int64_t)TargetValue < PreciseRangeStart ||
260           (int64_t)TargetValue > PreciseRangeLast)
261    TargetValue = PreciseRangeLast + 1;
262
263  __llvm_profile_instrument_target(TargetValue, Data, CounterIndex);
264}
265
266/*
267 * A wrapper struct that represents value profile runtime data.
268 * Like InstrProfRecord class which is used by profiling host tools,
269 * ValueProfRuntimeRecord also implements the abstract intefaces defined in
270 * ValueProfRecordClosure so that the runtime data can be serialized using
271 * shared C implementation.
272 */
273typedef struct ValueProfRuntimeRecord {
274  const __llvm_profile_data *Data;
275  ValueProfNode **NodesKind[IPVK_Last + 1];
276  uint8_t **SiteCountArray;
277} ValueProfRuntimeRecord;
278
279/* ValueProfRecordClosure Interface implementation. */
280
281static uint32_t getNumValueSitesRT(const void *R, uint32_t VK) {
282  return ((const ValueProfRuntimeRecord *)R)->Data->NumValueSites[VK];
283}
284
285static uint32_t getNumValueDataRT(const void *R, uint32_t VK) {
286  uint32_t S = 0, I;
287  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
288  if (Record->SiteCountArray[VK] == INSTR_PROF_NULLPTR)
289    return 0;
290  for (I = 0; I < Record->Data->NumValueSites[VK]; I++)
291    S += Record->SiteCountArray[VK][I];
292  return S;
293}
294
295static uint32_t getNumValueDataForSiteRT(const void *R, uint32_t VK,
296                                         uint32_t S) {
297  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
298  return Record->SiteCountArray[VK][S];
299}
300
301static ValueProfRuntimeRecord RTRecord;
302static ValueProfRecordClosure RTRecordClosure = {
303    &RTRecord,          INSTR_PROF_NULLPTR, /* GetNumValueKinds */
304    getNumValueSitesRT, getNumValueDataRT,  getNumValueDataForSiteRT,
305    INSTR_PROF_NULLPTR, /* RemapValueData */
306    INSTR_PROF_NULLPTR, /* GetValueForSite, */
307    INSTR_PROF_NULLPTR  /* AllocValueProfData */
308};
309
310static uint32_t
311initializeValueProfRuntimeRecord(const __llvm_profile_data *Data,
312                                 uint8_t *SiteCountArray[]) {
313  unsigned I, J, S = 0, NumValueKinds = 0;
314  ValueProfNode **Nodes = (ValueProfNode **)Data->Values;
315  RTRecord.Data = Data;
316  RTRecord.SiteCountArray = SiteCountArray;
317  for (I = 0; I <= IPVK_Last; I++) {
318    uint16_t N = Data->NumValueSites[I];
319    if (!N)
320      continue;
321
322    NumValueKinds++;
323
324    RTRecord.NodesKind[I] = Nodes ? &Nodes[S] : INSTR_PROF_NULLPTR;
325    for (J = 0; J < N; J++) {
326      /* Compute value count for each site. */
327      uint32_t C = 0;
328      ValueProfNode *Site =
329          Nodes ? RTRecord.NodesKind[I][J] : INSTR_PROF_NULLPTR;
330      while (Site) {
331        C++;
332        Site = Site->Next;
333      }
334      if (C > UCHAR_MAX)
335        C = UCHAR_MAX;
336      RTRecord.SiteCountArray[I][J] = C;
337    }
338    S += N;
339  }
340  return NumValueKinds;
341}
342
343static ValueProfNode *getNextNValueData(uint32_t VK, uint32_t Site,
344                                        InstrProfValueData *Dst,
345                                        ValueProfNode *StartNode, uint32_t N) {
346  unsigned I;
347  ValueProfNode *VNode = StartNode ? StartNode : RTRecord.NodesKind[VK][Site];
348  for (I = 0; I < N; I++) {
349    Dst[I].Value = VNode->Value;
350    Dst[I].Count = VNode->Count;
351    VNode = VNode->Next;
352  }
353  return VNode;
354}
355
356static uint32_t getValueProfDataSizeWrapper(void) {
357  return getValueProfDataSize(&RTRecordClosure);
358}
359
360static uint32_t getNumValueDataForSiteWrapper(uint32_t VK, uint32_t S) {
361  return getNumValueDataForSiteRT(&RTRecord, VK, S);
362}
363
364static VPDataReaderType TheVPDataReader = {
365    initializeValueProfRuntimeRecord, getValueProfRecordHeaderSize,
366    getFirstValueProfRecord,          getNumValueDataForSiteWrapper,
367    getValueProfDataSizeWrapper,      getNextNValueData};
368
369COMPILER_RT_VISIBILITY VPDataReaderType *lprofGetVPDataReader() {
370  return &TheVPDataReader;
371}
372