1//===-- tsan_clock.cc -----------------------------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file is a part of ThreadSanitizer (TSan), a race detector.
11//
12//===----------------------------------------------------------------------===//
13#include "tsan_clock.h"
14#include "tsan_rtl.h"
15#include "sanitizer_common/sanitizer_placement_new.h"
16
17// SyncClock and ThreadClock implement vector clocks for sync variables
18// (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
19// ThreadClock contains fixed-size vector clock for maximum number of threads.
20// SyncClock contains growable vector clock for currently necessary number of
21// threads.
22// Together they implement very simple model of operations, namely:
23//
24//   void ThreadClock::acquire(const SyncClock *src) {
25//     for (int i = 0; i < kMaxThreads; i++)
26//       clock[i] = max(clock[i], src->clock[i]);
27//   }
28//
29//   void ThreadClock::release(SyncClock *dst) const {
30//     for (int i = 0; i < kMaxThreads; i++)
31//       dst->clock[i] = max(dst->clock[i], clock[i]);
32//   }
33//
34//   void ThreadClock::ReleaseStore(SyncClock *dst) const {
35//     for (int i = 0; i < kMaxThreads; i++)
36//       dst->clock[i] = clock[i];
37//   }
38//
39//   void ThreadClock::acq_rel(SyncClock *dst) {
40//     acquire(dst);
41//     release(dst);
42//   }
43//
44// Conformance to this model is extensively verified in tsan_clock_test.cc.
45// However, the implementation is significantly more complex. The complexity
46// allows to implement important classes of use cases in O(1) instead of O(N).
47//
48// The use cases are:
49// 1. Singleton/once atomic that has a single release-store operation followed
50//    by zillions of acquire-loads (the acquire-load is O(1)).
51// 2. Thread-local mutex (both lock and unlock can be O(1)).
52// 3. Leaf mutex (unlock is O(1)).
53// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
54// 5. An atomic with a single writer (writes can be O(1)).
55// The implementation dynamically adopts to workload. So if an atomic is in
56// read-only phase, these reads will be O(1); if it later switches to read/write
57// phase, the implementation will correctly handle that by switching to O(N).
58//
59// Thread-safety note: all const operations on SyncClock's are conducted under
60// a shared lock; all non-const operations on SyncClock's are conducted under
61// an exclusive lock; ThreadClock's are private to respective threads and so
62// do not need any protection.
63//
64// Description of SyncClock state:
65// clk_ - variable size vector clock, low kClkBits hold timestamp,
66//   the remaining bits hold "acquired" flag (the actual value is thread's
67//   reused counter);
68//   if acquried == thr->reused_, then the respective thread has already
69//   acquired this clock (except possibly for dirty elements).
70// dirty_ - holds up to two indeces in the vector clock that other threads
71//   need to acquire regardless of "acquired" flag value;
72// release_store_tid_ - denotes that the clock state is a result of
73//   release-store operation by the thread with release_store_tid_ index.
74// release_store_reused_ - reuse count of release_store_tid_.
75
76// We don't have ThreadState in these methods, so this is an ugly hack that
77// works only in C++.
78#if !SANITIZER_GO
79# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
80#else
81# define CPP_STAT_INC(typ) (void)0
82#endif
83
84namespace __tsan {
85
86static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
87  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
88}
89
90// Drop reference to the first level block idx.
91static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
92  ClockBlock *cb = ctx->clock_alloc.Map(idx);
93  atomic_uint32_t *ref = ref_ptr(cb);
94  u32 v = atomic_load(ref, memory_order_acquire);
95  for (;;) {
96    CHECK_GT(v, 0);
97    if (v == 1)
98      break;
99    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
100      return;
101  }
102  // First level block owns second level blocks, so them as well.
103  for (uptr i = 0; i < blocks; i++)
104    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
105  ctx->clock_alloc.Free(c, idx);
106}
107
108ThreadClock::ThreadClock(unsigned tid, unsigned reused)
109    : tid_(tid)
110    , reused_(reused + 1)  // 0 has special meaning
111    , cached_idx_()
112    , cached_size_()
113    , cached_blocks_() {
114  CHECK_LT(tid, kMaxTidInClock);
115  CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
116  nclk_ = tid_ + 1;
117  last_acquire_ = 0;
118  internal_memset(clk_, 0, sizeof(clk_));
119}
120
121void ThreadClock::ResetCached(ClockCache *c) {
122  if (cached_idx_) {
123    UnrefClockBlock(c, cached_idx_, cached_blocks_);
124    cached_idx_ = 0;
125    cached_size_ = 0;
126    cached_blocks_ = 0;
127  }
128}
129
130void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
131  DCHECK_LE(nclk_, kMaxTid);
132  DCHECK_LE(src->size_, kMaxTid);
133  CPP_STAT_INC(StatClockAcquire);
134
135  // Check if it's empty -> no need to do anything.
136  const uptr nclk = src->size_;
137  if (nclk == 0) {
138    CPP_STAT_INC(StatClockAcquireEmpty);
139    return;
140  }
141
142  bool acquired = false;
143  for (unsigned i = 0; i < kDirtyTids; i++) {
144    SyncClock::Dirty dirty = src->dirty_[i];
145    unsigned tid = dirty.tid;
146    if (tid != kInvalidTid) {
147      if (clk_[tid] < dirty.epoch) {
148        clk_[tid] = dirty.epoch;
149        acquired = true;
150      }
151    }
152  }
153
154  // Check if we've already acquired src after the last release operation on src
155  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
156    // O(N) acquire.
157    CPP_STAT_INC(StatClockAcquireFull);
158    nclk_ = max(nclk_, nclk);
159    u64 *dst_pos = &clk_[0];
160    for (ClockElem &src_elem : *src) {
161      u64 epoch = src_elem.epoch;
162      if (*dst_pos < epoch) {
163        *dst_pos = epoch;
164        acquired = true;
165      }
166      dst_pos++;
167    }
168
169    // Remember that this thread has acquired this clock.
170    if (nclk > tid_)
171      src->elem(tid_).reused = reused_;
172  }
173
174  if (acquired) {
175    CPP_STAT_INC(StatClockAcquiredSomething);
176    last_acquire_ = clk_[tid_];
177    ResetCached(c);
178  }
179}
180
181void ThreadClock::release(ClockCache *c, SyncClock *dst) {
182  DCHECK_LE(nclk_, kMaxTid);
183  DCHECK_LE(dst->size_, kMaxTid);
184
185  if (dst->size_ == 0) {
186    // ReleaseStore will correctly set release_store_tid_,
187    // which can be important for future operations.
188    ReleaseStore(c, dst);
189    return;
190  }
191
192  CPP_STAT_INC(StatClockRelease);
193  // Check if we need to resize dst.
194  if (dst->size_ < nclk_)
195    dst->Resize(c, nclk_);
196
197  // Check if we had not acquired anything from other threads
198  // since the last release on dst. If so, we need to update
199  // only dst->elem(tid_).
200  if (dst->elem(tid_).epoch > last_acquire_) {
201    UpdateCurrentThread(c, dst);
202    if (dst->release_store_tid_ != tid_ ||
203        dst->release_store_reused_ != reused_)
204      dst->release_store_tid_ = kInvalidTid;
205    return;
206  }
207
208  // O(N) release.
209  CPP_STAT_INC(StatClockReleaseFull);
210  dst->Unshare(c);
211  // First, remember whether we've acquired dst.
212  bool acquired = IsAlreadyAcquired(dst);
213  if (acquired)
214    CPP_STAT_INC(StatClockReleaseAcquired);
215  // Update dst->clk_.
216  dst->FlushDirty();
217  uptr i = 0;
218  for (ClockElem &ce : *dst) {
219    ce.epoch = max(ce.epoch, clk_[i]);
220    ce.reused = 0;
221    i++;
222  }
223  // Clear 'acquired' flag in the remaining elements.
224  if (nclk_ < dst->size_)
225    CPP_STAT_INC(StatClockReleaseClearTail);
226  for (uptr i = nclk_; i < dst->size_; i++)
227    dst->elem(i).reused = 0;
228  dst->release_store_tid_ = kInvalidTid;
229  dst->release_store_reused_ = 0;
230  // If we've acquired dst, remember this fact,
231  // so that we don't need to acquire it on next acquire.
232  if (acquired)
233    dst->elem(tid_).reused = reused_;
234}
235
236void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
237  DCHECK_LE(nclk_, kMaxTid);
238  DCHECK_LE(dst->size_, kMaxTid);
239  CPP_STAT_INC(StatClockStore);
240
241  if (dst->size_ == 0 && cached_idx_ != 0) {
242    // Reuse the cached clock.
243    // Note: we could reuse/cache the cached clock in more cases:
244    // we could update the existing clock and cache it, or replace it with the
245    // currently cached clock and release the old one. And for a shared
246    // existing clock, we could replace it with the currently cached;
247    // or unshare, update and cache. But, for simplicity, we currnetly reuse
248    // cached clock only when the target clock is empty.
249    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
250    dst->tab_idx_ = cached_idx_;
251    dst->size_ = cached_size_;
252    dst->blocks_ = cached_blocks_;
253    CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
254    // The cached clock is shared (immutable),
255    // so this is where we store the current clock.
256    dst->dirty_[0].tid = tid_;
257    dst->dirty_[0].epoch = clk_[tid_];
258    dst->release_store_tid_ = tid_;
259    dst->release_store_reused_ = reused_;
260    // Rememeber that we don't need to acquire it in future.
261    dst->elem(tid_).reused = reused_;
262    // Grab a reference.
263    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
264    return;
265  }
266
267  // Check if we need to resize dst.
268  if (dst->size_ < nclk_)
269    dst->Resize(c, nclk_);
270
271  if (dst->release_store_tid_ == tid_ &&
272      dst->release_store_reused_ == reused_ &&
273      dst->elem(tid_).epoch > last_acquire_) {
274    CPP_STAT_INC(StatClockStoreFast);
275    UpdateCurrentThread(c, dst);
276    return;
277  }
278
279  // O(N) release-store.
280  CPP_STAT_INC(StatClockStoreFull);
281  dst->Unshare(c);
282  // Note: dst can be larger than this ThreadClock.
283  // This is fine since clk_ beyond size is all zeros.
284  uptr i = 0;
285  for (ClockElem &ce : *dst) {
286    ce.epoch = clk_[i];
287    ce.reused = 0;
288    i++;
289  }
290  for (uptr i = 0; i < kDirtyTids; i++)
291    dst->dirty_[i].tid = kInvalidTid;
292  dst->release_store_tid_ = tid_;
293  dst->release_store_reused_ = reused_;
294  // Rememeber that we don't need to acquire it in future.
295  dst->elem(tid_).reused = reused_;
296
297  // If the resulting clock is cachable, cache it for future release operations.
298  // The clock is always cachable if we released to an empty sync object.
299  if (cached_idx_ == 0 && dst->Cachable()) {
300    // Grab a reference to the ClockBlock.
301    atomic_uint32_t *ref = ref_ptr(dst->tab_);
302    if (atomic_load(ref, memory_order_acquire) == 1)
303      atomic_store_relaxed(ref, 2);
304    else
305      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
306    cached_idx_ = dst->tab_idx_;
307    cached_size_ = dst->size_;
308    cached_blocks_ = dst->blocks_;
309  }
310}
311
312void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
313  CPP_STAT_INC(StatClockAcquireRelease);
314  acquire(c, dst);
315  ReleaseStore(c, dst);
316}
317
318// Updates only single element related to the current thread in dst->clk_.
319void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
320  // Update the threads time, but preserve 'acquired' flag.
321  for (unsigned i = 0; i < kDirtyTids; i++) {
322    SyncClock::Dirty *dirty = &dst->dirty_[i];
323    const unsigned tid = dirty->tid;
324    if (tid == tid_ || tid == kInvalidTid) {
325      CPP_STAT_INC(StatClockReleaseFast);
326      dirty->tid = tid_;
327      dirty->epoch = clk_[tid_];
328      return;
329    }
330  }
331  // Reset all 'acquired' flags, O(N).
332  // We are going to touch dst elements, so we need to unshare it.
333  dst->Unshare(c);
334  CPP_STAT_INC(StatClockReleaseSlow);
335  dst->elem(tid_).epoch = clk_[tid_];
336  for (uptr i = 0; i < dst->size_; i++)
337    dst->elem(i).reused = 0;
338  dst->FlushDirty();
339}
340
341// Checks whether the current thread has already acquired src.
342bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
343  if (src->elem(tid_).reused != reused_)
344    return false;
345  for (unsigned i = 0; i < kDirtyTids; i++) {
346    SyncClock::Dirty dirty = src->dirty_[i];
347    if (dirty.tid != kInvalidTid) {
348      if (clk_[dirty.tid] < dirty.epoch)
349        return false;
350    }
351  }
352  return true;
353}
354
355// Sets a single element in the vector clock.
356// This function is called only from weird places like AcquireGlobal.
357void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
358  DCHECK_LT(tid, kMaxTid);
359  DCHECK_GE(v, clk_[tid]);
360  clk_[tid] = v;
361  if (nclk_ <= tid)
362    nclk_ = tid + 1;
363  last_acquire_ = clk_[tid_];
364  ResetCached(c);
365}
366
367void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
368  printf("clock=[");
369  for (uptr i = 0; i < nclk_; i++)
370    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
371  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
372}
373
374SyncClock::SyncClock() {
375  ResetImpl();
376}
377
378SyncClock::~SyncClock() {
379  // Reset must be called before dtor.
380  CHECK_EQ(size_, 0);
381  CHECK_EQ(blocks_, 0);
382  CHECK_EQ(tab_, 0);
383  CHECK_EQ(tab_idx_, 0);
384}
385
386void SyncClock::Reset(ClockCache *c) {
387  if (size_)
388    UnrefClockBlock(c, tab_idx_, blocks_);
389  ResetImpl();
390}
391
392void SyncClock::ResetImpl() {
393  tab_ = 0;
394  tab_idx_ = 0;
395  size_ = 0;
396  blocks_ = 0;
397  release_store_tid_ = kInvalidTid;
398  release_store_reused_ = 0;
399  for (uptr i = 0; i < kDirtyTids; i++)
400    dirty_[i].tid = kInvalidTid;
401}
402
403void SyncClock::Resize(ClockCache *c, uptr nclk) {
404  CPP_STAT_INC(StatClockReleaseResize);
405  Unshare(c);
406  if (nclk <= capacity()) {
407    // Memory is already allocated, just increase the size.
408    size_ = nclk;
409    return;
410  }
411  if (size_ == 0) {
412    // Grow from 0 to one-level table.
413    CHECK_EQ(size_, 0);
414    CHECK_EQ(blocks_, 0);
415    CHECK_EQ(tab_, 0);
416    CHECK_EQ(tab_idx_, 0);
417    tab_idx_ = ctx->clock_alloc.Alloc(c);
418    tab_ = ctx->clock_alloc.Map(tab_idx_);
419    internal_memset(tab_, 0, sizeof(*tab_));
420    atomic_store_relaxed(ref_ptr(tab_), 1);
421    size_ = 1;
422  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
423    u32 idx = ctx->clock_alloc.Alloc(c);
424    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
425    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
426    CHECK_LT(top, ClockBlock::kClockCount);
427    const uptr move = top * sizeof(tab_->clock[0]);
428    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
429    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
430    internal_memset(tab_->clock, 0, move);
431    append_block(idx);
432  }
433  // At this point we have first level table allocated and all clock elements
434  // are evacuated from it to a second level block.
435  // Add second level tables as necessary.
436  while (nclk > capacity()) {
437    u32 idx = ctx->clock_alloc.Alloc(c);
438    ClockBlock *cb = ctx->clock_alloc.Map(idx);
439    internal_memset(cb, 0, sizeof(*cb));
440    append_block(idx);
441  }
442  size_ = nclk;
443}
444
445// Flushes all dirty elements into the main clock array.
446void SyncClock::FlushDirty() {
447  for (unsigned i = 0; i < kDirtyTids; i++) {
448    Dirty *dirty = &dirty_[i];
449    if (dirty->tid != kInvalidTid) {
450      CHECK_LT(dirty->tid, size_);
451      elem(dirty->tid).epoch = dirty->epoch;
452      dirty->tid = kInvalidTid;
453    }
454  }
455}
456
457bool SyncClock::IsShared() const {
458  if (size_ == 0)
459    return false;
460  atomic_uint32_t *ref = ref_ptr(tab_);
461  u32 v = atomic_load(ref, memory_order_acquire);
462  CHECK_GT(v, 0);
463  return v > 1;
464}
465
466// Unshares the current clock if it's shared.
467// Shared clocks are immutable, so they need to be unshared before any updates.
468// Note: this does not apply to dirty entries as they are not shared.
469void SyncClock::Unshare(ClockCache *c) {
470  if (!IsShared())
471    return;
472  // First, copy current state into old.
473  SyncClock old;
474  old.tab_ = tab_;
475  old.tab_idx_ = tab_idx_;
476  old.size_ = size_;
477  old.blocks_ = blocks_;
478  old.release_store_tid_ = release_store_tid_;
479  old.release_store_reused_ = release_store_reused_;
480  for (unsigned i = 0; i < kDirtyTids; i++)
481    old.dirty_[i] = dirty_[i];
482  // Then, clear current object.
483  ResetImpl();
484  // Allocate brand new clock in the current object.
485  Resize(c, old.size_);
486  // Now copy state back into this object.
487  Iter old_iter(&old);
488  for (ClockElem &ce : *this) {
489    ce = *old_iter;
490    ++old_iter;
491  }
492  release_store_tid_ = old.release_store_tid_;
493  release_store_reused_ = old.release_store_reused_;
494  for (unsigned i = 0; i < kDirtyTids; i++)
495    dirty_[i] = old.dirty_[i];
496  // Drop reference to old and delete if necessary.
497  old.Reset(c);
498}
499
500// Can we cache this clock for future release operations?
501ALWAYS_INLINE bool SyncClock::Cachable() const {
502  if (size_ == 0)
503    return false;
504  for (unsigned i = 0; i < kDirtyTids; i++) {
505    if (dirty_[i].tid != kInvalidTid)
506      return false;
507  }
508  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
509}
510
511// elem linearizes the two-level structure into linear array.
512// Note: this is used only for one time accesses, vector operations use
513// the iterator as it is much faster.
514ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
515  DCHECK_LT(tid, size_);
516  const uptr block = tid / ClockBlock::kClockCount;
517  DCHECK_LE(block, blocks_);
518  tid %= ClockBlock::kClockCount;
519  if (block == blocks_)
520    return tab_->clock[tid];
521  u32 idx = get_block(block);
522  ClockBlock *cb = ctx->clock_alloc.Map(idx);
523  return cb->clock[tid];
524}
525
526ALWAYS_INLINE uptr SyncClock::capacity() const {
527  if (size_ == 0)
528    return 0;
529  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
530  // How many clock elements we can fit into the first level block.
531  // +1 for ref counter.
532  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
533  return blocks_ * ClockBlock::kClockCount + top;
534}
535
536ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
537  DCHECK(size_);
538  DCHECK_LT(bi, blocks_);
539  return tab_->table[ClockBlock::kBlockIdx - bi];
540}
541
542ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
543  uptr bi = blocks_++;
544  CHECK_EQ(get_block(bi), 0);
545  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
546}
547
548// Used only by tests.
549u64 SyncClock::get(unsigned tid) const {
550  for (unsigned i = 0; i < kDirtyTids; i++) {
551    Dirty dirty = dirty_[i];
552    if (dirty.tid == tid)
553      return dirty.epoch;
554  }
555  return elem(tid).epoch;
556}
557
558// Used only by Iter test.
559u64 SyncClock::get_clean(unsigned tid) const {
560  return elem(tid).epoch;
561}
562
563void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
564  printf("clock=[");
565  for (uptr i = 0; i < size_; i++)
566    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
567  printf("] reused=[");
568  for (uptr i = 0; i < size_; i++)
569    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
570  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
571      release_store_tid_, release_store_reused_,
572      dirty_[0].tid, dirty_[0].epoch,
573      dirty_[1].tid, dirty_[1].epoch);
574}
575
576void SyncClock::Iter::Next() {
577  // Finished with the current block, move on to the next one.
578  block_++;
579  if (block_ < parent_->blocks_) {
580    // Iterate over the next second level block.
581    u32 idx = parent_->get_block(block_);
582    ClockBlock *cb = ctx->clock_alloc.Map(idx);
583    pos_ = &cb->clock[0];
584    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
585        ClockBlock::kClockCount);
586    return;
587  }
588  if (block_ == parent_->blocks_ &&
589      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
590    // Iterate over elements in the first level block.
591    pos_ = &parent_->tab_->clock[0];
592    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
593        ClockBlock::kClockCount);
594    return;
595  }
596  parent_ = nullptr;  // denotes end
597}
598}  // namespace __tsan
599