1//===-- tsan_clock.cc -----------------------------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file is a part of ThreadSanitizer (TSan), a race detector. 11// 12//===----------------------------------------------------------------------===// 13#include "tsan_clock.h" 14#include "tsan_rtl.h" 15#include "sanitizer_common/sanitizer_placement_new.h" 16 17// SyncClock and ThreadClock implement vector clocks for sync variables 18// (mutexes, atomic variables, file descriptors, etc) and threads, respectively. 19// ThreadClock contains fixed-size vector clock for maximum number of threads. 20// SyncClock contains growable vector clock for currently necessary number of 21// threads. 22// Together they implement very simple model of operations, namely: 23// 24// void ThreadClock::acquire(const SyncClock *src) { 25// for (int i = 0; i < kMaxThreads; i++) 26// clock[i] = max(clock[i], src->clock[i]); 27// } 28// 29// void ThreadClock::release(SyncClock *dst) const { 30// for (int i = 0; i < kMaxThreads; i++) 31// dst->clock[i] = max(dst->clock[i], clock[i]); 32// } 33// 34// void ThreadClock::ReleaseStore(SyncClock *dst) const { 35// for (int i = 0; i < kMaxThreads; i++) 36// dst->clock[i] = clock[i]; 37// } 38// 39// void ThreadClock::acq_rel(SyncClock *dst) { 40// acquire(dst); 41// release(dst); 42// } 43// 44// Conformance to this model is extensively verified in tsan_clock_test.cc. 45// However, the implementation is significantly more complex. The complexity 46// allows to implement important classes of use cases in O(1) instead of O(N). 47// 48// The use cases are: 49// 1. Singleton/once atomic that has a single release-store operation followed 50// by zillions of acquire-loads (the acquire-load is O(1)). 51// 2. Thread-local mutex (both lock and unlock can be O(1)). 52// 3. Leaf mutex (unlock is O(1)). 53// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)). 54// 5. An atomic with a single writer (writes can be O(1)). 55// The implementation dynamically adopts to workload. So if an atomic is in 56// read-only phase, these reads will be O(1); if it later switches to read/write 57// phase, the implementation will correctly handle that by switching to O(N). 58// 59// Thread-safety note: all const operations on SyncClock's are conducted under 60// a shared lock; all non-const operations on SyncClock's are conducted under 61// an exclusive lock; ThreadClock's are private to respective threads and so 62// do not need any protection. 63// 64// Description of SyncClock state: 65// clk_ - variable size vector clock, low kClkBits hold timestamp, 66// the remaining bits hold "acquired" flag (the actual value is thread's 67// reused counter); 68// if acquried == thr->reused_, then the respective thread has already 69// acquired this clock (except possibly for dirty elements). 70// dirty_ - holds up to two indeces in the vector clock that other threads 71// need to acquire regardless of "acquired" flag value; 72// release_store_tid_ - denotes that the clock state is a result of 73// release-store operation by the thread with release_store_tid_ index. 74// release_store_reused_ - reuse count of release_store_tid_. 75 76// We don't have ThreadState in these methods, so this is an ugly hack that 77// works only in C++. 78#if !SANITIZER_GO 79# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ) 80#else 81# define CPP_STAT_INC(typ) (void)0 82#endif 83 84namespace __tsan { 85 86static atomic_uint32_t *ref_ptr(ClockBlock *cb) { 87 return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]); 88} 89 90// Drop reference to the first level block idx. 91static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) { 92 ClockBlock *cb = ctx->clock_alloc.Map(idx); 93 atomic_uint32_t *ref = ref_ptr(cb); 94 u32 v = atomic_load(ref, memory_order_acquire); 95 for (;;) { 96 CHECK_GT(v, 0); 97 if (v == 1) 98 break; 99 if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel)) 100 return; 101 } 102 // First level block owns second level blocks, so them as well. 103 for (uptr i = 0; i < blocks; i++) 104 ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]); 105 ctx->clock_alloc.Free(c, idx); 106} 107 108ThreadClock::ThreadClock(unsigned tid, unsigned reused) 109 : tid_(tid) 110 , reused_(reused + 1) // 0 has special meaning 111 , cached_idx_() 112 , cached_size_() 113 , cached_blocks_() { 114 CHECK_LT(tid, kMaxTidInClock); 115 CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits); 116 nclk_ = tid_ + 1; 117 last_acquire_ = 0; 118 internal_memset(clk_, 0, sizeof(clk_)); 119} 120 121void ThreadClock::ResetCached(ClockCache *c) { 122 if (cached_idx_) { 123 UnrefClockBlock(c, cached_idx_, cached_blocks_); 124 cached_idx_ = 0; 125 cached_size_ = 0; 126 cached_blocks_ = 0; 127 } 128} 129 130void ThreadClock::acquire(ClockCache *c, SyncClock *src) { 131 DCHECK_LE(nclk_, kMaxTid); 132 DCHECK_LE(src->size_, kMaxTid); 133 CPP_STAT_INC(StatClockAcquire); 134 135 // Check if it's empty -> no need to do anything. 136 const uptr nclk = src->size_; 137 if (nclk == 0) { 138 CPP_STAT_INC(StatClockAcquireEmpty); 139 return; 140 } 141 142 bool acquired = false; 143 for (unsigned i = 0; i < kDirtyTids; i++) { 144 SyncClock::Dirty dirty = src->dirty_[i]; 145 unsigned tid = dirty.tid; 146 if (tid != kInvalidTid) { 147 if (clk_[tid] < dirty.epoch) { 148 clk_[tid] = dirty.epoch; 149 acquired = true; 150 } 151 } 152 } 153 154 // Check if we've already acquired src after the last release operation on src 155 if (tid_ >= nclk || src->elem(tid_).reused != reused_) { 156 // O(N) acquire. 157 CPP_STAT_INC(StatClockAcquireFull); 158 nclk_ = max(nclk_, nclk); 159 u64 *dst_pos = &clk_[0]; 160 for (ClockElem &src_elem : *src) { 161 u64 epoch = src_elem.epoch; 162 if (*dst_pos < epoch) { 163 *dst_pos = epoch; 164 acquired = true; 165 } 166 dst_pos++; 167 } 168 169 // Remember that this thread has acquired this clock. 170 if (nclk > tid_) 171 src->elem(tid_).reused = reused_; 172 } 173 174 if (acquired) { 175 CPP_STAT_INC(StatClockAcquiredSomething); 176 last_acquire_ = clk_[tid_]; 177 ResetCached(c); 178 } 179} 180 181void ThreadClock::release(ClockCache *c, SyncClock *dst) { 182 DCHECK_LE(nclk_, kMaxTid); 183 DCHECK_LE(dst->size_, kMaxTid); 184 185 if (dst->size_ == 0) { 186 // ReleaseStore will correctly set release_store_tid_, 187 // which can be important for future operations. 188 ReleaseStore(c, dst); 189 return; 190 } 191 192 CPP_STAT_INC(StatClockRelease); 193 // Check if we need to resize dst. 194 if (dst->size_ < nclk_) 195 dst->Resize(c, nclk_); 196 197 // Check if we had not acquired anything from other threads 198 // since the last release on dst. If so, we need to update 199 // only dst->elem(tid_). 200 if (dst->elem(tid_).epoch > last_acquire_) { 201 UpdateCurrentThread(c, dst); 202 if (dst->release_store_tid_ != tid_ || 203 dst->release_store_reused_ != reused_) 204 dst->release_store_tid_ = kInvalidTid; 205 return; 206 } 207 208 // O(N) release. 209 CPP_STAT_INC(StatClockReleaseFull); 210 dst->Unshare(c); 211 // First, remember whether we've acquired dst. 212 bool acquired = IsAlreadyAcquired(dst); 213 if (acquired) 214 CPP_STAT_INC(StatClockReleaseAcquired); 215 // Update dst->clk_. 216 dst->FlushDirty(); 217 uptr i = 0; 218 for (ClockElem &ce : *dst) { 219 ce.epoch = max(ce.epoch, clk_[i]); 220 ce.reused = 0; 221 i++; 222 } 223 // Clear 'acquired' flag in the remaining elements. 224 if (nclk_ < dst->size_) 225 CPP_STAT_INC(StatClockReleaseClearTail); 226 for (uptr i = nclk_; i < dst->size_; i++) 227 dst->elem(i).reused = 0; 228 dst->release_store_tid_ = kInvalidTid; 229 dst->release_store_reused_ = 0; 230 // If we've acquired dst, remember this fact, 231 // so that we don't need to acquire it on next acquire. 232 if (acquired) 233 dst->elem(tid_).reused = reused_; 234} 235 236void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) { 237 DCHECK_LE(nclk_, kMaxTid); 238 DCHECK_LE(dst->size_, kMaxTid); 239 CPP_STAT_INC(StatClockStore); 240 241 if (dst->size_ == 0 && cached_idx_ != 0) { 242 // Reuse the cached clock. 243 // Note: we could reuse/cache the cached clock in more cases: 244 // we could update the existing clock and cache it, or replace it with the 245 // currently cached clock and release the old one. And for a shared 246 // existing clock, we could replace it with the currently cached; 247 // or unshare, update and cache. But, for simplicity, we currnetly reuse 248 // cached clock only when the target clock is empty. 249 dst->tab_ = ctx->clock_alloc.Map(cached_idx_); 250 dst->tab_idx_ = cached_idx_; 251 dst->size_ = cached_size_; 252 dst->blocks_ = cached_blocks_; 253 CHECK_EQ(dst->dirty_[0].tid, kInvalidTid); 254 // The cached clock is shared (immutable), 255 // so this is where we store the current clock. 256 dst->dirty_[0].tid = tid_; 257 dst->dirty_[0].epoch = clk_[tid_]; 258 dst->release_store_tid_ = tid_; 259 dst->release_store_reused_ = reused_; 260 // Rememeber that we don't need to acquire it in future. 261 dst->elem(tid_).reused = reused_; 262 // Grab a reference. 263 atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); 264 return; 265 } 266 267 // Check if we need to resize dst. 268 if (dst->size_ < nclk_) 269 dst->Resize(c, nclk_); 270 271 if (dst->release_store_tid_ == tid_ && 272 dst->release_store_reused_ == reused_ && 273 dst->elem(tid_).epoch > last_acquire_) { 274 CPP_STAT_INC(StatClockStoreFast); 275 UpdateCurrentThread(c, dst); 276 return; 277 } 278 279 // O(N) release-store. 280 CPP_STAT_INC(StatClockStoreFull); 281 dst->Unshare(c); 282 // Note: dst can be larger than this ThreadClock. 283 // This is fine since clk_ beyond size is all zeros. 284 uptr i = 0; 285 for (ClockElem &ce : *dst) { 286 ce.epoch = clk_[i]; 287 ce.reused = 0; 288 i++; 289 } 290 for (uptr i = 0; i < kDirtyTids; i++) 291 dst->dirty_[i].tid = kInvalidTid; 292 dst->release_store_tid_ = tid_; 293 dst->release_store_reused_ = reused_; 294 // Rememeber that we don't need to acquire it in future. 295 dst->elem(tid_).reused = reused_; 296 297 // If the resulting clock is cachable, cache it for future release operations. 298 // The clock is always cachable if we released to an empty sync object. 299 if (cached_idx_ == 0 && dst->Cachable()) { 300 // Grab a reference to the ClockBlock. 301 atomic_uint32_t *ref = ref_ptr(dst->tab_); 302 if (atomic_load(ref, memory_order_acquire) == 1) 303 atomic_store_relaxed(ref, 2); 304 else 305 atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); 306 cached_idx_ = dst->tab_idx_; 307 cached_size_ = dst->size_; 308 cached_blocks_ = dst->blocks_; 309 } 310} 311 312void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) { 313 CPP_STAT_INC(StatClockAcquireRelease); 314 acquire(c, dst); 315 ReleaseStore(c, dst); 316} 317 318// Updates only single element related to the current thread in dst->clk_. 319void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const { 320 // Update the threads time, but preserve 'acquired' flag. 321 for (unsigned i = 0; i < kDirtyTids; i++) { 322 SyncClock::Dirty *dirty = &dst->dirty_[i]; 323 const unsigned tid = dirty->tid; 324 if (tid == tid_ || tid == kInvalidTid) { 325 CPP_STAT_INC(StatClockReleaseFast); 326 dirty->tid = tid_; 327 dirty->epoch = clk_[tid_]; 328 return; 329 } 330 } 331 // Reset all 'acquired' flags, O(N). 332 // We are going to touch dst elements, so we need to unshare it. 333 dst->Unshare(c); 334 CPP_STAT_INC(StatClockReleaseSlow); 335 dst->elem(tid_).epoch = clk_[tid_]; 336 for (uptr i = 0; i < dst->size_; i++) 337 dst->elem(i).reused = 0; 338 dst->FlushDirty(); 339} 340 341// Checks whether the current thread has already acquired src. 342bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const { 343 if (src->elem(tid_).reused != reused_) 344 return false; 345 for (unsigned i = 0; i < kDirtyTids; i++) { 346 SyncClock::Dirty dirty = src->dirty_[i]; 347 if (dirty.tid != kInvalidTid) { 348 if (clk_[dirty.tid] < dirty.epoch) 349 return false; 350 } 351 } 352 return true; 353} 354 355// Sets a single element in the vector clock. 356// This function is called only from weird places like AcquireGlobal. 357void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) { 358 DCHECK_LT(tid, kMaxTid); 359 DCHECK_GE(v, clk_[tid]); 360 clk_[tid] = v; 361 if (nclk_ <= tid) 362 nclk_ = tid + 1; 363 last_acquire_ = clk_[tid_]; 364 ResetCached(c); 365} 366 367void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) { 368 printf("clock=["); 369 for (uptr i = 0; i < nclk_; i++) 370 printf("%s%llu", i == 0 ? "" : ",", clk_[i]); 371 printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_); 372} 373 374SyncClock::SyncClock() { 375 ResetImpl(); 376} 377 378SyncClock::~SyncClock() { 379 // Reset must be called before dtor. 380 CHECK_EQ(size_, 0); 381 CHECK_EQ(blocks_, 0); 382 CHECK_EQ(tab_, 0); 383 CHECK_EQ(tab_idx_, 0); 384} 385 386void SyncClock::Reset(ClockCache *c) { 387 if (size_) 388 UnrefClockBlock(c, tab_idx_, blocks_); 389 ResetImpl(); 390} 391 392void SyncClock::ResetImpl() { 393 tab_ = 0; 394 tab_idx_ = 0; 395 size_ = 0; 396 blocks_ = 0; 397 release_store_tid_ = kInvalidTid; 398 release_store_reused_ = 0; 399 for (uptr i = 0; i < kDirtyTids; i++) 400 dirty_[i].tid = kInvalidTid; 401} 402 403void SyncClock::Resize(ClockCache *c, uptr nclk) { 404 CPP_STAT_INC(StatClockReleaseResize); 405 Unshare(c); 406 if (nclk <= capacity()) { 407 // Memory is already allocated, just increase the size. 408 size_ = nclk; 409 return; 410 } 411 if (size_ == 0) { 412 // Grow from 0 to one-level table. 413 CHECK_EQ(size_, 0); 414 CHECK_EQ(blocks_, 0); 415 CHECK_EQ(tab_, 0); 416 CHECK_EQ(tab_idx_, 0); 417 tab_idx_ = ctx->clock_alloc.Alloc(c); 418 tab_ = ctx->clock_alloc.Map(tab_idx_); 419 internal_memset(tab_, 0, sizeof(*tab_)); 420 atomic_store_relaxed(ref_ptr(tab_), 1); 421 size_ = 1; 422 } else if (size_ > blocks_ * ClockBlock::kClockCount) { 423 u32 idx = ctx->clock_alloc.Alloc(c); 424 ClockBlock *new_cb = ctx->clock_alloc.Map(idx); 425 uptr top = size_ - blocks_ * ClockBlock::kClockCount; 426 CHECK_LT(top, ClockBlock::kClockCount); 427 const uptr move = top * sizeof(tab_->clock[0]); 428 internal_memcpy(&new_cb->clock[0], tab_->clock, move); 429 internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move); 430 internal_memset(tab_->clock, 0, move); 431 append_block(idx); 432 } 433 // At this point we have first level table allocated and all clock elements 434 // are evacuated from it to a second level block. 435 // Add second level tables as necessary. 436 while (nclk > capacity()) { 437 u32 idx = ctx->clock_alloc.Alloc(c); 438 ClockBlock *cb = ctx->clock_alloc.Map(idx); 439 internal_memset(cb, 0, sizeof(*cb)); 440 append_block(idx); 441 } 442 size_ = nclk; 443} 444 445// Flushes all dirty elements into the main clock array. 446void SyncClock::FlushDirty() { 447 for (unsigned i = 0; i < kDirtyTids; i++) { 448 Dirty *dirty = &dirty_[i]; 449 if (dirty->tid != kInvalidTid) { 450 CHECK_LT(dirty->tid, size_); 451 elem(dirty->tid).epoch = dirty->epoch; 452 dirty->tid = kInvalidTid; 453 } 454 } 455} 456 457bool SyncClock::IsShared() const { 458 if (size_ == 0) 459 return false; 460 atomic_uint32_t *ref = ref_ptr(tab_); 461 u32 v = atomic_load(ref, memory_order_acquire); 462 CHECK_GT(v, 0); 463 return v > 1; 464} 465 466// Unshares the current clock if it's shared. 467// Shared clocks are immutable, so they need to be unshared before any updates. 468// Note: this does not apply to dirty entries as they are not shared. 469void SyncClock::Unshare(ClockCache *c) { 470 if (!IsShared()) 471 return; 472 // First, copy current state into old. 473 SyncClock old; 474 old.tab_ = tab_; 475 old.tab_idx_ = tab_idx_; 476 old.size_ = size_; 477 old.blocks_ = blocks_; 478 old.release_store_tid_ = release_store_tid_; 479 old.release_store_reused_ = release_store_reused_; 480 for (unsigned i = 0; i < kDirtyTids; i++) 481 old.dirty_[i] = dirty_[i]; 482 // Then, clear current object. 483 ResetImpl(); 484 // Allocate brand new clock in the current object. 485 Resize(c, old.size_); 486 // Now copy state back into this object. 487 Iter old_iter(&old); 488 for (ClockElem &ce : *this) { 489 ce = *old_iter; 490 ++old_iter; 491 } 492 release_store_tid_ = old.release_store_tid_; 493 release_store_reused_ = old.release_store_reused_; 494 for (unsigned i = 0; i < kDirtyTids; i++) 495 dirty_[i] = old.dirty_[i]; 496 // Drop reference to old and delete if necessary. 497 old.Reset(c); 498} 499 500// Can we cache this clock for future release operations? 501ALWAYS_INLINE bool SyncClock::Cachable() const { 502 if (size_ == 0) 503 return false; 504 for (unsigned i = 0; i < kDirtyTids; i++) { 505 if (dirty_[i].tid != kInvalidTid) 506 return false; 507 } 508 return atomic_load_relaxed(ref_ptr(tab_)) == 1; 509} 510 511// elem linearizes the two-level structure into linear array. 512// Note: this is used only for one time accesses, vector operations use 513// the iterator as it is much faster. 514ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const { 515 DCHECK_LT(tid, size_); 516 const uptr block = tid / ClockBlock::kClockCount; 517 DCHECK_LE(block, blocks_); 518 tid %= ClockBlock::kClockCount; 519 if (block == blocks_) 520 return tab_->clock[tid]; 521 u32 idx = get_block(block); 522 ClockBlock *cb = ctx->clock_alloc.Map(idx); 523 return cb->clock[tid]; 524} 525 526ALWAYS_INLINE uptr SyncClock::capacity() const { 527 if (size_ == 0) 528 return 0; 529 uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]); 530 // How many clock elements we can fit into the first level block. 531 // +1 for ref counter. 532 uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio; 533 return blocks_ * ClockBlock::kClockCount + top; 534} 535 536ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const { 537 DCHECK(size_); 538 DCHECK_LT(bi, blocks_); 539 return tab_->table[ClockBlock::kBlockIdx - bi]; 540} 541 542ALWAYS_INLINE void SyncClock::append_block(u32 idx) { 543 uptr bi = blocks_++; 544 CHECK_EQ(get_block(bi), 0); 545 tab_->table[ClockBlock::kBlockIdx - bi] = idx; 546} 547 548// Used only by tests. 549u64 SyncClock::get(unsigned tid) const { 550 for (unsigned i = 0; i < kDirtyTids; i++) { 551 Dirty dirty = dirty_[i]; 552 if (dirty.tid == tid) 553 return dirty.epoch; 554 } 555 return elem(tid).epoch; 556} 557 558// Used only by Iter test. 559u64 SyncClock::get_clean(unsigned tid) const { 560 return elem(tid).epoch; 561} 562 563void SyncClock::DebugDump(int(*printf)(const char *s, ...)) { 564 printf("clock=["); 565 for (uptr i = 0; i < size_; i++) 566 printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch); 567 printf("] reused=["); 568 for (uptr i = 0; i < size_; i++) 569 printf("%s%llu", i == 0 ? "" : ",", elem(i).reused); 570 printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]", 571 release_store_tid_, release_store_reused_, 572 dirty_[0].tid, dirty_[0].epoch, 573 dirty_[1].tid, dirty_[1].epoch); 574} 575 576void SyncClock::Iter::Next() { 577 // Finished with the current block, move on to the next one. 578 block_++; 579 if (block_ < parent_->blocks_) { 580 // Iterate over the next second level block. 581 u32 idx = parent_->get_block(block_); 582 ClockBlock *cb = ctx->clock_alloc.Map(idx); 583 pos_ = &cb->clock[0]; 584 end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, 585 ClockBlock::kClockCount); 586 return; 587 } 588 if (block_ == parent_->blocks_ && 589 parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) { 590 // Iterate over elements in the first level block. 591 pos_ = &parent_->tab_->clock[0]; 592 end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, 593 ClockBlock::kClockCount); 594 return; 595 } 596 parent_ = nullptr; // denotes end 597} 598} // namespace __tsan 599