1/*
2 * kmp_barrier.cpp
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_wait_release.h"
15#include "kmp_itt.h"
16#include "kmp_os.h"
17#include "kmp_stats.h"
18#include "ompt-specific.h"
19
20#if KMP_MIC
21#include <immintrin.h>
22#define USE_NGO_STORES 1
23#endif // KMP_MIC
24
25#include "tsan_annotations.h"
26
27#if KMP_MIC && USE_NGO_STORES
28// ICV copying
29#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33#else
34#define ngo_load(src) ((void)0)
35#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37#define ngo_sync() ((void)0)
38#endif /* KMP_MIC && USE_NGO_STORES */
39
40void __kmp_print_structure(void); // Forward declaration
41
42// ---------------------------- Barrier Algorithms ----------------------------
43
44// Linear Barrier
45template <bool cancellable = false>
46static bool __kmp_linear_barrier_gather_template(
47    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53
54  KA_TRACE(
55      20,
56      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57       gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60#if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64        __itt_get_timestamp();
65  }
66#endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70    KA_TRACE(20,
71             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72              "arrived(%p): %llu => %llu\n",
73              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76    // Mark arrival to master thread
77    /* After performing this write, a worker thread may not assume that the team
78       is valid any more - it could be deallocated by the master thread at any
79       time. */
80    ANNOTATE_BARRIER_BEGIN(this_thr);
81    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82    flag.release();
83  } else {
84    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85    int nproc = this_thr->th.th_team_nproc;
86    int i;
87    // Don't have to worry about sleep bit here or atomic since team setting
88    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89
90    // Collect all the worker team member threads.
91    for (i = 1; i < nproc; ++i) {
92#if KMP_CACHE_MANAGE
93      // Prefetch next thread's arrived count
94      if (i + 1 < nproc)
95        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96#endif /* KMP_CACHE_MANAGE */
97      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                    "arrived(%p) == %llu\n",
99                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                    team->t.t_id, i,
101                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102
103      // Wait for worker thread to arrive
104      kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105                       new_state);
106      if (cancellable) {
107        bool cancelled = flag.wait_cancellable_nosleep(
108            this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109        if (cancelled)
110          return true;
111      } else {
112        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113      }
114      ANNOTATE_BARRIER_END(other_threads[i]);
115#if USE_ITT_BUILD && USE_ITT_NOTIFY
116      // Barrier imbalance - write min of the thread time and the other thread
117      // time to the thread.
118      if (__kmp_forkjoin_frames_mode == 2) {
119        this_thr->th.th_bar_min_time = KMP_MIN(
120            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121      }
122#endif
123      if (reduce) {
124        KA_TRACE(100,
125                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                  team->t.t_id, i));
128        ANNOTATE_REDUCE_AFTER(reduce);
129        OMPT_REDUCTION_DECL(this_thr, gtid);
130        OMPT_REDUCTION_BEGIN;
131        (*reduce)(this_thr->th.th_local.reduce_data,
132                  other_threads[i]->th.th_local.reduce_data);
133        OMPT_REDUCTION_END;
134        ANNOTATE_REDUCE_BEFORE(reduce);
135        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136      }
137    }
138    // Don't have to worry about sleep bit here or atomic since team setting
139    team_bar->b_arrived = new_state;
140    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                  "arrived(%p) = %llu\n",
142                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                  new_state));
144  }
145  KA_TRACE(
146      20,
147      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148       gtid, team->t.t_id, tid, bt));
149  return false;
150}
151
152template <bool cancellable = false>
153static bool __kmp_linear_barrier_release_template(
154    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159
160  if (KMP_MASTER_TID(tid)) {
161    unsigned int i;
162    kmp_uint32 nproc = this_thr->th.th_team_nproc;
163    kmp_info_t **other_threads;
164
165    team = __kmp_threads[gtid]->th.th_team;
166    KMP_DEBUG_ASSERT(team != NULL);
167    other_threads = team->t.t_threads;
168
169    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                  "barrier type %d\n",
171                  gtid, team->t.t_id, tid, bt));
172
173    if (nproc > 1) {
174#if KMP_BARRIER_ICV_PUSH
175      {
176        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177        if (propagate_icvs) {
178          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179          for (i = 1; i < nproc; ++i) {
180            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                     team, i, FALSE);
182            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                           &team->t.t_implicit_task_taskdata[0].td_icvs);
184          }
185          ngo_sync();
186        }
187      }
188#endif // KMP_BARRIER_ICV_PUSH
189
190      // Now, release all of the worker threads
191      for (i = 1; i < nproc; ++i) {
192#if KMP_CACHE_MANAGE
193        // Prefetch next thread's go flag
194        if (i + 1 < nproc)
195          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196#endif /* KMP_CACHE_MANAGE */
197        KA_TRACE(
198            20,
199            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200             "go(%p): %u => %u\n",
201             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203             other_threads[i]->th.th_bar[bt].bb.b_go,
204             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205        ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206        kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                         other_threads[i]);
208        flag.release();
209      }
210    }
211  } else { // Wait for the MASTER thread to release us
212    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215    if (cancellable) {
216      bool cancelled = flag.wait_cancellable_nosleep(
217          this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218      if (cancelled) {
219        return true;
220      }
221    } else {
222      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223    }
224    ANNOTATE_BARRIER_END(this_thr);
225#if USE_ITT_BUILD && USE_ITT_NOTIFY
226    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228      // disabled)
229      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230      // Cancel wait on previous parallel region...
231      __kmp_itt_task_starting(itt_sync_obj);
232
233      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234        return false;
235
236      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237      if (itt_sync_obj != NULL)
238        // Call prepare as early as possible for "new" barrier
239        __kmp_itt_task_finished(itt_sync_obj);
240    } else
241#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242        // Early exit for reaping threads releasing forkjoin barrier
243        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244      return false;
245// The worker thread may now assume that the team is valid.
246#ifdef KMP_DEBUG
247    tid = __kmp_tid_from_gtid(gtid);
248    team = __kmp_threads[gtid]->th.th_team;
249#endif
250    KMP_DEBUG_ASSERT(team != NULL);
251    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252    KA_TRACE(20,
253             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255    KMP_MB(); // Flush all pending memory write invalidates.
256  }
257  KA_TRACE(
258      20,
259      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260       gtid, team->t.t_id, tid, bt));
261  return false;
262}
263
264static void __kmp_linear_barrier_gather(
265    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267  __kmp_linear_barrier_gather_template<false>(
268      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269}
270
271static bool __kmp_linear_barrier_gather_cancellable(
272    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274  return __kmp_linear_barrier_gather_template<true>(
275      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276}
277
278static void __kmp_linear_barrier_release(
279    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281  __kmp_linear_barrier_release_template<false>(
282      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283}
284
285static bool __kmp_linear_barrier_release_cancellable(
286    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288  return __kmp_linear_barrier_release_template<true>(
289      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290}
291
292// Tree barrier
293static void
294__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295                          int tid, void (*reduce)(void *, void *)
296                                       USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298  kmp_team_t *team = this_thr->th.th_team;
299  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300  kmp_info_t **other_threads = team->t.t_threads;
301  kmp_uint32 nproc = this_thr->th.th_team_nproc;
302  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303  kmp_uint32 branch_factor = 1 << branch_bits;
304  kmp_uint32 child;
305  kmp_uint32 child_tid;
306  kmp_uint64 new_state;
307
308  KA_TRACE(
309      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310           gtid, team->t.t_id, tid, bt));
311  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312
313#if USE_ITT_BUILD && USE_ITT_NOTIFY
314  // Barrier imbalance - save arrive time to the thread
315  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317        __itt_get_timestamp();
318  }
319#endif
320  // Perform tree gather to wait until all threads have arrived; reduce any
321  // required data as we go
322  child_tid = (tid << branch_bits) + 1;
323  if (child_tid < nproc) {
324    // Parent threads wait for all their children to arrive
325    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326    child = 1;
327    do {
328      kmp_info_t *child_thr = other_threads[child_tid];
329      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330#if KMP_CACHE_MANAGE
331      // Prefetch next thread's arrived count
332      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333        KMP_CACHE_PREFETCH(
334            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335#endif /* KMP_CACHE_MANAGE */
336      KA_TRACE(20,
337               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338                "arrived(%p) == %llu\n",
339                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341      // Wait for child to arrive
342      kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344      ANNOTATE_BARRIER_END(child_thr);
345#if USE_ITT_BUILD && USE_ITT_NOTIFY
346      // Barrier imbalance - write min of the thread time and a child time to
347      // the thread.
348      if (__kmp_forkjoin_frames_mode == 2) {
349        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350                                               child_thr->th.th_bar_min_time);
351      }
352#endif
353      if (reduce) {
354        KA_TRACE(100,
355                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357                  team->t.t_id, child_tid));
358        ANNOTATE_REDUCE_AFTER(reduce);
359        OMPT_REDUCTION_DECL(this_thr, gtid);
360        OMPT_REDUCTION_BEGIN;
361        (*reduce)(this_thr->th.th_local.reduce_data,
362                  child_thr->th.th_local.reduce_data);
363        OMPT_REDUCTION_END;
364        ANNOTATE_REDUCE_BEFORE(reduce);
365        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366      }
367      child++;
368      child_tid++;
369    } while (child <= branch_factor && child_tid < nproc);
370  }
371
372  if (!KMP_MASTER_TID(tid)) { // Worker threads
373    kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374
375    KA_TRACE(20,
376             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377              "arrived(%p): %llu => %llu\n",
378              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381
382    // Mark arrival to parent thread
383    /* After performing this write, a worker thread may not assume that the team
384       is valid any more - it could be deallocated by the master thread at any
385       time.  */
386    ANNOTATE_BARRIER_BEGIN(this_thr);
387    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388    flag.release();
389  } else {
390    // Need to update the team arrived pointer if we are the master thread
391    if (nproc > 1) // New value was already computed above
392      team->t.t_bar[bt].b_arrived = new_state;
393    else
394      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396                  "arrived(%p) = %llu\n",
397                  gtid, team->t.t_id, tid, team->t.t_id,
398                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399  }
400  KA_TRACE(20,
401           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402            gtid, team->t.t_id, tid, bt));
403}
404
405static void __kmp_tree_barrier_release(
406    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409  kmp_team_t *team;
410  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411  kmp_uint32 nproc;
412  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413  kmp_uint32 branch_factor = 1 << branch_bits;
414  kmp_uint32 child;
415  kmp_uint32 child_tid;
416
417  // Perform a tree release for all of the threads that have been gathered
418  if (!KMP_MASTER_TID(
419          tid)) { // Handle fork barrier workers who aren't part of a team yet
420    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422    // Wait for parent thread to release us
423    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425    ANNOTATE_BARRIER_END(this_thr);
426#if USE_ITT_BUILD && USE_ITT_NOTIFY
427    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428      // In fork barrier where we could not get the object reliably (or
429      // ITTNOTIFY is disabled)
430      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431      // Cancel wait on previous parallel region...
432      __kmp_itt_task_starting(itt_sync_obj);
433
434      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435        return;
436
437      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438      if (itt_sync_obj != NULL)
439        // Call prepare as early as possible for "new" barrier
440        __kmp_itt_task_finished(itt_sync_obj);
441    } else
442#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443        // Early exit for reaping threads releasing forkjoin barrier
444        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445      return;
446
447    // The worker thread may now assume that the team is valid.
448    team = __kmp_threads[gtid]->th.th_team;
449    KMP_DEBUG_ASSERT(team != NULL);
450    tid = __kmp_tid_from_gtid(gtid);
451
452    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453    KA_TRACE(20,
454             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456    KMP_MB(); // Flush all pending memory write invalidates.
457  } else {
458    team = __kmp_threads[gtid]->th.th_team;
459    KMP_DEBUG_ASSERT(team != NULL);
460    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461                  "barrier type %d\n",
462                  gtid, team->t.t_id, tid, bt));
463  }
464  nproc = this_thr->th.th_team_nproc;
465  child_tid = (tid << branch_bits) + 1;
466
467  if (child_tid < nproc) {
468    kmp_info_t **other_threads = team->t.t_threads;
469    child = 1;
470    // Parent threads release all their children
471    do {
472      kmp_info_t *child_thr = other_threads[child_tid];
473      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475      // Prefetch next thread's go count
476      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477        KMP_CACHE_PREFETCH(
478            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479#endif /* KMP_CACHE_MANAGE */
480
481#if KMP_BARRIER_ICV_PUSH
482      {
483        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484        if (propagate_icvs) {
485          __kmp_init_implicit_task(team->t.t_ident,
486                                   team->t.t_threads[child_tid], team,
487                                   child_tid, FALSE);
488          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489                    &team->t.t_implicit_task_taskdata[0].td_icvs);
490        }
491      }
492#endif // KMP_BARRIER_ICV_PUSH
493      KA_TRACE(20,
494               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495                "go(%p): %u => %u\n",
496                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499      // Release child from barrier
500      ANNOTATE_BARRIER_BEGIN(child_thr);
501      kmp_flag_64 flag(&child_bar->b_go, child_thr);
502      flag.release();
503      child++;
504      child_tid++;
505    } while (child <= branch_factor && child_tid < nproc);
506  }
507  KA_TRACE(
508      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509           gtid, team->t.t_id, tid, bt));
510}
511
512// Hyper Barrier
513static void
514__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515                           int tid, void (*reduce)(void *, void *)
516                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518  kmp_team_t *team = this_thr->th.th_team;
519  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520  kmp_info_t **other_threads = team->t.t_threads;
521  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524  kmp_uint32 branch_factor = 1 << branch_bits;
525  kmp_uint32 offset;
526  kmp_uint32 level;
527
528  KA_TRACE(
529      20,
530      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531       gtid, team->t.t_id, tid, bt));
532  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533
534#if USE_ITT_BUILD && USE_ITT_NOTIFY
535  // Barrier imbalance - save arrive time to the thread
536  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538        __itt_get_timestamp();
539  }
540#endif
541  /* Perform a hypercube-embedded tree gather to wait until all of the threads
542     have arrived, and reduce any required data as we go.  */
543  kmp_flag_64 p_flag(&thr_bar->b_arrived);
544  for (level = 0, offset = 1; offset < num_threads;
545       level += branch_bits, offset <<= branch_bits) {
546    kmp_uint32 child;
547    kmp_uint32 child_tid;
548
549    if (((tid >> level) & (branch_factor - 1)) != 0) {
550      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551
552      KMP_MB(); // Synchronize parent and child threads.
553      KA_TRACE(20,
554               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
555                "arrived(%p): %llu => %llu\n",
556                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
557                team->t.t_id, parent_tid, &thr_bar->b_arrived,
558                thr_bar->b_arrived,
559                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560      // Mark arrival to parent thread
561      /* After performing this write (in the last iteration of the enclosing for
562         loop), a worker thread may not assume that the team is valid any more
563         - it could be deallocated by the master thread at any time.  */
564      ANNOTATE_BARRIER_BEGIN(this_thr);
565      p_flag.set_waiter(other_threads[parent_tid]);
566      p_flag.release();
567      break;
568    }
569
570    // Parent threads wait for children to arrive
571    if (new_state == KMP_BARRIER_UNUSED_STATE)
572      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
573    for (child = 1, child_tid = tid + (1 << level);
574         child < branch_factor && child_tid < num_threads;
575         child++, child_tid += (1 << level)) {
576      kmp_info_t *child_thr = other_threads[child_tid];
577      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578#if KMP_CACHE_MANAGE
579      kmp_uint32 next_child_tid = child_tid + (1 << level);
580      // Prefetch next thread's arrived count
581      if (child + 1 < branch_factor && next_child_tid < num_threads)
582        KMP_CACHE_PREFETCH(
583            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584#endif /* KMP_CACHE_MANAGE */
585      KA_TRACE(20,
586               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
587                "arrived(%p) == %llu\n",
588                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
589                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590      // Wait for child to arrive
591      kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
592      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
593      ANNOTATE_BARRIER_END(child_thr);
594      KMP_MB(); // Synchronize parent and child threads.
595#if USE_ITT_BUILD && USE_ITT_NOTIFY
596      // Barrier imbalance - write min of the thread time and a child time to
597      // the thread.
598      if (__kmp_forkjoin_frames_mode == 2) {
599        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
600                                               child_thr->th.th_bar_min_time);
601      }
602#endif
603      if (reduce) {
604        KA_TRACE(100,
605                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
606                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
607                  team->t.t_id, child_tid));
608        ANNOTATE_REDUCE_AFTER(reduce);
609        OMPT_REDUCTION_DECL(this_thr, gtid);
610        OMPT_REDUCTION_BEGIN;
611        (*reduce)(this_thr->th.th_local.reduce_data,
612                  child_thr->th.th_local.reduce_data);
613        OMPT_REDUCTION_END;
614        ANNOTATE_REDUCE_BEFORE(reduce);
615        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616      }
617    }
618  }
619
620  if (KMP_MASTER_TID(tid)) {
621    // Need to update the team arrived pointer if we are the master thread
622    if (new_state == KMP_BARRIER_UNUSED_STATE)
623      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
624    else
625      team->t.t_bar[bt].b_arrived = new_state;
626    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
627                  "arrived(%p) = %llu\n",
628                  gtid, team->t.t_id, tid, team->t.t_id,
629                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630  }
631  KA_TRACE(
632      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
633           gtid, team->t.t_id, tid, bt));
634}
635
636// The reverse versions seem to beat the forward versions overall
637#define KMP_REVERSE_HYPER_BAR
638static void __kmp_hyper_barrier_release(
639    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
640    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
641  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
642  kmp_team_t *team;
643  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
644  kmp_info_t **other_threads;
645  kmp_uint32 num_threads;
646  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
647  kmp_uint32 branch_factor = 1 << branch_bits;
648  kmp_uint32 child;
649  kmp_uint32 child_tid;
650  kmp_uint32 offset;
651  kmp_uint32 level;
652
653  /* Perform a hypercube-embedded tree release for all of the threads that have
654     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
655     are released in the reverse order of the corresponding gather, otherwise
656     threads are released in the same order. */
657  if (KMP_MASTER_TID(tid)) { // master
658    team = __kmp_threads[gtid]->th.th_team;
659    KMP_DEBUG_ASSERT(team != NULL);
660    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
661                  "barrier type %d\n",
662                  gtid, team->t.t_id, tid, bt));
663#if KMP_BARRIER_ICV_PUSH
664    if (propagate_icvs) { // master already has ICVs in final destination; copy
665      copy_icvs(&thr_bar->th_fixed_icvs,
666                &team->t.t_implicit_task_taskdata[tid].td_icvs);
667    }
668#endif
669  } else { // Handle fork barrier workers who aren't part of a team yet
670    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
671                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
672    // Wait for parent thread to release us
673    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
674    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
675    ANNOTATE_BARRIER_END(this_thr);
676#if USE_ITT_BUILD && USE_ITT_NOTIFY
677    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
678      // In fork barrier where we could not get the object reliably
679      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
680      // Cancel wait on previous parallel region...
681      __kmp_itt_task_starting(itt_sync_obj);
682
683      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684        return;
685
686      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
687      if (itt_sync_obj != NULL)
688        // Call prepare as early as possible for "new" barrier
689        __kmp_itt_task_finished(itt_sync_obj);
690    } else
691#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
692        // Early exit for reaping threads releasing forkjoin barrier
693        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
694      return;
695
696    // The worker thread may now assume that the team is valid.
697    team = __kmp_threads[gtid]->th.th_team;
698    KMP_DEBUG_ASSERT(team != NULL);
699    tid = __kmp_tid_from_gtid(gtid);
700
701    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
702    KA_TRACE(20,
703             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
704              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705    KMP_MB(); // Flush all pending memory write invalidates.
706  }
707  num_threads = this_thr->th.th_team_nproc;
708  other_threads = team->t.t_threads;
709
710#ifdef KMP_REVERSE_HYPER_BAR
711  // Count up to correct level for parent
712  for (level = 0, offset = 1;
713       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
714       level += branch_bits, offset <<= branch_bits)
715    ;
716
717  // Now go down from there
718  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
719       level -= branch_bits, offset >>= branch_bits)
720#else
721  // Go down the tree, level by level
722  for (level = 0, offset = 1; offset < num_threads;
723       level += branch_bits, offset <<= branch_bits)
724#endif // KMP_REVERSE_HYPER_BAR
725  {
726#ifdef KMP_REVERSE_HYPER_BAR
727    /* Now go in reverse order through the children, highest to lowest.
728       Initial setting of child is conservative here. */
729    child = num_threads >> ((level == 0) ? level : level - 1);
730    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
731        child_tid = tid + (child << level);
732         child >= 1; child--, child_tid -= (1 << level))
733#else
734    if (((tid >> level) & (branch_factor - 1)) != 0)
735      // No need to go lower than this, since this is the level parent would be
736      // notified
737      break;
738    // Iterate through children on this level of the tree
739    for (child = 1, child_tid = tid + (1 << level);
740         child < branch_factor && child_tid < num_threads;
741         child++, child_tid += (1 << level))
742#endif // KMP_REVERSE_HYPER_BAR
743    {
744      if (child_tid >= num_threads)
745        continue; // Child doesn't exist so keep going
746      else {
747        kmp_info_t *child_thr = other_threads[child_tid];
748        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
749#if KMP_CACHE_MANAGE
750        kmp_uint32 next_child_tid = child_tid - (1 << level);
751// Prefetch next thread's go count
752#ifdef KMP_REVERSE_HYPER_BAR
753        if (child - 1 >= 1 && next_child_tid < num_threads)
754#else
755        if (child + 1 < branch_factor && next_child_tid < num_threads)
756#endif // KMP_REVERSE_HYPER_BAR
757          KMP_CACHE_PREFETCH(
758              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759#endif /* KMP_CACHE_MANAGE */
760
761#if KMP_BARRIER_ICV_PUSH
762        if (propagate_icvs) // push my fixed ICVs to my child
763          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
764#endif // KMP_BARRIER_ICV_PUSH
765
766        KA_TRACE(
767            20,
768            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
769             "go(%p): %u => %u\n",
770             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
771             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
772             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
773        // Release child from barrier
774        ANNOTATE_BARRIER_BEGIN(child_thr);
775        kmp_flag_64 flag(&child_bar->b_go, child_thr);
776        flag.release();
777      }
778    }
779  }
780#if KMP_BARRIER_ICV_PUSH
781  if (propagate_icvs &&
782      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
783    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
784                             FALSE);
785    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
786              &thr_bar->th_fixed_icvs);
787  }
788#endif
789  KA_TRACE(
790      20,
791      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
792       gtid, team->t.t_id, tid, bt));
793}
794
795// Hierarchical Barrier
796
797// Initialize thread barrier data
798/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
799   Performs the minimum amount of initialization required based on how the team
800   has changed. Returns true if leaf children will require both on-core and
801   traditional wake-up mechanisms. For example, if the team size increases,
802   threads already in the team will respond to on-core wakeup on their parent
803   thread, but threads newly added to the team will only be listening on the
804   their local b_go. */
805static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
806                                                   kmp_bstate_t *thr_bar,
807                                                   kmp_uint32 nproc, int gtid,
808                                                   int tid, kmp_team_t *team) {
809  // Checks to determine if (re-)initialization is needed
810  bool uninitialized = thr_bar->team == NULL;
811  bool team_changed = team != thr_bar->team;
812  bool team_sz_changed = nproc != thr_bar->nproc;
813  bool tid_changed = tid != thr_bar->old_tid;
814  bool retval = false;
815
816  if (uninitialized || team_sz_changed) {
817    __kmp_get_hierarchy(nproc, thr_bar);
818  }
819
820  if (uninitialized || team_sz_changed || tid_changed) {
821    thr_bar->my_level = thr_bar->depth - 1; // default for master
822    thr_bar->parent_tid = -1; // default for master
823    if (!KMP_MASTER_TID(
824            tid)) { // if not master, find parent thread in hierarchy
825      kmp_uint32 d = 0;
826      while (d < thr_bar->depth) { // find parent based on level of thread in
827        // hierarchy, and note level
828        kmp_uint32 rem;
829        if (d == thr_bar->depth - 2) { // reached level right below the master
830          thr_bar->parent_tid = 0;
831          thr_bar->my_level = d;
832          break;
833        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834                   0) { // TODO: can we make this op faster?
835          // thread is not a subtree root at next level, so this is max
836          thr_bar->parent_tid = tid - rem;
837          thr_bar->my_level = d;
838          break;
839        }
840        ++d;
841      }
842    }
843    thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
844    thr_bar->old_tid = tid;
845    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846    thr_bar->team = team;
847    thr_bar->parent_bar =
848        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849  }
850  if (uninitialized || team_changed || tid_changed) {
851    thr_bar->team = team;
852    thr_bar->parent_bar =
853        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854    retval = true;
855  }
856  if (uninitialized || team_sz_changed || tid_changed) {
857    thr_bar->nproc = nproc;
858    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859    if (thr_bar->my_level == 0)
860      thr_bar->leaf_kids = 0;
861    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862      thr_bar->leaf_kids = nproc - tid - 1;
863    thr_bar->leaf_state = 0;
864    for (int i = 0; i < thr_bar->leaf_kids; ++i)
865      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866  }
867  return retval;
868}
869
870static void __kmp_hierarchical_barrier_gather(
871    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874  kmp_team_t *team = this_thr->th.th_team;
875  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876  kmp_uint32 nproc = this_thr->th.th_team_nproc;
877  kmp_info_t **other_threads = team->t.t_threads;
878  kmp_uint64 new_state;
879
880  int level = team->t.t_level;
881  if (other_threads[0]
882          ->th.th_teams_microtask) // are we inside the teams construct?
883    if (this_thr->th.th_teams_size.nteams > 1)
884      ++level; // level was not increased in teams construct for team_of_masters
885  if (level == 1)
886    thr_bar->use_oncore_barrier = 1;
887  else
888    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889
890  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891                "barrier type %d\n",
892                gtid, team->t.t_id, tid, bt));
893  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894
895#if USE_ITT_BUILD && USE_ITT_NOTIFY
896  // Barrier imbalance - save arrive time to the thread
897  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899  }
900#endif
901
902  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903                                               team);
904
905  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906    kmp_int32 child_tid;
907    new_state =
908        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910        thr_bar->use_oncore_barrier) {
911      if (thr_bar->leaf_kids) {
912        // First, wait for leaf children to check-in on my b_arrived flag
913        kmp_uint64 leaf_state =
914            KMP_MASTER_TID(tid)
915                ? thr_bar->b_arrived | thr_bar->leaf_state
916                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918                      "for leaf kids\n",
919                      gtid, team->t.t_id, tid));
920        kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
921        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922        if (reduce) {
923          ANNOTATE_REDUCE_AFTER(reduce);
924          OMPT_REDUCTION_DECL(this_thr, gtid);
925          OMPT_REDUCTION_BEGIN;
926          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927               ++child_tid) {
928            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929                           "T#%d(%d:%d)\n",
930                           gtid, team->t.t_id, tid,
931                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932                           child_tid));
933            ANNOTATE_BARRIER_END(other_threads[child_tid]);
934            (*reduce)(this_thr->th.th_local.reduce_data,
935                      other_threads[child_tid]->th.th_local.reduce_data);
936          }
937          OMPT_REDUCTION_END;
938          ANNOTATE_REDUCE_BEFORE(reduce);
939          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940        }
941        // clear leaf_state bits
942        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943      }
944      // Next, wait for higher level children on each child's b_arrived flag
945      for (kmp_uint32 d = 1; d < thr_bar->my_level;
946           ++d) { // gather lowest level threads first, but skip 0
947        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948                   skip = thr_bar->skip_per_level[d];
949        if (last > nproc)
950          last = nproc;
951        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952          kmp_info_t *child_thr = other_threads[child_tid];
953          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955                        "T#%d(%d:%d) "
956                        "arrived(%p) == %llu\n",
957                        gtid, team->t.t_id, tid,
958                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959                        child_tid, &child_bar->b_arrived, new_state));
960          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
961          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962          ANNOTATE_BARRIER_END(child_thr);
963          if (reduce) {
964            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965                           "T#%d(%d:%d)\n",
966                           gtid, team->t.t_id, tid,
967                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968                           child_tid));
969            ANNOTATE_REDUCE_AFTER(reduce);
970            (*reduce)(this_thr->th.th_local.reduce_data,
971                      child_thr->th.th_local.reduce_data);
972            ANNOTATE_REDUCE_BEFORE(reduce);
973            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974          }
975        }
976      }
977    } else { // Blocktime is not infinite
978      for (kmp_uint32 d = 0; d < thr_bar->my_level;
979           ++d) { // Gather lowest level threads first
980        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981                   skip = thr_bar->skip_per_level[d];
982        if (last > nproc)
983          last = nproc;
984        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985          kmp_info_t *child_thr = other_threads[child_tid];
986          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988                        "T#%d(%d:%d) "
989                        "arrived(%p) == %llu\n",
990                        gtid, team->t.t_id, tid,
991                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992                        child_tid, &child_bar->b_arrived, new_state));
993          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
994          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995          ANNOTATE_BARRIER_END(child_thr);
996          if (reduce) {
997            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998                           "T#%d(%d:%d)\n",
999                           gtid, team->t.t_id, tid,
1000                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001                           child_tid));
1002            ANNOTATE_REDUCE_AFTER(reduce);
1003            (*reduce)(this_thr->th.th_local.reduce_data,
1004                      child_thr->th.th_local.reduce_data);
1005            ANNOTATE_REDUCE_BEFORE(reduce);
1006            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007          }
1008        }
1009      }
1010    }
1011  }
1012  // All subordinates are gathered; now release parent if not master thread
1013
1014  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017                  gtid, team->t.t_id, tid,
1018                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021    /* Mark arrival to parent: After performing this write, a worker thread may
1022       not assume that the team is valid any more - it could be deallocated by
1023       the master thread at any time. */
1024    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026      // flag; release it
1027      ANNOTATE_BARRIER_BEGIN(this_thr);
1028      kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1029      flag.release();
1030    } else {
1031      // Leaf does special release on "offset" bits of parent's b_arrived flag
1032      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1033      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1034      flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035      flag.release();
1036    }
1037  } else { // Master thread needs to update the team's b_arrived value
1038    team->t.t_bar[bt].b_arrived = new_state;
1039    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040                  "arrived(%p) = %llu\n",
1041                  gtid, team->t.t_id, tid, team->t.t_id,
1042                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043  }
1044  // Is the team access below unsafe or just technically invalid?
1045  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046                "barrier type %d\n",
1047                gtid, team->t.t_id, tid, bt));
1048}
1049
1050static void __kmp_hierarchical_barrier_release(
1051    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054  kmp_team_t *team;
1055  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056  kmp_uint32 nproc;
1057  bool team_change = false; // indicates on-core barrier shouldn't be used
1058
1059  if (KMP_MASTER_TID(tid)) {
1060    team = __kmp_threads[gtid]->th.th_team;
1061    KMP_DEBUG_ASSERT(team != NULL);
1062    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063                  "entered barrier type %d\n",
1064                  gtid, team->t.t_id, tid, bt));
1065  } else { // Worker threads
1066    // Wait for parent thread to release me
1067    if (!thr_bar->use_oncore_barrier ||
1068        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069        thr_bar->team == NULL) {
1070      // Use traditional method of waiting on my own b_go flag
1071      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072      kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074      ANNOTATE_BARRIER_END(this_thr);
1075      TCW_8(thr_bar->b_go,
1076            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078      // infinite, not nested
1079      // Wait on my "offset" bits on parent's b_go flag
1080      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082                           thr_bar->offset, bt,
1083                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084      flag.wait(this_thr, TRUE);
1085      if (thr_bar->wait_flag ==
1086          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087        TCW_8(thr_bar->b_go,
1088              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089      } else { // Reset my bits on parent's b_go flag
1090        (RCAST(volatile char *,
1091               &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092      }
1093    }
1094    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095    // Early exit for reaping threads releasing forkjoin barrier
1096    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097      return;
1098    // The worker thread may now assume that the team is valid.
1099    team = __kmp_threads[gtid]->th.th_team;
1100    KMP_DEBUG_ASSERT(team != NULL);
1101    tid = __kmp_tid_from_gtid(gtid);
1102
1103    KA_TRACE(
1104        20,
1105        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107    KMP_MB(); // Flush all pending memory write invalidates.
1108  }
1109
1110  nproc = this_thr->th.th_team_nproc;
1111  int level = team->t.t_level;
1112  if (team->t.t_threads[0]
1113          ->th.th_teams_microtask) { // are we inside the teams construct?
1114    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115        this_thr->th.th_teams_level == level)
1116      ++level; // level was not increased in teams construct for team_of_workers
1117    if (this_thr->th.th_teams_size.nteams > 1)
1118      ++level; // level was not increased in teams construct for team_of_masters
1119  }
1120  if (level == 1)
1121    thr_bar->use_oncore_barrier = 1;
1122  else
1123    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124
1125  // If the team size has increased, we still communicate with old leaves via
1126  // oncore barrier.
1127  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130                                                       tid, team);
1131  // But if the entire team changes, we won't use oncore barrier at all
1132  if (team_change)
1133    old_leaf_kids = 0;
1134
1135#if KMP_BARRIER_ICV_PUSH
1136  if (propagate_icvs) {
1137    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138                             FALSE);
1139    if (KMP_MASTER_TID(
1140            tid)) { // master already has copy in final destination; copy
1141      copy_icvs(&thr_bar->th_fixed_icvs,
1142                &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146        // leaves (on-core children) pull parent's fixed ICVs directly to local
1147        // ICV store
1148        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149                  &thr_bar->parent_bar->th_fixed_icvs);
1150      // non-leaves will get ICVs piggybacked with b_go via NGO store
1151    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153        // access
1154        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155      else // leaves copy parent's fixed ICVs directly to local ICV store
1156        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157                  &thr_bar->parent_bar->th_fixed_icvs);
1158    }
1159  }
1160#endif // KMP_BARRIER_ICV_PUSH
1161
1162  // Now, release my children
1163  if (thr_bar->my_level) { // not a leaf
1164    kmp_int32 child_tid;
1165    kmp_uint32 last;
1166    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167        thr_bar->use_oncore_barrier) {
1168      if (KMP_MASTER_TID(tid)) { // do a flat release
1169        // Set local b_go to bump children via NGO store of the cache line
1170        // containing IVCs and b_go.
1171        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173        // the cache line
1174        ngo_load(&thr_bar->th_fixed_icvs);
1175        // This loops over all the threads skipping only the leaf nodes in the
1176        // hierarchy
1177        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178             child_tid += thr_bar->skip_per_level[1]) {
1179          kmp_bstate_t *child_bar =
1180              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182                        "releasing T#%d(%d:%d)"
1183                        " go(%p): %u => %u\n",
1184                        gtid, team->t.t_id, tid,
1185                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186                        child_tid, &child_bar->b_go, child_bar->b_go,
1187                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188          // Use ngo store (if available) to both store ICVs and release child
1189          // via child's b_go
1190          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191        }
1192        ngo_sync();
1193      }
1194      TCW_8(thr_bar->b_go,
1195            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196      // Now, release leaf children
1197      if (thr_bar->leaf_kids) { // if there are any
1198        // We test team_change on the off-chance that the level 1 team changed.
1199        if (team_change ||
1200            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201          if (old_leaf_kids) { // release old leaf kids
1202            thr_bar->b_go |= old_leaf_state;
1203          }
1204          // Release new leaf kids
1205          last = tid + thr_bar->skip_per_level[1];
1206          if (last > nproc)
1207            last = nproc;
1208          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209               ++child_tid) { // skip_per_level[0]=1
1210            kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212            KA_TRACE(
1213                20,
1214                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215                 " T#%d(%d:%d) go(%p): %u => %u\n",
1216                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219            // Release child using child's b_go flag
1220            ANNOTATE_BARRIER_BEGIN(child_thr);
1221            kmp_flag_64 flag(&child_bar->b_go, child_thr);
1222            flag.release();
1223          }
1224        } else { // Release all children at once with leaf_state bits on my own
1225          // b_go flag
1226          thr_bar->b_go |= thr_bar->leaf_state;
1227        }
1228      }
1229    } else { // Blocktime is not infinite; do a simple hierarchical release
1230      for (int d = thr_bar->my_level - 1; d >= 0;
1231           --d) { // Release highest level threads first
1232        last = tid + thr_bar->skip_per_level[d + 1];
1233        kmp_uint32 skip = thr_bar->skip_per_level[d];
1234        if (last > nproc)
1235          last = nproc;
1236        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237          kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241                        gtid, team->t.t_id, tid,
1242                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243                        child_tid, &child_bar->b_go, child_bar->b_go,
1244                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245          // Release child using child's b_go flag
1246          ANNOTATE_BARRIER_BEGIN(child_thr);
1247          kmp_flag_64 flag(&child_bar->b_go, child_thr);
1248          flag.release();
1249        }
1250      }
1251    }
1252#if KMP_BARRIER_ICV_PUSH
1253    if (propagate_icvs && !KMP_MASTER_TID(tid))
1254      // non-leaves copy ICVs from fixed ICVs to local dest
1255      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256                &thr_bar->th_fixed_icvs);
1257#endif // KMP_BARRIER_ICV_PUSH
1258  }
1259  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260                "barrier type %d\n",
1261                gtid, team->t.t_id, tid, bt));
1262}
1263
1264// End of Barrier Algorithms
1265
1266// type traits for cancellable value
1267// if cancellable is true, then is_cancellable is a normal boolean variable
1268// if cancellable is false, then is_cancellable is a compile time constant
1269template <bool cancellable> struct is_cancellable {};
1270template <> struct is_cancellable<true> {
1271  bool value;
1272  is_cancellable() : value(false) {}
1273  is_cancellable(bool b) : value(b) {}
1274  is_cancellable &operator=(bool b) {
1275    value = b;
1276    return *this;
1277  }
1278  operator bool() const { return value; }
1279};
1280template <> struct is_cancellable<false> {
1281  is_cancellable &operator=(bool b) { return *this; }
1282  constexpr operator bool() const { return false; }
1283};
1284
1285// Internal function to do a barrier.
1286/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288   barrier
1289   When cancellable = false,
1290     Returns 0 if master thread, 1 if worker thread.
1291   When cancellable = true
1292     Returns 0 if not cancelled, 1 if cancelled.  */
1293template <bool cancellable = false>
1294static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295                                  size_t reduce_size, void *reduce_data,
1296                                  void (*reduce)(void *, void *)) {
1297  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299  int tid = __kmp_tid_from_gtid(gtid);
1300  kmp_info_t *this_thr = __kmp_threads[gtid];
1301  kmp_team_t *team = this_thr->th.th_team;
1302  int status = 0;
1303  is_cancellable<cancellable> cancelled;
1304#if OMPT_SUPPORT && OMPT_OPTIONAL
1305  ompt_data_t *my_task_data;
1306  ompt_data_t *my_parallel_data;
1307  void *return_address;
1308  ompt_sync_region_t barrier_kind;
1309#endif
1310
1311  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313
1314  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315#if OMPT_SUPPORT
1316  if (ompt_enabled.enabled) {
1317#if OMPT_OPTIONAL
1318    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322    if (ompt_enabled.ompt_callback_sync_region) {
1323      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325          return_address);
1326    }
1327    if (ompt_enabled.ompt_callback_sync_region_wait) {
1328      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330          return_address);
1331    }
1332#endif
1333    // It is OK to report the barrier state after the barrier begin callback.
1334    // According to the OMPT specification, a compliant implementation may
1335    // even delay reporting this state until the barrier begins to wait.
1336    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337  }
1338#endif
1339
1340  if (!team->t.t_serialized) {
1341#if USE_ITT_BUILD
1342    // This value will be used in itt notify events below.
1343    void *itt_sync_obj = NULL;
1344#if USE_ITT_NOTIFY
1345    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347#endif
1348#endif /* USE_ITT_BUILD */
1349    if (__kmp_tasking_mode == tskm_extra_barrier) {
1350      __kmp_tasking_barrier(team, this_thr, gtid);
1351      KA_TRACE(15,
1352               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354    }
1355
1356    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357       access it when the team struct is not guaranteed to exist. */
1358    // See note about the corresponding code in __kmp_join_barrier() being
1359    // performance-critical.
1360    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361#if KMP_USE_MONITOR
1362      this_thr->th.th_team_bt_intervals =
1363          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364      this_thr->th.th_team_bt_set =
1365          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366#else
1367      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368#endif
1369    }
1370
1371#if USE_ITT_BUILD
1372    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374#endif /* USE_ITT_BUILD */
1375#if USE_DEBUGGER
1376    // Let the debugger know: the thread arrived to the barrier and waiting.
1377    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378      team->t.t_bar[bt].b_master_arrived += 1;
1379    } else {
1380      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381    } // if
1382#endif /* USE_DEBUGGER */
1383    if (reduce != NULL) {
1384      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1385      this_thr->th.th_local.reduce_data = reduce_data;
1386    }
1387
1388    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389      // use 0 to only setup the current team if nthreads > 1
1390      __kmp_task_team_setup(this_thr, team, 0);
1391
1392    if (cancellable) {
1393      cancelled = __kmp_linear_barrier_gather_cancellable(
1394          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395    } else {
1396      switch (__kmp_barrier_gather_pattern[bt]) {
1397      case bp_hyper_bar: {
1398        // don't set branch bits to 0; use linear
1399        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402        break;
1403      }
1404      case bp_hierarchical_bar: {
1405        __kmp_hierarchical_barrier_gather(
1406            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407        break;
1408      }
1409      case bp_tree_bar: {
1410        // don't set branch bits to 0; use linear
1411        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414        break;
1415      }
1416      default: {
1417        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419      }
1420      }
1421    }
1422
1423    KMP_MB();
1424
1425    if (KMP_MASTER_TID(tid)) {
1426      status = 0;
1427      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429      }
1430#if USE_DEBUGGER
1431      // Let the debugger know: All threads are arrived and starting leaving the
1432      // barrier.
1433      team->t.t_bar[bt].b_team_arrived += 1;
1434#endif
1435
1436      if (__kmp_omp_cancellation) {
1437        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438        // Reset cancellation flag for worksharing constructs
1439        if (cancel_request == cancel_loop ||
1440            cancel_request == cancel_sections) {
1441          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442        }
1443      }
1444#if USE_ITT_BUILD
1445      /* TODO: In case of split reduction barrier, master thread may send
1446         acquired event early, before the final summation into the shared
1447         variable is done (final summation can be a long operation for array
1448         reductions).  */
1449      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451#endif /* USE_ITT_BUILD */
1452#if USE_ITT_BUILD && USE_ITT_NOTIFY
1453      // Barrier - report frame end (only if active_level == 1)
1454      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455          __kmp_forkjoin_frames_mode &&
1456          this_thr->th.th_teams_microtask == NULL &&
1457          team->t.t_active_level == 1) {
1458        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1459        kmp_uint64 cur_time = __itt_get_timestamp();
1460        kmp_info_t **other_threads = team->t.t_threads;
1461        int nproc = this_thr->th.th_team_nproc;
1462        int i;
1463        switch (__kmp_forkjoin_frames_mode) {
1464        case 1:
1465          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1466                                 loc, nproc);
1467          this_thr->th.th_frame_time = cur_time;
1468          break;
1469        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1470          // be fixed)
1471          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1472                                 1, loc, nproc);
1473          break;
1474        case 3:
1475          if (__itt_metadata_add_ptr) {
1476            // Initialize with master's wait time
1477            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478            // Set arrive time to zero to be able to check it in
1479            // __kmp_invoke_task(); the same is done inside the loop below
1480            this_thr->th.th_bar_arrive_time = 0;
1481            for (i = 1; i < nproc; ++i) {
1482              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1483              other_threads[i]->th.th_bar_arrive_time = 0;
1484            }
1485            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1486                                         cur_time, delta,
1487                                         (kmp_uint64)(reduce != NULL));
1488          }
1489          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1490                                 loc, nproc);
1491          this_thr->th.th_frame_time = cur_time;
1492          break;
1493        }
1494      }
1495#endif /* USE_ITT_BUILD */
1496    } else {
1497      status = 1;
1498#if USE_ITT_BUILD
1499      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501#endif /* USE_ITT_BUILD */
1502    }
1503    if ((status == 1 || !is_split) && !cancelled) {
1504      if (cancellable) {
1505        cancelled = __kmp_linear_barrier_release_cancellable(
1506            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507      } else {
1508        switch (__kmp_barrier_release_pattern[bt]) {
1509        case bp_hyper_bar: {
1510          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1511          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1512                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513          break;
1514        }
1515        case bp_hierarchical_bar: {
1516          __kmp_hierarchical_barrier_release(
1517              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518          break;
1519        }
1520        case bp_tree_bar: {
1521          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1522          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1523                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524          break;
1525        }
1526        default: {
1527          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1528                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1529        }
1530        }
1531      }
1532      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1533        __kmp_task_team_sync(this_thr, team);
1534      }
1535    }
1536
1537#if USE_ITT_BUILD
1538    /* GEH: TODO: Move this under if-condition above and also include in
1539       __kmp_end_split_barrier(). This will more accurately represent the actual
1540       release time of the threads for split barriers.  */
1541    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1542      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1543#endif /* USE_ITT_BUILD */
1544  } else { // Team is serialized.
1545    status = 0;
1546    if (__kmp_tasking_mode != tskm_immediate_exec) {
1547      if (this_thr->th.th_task_team != NULL) {
1548#if USE_ITT_NOTIFY
1549        void *itt_sync_obj = NULL;
1550        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1552          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1553        }
1554#endif
1555
1556        KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1557                         TRUE);
1558        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1559        __kmp_task_team_setup(this_thr, team, 0);
1560
1561#if USE_ITT_BUILD
1562        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1563          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564#endif /* USE_ITT_BUILD */
1565      }
1566    }
1567  }
1568  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1569                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1570                __kmp_tid_from_gtid(gtid), status));
1571
1572#if OMPT_SUPPORT
1573  if (ompt_enabled.enabled) {
1574#if OMPT_OPTIONAL
1575    if (ompt_enabled.ompt_callback_sync_region_wait) {
1576      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1577          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578          return_address);
1579    }
1580    if (ompt_enabled.ompt_callback_sync_region) {
1581      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1582          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583          return_address);
1584    }
1585#endif
1586    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587  }
1588#endif
1589  ANNOTATE_BARRIER_END(&team->t.t_bar);
1590
1591  if (cancellable)
1592    return (int)cancelled;
1593  return status;
1594}
1595
1596// Returns 0 if master thread, 1 if worker thread.
1597int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1598                  size_t reduce_size, void *reduce_data,
1599                  void (*reduce)(void *, void *)) {
1600  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1601                                  reduce);
1602}
1603
1604#if defined(KMP_GOMP_COMPAT)
1605// Returns 1 if cancelled, 0 otherwise
1606int __kmp_barrier_gomp_cancel(int gtid) {
1607  if (__kmp_omp_cancellation) {
1608    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609                                                 0, NULL, NULL);
1610    if (cancelled) {
1611      int tid = __kmp_tid_from_gtid(gtid);
1612      kmp_info_t *this_thr = __kmp_threads[gtid];
1613      if (KMP_MASTER_TID(tid)) {
1614        // Master does not need to revert anything
1615      } else {
1616        // Workers need to revert their private b_arrived flag
1617        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1618            KMP_BARRIER_STATE_BUMP;
1619      }
1620    }
1621    return cancelled;
1622  }
1623  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1624  return FALSE;
1625}
1626#endif
1627
1628void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1629  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1630  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1631  int tid = __kmp_tid_from_gtid(gtid);
1632  kmp_info_t *this_thr = __kmp_threads[gtid];
1633  kmp_team_t *team = this_thr->th.th_team;
1634
1635  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1636  if (!team->t.t_serialized) {
1637    if (KMP_MASTER_GTID(gtid)) {
1638      switch (__kmp_barrier_release_pattern[bt]) {
1639      case bp_hyper_bar: {
1640        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1641        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1642                                    FALSE USE_ITT_BUILD_ARG(NULL));
1643        break;
1644      }
1645      case bp_hierarchical_bar: {
1646        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1647                                           FALSE USE_ITT_BUILD_ARG(NULL));
1648        break;
1649      }
1650      case bp_tree_bar: {
1651        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1652        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1653                                   FALSE USE_ITT_BUILD_ARG(NULL));
1654        break;
1655      }
1656      default: {
1657        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1658                                     FALSE USE_ITT_BUILD_ARG(NULL));
1659      }
1660      }
1661      if (__kmp_tasking_mode != tskm_immediate_exec) {
1662        __kmp_task_team_sync(this_thr, team);
1663      } // if
1664    }
1665  }
1666  ANNOTATE_BARRIER_END(&team->t.t_bar);
1667}
1668
1669void __kmp_join_barrier(int gtid) {
1670  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1671  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1672  kmp_info_t *this_thr = __kmp_threads[gtid];
1673  kmp_team_t *team;
1674  kmp_uint nproc;
1675  kmp_info_t *master_thread;
1676  int tid;
1677#ifdef KMP_DEBUG
1678  int team_id;
1679#endif /* KMP_DEBUG */
1680#if USE_ITT_BUILD
1681  void *itt_sync_obj = NULL;
1682#if USE_ITT_NOTIFY
1683  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1684    // Get object created at fork_barrier
1685    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686#endif
1687#endif /* USE_ITT_BUILD */
1688  KMP_MB();
1689
1690  // Get current info
1691  team = this_thr->th.th_team;
1692  nproc = this_thr->th.th_team_nproc;
1693  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1694  tid = __kmp_tid_from_gtid(gtid);
1695#ifdef KMP_DEBUG
1696  team_id = team->t.t_id;
1697#endif /* KMP_DEBUG */
1698  master_thread = this_thr->th.th_team_master;
1699#ifdef KMP_DEBUG
1700  if (master_thread != team->t.t_threads[0]) {
1701    __kmp_print_structure();
1702  }
1703#endif /* KMP_DEBUG */
1704  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1705  KMP_MB();
1706
1707  // Verify state
1708  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1709  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1710  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1711  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1712  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1713                gtid, team_id, tid));
1714
1715  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1716#if OMPT_SUPPORT
1717  if (ompt_enabled.enabled) {
1718#if OMPT_OPTIONAL
1719    ompt_data_t *my_task_data;
1720    ompt_data_t *my_parallel_data;
1721    void *codeptr = NULL;
1722    int ds_tid = this_thr->th.th_info.ds.ds_tid;
1723    if (KMP_MASTER_TID(ds_tid) &&
1724        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1725         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1726      codeptr = team->t.ompt_team_info.master_return_address;
1727    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1728    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1729    if (ompt_enabled.ompt_callback_sync_region) {
1730      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1731          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1732          my_task_data, codeptr);
1733    }
1734    if (ompt_enabled.ompt_callback_sync_region_wait) {
1735      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1736          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1737          my_task_data, codeptr);
1738    }
1739    if (!KMP_MASTER_TID(ds_tid))
1740      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1741#endif
1742    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1743  }
1744#endif
1745
1746  if (__kmp_tasking_mode == tskm_extra_barrier) {
1747    __kmp_tasking_barrier(team, this_thr, gtid);
1748    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1749                  team_id, tid));
1750  }
1751#ifdef KMP_DEBUG
1752  if (__kmp_tasking_mode != tskm_immediate_exec) {
1753    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1754                  "%p, th_task_team = %p\n",
1755                  __kmp_gtid_from_thread(this_thr), team_id,
1756                  team->t.t_task_team[this_thr->th.th_task_state],
1757                  this_thr->th.th_task_team));
1758    KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1759                     team->t.t_task_team[this_thr->th.th_task_state]);
1760  }
1761#endif /* KMP_DEBUG */
1762
1763  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1764     access it when the team struct is not guaranteed to exist. Doing these
1765     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1766     we do not perform the copy if blocktime=infinite, since the values are not
1767     used by __kmp_wait_template() in that case. */
1768  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1769#if KMP_USE_MONITOR
1770    this_thr->th.th_team_bt_intervals =
1771        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1772    this_thr->th.th_team_bt_set =
1773        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1774#else
1775    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1776#endif
1777  }
1778
1779#if USE_ITT_BUILD
1780  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1781    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782#endif /* USE_ITT_BUILD */
1783
1784  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1785  case bp_hyper_bar: {
1786    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1787    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1788                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789    break;
1790  }
1791  case bp_hierarchical_bar: {
1792    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1793                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794    break;
1795  }
1796  case bp_tree_bar: {
1797    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1798    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1799                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800    break;
1801  }
1802  default: {
1803    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1804                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1805  }
1806  }
1807
1808  /* From this point on, the team data structure may be deallocated at any time
1809     by the master thread - it is unsafe to reference it in any of the worker
1810     threads. Any per-team data items that need to be referenced before the
1811     end of the barrier should be moved to the kmp_task_team_t structs.  */
1812  if (KMP_MASTER_TID(tid)) {
1813    if (__kmp_tasking_mode != tskm_immediate_exec) {
1814      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1815    }
1816    if (__kmp_display_affinity) {
1817      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1818    }
1819#if KMP_STATS_ENABLED
1820    // Have master thread flag the workers to indicate they are now waiting for
1821    // next parallel region, Also wake them up so they switch their timers to
1822    // idle.
1823    for (int i = 0; i < team->t.t_nproc; ++i) {
1824      kmp_info_t *team_thread = team->t.t_threads[i];
1825      if (team_thread == this_thr)
1826        continue;
1827      team_thread->th.th_stats->setIdleFlag();
1828      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1829          team_thread->th.th_sleep_loc != NULL)
1830        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1831                                  team_thread->th.th_sleep_loc);
1832    }
1833#endif
1834#if USE_ITT_BUILD
1835    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837#endif /* USE_ITT_BUILD */
1838
1839#if USE_ITT_BUILD && USE_ITT_NOTIFY
1840    // Join barrier - report frame end
1841    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1842        __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1843        team->t.t_active_level == 1) {
1844      kmp_uint64 cur_time = __itt_get_timestamp();
1845      ident_t *loc = team->t.t_ident;
1846      kmp_info_t **other_threads = team->t.t_threads;
1847      int nproc = this_thr->th.th_team_nproc;
1848      int i;
1849      switch (__kmp_forkjoin_frames_mode) {
1850      case 1:
1851        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1852                               loc, nproc);
1853        break;
1854      case 2:
1855        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1856                               loc, nproc);
1857        break;
1858      case 3:
1859        if (__itt_metadata_add_ptr) {
1860          // Initialize with master's wait time
1861          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1862          // Set arrive time to zero to be able to check it in
1863          // __kmp_invoke_task(); the same is done inside the loop below
1864          this_thr->th.th_bar_arrive_time = 0;
1865          for (i = 1; i < nproc; ++i) {
1866            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1867            other_threads[i]->th.th_bar_arrive_time = 0;
1868          }
1869          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1870                                       cur_time, delta, 0);
1871        }
1872        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1873                               loc, nproc);
1874        this_thr->th.th_frame_time = cur_time;
1875        break;
1876      }
1877    }
1878#endif /* USE_ITT_BUILD */
1879  }
1880#if USE_ITT_BUILD
1881  else {
1882    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1883      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1884  }
1885#endif /* USE_ITT_BUILD */
1886
1887#if KMP_DEBUG
1888  if (KMP_MASTER_TID(tid)) {
1889    KA_TRACE(
1890        15,
1891        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1892         gtid, team_id, tid, nproc));
1893  }
1894#endif /* KMP_DEBUG */
1895
1896  // TODO now, mark worker threads as done so they may be disbanded
1897  KMP_MB(); // Flush all pending memory write invalidates.
1898  KA_TRACE(10,
1899           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1900
1901  ANNOTATE_BARRIER_END(&team->t.t_bar);
1902}
1903
1904// TODO release worker threads' fork barriers as we are ready instead of all at
1905// once
1906void __kmp_fork_barrier(int gtid, int tid) {
1907  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1908  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1909  kmp_info_t *this_thr = __kmp_threads[gtid];
1910  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1911#if USE_ITT_BUILD
1912  void *itt_sync_obj = NULL;
1913#endif /* USE_ITT_BUILD */
1914  if (team)
1915    ANNOTATE_BARRIER_END(&team->t.t_bar);
1916
1917  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1918                (team != NULL) ? team->t.t_id : -1, tid));
1919
1920  // th_team pointer only valid for master thread here
1921  if (KMP_MASTER_TID(tid)) {
1922#if USE_ITT_BUILD && USE_ITT_NOTIFY
1923    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1924      // Create itt barrier object
1925      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1926      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1927    }
1928#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1929
1930#ifdef KMP_DEBUG
1931    kmp_info_t **other_threads = team->t.t_threads;
1932    int i;
1933
1934    // Verify state
1935    KMP_MB();
1936
1937    for (i = 1; i < team->t.t_nproc; ++i) {
1938      KA_TRACE(500,
1939               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1940                "== %u.\n",
1941                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1942                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1943                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1944      KMP_DEBUG_ASSERT(
1945          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1946           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1947      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1948    }
1949#endif
1950
1951    if (__kmp_tasking_mode != tskm_immediate_exec) {
1952      // 0 indicates setup current task team if nthreads > 1
1953      __kmp_task_team_setup(this_thr, team, 0);
1954    }
1955
1956    /* The master thread may have changed its blocktime between the join barrier
1957       and the fork barrier. Copy the blocktime info to the thread, where
1958       __kmp_wait_template() can access it when the team struct is not
1959       guaranteed to exist. */
1960    // See note about the corresponding code in __kmp_join_barrier() being
1961    // performance-critical
1962    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1963#if KMP_USE_MONITOR
1964      this_thr->th.th_team_bt_intervals =
1965          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1966      this_thr->th.th_team_bt_set =
1967          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1968#else
1969      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1970#endif
1971    }
1972  } // master
1973
1974  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1975  case bp_hyper_bar: {
1976    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1977    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1978                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1979    break;
1980  }
1981  case bp_hierarchical_bar: {
1982    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984    break;
1985  }
1986  case bp_tree_bar: {
1987    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1988    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1989                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1990    break;
1991  }
1992  default: {
1993    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995  }
1996  }
1997
1998#if OMPT_SUPPORT
1999  if (ompt_enabled.enabled &&
2000      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2001    int ds_tid = this_thr->th.th_info.ds.ds_tid;
2002    ompt_data_t *task_data = (team)
2003                                 ? OMPT_CUR_TASK_DATA(this_thr)
2004                                 : &(this_thr->th.ompt_thread_info.task_data);
2005    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2006#if OMPT_OPTIONAL
2007    void *codeptr = NULL;
2008    if (KMP_MASTER_TID(ds_tid) &&
2009        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2010         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2011      codeptr = team->t.ompt_team_info.master_return_address;
2012    if (ompt_enabled.ompt_callback_sync_region_wait) {
2013      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2014          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2015          codeptr);
2016    }
2017    if (ompt_enabled.ompt_callback_sync_region) {
2018      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2019          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020          codeptr);
2021    }
2022#endif
2023    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2024      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2025          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2026    }
2027  }
2028#endif
2029
2030  // Early exit for reaping threads releasing forkjoin barrier
2031  if (TCR_4(__kmp_global.g.g_done)) {
2032    this_thr->th.th_task_team = NULL;
2033
2034#if USE_ITT_BUILD && USE_ITT_NOTIFY
2035    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2036      if (!KMP_MASTER_TID(tid)) {
2037        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2038        if (itt_sync_obj)
2039          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2040      }
2041    }
2042#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2043    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2044    return;
2045  }
2046
2047  /* We can now assume that a valid team structure has been allocated by the
2048     master and propagated to all worker threads. The current thread, however,
2049     may not be part of the team, so we can't blindly assume that the team
2050     pointer is non-null.  */
2051  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2052  KMP_DEBUG_ASSERT(team != NULL);
2053  tid = __kmp_tid_from_gtid(gtid);
2054
2055#if KMP_BARRIER_ICV_PULL
2056  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2057     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2058     implicit task has this data before this function is called. We cannot
2059     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2060     struct, because it is not always the case that the threads arrays have
2061     been allocated when __kmp_fork_call() is executed. */
2062  {
2063    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2064    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2065      // Copy the initial ICVs from the master's thread struct to the implicit
2066      // task for this tid.
2067      KA_TRACE(10,
2068               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2069      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2070                               tid, FALSE);
2071      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2072                &team->t.t_threads[0]
2073                     ->th.th_bar[bs_forkjoin_barrier]
2074                     .bb.th_fixed_icvs);
2075    }
2076  }
2077#endif // KMP_BARRIER_ICV_PULL
2078
2079  if (__kmp_tasking_mode != tskm_immediate_exec) {
2080    __kmp_task_team_sync(this_thr, team);
2081  }
2082
2083#if KMP_AFFINITY_SUPPORTED
2084  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2085  if (proc_bind == proc_bind_intel) {
2086    // Call dynamic affinity settings
2087    if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2088      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2089    }
2090  } else if (proc_bind != proc_bind_false) {
2091    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2092      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2093                     __kmp_gtid_from_thread(this_thr),
2094                     this_thr->th.th_current_place));
2095    } else {
2096      __kmp_affinity_set_place(gtid);
2097    }
2098  }
2099#endif // KMP_AFFINITY_SUPPORTED
2100  // Perform the display affinity functionality
2101  if (__kmp_display_affinity) {
2102    if (team->t.t_display_affinity
2103#if KMP_AFFINITY_SUPPORTED
2104        || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2105#endif
2106            ) {
2107      // NULL means use the affinity-format-var ICV
2108      __kmp_aux_display_affinity(gtid, NULL);
2109      this_thr->th.th_prev_num_threads = team->t.t_nproc;
2110      this_thr->th.th_prev_level = team->t.t_level;
2111    }
2112  }
2113  if (!KMP_MASTER_TID(tid))
2114    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2115
2116#if USE_ITT_BUILD && USE_ITT_NOTIFY
2117  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2118    if (!KMP_MASTER_TID(tid)) {
2119      // Get correct barrier object
2120      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2121      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2122    } // (prepare called inside barrier_release)
2123  }
2124#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2125  ANNOTATE_BARRIER_END(&team->t.t_bar);
2126  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2127                team->t.t_id, tid));
2128}
2129
2130void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2131                          kmp_internal_control_t *new_icvs, ident_t *loc) {
2132  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2133
2134  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2135  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2136
2137/* Master thread's copy of the ICVs was set up on the implicit taskdata in
2138   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2139   implicit task has this data before this function is called. */
2140#if KMP_BARRIER_ICV_PULL
2141  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2142     untouched), where all of the worker threads can access them and make their
2143     own copies after the barrier. */
2144  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2145  // allocated at this point
2146  copy_icvs(
2147      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2148      new_icvs);
2149  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2150                team->t.t_threads[0], team));
2151#elif KMP_BARRIER_ICV_PUSH
2152  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2153  // done here.
2154  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2155                team->t.t_threads[0], team));
2156#else
2157  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2158  // time.
2159  ngo_load(new_icvs);
2160  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2161  // allocated at this point
2162  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2163    // TODO: GEH - pass in better source location info since usually NULL here
2164    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2165                  f, team->t.t_threads[f], team));
2166    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2167    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2168    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2169                  f, team->t.t_threads[f], team));
2170  }
2171  ngo_sync();
2172#endif // KMP_BARRIER_ICV_PULL
2173}
2174