kmp_runtime.cpp revision 360660
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34
35/* these are temporary issues to be dealt with */
36#define KMP_USE_PRCTL 0
37
38#if KMP_OS_WINDOWS
39#include <process.h>
40#endif
41
42#include "tsan_annotations.h"
43
44#if defined(KMP_GOMP_COMPAT)
45char const __kmp_version_alt_comp[] =
46    KMP_VERSION_PREFIX "alternative compiler support: yes";
47#endif /* defined(KMP_GOMP_COMPAT) */
48
49char const __kmp_version_omp_api[] =
50    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51
52#ifdef KMP_DEBUG
53char const __kmp_version_lock[] =
54    KMP_VERSION_PREFIX "lock type: run time selectable";
55#endif /* KMP_DEBUG */
56
57#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58
59/* ------------------------------------------------------------------------ */
60
61#if KMP_USE_MONITOR
62kmp_info_t __kmp_monitor;
63#endif
64
65/* Forward declarations */
66
67void __kmp_cleanup(void);
68
69static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                  int gtid);
71static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                  kmp_internal_control_t *new_icvs,
73                                  ident_t *loc);
74#if KMP_AFFINITY_SUPPORTED
75static void __kmp_partition_places(kmp_team_t *team,
76                                   int update_master_only = 0);
77#endif
78static void __kmp_do_serial_initialize(void);
79void __kmp_fork_barrier(int gtid, int tid);
80void __kmp_join_barrier(int gtid);
81void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                          kmp_internal_control_t *new_icvs, ident_t *loc);
83
84#ifdef USE_LOAD_BALANCE
85static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86#endif
87
88static int __kmp_expand_threads(int nNeed);
89#if KMP_OS_WINDOWS
90static int __kmp_unregister_root_other_thread(int gtid);
91#endif
92static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95
96/* Calculate the identifier of the current thread */
97/* fast (and somewhat portable) way to get unique identifier of executing
98   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99int __kmp_get_global_thread_id() {
100  int i;
101  kmp_info_t **other_threads;
102  size_t stack_data;
103  char *stack_addr;
104  size_t stack_size;
105  char *stack_base;
106
107  KA_TRACE(
108      1000,
109      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110       __kmp_nth, __kmp_all_nth));
111
112  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115     __kmp_init_gtid for this to work. */
116
117  if (!TCR_4(__kmp_init_gtid))
118    return KMP_GTID_DNE;
119
120#ifdef KMP_TDATA_GTID
121  if (TCR_4(__kmp_gtid_mode) >= 3) {
122    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123    return __kmp_gtid;
124  }
125#endif
126  if (TCR_4(__kmp_gtid_mode) >= 2) {
127    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128    return __kmp_gtid_get_specific();
129  }
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131
132  stack_addr = (char *)&stack_data;
133  other_threads = __kmp_threads;
134
135  /* ATT: The code below is a source of potential bugs due to unsynchronized
136     access to __kmp_threads array. For example:
137     1. Current thread loads other_threads[i] to thr and checks it, it is
138        non-NULL.
139     2. Current thread is suspended by OS.
140     3. Another thread unregisters and finishes (debug versions of free()
141        may fill memory with something like 0xEF).
142     4. Current thread is resumed.
143     5. Current thread reads junk from *thr.
144     TODO: Fix it.  --ln  */
145
146  for (i = 0; i < __kmp_threads_capacity; i++) {
147
148    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149    if (!thr)
150      continue;
151
152    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154
155    /* stack grows down -- search through all of the active threads */
156
157    if (stack_addr <= stack_base) {
158      size_t stack_diff = stack_base - stack_addr;
159
160      if (stack_diff <= stack_size) {
161        /* The only way we can be closer than the allocated */
162        /* stack size is if we are running on this thread. */
163        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164        return i;
165      }
166    }
167  }
168
169  /* get specific to try and determine our gtid */
170  KA_TRACE(1000,
171           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172            "thread, using TLS\n"));
173  i = __kmp_gtid_get_specific();
174
175  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176
177  /* if we havn't been assigned a gtid, then return code */
178  if (i < 0)
179    return i;
180
181  /* dynamically updated stack window for uber threads to avoid get_specific
182     call */
183  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184    KMP_FATAL(StackOverflow, i);
185  }
186
187  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188  if (stack_addr > stack_base) {
189    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                stack_base);
193  } else {
194    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195            stack_base - stack_addr);
196  }
197
198  /* Reprint stack bounds for ubermaster since they have been refined */
199  if (__kmp_storage_map) {
200    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                 other_threads[i]->th.th_info.ds.ds_stacksize,
204                                 "th_%d stack (refinement)", i);
205  }
206  return i;
207}
208
209int __kmp_get_global_thread_id_reg() {
210  int gtid;
211
212  if (!__kmp_init_serial) {
213    gtid = KMP_GTID_DNE;
214  } else
215#ifdef KMP_TDATA_GTID
216      if (TCR_4(__kmp_gtid_mode) >= 3) {
217    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218    gtid = __kmp_gtid;
219  } else
220#endif
221      if (TCR_4(__kmp_gtid_mode) >= 2) {
222    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223    gtid = __kmp_gtid_get_specific();
224  } else {
225    KA_TRACE(1000,
226             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227    gtid = __kmp_get_global_thread_id();
228  }
229
230  /* we must be a new uber master sibling thread */
231  if (gtid == KMP_GTID_DNE) {
232    KA_TRACE(10,
233             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234              "Registering a new gtid.\n"));
235    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236    if (!__kmp_init_serial) {
237      __kmp_do_serial_initialize();
238      gtid = __kmp_gtid_get_specific();
239    } else {
240      gtid = __kmp_register_root(FALSE);
241    }
242    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244  }
245
246  KMP_DEBUG_ASSERT(gtid >= 0);
247
248  return gtid;
249}
250
251/* caller must hold forkjoin_lock */
252void __kmp_check_stack_overlap(kmp_info_t *th) {
253  int f;
254  char *stack_beg = NULL;
255  char *stack_end = NULL;
256  int gtid;
257
258  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259  if (__kmp_storage_map) {
260    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262
263    gtid = __kmp_gtid_from_thread(th);
264
265    if (gtid == KMP_GTID_MONITOR) {
266      __kmp_print_storage_map_gtid(
267          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268          "th_%s stack (%s)", "mon",
269          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270    } else {
271      __kmp_print_storage_map_gtid(
272          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273          "th_%d stack (%s)", gtid,
274          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275    }
276  }
277
278  /* No point in checking ubermaster threads since they use refinement and
279   * cannot overlap */
280  gtid = __kmp_gtid_from_thread(th);
281  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282    KA_TRACE(10,
283             ("__kmp_check_stack_overlap: performing extensive checking\n"));
284    if (stack_beg == NULL) {
285      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287    }
288
289    for (f = 0; f < __kmp_threads_capacity; f++) {
290      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291
292      if (f_th && f_th != th) {
293        char *other_stack_end =
294            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295        char *other_stack_beg =
296            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299
300          /* Print the other stack values before the abort */
301          if (__kmp_storage_map)
302            __kmp_print_storage_map_gtid(
303                -1, other_stack_beg, other_stack_end,
304                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306
307          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                      __kmp_msg_null);
309        }
310      }
311    }
312  }
313  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314}
315
316/* ------------------------------------------------------------------------ */
317
318void __kmp_infinite_loop(void) {
319  static int done = FALSE;
320
321  while (!done) {
322    KMP_YIELD(TRUE);
323  }
324}
325
326#define MAX_MESSAGE 512
327
328void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                  char const *format, ...) {
330  char buffer[MAX_MESSAGE];
331  va_list ap;
332
333  va_start(ap, format);
334  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335               p2, (unsigned long)size, format);
336  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337  __kmp_vprintf(kmp_err, buffer, ap);
338#if KMP_PRINT_DATA_PLACEMENT
339  int node;
340  if (gtid >= 0) {
341    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342      if (__kmp_storage_map_verbose) {
343        node = __kmp_get_host_node(p1);
344        if (node < 0) /* doesn't work, so don't try this next time */
345          __kmp_storage_map_verbose = FALSE;
346        else {
347          char *last;
348          int lastNode;
349          int localProc = __kmp_get_cpu_from_gtid(gtid);
350
351          const int page_size = KMP_GET_PAGE_SIZE();
352
353          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355          if (localProc >= 0)
356            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                 localProc >> 1);
358          else
359            __kmp_printf_no_lock("  GTID %d\n", gtid);
360#if KMP_USE_PRCTL
361          /* The more elaborate format is disabled for now because of the prctl
362           * hanging bug. */
363          do {
364            last = p1;
365            lastNode = node;
366            /* This loop collates adjacent pages with the same host node. */
367            do {
368              (char *)p1 += page_size;
369            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                 lastNode);
372          } while (p1 <= p2);
373#else
374          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                               (char *)p1 + (page_size - 1),
376                               __kmp_get_host_node(p1));
377          if (p1 < p2) {
378            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                 (char *)p2 + (page_size - 1),
380                                 __kmp_get_host_node(p2));
381          }
382#endif
383        }
384      }
385    } else
386      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387  }
388#endif /* KMP_PRINT_DATA_PLACEMENT */
389  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390}
391
392void __kmp_warn(char const *format, ...) {
393  char buffer[MAX_MESSAGE];
394  va_list ap;
395
396  if (__kmp_generate_warnings == kmp_warnings_off) {
397    return;
398  }
399
400  va_start(ap, format);
401
402  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404  __kmp_vprintf(kmp_err, buffer, ap);
405  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406
407  va_end(ap);
408}
409
410void __kmp_abort_process() {
411  // Later threads may stall here, but that's ok because abort() will kill them.
412  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413
414  if (__kmp_debug_buf) {
415    __kmp_dump_debug_buffer();
416  }
417
418  if (KMP_OS_WINDOWS) {
419    // Let other threads know of abnormal termination and prevent deadlock
420    // if abort happened during library initialization or shutdown
421    __kmp_global.g.g_abort = SIGABRT;
422
423    /* On Windows* OS by default abort() causes pop-up error box, which stalls
424       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425       boxes. _set_abort_behavior() works well, but this function is not
426       available in VS7 (this is not problem for DLL, but it is a problem for
427       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428       help, at least in some versions of MS C RTL.
429
430       It seems following sequence is the only way to simulate abort() and
431       avoid pop-up error box. */
432    raise(SIGABRT);
433    _exit(3); // Just in case, if signal ignored, exit anyway.
434  } else {
435    abort();
436  }
437
438  __kmp_infinite_loop();
439  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440
441} // __kmp_abort_process
442
443void __kmp_abort_thread(void) {
444  // TODO: Eliminate g_abort global variable and this function.
445  // In case of abort just call abort(), it will kill all the threads.
446  __kmp_infinite_loop();
447} // __kmp_abort_thread
448
449/* Print out the storage map for the major kmp_info_t thread data structures
450   that are allocated together. */
451
452static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                               gtid);
455
456  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
461
462  __kmp_print_storage_map_gtid(
463      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465
466  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                               &thr->th.th_bar[bs_plain_barrier + 1],
468                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                               gtid);
470
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                               gtid);
475
476#if KMP_FAST_REDUCTION_BARRIER
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                               &thr->th.th_bar[bs_reduction_barrier + 1],
479                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                               gtid);
481#endif // KMP_FAST_REDUCTION_BARRIER
482}
483
484/* Print out the storage map for the major kmp_team_t team data structures
485   that are allocated together. */
486
487static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                         int team_id, int num_thr) {
489  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                               header, team_id);
492
493  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                               &team->t.t_bar[bs_last_barrier],
495                               sizeof(kmp_balign_team_t) * bs_last_barrier,
496                               "%s_%d.t_bar", header, team_id);
497
498  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                               &team->t.t_bar[bs_plain_barrier + 1],
500                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                               header, team_id);
502
503  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                               &team->t.t_bar[bs_forkjoin_barrier + 1],
505                               sizeof(kmp_balign_team_t),
506                               "%s_%d.t_bar[forkjoin]", header, team_id);
507
508#if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                               &team->t.t_bar[bs_reduction_barrier + 1],
511                               sizeof(kmp_balign_team_t),
512                               "%s_%d.t_bar[reduction]", header, team_id);
513#endif // KMP_FAST_REDUCTION_BARRIER
514
515  __kmp_print_storage_map_gtid(
516      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518
519  __kmp_print_storage_map_gtid(
520      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522
523  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                               &team->t.t_disp_buffer[num_disp_buff],
525                               sizeof(dispatch_shared_info_t) * num_disp_buff,
526                               "%s_%d.t_disp_buffer", header, team_id);
527}
528
529static void __kmp_init_allocator() { __kmp_init_memkind(); }
530static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531
532/* ------------------------------------------------------------------------ */
533
534#if KMP_DYNAMIC_LIB
535#if KMP_OS_WINDOWS
536
537static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538  // TODO: Change to __kmp_break_bootstrap_lock().
539  __kmp_init_bootstrap_lock(lck); // make the lock released
540}
541
542static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543  int i;
544  int thread_count;
545
546  // PROCESS_DETACH is expected to be called by a thread that executes
547  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550  // threads can be still alive here, although being about to be terminated. The
551  // threads in the array with ds_thread==0 are most suspicious. Actually, it
552  // can be not safe to access the __kmp_threads[].
553
554  // TODO: does it make sense to check __kmp_roots[] ?
555
556  // Let's check that there are no other alive threads registered with the OMP
557  // lib.
558  while (1) {
559    thread_count = 0;
560    for (i = 0; i < __kmp_threads_capacity; ++i) {
561      if (!__kmp_threads)
562        continue;
563      kmp_info_t *th = __kmp_threads[i];
564      if (th == NULL)
565        continue;
566      int gtid = th->th.th_info.ds.ds_gtid;
567      if (gtid == gtid_req)
568        continue;
569      if (gtid < 0)
570        continue;
571      DWORD exit_val;
572      int alive = __kmp_is_thread_alive(th, &exit_val);
573      if (alive) {
574        ++thread_count;
575      }
576    }
577    if (thread_count == 0)
578      break; // success
579  }
580
581  // Assume that I'm alone. Now it might be safe to check and reset locks.
582  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583  __kmp_reset_lock(&__kmp_forkjoin_lock);
584#ifdef KMP_DEBUG
585  __kmp_reset_lock(&__kmp_stdio_lock);
586#endif // KMP_DEBUG
587}
588
589BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591
592  switch (fdwReason) {
593
594  case DLL_PROCESS_ATTACH:
595    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596
597    return TRUE;
598
599  case DLL_PROCESS_DETACH:
600    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601
602    if (lpReserved != NULL) {
603      // lpReserved is used for telling the difference:
604      //   lpReserved == NULL when FreeLibrary() was called,
605      //   lpReserved != NULL when the process terminates.
606      // When FreeLibrary() is called, worker threads remain alive. So they will
607      // release the forkjoin lock by themselves. When the process terminates,
608      // worker threads disappear triggering the problem of unreleased forkjoin
609      // lock as described below.
610
611      // A worker thread can take the forkjoin lock. The problem comes up if
612      // that worker thread becomes dead before it releases the forkjoin lock.
613      // The forkjoin lock remains taken, while the thread executing
614      // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615      // to take the forkjoin lock and will always fail, so that the application
616      // will never finish [normally]. This scenario is possible if
617      // __kmpc_end() has not been executed. It looks like it's not a corner
618      // case, but common cases:
619      // - the main function was compiled by an alternative compiler;
620      // - the main function was compiled by icl but without /Qopenmp
621      //   (application with plugins);
622      // - application terminates by calling C exit(), Fortran CALL EXIT() or
623      //   Fortran STOP.
624      // - alive foreign thread prevented __kmpc_end from doing cleanup.
625      //
626      // This is a hack to work around the problem.
627      // TODO: !!! figure out something better.
628      __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629    }
630
631    __kmp_internal_end_library(__kmp_gtid_get_specific());
632
633    return TRUE;
634
635  case DLL_THREAD_ATTACH:
636    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637
638    /* if we want to register new siblings all the time here call
639     * __kmp_get_gtid(); */
640    return TRUE;
641
642  case DLL_THREAD_DETACH:
643    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644
645    __kmp_internal_end_thread(__kmp_gtid_get_specific());
646    return TRUE;
647  }
648
649  return TRUE;
650}
651
652#endif /* KMP_OS_WINDOWS */
653#endif /* KMP_DYNAMIC_LIB */
654
655/* __kmp_parallel_deo -- Wait until it's our turn. */
656void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657  int gtid = *gtid_ref;
658#ifdef BUILD_PARALLEL_ORDERED
659  kmp_team_t *team = __kmp_team_from_gtid(gtid);
660#endif /* BUILD_PARALLEL_ORDERED */
661
662  if (__kmp_env_consistency_check) {
663    if (__kmp_threads[gtid]->th.th_root->r.r_active)
664#if KMP_USE_DYNAMIC_LOCK
665      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666#else
667      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668#endif
669  }
670#ifdef BUILD_PARALLEL_ORDERED
671  if (!team->t.t_serialized) {
672    KMP_MB();
673    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674             NULL);
675    KMP_MB();
676  }
677#endif /* BUILD_PARALLEL_ORDERED */
678}
679
680/* __kmp_parallel_dxo -- Signal the next task. */
681void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683#ifdef BUILD_PARALLEL_ORDERED
684  int tid = __kmp_tid_from_gtid(gtid);
685  kmp_team_t *team = __kmp_team_from_gtid(gtid);
686#endif /* BUILD_PARALLEL_ORDERED */
687
688  if (__kmp_env_consistency_check) {
689    if (__kmp_threads[gtid]->th.th_root->r.r_active)
690      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691  }
692#ifdef BUILD_PARALLEL_ORDERED
693  if (!team->t.t_serialized) {
694    KMP_MB(); /* Flush all pending memory write invalidates.  */
695
696    /* use the tid of the next thread in this team */
697    /* TODO replace with general release procedure */
698    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699
700    KMP_MB(); /* Flush all pending memory write invalidates.  */
701  }
702#endif /* BUILD_PARALLEL_ORDERED */
703}
704
705/* ------------------------------------------------------------------------ */
706/* The BARRIER for a SINGLE process section is always explicit   */
707
708int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709  int status;
710  kmp_info_t *th;
711  kmp_team_t *team;
712
713  if (!TCR_4(__kmp_init_parallel))
714    __kmp_parallel_initialize();
715  __kmp_resume_if_soft_paused();
716
717  th = __kmp_threads[gtid];
718  team = th->th.th_team;
719  status = 0;
720
721  th->th.th_ident = id_ref;
722
723  if (team->t.t_serialized) {
724    status = 1;
725  } else {
726    kmp_int32 old_this = th->th.th_local.this_construct;
727
728    ++th->th.th_local.this_construct;
729    /* try to set team count to thread count--success means thread got the
730       single block */
731    /* TODO: Should this be acquire or release? */
732    if (team->t.t_construct == old_this) {
733      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                              th->th.th_local.this_construct);
735    }
736#if USE_ITT_BUILD
737    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739        team->t.t_active_level ==
740            1) { // Only report metadata by master of active team at level 1
741      __kmp_itt_metadata_single(id_ref);
742    }
743#endif /* USE_ITT_BUILD */
744  }
745
746  if (__kmp_env_consistency_check) {
747    if (status && push_ws) {
748      __kmp_push_workshare(gtid, ct_psingle, id_ref);
749    } else {
750      __kmp_check_workshare(gtid, ct_psingle, id_ref);
751    }
752  }
753#if USE_ITT_BUILD
754  if (status) {
755    __kmp_itt_single_start(gtid);
756  }
757#endif /* USE_ITT_BUILD */
758  return status;
759}
760
761void __kmp_exit_single(int gtid) {
762#if USE_ITT_BUILD
763  __kmp_itt_single_end(gtid);
764#endif /* USE_ITT_BUILD */
765  if (__kmp_env_consistency_check)
766    __kmp_pop_workshare(gtid, ct_psingle, NULL);
767}
768
769/* determine if we can go parallel or must use a serialized parallel region and
770 * how many threads we can use
771 * set_nproc is the number of threads requested for the team
772 * returns 0 if we should serialize or only use one thread,
773 * otherwise the number of threads to use
774 * The forkjoin lock is held by the caller. */
775static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                 int master_tid, int set_nthreads,
777                                 int enter_teams) {
778  int capacity;
779  int new_nthreads;
780  KMP_DEBUG_ASSERT(__kmp_init_serial);
781  KMP_DEBUG_ASSERT(root && parent_team);
782  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783
784  // If dyn-var is set, dynamically adjust the number of desired threads,
785  // according to the method specified by dynamic_mode.
786  new_nthreads = set_nthreads;
787  if (!get__dynamic_2(parent_team, master_tid)) {
788    ;
789  }
790#ifdef USE_LOAD_BALANCE
791  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793    if (new_nthreads == 1) {
794      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                    "reservation to 1 thread\n",
796                    master_tid));
797      return 1;
798    }
799    if (new_nthreads < set_nthreads) {
800      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                    "reservation to %d threads\n",
802                    master_tid, new_nthreads));
803    }
804  }
805#endif /* USE_LOAD_BALANCE */
806  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807    new_nthreads = __kmp_avail_proc - __kmp_nth +
808                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809    if (new_nthreads <= 1) {
810      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                    "reservation to 1 thread\n",
812                    master_tid));
813      return 1;
814    }
815    if (new_nthreads < set_nthreads) {
816      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                    "reservation to %d threads\n",
818                    master_tid, new_nthreads));
819    } else {
820      new_nthreads = set_nthreads;
821    }
822  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823    if (set_nthreads > 2) {
824      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825      new_nthreads = (new_nthreads % set_nthreads) + 1;
826      if (new_nthreads == 1) {
827        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                      "reservation to 1 thread\n",
829                      master_tid));
830        return 1;
831      }
832      if (new_nthreads < set_nthreads) {
833        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                      "reservation to %d threads\n",
835                      master_tid, new_nthreads));
836      }
837    }
838  } else {
839    KMP_ASSERT(0);
840  }
841
842  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843  if (__kmp_nth + new_nthreads -
844          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845      __kmp_max_nth) {
846    int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848    if (tl_nthreads <= 0) {
849      tl_nthreads = 1;
850    }
851
852    // If dyn-var is false, emit a 1-time warning.
853    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854      __kmp_reserve_warn = 1;
855      __kmp_msg(kmp_ms_warning,
856                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858    }
859    if (tl_nthreads == 1) {
860      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                    "reduced reservation to 1 thread\n",
862                    master_tid));
863      return 1;
864    }
865    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                  "reservation to %d threads\n",
867                  master_tid, tl_nthreads));
868    new_nthreads = tl_nthreads;
869  }
870
871  // Respect OMP_THREAD_LIMIT
872  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874  if (cg_nthreads + new_nthreads -
875          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876      max_cg_threads) {
877    int tl_nthreads = max_cg_threads - cg_nthreads +
878                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879    if (tl_nthreads <= 0) {
880      tl_nthreads = 1;
881    }
882
883    // If dyn-var is false, emit a 1-time warning.
884    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885      __kmp_reserve_warn = 1;
886      __kmp_msg(kmp_ms_warning,
887                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889    }
890    if (tl_nthreads == 1) {
891      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                    "reduced reservation to 1 thread\n",
893                    master_tid));
894      return 1;
895    }
896    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                  "reservation to %d threads\n",
898                  master_tid, tl_nthreads));
899    new_nthreads = tl_nthreads;
900  }
901
902  // Check if the threads array is large enough, or needs expanding.
903  // See comment in __kmp_register_root() about the adjustment if
904  // __kmp_threads[0] == NULL.
905  capacity = __kmp_threads_capacity;
906  if (TCR_PTR(__kmp_threads[0]) == NULL) {
907    --capacity;
908  }
909  if (__kmp_nth + new_nthreads -
910          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911      capacity) {
912    // Expand the threads array.
913    int slotsRequired = __kmp_nth + new_nthreads -
914                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                        capacity;
916    int slotsAdded = __kmp_expand_threads(slotsRequired);
917    if (slotsAdded < slotsRequired) {
918      // The threads array was not expanded enough.
919      new_nthreads -= (slotsRequired - slotsAdded);
920      KMP_ASSERT(new_nthreads >= 1);
921
922      // If dyn-var is false, emit a 1-time warning.
923      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924        __kmp_reserve_warn = 1;
925        if (__kmp_tp_cached) {
926          __kmp_msg(kmp_ms_warning,
927                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930        } else {
931          __kmp_msg(kmp_ms_warning,
932                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934        }
935      }
936    }
937  }
938
939#ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941    KC_TRACE(10,
942             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943              "dead roots and rechecking; requested %d threads\n",
944              __kmp_get_gtid(), set_nthreads));
945  } else {
946    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                  " %d threads\n",
948                  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950#endif // KMP_DEBUG
951  return new_nthreads;
952}
953
954/* Allocate threads from the thread pool and assign them to the new team. We are
955   assured that there are enough threads available, because we checked on that
956   earlier within critical section forkjoin */
957static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                    kmp_info_t *master_th, int master_gtid) {
959  int i;
960  int use_hot_team;
961
962  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964  KMP_MB();
965
966  /* first, let's setup the master thread */
967  master_th->th.th_info.ds.ds_tid = 0;
968  master_th->th.th_team = team;
969  master_th->th.th_team_nproc = team->t.t_nproc;
970  master_th->th.th_team_master = master_th;
971  master_th->th.th_team_serialized = FALSE;
972  master_th->th.th_dispatch = &team->t.t_dispatch[0];
973
974/* make sure we are not the optimized hot team */
975#if KMP_NESTED_HOT_TEAMS
976  use_hot_team = 0;
977  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978  if (hot_teams) { // hot teams array is not allocated if
979    // KMP_HOT_TEAMS_MAX_LEVEL=0
980    int level = team->t.t_active_level - 1; // index in array of hot teams
981    if (master_th->th.th_teams_microtask) { // are we inside the teams?
982      if (master_th->th.th_teams_size.nteams > 1) {
983        ++level; // level was not increased in teams construct for
984        // team_of_masters
985      }
986      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987          master_th->th.th_teams_level == team->t.t_level) {
988        ++level; // level was not increased in teams construct for
989        // team_of_workers before the parallel
990      } // team->t.t_level will be increased inside parallel
991    }
992    if (level < __kmp_hot_teams_max_level) {
993      if (hot_teams[level].hot_team) {
994        // hot team has already been allocated for given level
995        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996        use_hot_team = 1; // the team is ready to use
997      } else {
998        use_hot_team = 0; // AC: threads are not allocated yet
999        hot_teams[level].hot_team = team; // remember new hot team
1000        hot_teams[level].hot_team_nth = team->t.t_nproc;
1001      }
1002    } else {
1003      use_hot_team = 0;
1004    }
1005  }
1006#else
1007  use_hot_team = team == root->r.r_hot_team;
1008#endif
1009  if (!use_hot_team) {
1010
1011    /* install the master thread */
1012    team->t.t_threads[0] = master_th;
1013    __kmp_initialize_info(master_th, team, 0, master_gtid);
1014
1015    /* now, install the worker threads */
1016    for (i = 1; i < team->t.t_nproc; i++) {
1017
1018      /* fork or reallocate a new thread and install it in team */
1019      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020      team->t.t_threads[i] = thr;
1021      KMP_DEBUG_ASSERT(thr);
1022      KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023      /* align team and thread arrived states */
1024      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                    team->t.t_bar[bs_plain_barrier].b_arrived));
1030      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031      thr->th.th_teams_level = master_th->th.th_teams_level;
1032      thr->th.th_teams_size = master_th->th.th_teams_size;
1033      { // Initialize threads' barrier data.
1034        int b;
1035        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036        for (b = 0; b < bs_last_barrier; ++b) {
1037          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039#if USE_DEBUGGER
1040          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041#endif
1042        }
1043      }
1044    }
1045
1046#if KMP_AFFINITY_SUPPORTED
1047    __kmp_partition_places(team);
1048#endif
1049  }
1050
1051  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052    for (i = 0; i < team->t.t_nproc; i++) {
1053      kmp_info_t *thr = team->t.t_threads[i];
1054      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055          thr->th.th_prev_level != team->t.t_level) {
1056        team->t.t_display_affinity = 1;
1057        break;
1058      }
1059    }
1060  }
1061
1062  KMP_MB();
1063}
1064
1065#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066// Propagate any changes to the floating point control registers out to the team
1067// We try to avoid unnecessary writes to the relevant cache line in the team
1068// structure, so we don't make changes unless they are needed.
1069inline static void propagateFPControl(kmp_team_t *team) {
1070  if (__kmp_inherit_fp_control) {
1071    kmp_int16 x87_fpu_control_word;
1072    kmp_uint32 mxcsr;
1073
1074    // Get master values of FPU control flags (both X87 and vector)
1075    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076    __kmp_store_mxcsr(&mxcsr);
1077    mxcsr &= KMP_X86_MXCSR_MASK;
1078
1079    // There is no point looking at t_fp_control_saved here.
1080    // If it is TRUE, we still have to update the values if they are different
1081    // from those we now have. If it is FALSE we didn't save anything yet, but
1082    // our objective is the same. We have to ensure that the values in the team
1083    // are the same as those we have.
1084    // So, this code achieves what we need whether or not t_fp_control_saved is
1085    // true. By checking whether the value needs updating we avoid unnecessary
1086    // writes that would put the cache-line into a written state, causing all
1087    // threads in the team to have to read it again.
1088    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090    // Although we don't use this value, other code in the runtime wants to know
1091    // whether it should restore them. So we must ensure it is correct.
1092    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093  } else {
1094    // Similarly here. Don't write to this cache-line in the team structure
1095    // unless we have to.
1096    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097  }
1098}
1099
1100// Do the opposite, setting the hardware registers to the updated values from
1101// the team.
1102inline static void updateHWFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104    // Only reset the fp control regs if they have been changed in the team.
1105    // the parallel region that we are exiting.
1106    kmp_int16 x87_fpu_control_word;
1107    kmp_uint32 mxcsr;
1108    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109    __kmp_store_mxcsr(&mxcsr);
1110    mxcsr &= KMP_X86_MXCSR_MASK;
1111
1112    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113      __kmp_clear_x87_fpu_status_word();
1114      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115    }
1116
1117    if (team->t.t_mxcsr != mxcsr) {
1118      __kmp_load_mxcsr(&team->t.t_mxcsr);
1119    }
1120  }
1121}
1122#else
1123#define propagateFPControl(x) ((void)0)
1124#define updateHWFPControl(x) ((void)0)
1125#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126
1127static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                     int realloc); // forward declaration
1129
1130/* Run a parallel region that has been serialized, so runs only in a team of the
1131   single master thread. */
1132void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133  kmp_info_t *this_thr;
1134  kmp_team_t *serial_team;
1135
1136  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137
1138  /* Skip all this code for autopar serialized loops since it results in
1139     unacceptable overhead */
1140  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141    return;
1142
1143  if (!TCR_4(__kmp_init_parallel))
1144    __kmp_parallel_initialize();
1145  __kmp_resume_if_soft_paused();
1146
1147  this_thr = __kmp_threads[global_tid];
1148  serial_team = this_thr->th.th_serial_team;
1149
1150  /* utilize the serialized team held by this thread */
1151  KMP_DEBUG_ASSERT(serial_team);
1152  KMP_MB();
1153
1154  if (__kmp_tasking_mode != tskm_immediate_exec) {
1155    KMP_DEBUG_ASSERT(
1156        this_thr->th.th_task_team ==
1157        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                     NULL);
1160    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                  "team %p, new task_team = NULL\n",
1162                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163    this_thr->th.th_task_team = NULL;
1164  }
1165
1166  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168    proc_bind = proc_bind_false;
1169  } else if (proc_bind == proc_bind_default) {
1170    // No proc_bind clause was specified, so use the current value
1171    // of proc-bind-var for this parallel region.
1172    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173  }
1174  // Reset for next parallel region
1175  this_thr->th.th_set_proc_bind = proc_bind_default;
1176
1177#if OMPT_SUPPORT
1178  ompt_data_t ompt_parallel_data = ompt_data_none;
1179  ompt_data_t *implicit_task_data;
1180  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181  if (ompt_enabled.enabled &&
1182      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183
1184    ompt_task_info_t *parent_task_info;
1185    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186
1187    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188    if (ompt_enabled.ompt_callback_parallel_begin) {
1189      int team_size = 1;
1190
1191      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192          &(parent_task_info->task_data), &(parent_task_info->frame),
1193          &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1194          codeptr);
1195    }
1196  }
1197#endif // OMPT_SUPPORT
1198
1199  if (this_thr->th.th_team != serial_team) {
1200    // Nested level will be an index in the nested nthreads array
1201    int level = this_thr->th.th_team->t.t_level;
1202
1203    if (serial_team->t.t_serialized) {
1204      /* this serial team was already used
1205         TODO increase performance by making this locks more specific */
1206      kmp_team_t *new_team;
1207
1208      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209
1210      new_team =
1211          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212#if OMPT_SUPPORT
1213                              ompt_parallel_data,
1214#endif
1215                              proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                              0 USE_NESTED_HOT_ARG(NULL));
1217      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218      KMP_ASSERT(new_team);
1219
1220      /* setup new serialized team and install it */
1221      new_team->t.t_threads[0] = this_thr;
1222      new_team->t.t_parent = this_thr->th.th_team;
1223      serial_team = new_team;
1224      this_thr->th.th_serial_team = serial_team;
1225
1226      KF_TRACE(
1227          10,
1228          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229           global_tid, serial_team));
1230
1231      /* TODO the above breaks the requirement that if we run out of resources,
1232         then we can still guarantee that serialized teams are ok, since we may
1233         need to allocate a new one */
1234    } else {
1235      KF_TRACE(
1236          10,
1237          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238           global_tid, serial_team));
1239    }
1240
1241    /* we have to initialize this serial team */
1242    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245    serial_team->t.t_ident = loc;
1246    serial_team->t.t_serialized = 1;
1247    serial_team->t.t_nproc = 1;
1248    serial_team->t.t_parent = this_thr->th.th_team;
1249    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250    this_thr->th.th_team = serial_team;
1251    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252
1253    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                  this_thr->th.th_current_task));
1255    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256    this_thr->th.th_current_task->td_flags.executing = 0;
1257
1258    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259
1260    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261       implicit task for each serialized task represented by
1262       team->t.t_serialized? */
1263    copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264              &this_thr->th.th_current_task->td_parent->td_icvs);
1265
1266    // Thread value exists in the nested nthreads array for the next nested
1267    // level
1268    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269      this_thr->th.th_current_task->td_icvs.nproc =
1270          __kmp_nested_nth.nth[level + 1];
1271    }
1272
1273    if (__kmp_nested_proc_bind.used &&
1274        (level + 1 < __kmp_nested_proc_bind.used)) {
1275      this_thr->th.th_current_task->td_icvs.proc_bind =
1276          __kmp_nested_proc_bind.bind_types[level + 1];
1277    }
1278
1279#if USE_DEBUGGER
1280    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281#endif
1282    this_thr->th.th_info.ds.ds_tid = 0;
1283
1284    /* set thread cache values */
1285    this_thr->th.th_team_nproc = 1;
1286    this_thr->th.th_team_master = this_thr;
1287    this_thr->th.th_team_serialized = 1;
1288
1289    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292
1293    propagateFPControl(serial_team);
1294
1295    /* check if we need to allocate dispatch buffers stack */
1296    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297    if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298      serial_team->t.t_dispatch->th_disp_buffer =
1299          (dispatch_private_info_t *)__kmp_allocate(
1300              sizeof(dispatch_private_info_t));
1301    }
1302    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303
1304    KMP_MB();
1305
1306  } else {
1307    /* this serialized team is already being used,
1308     * that's fine, just add another nested level */
1309    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312    ++serial_team->t.t_serialized;
1313    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314
1315    // Nested level will be an index in the nested nthreads array
1316    int level = this_thr->th.th_team->t.t_level;
1317    // Thread value exists in the nested nthreads array for the next nested
1318    // level
1319    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320      this_thr->th.th_current_task->td_icvs.nproc =
1321          __kmp_nested_nth.nth[level + 1];
1322    }
1323    serial_team->t.t_level++;
1324    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                  "of serial team %p to %d\n",
1326                  global_tid, serial_team, serial_team->t.t_level));
1327
1328    /* allocate/push dispatch buffers stack */
1329    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330    {
1331      dispatch_private_info_t *disp_buffer =
1332          (dispatch_private_info_t *)__kmp_allocate(
1333              sizeof(dispatch_private_info_t));
1334      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336    }
1337    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338
1339    KMP_MB();
1340  }
1341  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342
1343  // Perform the display affinity functionality for
1344  // serialized parallel regions
1345  if (__kmp_display_affinity) {
1346    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347        this_thr->th.th_prev_num_threads != 1) {
1348      // NULL means use the affinity-format-var ICV
1349      __kmp_aux_display_affinity(global_tid, NULL);
1350      this_thr->th.th_prev_level = serial_team->t.t_level;
1351      this_thr->th.th_prev_num_threads = 1;
1352    }
1353  }
1354
1355  if (__kmp_env_consistency_check)
1356    __kmp_push_parallel(global_tid, NULL);
1357#if OMPT_SUPPORT
1358  serial_team->t.ompt_team_info.master_return_address = codeptr;
1359  if (ompt_enabled.enabled &&
1360      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362
1363    ompt_lw_taskteam_t lw_taskteam;
1364    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                            &ompt_parallel_data, codeptr);
1366
1367    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368    // don't use lw_taskteam after linking. content was swaped
1369
1370    /* OMPT implicit task begin */
1371    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372    if (ompt_enabled.ompt_callback_implicit_task) {
1373      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376      OMPT_CUR_TASK_INFO(this_thr)
1377          ->thread_num = __kmp_tid_from_gtid(global_tid);
1378    }
1379
1380    /* OMPT state */
1381    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383  }
1384#endif
1385}
1386
1387/* most of the work for a fork */
1388/* return true if we really went parallel, false if serialized */
1389int __kmp_fork_call(ident_t *loc, int gtid,
1390                    enum fork_context_e call_context, // Intel, GNU, ...
1391                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392/* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394                    va_list *ap
1395#else
1396                    va_list ap
1397#endif
1398                    ) {
1399  void **argv;
1400  int i;
1401  int master_tid;
1402  int master_this_cons;
1403  kmp_team_t *team;
1404  kmp_team_t *parent_team;
1405  kmp_info_t *master_th;
1406  kmp_root_t *root;
1407  int nthreads;
1408  int master_active;
1409  int master_set_numthreads;
1410  int level;
1411  int active_level;
1412  int teams_level;
1413#if KMP_NESTED_HOT_TEAMS
1414  kmp_hot_team_ptr_t **p_hot_teams;
1415#endif
1416  { // KMP_TIME_BLOCK
1417    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419
1420    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422      /* Some systems prefer the stack for the root thread(s) to start with */
1423      /* some gap from the parent stack to prevent false sharing. */
1424      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425      /* These 2 lines below are so this does not get optimized out */
1426      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427        __kmp_stkpadding += (short)((kmp_int64)dummy);
1428    }
1429
1430    /* initialize if needed */
1431    KMP_DEBUG_ASSERT(
1432        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433    if (!TCR_4(__kmp_init_parallel))
1434      __kmp_parallel_initialize();
1435    __kmp_resume_if_soft_paused();
1436
1437    /* setup current data */
1438    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439    // shutdown
1440    parent_team = master_th->th.th_team;
1441    master_tid = master_th->th.th_info.ds.ds_tid;
1442    master_this_cons = master_th->th.th_local.this_construct;
1443    root = master_th->th.th_root;
1444    master_active = root->r.r_active;
1445    master_set_numthreads = master_th->th.th_set_nproc;
1446
1447#if OMPT_SUPPORT
1448    ompt_data_t ompt_parallel_data = ompt_data_none;
1449    ompt_data_t *parent_task_data;
1450    ompt_frame_t *ompt_frame;
1451    ompt_data_t *implicit_task_data;
1452    void *return_address = NULL;
1453
1454    if (ompt_enabled.enabled) {
1455      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456                                    NULL, NULL);
1457      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458    }
1459#endif
1460
1461    // Nested level will be an index in the nested nthreads array
1462    level = parent_team->t.t_level;
1463    // used to launch non-serial teams even if nested is not allowed
1464    active_level = parent_team->t.t_active_level;
1465    // needed to check nesting inside the teams
1466    teams_level = master_th->th.th_teams_level;
1467#if KMP_NESTED_HOT_TEAMS
1468    p_hot_teams = &master_th->th.th_hot_teams;
1469    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473      // it is either actual or not needed (when active_level > 0)
1474      (*p_hot_teams)[0].hot_team_nth = 1;
1475    }
1476#endif
1477
1478#if OMPT_SUPPORT
1479    if (ompt_enabled.enabled) {
1480      if (ompt_enabled.ompt_callback_parallel_begin) {
1481        int team_size = master_set_numthreads
1482                            ? master_set_numthreads
1483                            : get__nproc_2(parent_team, master_tid);
1484        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1485            parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1486            OMPT_INVOKER(call_context), return_address);
1487      }
1488      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489    }
1490#endif
1491
1492    master_th->th.th_ident = loc;
1493
1494    if (master_th->th.th_teams_microtask && ap &&
1495        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1496      // AC: This is start of parallel that is nested inside teams construct.
1497      // The team is actual (hot), all workers are ready at the fork barrier.
1498      // No lock needed to initialize the team a bit, then free workers.
1499      parent_team->t.t_ident = loc;
1500      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1501      parent_team->t.t_argc = argc;
1502      argv = (void **)parent_team->t.t_argv;
1503      for (i = argc - 1; i >= 0; --i)
1504/* TODO: revert workaround for Intel(R) 64 tracker #96 */
1505#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1506        *argv++ = va_arg(*ap, void *);
1507#else
1508        *argv++ = va_arg(ap, void *);
1509#endif
1510      // Increment our nested depth levels, but not increase the serialization
1511      if (parent_team == master_th->th.th_serial_team) {
1512        // AC: we are in serialized parallel
1513        __kmpc_serialized_parallel(loc, gtid);
1514        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1515        // AC: need this in order enquiry functions work
1516        // correctly, will restore at join time
1517        parent_team->t.t_serialized--;
1518#if OMPT_SUPPORT
1519        void *dummy;
1520        void **exit_runtime_p;
1521
1522        ompt_lw_taskteam_t lw_taskteam;
1523
1524        if (ompt_enabled.enabled) {
1525          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1526                                  &ompt_parallel_data, return_address);
1527          exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1528
1529          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1530          // don't use lw_taskteam after linking. content was swaped
1531
1532          /* OMPT implicit task begin */
1533          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1534          if (ompt_enabled.ompt_callback_implicit_task) {
1535            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1536                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1537                implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1538            OMPT_CUR_TASK_INFO(master_th)
1539                ->thread_num = __kmp_tid_from_gtid(gtid);
1540          }
1541
1542          /* OMPT state */
1543          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1544        } else {
1545          exit_runtime_p = &dummy;
1546        }
1547#endif
1548
1549        {
1550          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1551          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1552          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1553#if OMPT_SUPPORT
1554                                 ,
1555                                 exit_runtime_p
1556#endif
1557                                 );
1558        }
1559
1560#if OMPT_SUPPORT
1561        *exit_runtime_p = NULL;
1562        if (ompt_enabled.enabled) {
1563          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1564          if (ompt_enabled.ompt_callback_implicit_task) {
1565            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1566                ompt_scope_end, NULL, implicit_task_data, 1,
1567                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1568          }
1569          __ompt_lw_taskteam_unlink(master_th);
1570
1571          if (ompt_enabled.ompt_callback_parallel_end) {
1572            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1573                OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1574                OMPT_INVOKER(call_context), return_address);
1575          }
1576          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1577        }
1578#endif
1579        return TRUE;
1580      }
1581
1582      parent_team->t.t_pkfn = microtask;
1583      parent_team->t.t_invoke = invoker;
1584      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1585      parent_team->t.t_active_level++;
1586      parent_team->t.t_level++;
1587      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1588
1589      /* Change number of threads in the team if requested */
1590      if (master_set_numthreads) { // The parallel has num_threads clause
1591        if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1592          // AC: only can reduce number of threads dynamically, can't increase
1593          kmp_info_t **other_threads = parent_team->t.t_threads;
1594          parent_team->t.t_nproc = master_set_numthreads;
1595          for (i = 0; i < master_set_numthreads; ++i) {
1596            other_threads[i]->th.th_team_nproc = master_set_numthreads;
1597          }
1598          // Keep extra threads hot in the team for possible next parallels
1599        }
1600        master_th->th.th_set_nproc = 0;
1601      }
1602
1603#if USE_DEBUGGER
1604      if (__kmp_debugging) { // Let debugger override number of threads.
1605        int nth = __kmp_omp_num_threads(loc);
1606        if (nth > 0) { // 0 means debugger doesn't want to change num threads
1607          master_set_numthreads = nth;
1608        }
1609      }
1610#endif
1611
1612      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1613                    "master_th=%p, gtid=%d\n",
1614                    root, parent_team, master_th, gtid));
1615      __kmp_internal_fork(loc, gtid, parent_team);
1616      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1617                    "master_th=%p, gtid=%d\n",
1618                    root, parent_team, master_th, gtid));
1619
1620      /* Invoke microtask for MASTER thread */
1621      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1622                    parent_team->t.t_id, parent_team->t.t_pkfn));
1623
1624      if (!parent_team->t.t_invoke(gtid)) {
1625        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1626      }
1627      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1628                    parent_team->t.t_id, parent_team->t.t_pkfn));
1629      KMP_MB(); /* Flush all pending memory write invalidates.  */
1630
1631      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1632
1633      return TRUE;
1634    } // Parallel closely nested in teams construct
1635
1636#if KMP_DEBUG
1637    if (__kmp_tasking_mode != tskm_immediate_exec) {
1638      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1639                       parent_team->t.t_task_team[master_th->th.th_task_state]);
1640    }
1641#endif
1642
1643    if (parent_team->t.t_active_level >=
1644        master_th->th.th_current_task->td_icvs.max_active_levels) {
1645      nthreads = 1;
1646    } else {
1647      int enter_teams = ((ap == NULL && active_level == 0) ||
1648                         (ap && teams_level > 0 && teams_level == level));
1649      nthreads =
1650          master_set_numthreads
1651              ? master_set_numthreads
1652              : get__nproc_2(
1653                    parent_team,
1654                    master_tid); // TODO: get nproc directly from current task
1655
1656      // Check if we need to take forkjoin lock? (no need for serialized
1657      // parallel out of teams construct). This code moved here from
1658      // __kmp_reserve_threads() to speedup nested serialized parallels.
1659      if (nthreads > 1) {
1660        if ((get__max_active_levels(master_th) == 1 &&
1661             (root->r.r_in_parallel && !enter_teams)) ||
1662            (__kmp_library == library_serial)) {
1663          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1664                        " threads\n",
1665                        gtid, nthreads));
1666          nthreads = 1;
1667        }
1668      }
1669      if (nthreads > 1) {
1670        /* determine how many new threads we can use */
1671        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1672        /* AC: If we execute teams from parallel region (on host), then teams
1673           should be created but each can only have 1 thread if nesting is
1674           disabled. If teams called from serial region, then teams and their
1675           threads should be created regardless of the nesting setting. */
1676        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1677                                         nthreads, enter_teams);
1678        if (nthreads == 1) {
1679          // Free lock for single thread execution here; for multi-thread
1680          // execution it will be freed later after team of threads created
1681          // and initialized
1682          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1683        }
1684      }
1685    }
1686    KMP_DEBUG_ASSERT(nthreads > 0);
1687
1688    // If we temporarily changed the set number of threads then restore it now
1689    master_th->th.th_set_nproc = 0;
1690
1691    /* create a serialized parallel region? */
1692    if (nthreads == 1) {
1693/* josh todo: hypothetical question: what do we do for OS X*? */
1694#if KMP_OS_LINUX &&                                                            \
1695    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1696      void *args[argc];
1697#else
1698      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1699#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1700          KMP_ARCH_AARCH64) */
1701
1702      KA_TRACE(20,
1703               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1704
1705      __kmpc_serialized_parallel(loc, gtid);
1706
1707      if (call_context == fork_context_intel) {
1708        /* TODO this sucks, use the compiler itself to pass args! :) */
1709        master_th->th.th_serial_team->t.t_ident = loc;
1710        if (!ap) {
1711          // revert change made in __kmpc_serialized_parallel()
1712          master_th->th.th_serial_team->t.t_level--;
1713// Get args from parent team for teams construct
1714
1715#if OMPT_SUPPORT
1716          void *dummy;
1717          void **exit_runtime_p;
1718          ompt_task_info_t *task_info;
1719
1720          ompt_lw_taskteam_t lw_taskteam;
1721
1722          if (ompt_enabled.enabled) {
1723            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724                                    &ompt_parallel_data, return_address);
1725
1726            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727            // don't use lw_taskteam after linking. content was swaped
1728
1729            task_info = OMPT_CUR_TASK_INFO(master_th);
1730            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1731            if (ompt_enabled.ompt_callback_implicit_task) {
1732              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1733                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1734                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1735              OMPT_CUR_TASK_INFO(master_th)
1736                  ->thread_num = __kmp_tid_from_gtid(gtid);
1737            }
1738
1739            /* OMPT state */
1740            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1741          } else {
1742            exit_runtime_p = &dummy;
1743          }
1744#endif
1745
1746          {
1747            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1748            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1749            __kmp_invoke_microtask(microtask, gtid, 0, argc,
1750                                   parent_team->t.t_argv
1751#if OMPT_SUPPORT
1752                                   ,
1753                                   exit_runtime_p
1754#endif
1755                                   );
1756          }
1757
1758#if OMPT_SUPPORT
1759          if (ompt_enabled.enabled) {
1760            exit_runtime_p = NULL;
1761            if (ompt_enabled.ompt_callback_implicit_task) {
1762              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1764                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1765            }
1766
1767            __ompt_lw_taskteam_unlink(master_th);
1768            if (ompt_enabled.ompt_callback_parallel_end) {
1769              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1770                  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1771                  OMPT_INVOKER(call_context), return_address);
1772            }
1773            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774          }
1775#endif
1776        } else if (microtask == (microtask_t)__kmp_teams_master) {
1777          KMP_DEBUG_ASSERT(master_th->th.th_team ==
1778                           master_th->th.th_serial_team);
1779          team = master_th->th.th_team;
1780          // team->t.t_pkfn = microtask;
1781          team->t.t_invoke = invoker;
1782          __kmp_alloc_argv_entries(argc, team, TRUE);
1783          team->t.t_argc = argc;
1784          argv = (void **)team->t.t_argv;
1785          if (ap) {
1786            for (i = argc - 1; i >= 0; --i)
1787// TODO: revert workaround for Intel(R) 64 tracker #96
1788#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1789              *argv++ = va_arg(*ap, void *);
1790#else
1791              *argv++ = va_arg(ap, void *);
1792#endif
1793          } else {
1794            for (i = 0; i < argc; ++i)
1795              // Get args from parent team for teams construct
1796              argv[i] = parent_team->t.t_argv[i];
1797          }
1798          // AC: revert change made in __kmpc_serialized_parallel()
1799          //     because initial code in teams should have level=0
1800          team->t.t_level--;
1801          // AC: call special invoker for outer "parallel" of teams construct
1802          invoker(gtid);
1803        } else {
1804          argv = args;
1805          for (i = argc - 1; i >= 0; --i)
1806// TODO: revert workaround for Intel(R) 64 tracker #96
1807#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1808            *argv++ = va_arg(*ap, void *);
1809#else
1810            *argv++ = va_arg(ap, void *);
1811#endif
1812          KMP_MB();
1813
1814#if OMPT_SUPPORT
1815          void *dummy;
1816          void **exit_runtime_p;
1817          ompt_task_info_t *task_info;
1818
1819          ompt_lw_taskteam_t lw_taskteam;
1820
1821          if (ompt_enabled.enabled) {
1822            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1823                                    &ompt_parallel_data, return_address);
1824            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1825            // don't use lw_taskteam after linking. content was swaped
1826            task_info = OMPT_CUR_TASK_INFO(master_th);
1827            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1828
1829            /* OMPT implicit task begin */
1830            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1831            if (ompt_enabled.ompt_callback_implicit_task) {
1832              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1833                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1834                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1835              OMPT_CUR_TASK_INFO(master_th)
1836                  ->thread_num = __kmp_tid_from_gtid(gtid);
1837            }
1838
1839            /* OMPT state */
1840            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1841          } else {
1842            exit_runtime_p = &dummy;
1843          }
1844#endif
1845
1846          {
1847            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1848            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1849            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1850#if OMPT_SUPPORT
1851                                   ,
1852                                   exit_runtime_p
1853#endif
1854                                   );
1855          }
1856
1857#if OMPT_SUPPORT
1858          if (ompt_enabled.enabled) {
1859            *exit_runtime_p = NULL;
1860            if (ompt_enabled.ompt_callback_implicit_task) {
1861              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1862                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1863                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1864            }
1865
1866            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1867            __ompt_lw_taskteam_unlink(master_th);
1868            if (ompt_enabled.ompt_callback_parallel_end) {
1869              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1870                  &ompt_parallel_data, parent_task_data,
1871                  OMPT_INVOKER(call_context), return_address);
1872            }
1873            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874          }
1875#endif
1876        }
1877      } else if (call_context == fork_context_gnu) {
1878#if OMPT_SUPPORT
1879        ompt_lw_taskteam_t lwt;
1880        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1881                                return_address);
1882
1883        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1884        __ompt_lw_taskteam_link(&lwt, master_th, 1);
1885// don't use lw_taskteam after linking. content was swaped
1886#endif
1887
1888        // we were called from GNU native code
1889        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890        return FALSE;
1891      } else {
1892        KMP_ASSERT2(call_context < fork_context_last,
1893                    "__kmp_fork_call: unknown fork_context parameter");
1894      }
1895
1896      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1897      KMP_MB();
1898      return FALSE;
1899    } // if (nthreads == 1)
1900
1901    // GEH: only modify the executing flag in the case when not serialized
1902    //      serialized case is handled in kmpc_serialized_parallel
1903    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1904                  "curtask=%p, curtask_max_aclevel=%d\n",
1905                  parent_team->t.t_active_level, master_th,
1906                  master_th->th.th_current_task,
1907                  master_th->th.th_current_task->td_icvs.max_active_levels));
1908    // TODO: GEH - cannot do this assertion because root thread not set up as
1909    // executing
1910    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1911    master_th->th.th_current_task->td_flags.executing = 0;
1912
1913    if (!master_th->th.th_teams_microtask || level > teams_level) {
1914      /* Increment our nested depth level */
1915      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1916    }
1917
1918    // See if we need to make a copy of the ICVs.
1919    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1920    if ((level + 1 < __kmp_nested_nth.used) &&
1921        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1922      nthreads_icv = __kmp_nested_nth.nth[level + 1];
1923    } else {
1924      nthreads_icv = 0; // don't update
1925    }
1926
1927    // Figure out the proc_bind_policy for the new team.
1928    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1929    kmp_proc_bind_t proc_bind_icv =
1930        proc_bind_default; // proc_bind_default means don't update
1931    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1932      proc_bind = proc_bind_false;
1933    } else {
1934      if (proc_bind == proc_bind_default) {
1935        // No proc_bind clause specified; use current proc-bind-var for this
1936        // parallel region
1937        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1938      }
1939      /* else: The proc_bind policy was specified explicitly on parallel clause.
1940         This overrides proc-bind-var for this parallel region, but does not
1941         change proc-bind-var. */
1942      // Figure the value of proc-bind-var for the child threads.
1943      if ((level + 1 < __kmp_nested_proc_bind.used) &&
1944          (__kmp_nested_proc_bind.bind_types[level + 1] !=
1945           master_th->th.th_current_task->td_icvs.proc_bind)) {
1946        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1947      }
1948    }
1949
1950    // Reset for next parallel region
1951    master_th->th.th_set_proc_bind = proc_bind_default;
1952
1953    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1954      kmp_internal_control_t new_icvs;
1955      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1956      new_icvs.next = NULL;
1957      if (nthreads_icv > 0) {
1958        new_icvs.nproc = nthreads_icv;
1959      }
1960      if (proc_bind_icv != proc_bind_default) {
1961        new_icvs.proc_bind = proc_bind_icv;
1962      }
1963
1964      /* allocate a new parallel team */
1965      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1966      team = __kmp_allocate_team(root, nthreads, nthreads,
1967#if OMPT_SUPPORT
1968                                 ompt_parallel_data,
1969#endif
1970                                 proc_bind, &new_icvs,
1971                                 argc USE_NESTED_HOT_ARG(master_th));
1972    } else {
1973      /* allocate a new parallel team */
1974      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1975      team = __kmp_allocate_team(root, nthreads, nthreads,
1976#if OMPT_SUPPORT
1977                                 ompt_parallel_data,
1978#endif
1979                                 proc_bind,
1980                                 &master_th->th.th_current_task->td_icvs,
1981                                 argc USE_NESTED_HOT_ARG(master_th));
1982    }
1983    KF_TRACE(
1984        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1985
1986    /* setup the new team */
1987    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1988    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1989    KMP_CHECK_UPDATE(team->t.t_ident, loc);
1990    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1991    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1992#if OMPT_SUPPORT
1993    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1994                          return_address);
1995#endif
1996    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1997    // TODO: parent_team->t.t_level == INT_MAX ???
1998    if (!master_th->th.th_teams_microtask || level > teams_level) {
1999      int new_level = parent_team->t.t_level + 1;
2000      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2001      new_level = parent_team->t.t_active_level + 1;
2002      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2003    } else {
2004      // AC: Do not increase parallel level at start of the teams construct
2005      int new_level = parent_team->t.t_level;
2006      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2007      new_level = parent_team->t.t_active_level;
2008      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2009    }
2010    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2011    // set master's schedule as new run-time schedule
2012    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2013
2014    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2015    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2016
2017    // Update the floating point rounding in the team if required.
2018    propagateFPControl(team);
2019
2020    if (__kmp_tasking_mode != tskm_immediate_exec) {
2021      // Set master's task team to team's task team. Unless this is hot team, it
2022      // should be NULL.
2023      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2024                       parent_team->t.t_task_team[master_th->th.th_task_state]);
2025      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2026                    "%p, new task_team %p / team %p\n",
2027                    __kmp_gtid_from_thread(master_th),
2028                    master_th->th.th_task_team, parent_team,
2029                    team->t.t_task_team[master_th->th.th_task_state], team));
2030
2031      if (active_level || master_th->th.th_task_team) {
2032        // Take a memo of master's task_state
2033        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2034        if (master_th->th.th_task_state_top >=
2035            master_th->th.th_task_state_stack_sz) { // increase size
2036          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2037          kmp_uint8 *old_stack, *new_stack;
2038          kmp_uint32 i;
2039          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2040          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2041            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2042          }
2043          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2044               ++i) { // zero-init rest of stack
2045            new_stack[i] = 0;
2046          }
2047          old_stack = master_th->th.th_task_state_memo_stack;
2048          master_th->th.th_task_state_memo_stack = new_stack;
2049          master_th->th.th_task_state_stack_sz = new_size;
2050          __kmp_free(old_stack);
2051        }
2052        // Store master's task_state on stack
2053        master_th->th
2054            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2055            master_th->th.th_task_state;
2056        master_th->th.th_task_state_top++;
2057#if KMP_NESTED_HOT_TEAMS
2058        if (master_th->th.th_hot_teams &&
2059            active_level < __kmp_hot_teams_max_level &&
2060            team == master_th->th.th_hot_teams[active_level].hot_team) {
2061          // Restore master's nested state if nested hot team
2062          master_th->th.th_task_state =
2063              master_th->th
2064                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2065        } else {
2066#endif
2067          master_th->th.th_task_state = 0;
2068#if KMP_NESTED_HOT_TEAMS
2069        }
2070#endif
2071      }
2072#if !KMP_NESTED_HOT_TEAMS
2073      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2074                       (team == root->r.r_hot_team));
2075#endif
2076    }
2077
2078    KA_TRACE(
2079        20,
2080        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2081         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2082         team->t.t_nproc));
2083    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2084                     (team->t.t_master_tid == 0 &&
2085                      (team->t.t_parent == root->r.r_root_team ||
2086                       team->t.t_parent->t.t_serialized)));
2087    KMP_MB();
2088
2089    /* now, setup the arguments */
2090    argv = (void **)team->t.t_argv;
2091    if (ap) {
2092      for (i = argc - 1; i >= 0; --i) {
2093// TODO: revert workaround for Intel(R) 64 tracker #96
2094#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2095        void *new_argv = va_arg(*ap, void *);
2096#else
2097        void *new_argv = va_arg(ap, void *);
2098#endif
2099        KMP_CHECK_UPDATE(*argv, new_argv);
2100        argv++;
2101      }
2102    } else {
2103      for (i = 0; i < argc; ++i) {
2104        // Get args from parent team for teams construct
2105        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2106      }
2107    }
2108
2109    /* now actually fork the threads */
2110    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2111    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2112      root->r.r_active = TRUE;
2113
2114    __kmp_fork_team_threads(root, team, master_th, gtid);
2115    __kmp_setup_icv_copy(team, nthreads,
2116                         &master_th->th.th_current_task->td_icvs, loc);
2117
2118#if OMPT_SUPPORT
2119    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2120#endif
2121
2122    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2123
2124#if USE_ITT_BUILD
2125    if (team->t.t_active_level == 1 // only report frames at level 1
2126        && !master_th->th.th_teams_microtask) { // not in teams construct
2127#if USE_ITT_NOTIFY
2128      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2129          (__kmp_forkjoin_frames_mode == 3 ||
2130           __kmp_forkjoin_frames_mode == 1)) {
2131        kmp_uint64 tmp_time = 0;
2132        if (__itt_get_timestamp_ptr)
2133          tmp_time = __itt_get_timestamp();
2134        // Internal fork - report frame begin
2135        master_th->th.th_frame_time = tmp_time;
2136        if (__kmp_forkjoin_frames_mode == 3)
2137          team->t.t_region_time = tmp_time;
2138      } else
2139// only one notification scheme (either "submit" or "forking/joined", not both)
2140#endif /* USE_ITT_NOTIFY */
2141          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2142              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2143        // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2144        __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2145      }
2146    }
2147#endif /* USE_ITT_BUILD */
2148
2149    /* now go on and do the work */
2150    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2151    KMP_MB();
2152    KF_TRACE(10,
2153             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2154              root, team, master_th, gtid));
2155
2156#if USE_ITT_BUILD
2157    if (__itt_stack_caller_create_ptr) {
2158      team->t.t_stack_id =
2159          __kmp_itt_stack_caller_create(); // create new stack stitching id
2160      // before entering fork barrier
2161    }
2162#endif /* USE_ITT_BUILD */
2163
2164    // AC: skip __kmp_internal_fork at teams construct, let only master
2165    // threads execute
2166    if (ap) {
2167      __kmp_internal_fork(loc, gtid, team);
2168      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2169                    "master_th=%p, gtid=%d\n",
2170                    root, team, master_th, gtid));
2171    }
2172
2173    if (call_context == fork_context_gnu) {
2174      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2175      return TRUE;
2176    }
2177
2178    /* Invoke microtask for MASTER thread */
2179    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2180                  team->t.t_id, team->t.t_pkfn));
2181  } // END of timer KMP_fork_call block
2182
2183#if KMP_STATS_ENABLED
2184  // If beginning a teams construct, then change thread state
2185  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2186  if (!ap) {
2187    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2188  }
2189#endif
2190
2191  if (!team->t.t_invoke(gtid)) {
2192    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2193  }
2194
2195#if KMP_STATS_ENABLED
2196  // If was beginning of a teams construct, then reset thread state
2197  if (!ap) {
2198    KMP_SET_THREAD_STATE(previous_state);
2199  }
2200#endif
2201
2202  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2203                team->t.t_id, team->t.t_pkfn));
2204  KMP_MB(); /* Flush all pending memory write invalidates.  */
2205
2206  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2207
2208#if OMPT_SUPPORT
2209  if (ompt_enabled.enabled) {
2210    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2211  }
2212#endif
2213
2214  return TRUE;
2215}
2216
2217#if OMPT_SUPPORT
2218static inline void __kmp_join_restore_state(kmp_info_t *thread,
2219                                            kmp_team_t *team) {
2220  // restore state outside the region
2221  thread->th.ompt_thread_info.state =
2222      ((team->t.t_serialized) ? ompt_state_work_serial
2223                              : ompt_state_work_parallel);
2224}
2225
2226static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2227                                   kmp_team_t *team, ompt_data_t *parallel_data,
2228                                   fork_context_e fork_context, void *codeptr) {
2229  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2230  if (ompt_enabled.ompt_callback_parallel_end) {
2231    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2232        parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2233        codeptr);
2234  }
2235
2236  task_info->frame.enter_frame = ompt_data_none;
2237  __kmp_join_restore_state(thread, team);
2238}
2239#endif
2240
2241void __kmp_join_call(ident_t *loc, int gtid
2242#if OMPT_SUPPORT
2243                     ,
2244                     enum fork_context_e fork_context
2245#endif
2246                     ,
2247                     int exit_teams) {
2248  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2249  kmp_team_t *team;
2250  kmp_team_t *parent_team;
2251  kmp_info_t *master_th;
2252  kmp_root_t *root;
2253  int master_active;
2254
2255  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2256
2257  /* setup current data */
2258  master_th = __kmp_threads[gtid];
2259  root = master_th->th.th_root;
2260  team = master_th->th.th_team;
2261  parent_team = team->t.t_parent;
2262
2263  master_th->th.th_ident = loc;
2264
2265#if OMPT_SUPPORT
2266  if (ompt_enabled.enabled) {
2267    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2268  }
2269#endif
2270
2271#if KMP_DEBUG
2272  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2273    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2274                  "th_task_team = %p\n",
2275                  __kmp_gtid_from_thread(master_th), team,
2276                  team->t.t_task_team[master_th->th.th_task_state],
2277                  master_th->th.th_task_team));
2278    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2279                     team->t.t_task_team[master_th->th.th_task_state]);
2280  }
2281#endif
2282
2283  if (team->t.t_serialized) {
2284    if (master_th->th.th_teams_microtask) {
2285      // We are in teams construct
2286      int level = team->t.t_level;
2287      int tlevel = master_th->th.th_teams_level;
2288      if (level == tlevel) {
2289        // AC: we haven't incremented it earlier at start of teams construct,
2290        //     so do it here - at the end of teams construct
2291        team->t.t_level++;
2292      } else if (level == tlevel + 1) {
2293        // AC: we are exiting parallel inside teams, need to increment
2294        // serialization in order to restore it in the next call to
2295        // __kmpc_end_serialized_parallel
2296        team->t.t_serialized++;
2297      }
2298    }
2299    __kmpc_end_serialized_parallel(loc, gtid);
2300
2301#if OMPT_SUPPORT
2302    if (ompt_enabled.enabled) {
2303      __kmp_join_restore_state(master_th, parent_team);
2304    }
2305#endif
2306
2307    return;
2308  }
2309
2310  master_active = team->t.t_master_active;
2311
2312  if (!exit_teams) {
2313    // AC: No barrier for internal teams at exit from teams construct.
2314    //     But there is barrier for external team (league).
2315    __kmp_internal_join(loc, gtid, team);
2316  } else {
2317    master_th->th.th_task_state =
2318        0; // AC: no tasking in teams (out of any parallel)
2319  }
2320
2321  KMP_MB();
2322
2323#if OMPT_SUPPORT
2324  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2325  void *codeptr = team->t.ompt_team_info.master_return_address;
2326#endif
2327
2328#if USE_ITT_BUILD
2329  if (__itt_stack_caller_create_ptr) {
2330    __kmp_itt_stack_caller_destroy(
2331        (__itt_caller)team->t
2332            .t_stack_id); // destroy the stack stitching id after join barrier
2333  }
2334
2335  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2336  if (team->t.t_active_level == 1 &&
2337      !master_th->th.th_teams_microtask) { /* not in teams construct */
2338    master_th->th.th_ident = loc;
2339    // only one notification scheme (either "submit" or "forking/joined", not
2340    // both)
2341    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2342        __kmp_forkjoin_frames_mode == 3)
2343      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2344                             master_th->th.th_frame_time, 0, loc,
2345                             master_th->th.th_team_nproc, 1);
2346    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2347             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2348      __kmp_itt_region_joined(gtid);
2349  } // active_level == 1
2350#endif /* USE_ITT_BUILD */
2351
2352  if (master_th->th.th_teams_microtask && !exit_teams &&
2353      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2354      team->t.t_level == master_th->th.th_teams_level + 1) {
2355    // AC: We need to leave the team structure intact at the end of parallel
2356    // inside the teams construct, so that at the next parallel same (hot) team
2357    // works, only adjust nesting levels
2358
2359    /* Decrement our nested depth level */
2360    team->t.t_level--;
2361    team->t.t_active_level--;
2362    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2363
2364    // Restore number of threads in the team if needed. This code relies on
2365    // the proper adjustment of th_teams_size.nth after the fork in
2366    // __kmp_teams_master on each teams master in the case that
2367    // __kmp_reserve_threads reduced it.
2368    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2369      int old_num = master_th->th.th_team_nproc;
2370      int new_num = master_th->th.th_teams_size.nth;
2371      kmp_info_t **other_threads = team->t.t_threads;
2372      team->t.t_nproc = new_num;
2373      for (int i = 0; i < old_num; ++i) {
2374        other_threads[i]->th.th_team_nproc = new_num;
2375      }
2376      // Adjust states of non-used threads of the team
2377      for (int i = old_num; i < new_num; ++i) {
2378        // Re-initialize thread's barrier data.
2379        KMP_DEBUG_ASSERT(other_threads[i]);
2380        kmp_balign_t *balign = other_threads[i]->th.th_bar;
2381        for (int b = 0; b < bs_last_barrier; ++b) {
2382          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2383          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2384#if USE_DEBUGGER
2385          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2386#endif
2387        }
2388        if (__kmp_tasking_mode != tskm_immediate_exec) {
2389          // Synchronize thread's task state
2390          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2391        }
2392      }
2393    }
2394
2395#if OMPT_SUPPORT
2396    if (ompt_enabled.enabled) {
2397      __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2398                      codeptr);
2399    }
2400#endif
2401
2402    return;
2403  }
2404
2405  /* do cleanup and restore the parent team */
2406  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2407  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2408
2409  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2410
2411  /* jc: The following lock has instructions with REL and ACQ semantics,
2412     separating the parallel user code called in this parallel region
2413     from the serial user code called after this function returns. */
2414  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2415
2416  if (!master_th->th.th_teams_microtask ||
2417      team->t.t_level > master_th->th.th_teams_level) {
2418    /* Decrement our nested depth level */
2419    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2420  }
2421  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2422
2423#if OMPT_SUPPORT
2424  if (ompt_enabled.enabled) {
2425    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426    if (ompt_enabled.ompt_callback_implicit_task) {
2427      int ompt_team_size = team->t.t_nproc;
2428      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430          OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2431    }
2432
2433    task_info->frame.exit_frame = ompt_data_none;
2434    task_info->task_data = ompt_data_none;
2435  }
2436#endif
2437
2438  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2439                master_th, team));
2440  __kmp_pop_current_task_from_thread(master_th);
2441
2442#if KMP_AFFINITY_SUPPORTED
2443  // Restore master thread's partition.
2444  master_th->th.th_first_place = team->t.t_first_place;
2445  master_th->th.th_last_place = team->t.t_last_place;
2446#endif // KMP_AFFINITY_SUPPORTED
2447  master_th->th.th_def_allocator = team->t.t_def_allocator;
2448
2449  updateHWFPControl(team);
2450
2451  if (root->r.r_active != master_active)
2452    root->r.r_active = master_active;
2453
2454  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2455                            master_th)); // this will free worker threads
2456
2457  /* this race was fun to find. make sure the following is in the critical
2458     region otherwise assertions may fail occasionally since the old team may be
2459     reallocated and the hierarchy appears inconsistent. it is actually safe to
2460     run and won't cause any bugs, but will cause those assertion failures. it's
2461     only one deref&assign so might as well put this in the critical region */
2462  master_th->th.th_team = parent_team;
2463  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2464  master_th->th.th_team_master = parent_team->t.t_threads[0];
2465  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2466
2467  /* restore serialized team, if need be */
2468  if (parent_team->t.t_serialized &&
2469      parent_team != master_th->th.th_serial_team &&
2470      parent_team != root->r.r_root_team) {
2471    __kmp_free_team(root,
2472                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2473    master_th->th.th_serial_team = parent_team;
2474  }
2475
2476  if (__kmp_tasking_mode != tskm_immediate_exec) {
2477    if (master_th->th.th_task_state_top >
2478        0) { // Restore task state from memo stack
2479      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2480      // Remember master's state if we re-use this nested hot team
2481      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2482          master_th->th.th_task_state;
2483      --master_th->th.th_task_state_top; // pop
2484      // Now restore state at this level
2485      master_th->th.th_task_state =
2486          master_th->th
2487              .th_task_state_memo_stack[master_th->th.th_task_state_top];
2488    }
2489    // Copy the task team from the parent team to the master thread
2490    master_th->th.th_task_team =
2491        parent_team->t.t_task_team[master_th->th.th_task_state];
2492    KA_TRACE(20,
2493             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2494              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2495              parent_team));
2496  }
2497
2498  // TODO: GEH - cannot do this assertion because root thread not set up as
2499  // executing
2500  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2501  master_th->th.th_current_task->td_flags.executing = 1;
2502
2503  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2504
2505#if OMPT_SUPPORT
2506  if (ompt_enabled.enabled) {
2507    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2508                    codeptr);
2509  }
2510#endif
2511
2512  KMP_MB();
2513  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2514}
2515
2516/* Check whether we should push an internal control record onto the
2517   serial team stack.  If so, do it.  */
2518void __kmp_save_internal_controls(kmp_info_t *thread) {
2519
2520  if (thread->th.th_team != thread->th.th_serial_team) {
2521    return;
2522  }
2523  if (thread->th.th_team->t.t_serialized > 1) {
2524    int push = 0;
2525
2526    if (thread->th.th_team->t.t_control_stack_top == NULL) {
2527      push = 1;
2528    } else {
2529      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530          thread->th.th_team->t.t_serialized) {
2531        push = 1;
2532      }
2533    }
2534    if (push) { /* push a record on the serial team's stack */
2535      kmp_internal_control_t *control =
2536          (kmp_internal_control_t *)__kmp_allocate(
2537              sizeof(kmp_internal_control_t));
2538
2539      copy_icvs(control, &thread->th.th_current_task->td_icvs);
2540
2541      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2542
2543      control->next = thread->th.th_team->t.t_control_stack_top;
2544      thread->th.th_team->t.t_control_stack_top = control;
2545    }
2546  }
2547}
2548
2549/* Changes set_nproc */
2550void __kmp_set_num_threads(int new_nth, int gtid) {
2551  kmp_info_t *thread;
2552  kmp_root_t *root;
2553
2554  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2555  KMP_DEBUG_ASSERT(__kmp_init_serial);
2556
2557  if (new_nth < 1)
2558    new_nth = 1;
2559  else if (new_nth > __kmp_max_nth)
2560    new_nth = __kmp_max_nth;
2561
2562  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563  thread = __kmp_threads[gtid];
2564  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2565    return; // nothing to do
2566
2567  __kmp_save_internal_controls(thread);
2568
2569  set__nproc(thread, new_nth);
2570
2571  // If this omp_set_num_threads() call will cause the hot team size to be
2572  // reduced (in the absence of a num_threads clause), then reduce it now,
2573  // rather than waiting for the next parallel region.
2574  root = thread->th.th_root;
2575  if (__kmp_init_parallel && (!root->r.r_active) &&
2576      (root->r.r_hot_team->t.t_nproc > new_nth)
2577#if KMP_NESTED_HOT_TEAMS
2578      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2579#endif
2580      ) {
2581    kmp_team_t *hot_team = root->r.r_hot_team;
2582    int f;
2583
2584    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2585
2586    // Release the extra threads we don't need any more.
2587    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2588      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2589      if (__kmp_tasking_mode != tskm_immediate_exec) {
2590        // When decreasing team size, threads no longer in the team should unref
2591        // task team.
2592        hot_team->t.t_threads[f]->th.th_task_team = NULL;
2593      }
2594      __kmp_free_thread(hot_team->t.t_threads[f]);
2595      hot_team->t.t_threads[f] = NULL;
2596    }
2597    hot_team->t.t_nproc = new_nth;
2598#if KMP_NESTED_HOT_TEAMS
2599    if (thread->th.th_hot_teams) {
2600      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2601      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2602    }
2603#endif
2604
2605    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2606
2607    // Update the t_nproc field in the threads that are still active.
2608    for (f = 0; f < new_nth; f++) {
2609      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2610      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2611    }
2612    // Special flag in case omp_set_num_threads() call
2613    hot_team->t.t_size_changed = -1;
2614  }
2615}
2616
2617/* Changes max_active_levels */
2618void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2619  kmp_info_t *thread;
2620
2621  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2622                "%d = (%d)\n",
2623                gtid, max_active_levels));
2624  KMP_DEBUG_ASSERT(__kmp_init_serial);
2625
2626  // validate max_active_levels
2627  if (max_active_levels < 0) {
2628    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2629    // We ignore this call if the user has specified a negative value.
2630    // The current setting won't be changed. The last valid setting will be
2631    // used. A warning will be issued (if warnings are allowed as controlled by
2632    // the KMP_WARNINGS env var).
2633    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2634                  "max_active_levels for thread %d = (%d)\n",
2635                  gtid, max_active_levels));
2636    return;
2637  }
2638  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2639    // it's OK, the max_active_levels is within the valid range: [ 0;
2640    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2641    // We allow a zero value. (implementation defined behavior)
2642  } else {
2643    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2644                KMP_MAX_ACTIVE_LEVELS_LIMIT);
2645    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2646    // Current upper limit is MAX_INT. (implementation defined behavior)
2647    // If the input exceeds the upper limit, we correct the input to be the
2648    // upper limit. (implementation defined behavior)
2649    // Actually, the flow should never get here until we use MAX_INT limit.
2650  }
2651  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2652                "max_active_levels for thread %d = (%d)\n",
2653                gtid, max_active_levels));
2654
2655  thread = __kmp_threads[gtid];
2656
2657  __kmp_save_internal_controls(thread);
2658
2659  set__max_active_levels(thread, max_active_levels);
2660}
2661
2662/* Gets max_active_levels */
2663int __kmp_get_max_active_levels(int gtid) {
2664  kmp_info_t *thread;
2665
2666  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2667  KMP_DEBUG_ASSERT(__kmp_init_serial);
2668
2669  thread = __kmp_threads[gtid];
2670  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2671  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2672                "curtask_maxaclevel=%d\n",
2673                gtid, thread->th.th_current_task,
2674                thread->th.th_current_task->td_icvs.max_active_levels));
2675  return thread->th.th_current_task->td_icvs.max_active_levels;
2676}
2677
2678KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2679KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2680
2681/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2683  kmp_info_t *thread;
2684  kmp_sched_t orig_kind;
2685  //    kmp_team_t *team;
2686
2687  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2688                gtid, (int)kind, chunk));
2689  KMP_DEBUG_ASSERT(__kmp_init_serial);
2690
2691  // Check if the kind parameter is valid, correct if needed.
2692  // Valid parameters should fit in one of two intervals - standard or extended:
2693  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2695  orig_kind = kind;
2696  kind = __kmp_sched_without_mods(kind);
2697
2698  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2699      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2700    // TODO: Hint needs attention in case we change the default schedule.
2701    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2702              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2703              __kmp_msg_null);
2704    kind = kmp_sched_default;
2705    chunk = 0; // ignore chunk value in case of bad kind
2706  }
2707
2708  thread = __kmp_threads[gtid];
2709
2710  __kmp_save_internal_controls(thread);
2711
2712  if (kind < kmp_sched_upper_std) {
2713    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2714      // differ static chunked vs. unchunked:  chunk should be invalid to
2715      // indicate unchunked schedule (which is the default)
2716      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2717    } else {
2718      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2719          __kmp_sch_map[kind - kmp_sched_lower - 1];
2720    }
2721  } else {
2722    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2723    //    kmp_sched_lower - 2 ];
2724    thread->th.th_current_task->td_icvs.sched.r_sched_type =
2725        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2726                      kmp_sched_lower - 2];
2727  }
2728  __kmp_sched_apply_mods_intkind(
2729      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2730  if (kind == kmp_sched_auto || chunk < 1) {
2731    // ignore parameter chunk for schedule auto
2732    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2733  } else {
2734    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2735  }
2736}
2737
2738/* Gets def_sched_var ICV values */
2739void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2740  kmp_info_t *thread;
2741  enum sched_type th_type;
2742
2743  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2744  KMP_DEBUG_ASSERT(__kmp_init_serial);
2745
2746  thread = __kmp_threads[gtid];
2747
2748  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2749  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2750  case kmp_sch_static:
2751  case kmp_sch_static_greedy:
2752  case kmp_sch_static_balanced:
2753    *kind = kmp_sched_static;
2754    __kmp_sched_apply_mods_stdkind(kind, th_type);
2755    *chunk = 0; // chunk was not set, try to show this fact via zero value
2756    return;
2757  case kmp_sch_static_chunked:
2758    *kind = kmp_sched_static;
2759    break;
2760  case kmp_sch_dynamic_chunked:
2761    *kind = kmp_sched_dynamic;
2762    break;
2763  case kmp_sch_guided_chunked:
2764  case kmp_sch_guided_iterative_chunked:
2765  case kmp_sch_guided_analytical_chunked:
2766    *kind = kmp_sched_guided;
2767    break;
2768  case kmp_sch_auto:
2769    *kind = kmp_sched_auto;
2770    break;
2771  case kmp_sch_trapezoidal:
2772    *kind = kmp_sched_trapezoidal;
2773    break;
2774#if KMP_STATIC_STEAL_ENABLED
2775  case kmp_sch_static_steal:
2776    *kind = kmp_sched_static_steal;
2777    break;
2778#endif
2779  default:
2780    KMP_FATAL(UnknownSchedulingType, th_type);
2781  }
2782
2783  __kmp_sched_apply_mods_stdkind(kind, th_type);
2784  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2785}
2786
2787int __kmp_get_ancestor_thread_num(int gtid, int level) {
2788
2789  int ii, dd;
2790  kmp_team_t *team;
2791  kmp_info_t *thr;
2792
2793  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2794  KMP_DEBUG_ASSERT(__kmp_init_serial);
2795
2796  // validate level
2797  if (level == 0)
2798    return 0;
2799  if (level < 0)
2800    return -1;
2801  thr = __kmp_threads[gtid];
2802  team = thr->th.th_team;
2803  ii = team->t.t_level;
2804  if (level > ii)
2805    return -1;
2806
2807  if (thr->th.th_teams_microtask) {
2808    // AC: we are in teams region where multiple nested teams have same level
2809    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2810    if (level <=
2811        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2812      KMP_DEBUG_ASSERT(ii >= tlevel);
2813      // AC: As we need to pass by the teams league, we need to artificially
2814      // increase ii
2815      if (ii == tlevel) {
2816        ii += 2; // three teams have same level
2817      } else {
2818        ii++; // two teams have same level
2819      }
2820    }
2821  }
2822
2823  if (ii == level)
2824    return __kmp_tid_from_gtid(gtid);
2825
2826  dd = team->t.t_serialized;
2827  level++;
2828  while (ii > level) {
2829    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2830    }
2831    if ((team->t.t_serialized) && (!dd)) {
2832      team = team->t.t_parent;
2833      continue;
2834    }
2835    if (ii > level) {
2836      team = team->t.t_parent;
2837      dd = team->t.t_serialized;
2838      ii--;
2839    }
2840  }
2841
2842  return (dd > 1) ? (0) : (team->t.t_master_tid);
2843}
2844
2845int __kmp_get_team_size(int gtid, int level) {
2846
2847  int ii, dd;
2848  kmp_team_t *team;
2849  kmp_info_t *thr;
2850
2851  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2852  KMP_DEBUG_ASSERT(__kmp_init_serial);
2853
2854  // validate level
2855  if (level == 0)
2856    return 1;
2857  if (level < 0)
2858    return -1;
2859  thr = __kmp_threads[gtid];
2860  team = thr->th.th_team;
2861  ii = team->t.t_level;
2862  if (level > ii)
2863    return -1;
2864
2865  if (thr->th.th_teams_microtask) {
2866    // AC: we are in teams region where multiple nested teams have same level
2867    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2868    if (level <=
2869        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2870      KMP_DEBUG_ASSERT(ii >= tlevel);
2871      // AC: As we need to pass by the teams league, we need to artificially
2872      // increase ii
2873      if (ii == tlevel) {
2874        ii += 2; // three teams have same level
2875      } else {
2876        ii++; // two teams have same level
2877      }
2878    }
2879  }
2880
2881  while (ii > level) {
2882    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2883    }
2884    if (team->t.t_serialized && (!dd)) {
2885      team = team->t.t_parent;
2886      continue;
2887    }
2888    if (ii > level) {
2889      team = team->t.t_parent;
2890      ii--;
2891    }
2892  }
2893
2894  return team->t.t_nproc;
2895}
2896
2897kmp_r_sched_t __kmp_get_schedule_global() {
2898  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2899  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2900  // independently. So one can get the updated schedule here.
2901
2902  kmp_r_sched_t r_sched;
2903
2904  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2905  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2906  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2907  // different roots (even in OMP 2.5)
2908  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2909  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2910  if (s == kmp_sch_static) {
2911    // replace STATIC with more detailed schedule (balanced or greedy)
2912    r_sched.r_sched_type = __kmp_static;
2913  } else if (s == kmp_sch_guided_chunked) {
2914    // replace GUIDED with more detailed schedule (iterative or analytical)
2915    r_sched.r_sched_type = __kmp_guided;
2916  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2917    r_sched.r_sched_type = __kmp_sched;
2918  }
2919  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2920
2921  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2922    // __kmp_chunk may be wrong here (if it was not ever set)
2923    r_sched.chunk = KMP_DEFAULT_CHUNK;
2924  } else {
2925    r_sched.chunk = __kmp_chunk;
2926  }
2927
2928  return r_sched;
2929}
2930
2931/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2932   at least argc number of *t_argv entries for the requested team. */
2933static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2934
2935  KMP_DEBUG_ASSERT(team);
2936  if (!realloc || argc > team->t.t_max_argc) {
2937
2938    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2939                   "current entries=%d\n",
2940                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2941    /* if previously allocated heap space for args, free them */
2942    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2943      __kmp_free((void *)team->t.t_argv);
2944
2945    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2946      /* use unused space in the cache line for arguments */
2947      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2948      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2949                     "argv entries\n",
2950                     team->t.t_id, team->t.t_max_argc));
2951      team->t.t_argv = &team->t.t_inline_argv[0];
2952      if (__kmp_storage_map) {
2953        __kmp_print_storage_map_gtid(
2954            -1, &team->t.t_inline_argv[0],
2955            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2956            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2957            team->t.t_id);
2958      }
2959    } else {
2960      /* allocate space for arguments in the heap */
2961      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2962                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
2963                               : 2 * argc;
2964      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2965                     "argv entries\n",
2966                     team->t.t_id, team->t.t_max_argc));
2967      team->t.t_argv =
2968          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2969      if (__kmp_storage_map) {
2970        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2971                                     &team->t.t_argv[team->t.t_max_argc],
2972                                     sizeof(void *) * team->t.t_max_argc,
2973                                     "team_%d.t_argv", team->t.t_id);
2974      }
2975    }
2976  }
2977}
2978
2979static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2980  int i;
2981  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2982  team->t.t_threads =
2983      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
2984  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
2985      sizeof(dispatch_shared_info_t) * num_disp_buff);
2986  team->t.t_dispatch =
2987      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
2988  team->t.t_implicit_task_taskdata =
2989      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
2990  team->t.t_max_nproc = max_nth;
2991
2992  /* setup dispatch buffers */
2993  for (i = 0; i < num_disp_buff; ++i) {
2994    team->t.t_disp_buffer[i].buffer_index = i;
2995    team->t.t_disp_buffer[i].doacross_buf_idx = i;
2996  }
2997}
2998
2999static void __kmp_free_team_arrays(kmp_team_t *team) {
3000  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3001  int i;
3002  for (i = 0; i < team->t.t_max_nproc; ++i) {
3003    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3004      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3005      team->t.t_dispatch[i].th_disp_buffer = NULL;
3006    }
3007  }
3008#if KMP_USE_HIER_SCHED
3009  __kmp_dispatch_free_hierarchies(team);
3010#endif
3011  __kmp_free(team->t.t_threads);
3012  __kmp_free(team->t.t_disp_buffer);
3013  __kmp_free(team->t.t_dispatch);
3014  __kmp_free(team->t.t_implicit_task_taskdata);
3015  team->t.t_threads = NULL;
3016  team->t.t_disp_buffer = NULL;
3017  team->t.t_dispatch = NULL;
3018  team->t.t_implicit_task_taskdata = 0;
3019}
3020
3021static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3022  kmp_info_t **oldThreads = team->t.t_threads;
3023
3024  __kmp_free(team->t.t_disp_buffer);
3025  __kmp_free(team->t.t_dispatch);
3026  __kmp_free(team->t.t_implicit_task_taskdata);
3027  __kmp_allocate_team_arrays(team, max_nth);
3028
3029  KMP_MEMCPY(team->t.t_threads, oldThreads,
3030             team->t.t_nproc * sizeof(kmp_info_t *));
3031
3032  __kmp_free(oldThreads);
3033}
3034
3035static kmp_internal_control_t __kmp_get_global_icvs(void) {
3036
3037  kmp_r_sched_t r_sched =
3038      __kmp_get_schedule_global(); // get current state of scheduling globals
3039
3040  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3041
3042  kmp_internal_control_t g_icvs = {
3043    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3044    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3045    // adjustment of threads (per thread)
3046    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3047    // whether blocktime is explicitly set
3048    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3049#if KMP_USE_MONITOR
3050    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3051// intervals
3052#endif
3053    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3054    // next parallel region (per thread)
3055    // (use a max ub on value if __kmp_parallel_initialize not called yet)
3056    __kmp_cg_max_nth, // int thread_limit;
3057    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3058    // for max_active_levels
3059    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3060    // {sched,chunk} pair
3061    __kmp_nested_proc_bind.bind_types[0],
3062    __kmp_default_device,
3063    NULL // struct kmp_internal_control *next;
3064  };
3065
3066  return g_icvs;
3067}
3068
3069static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3070
3071  kmp_internal_control_t gx_icvs;
3072  gx_icvs.serial_nesting_level =
3073      0; // probably =team->t.t_serial like in save_inter_controls
3074  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3075  gx_icvs.next = NULL;
3076
3077  return gx_icvs;
3078}
3079
3080static void __kmp_initialize_root(kmp_root_t *root) {
3081  int f;
3082  kmp_team_t *root_team;
3083  kmp_team_t *hot_team;
3084  int hot_team_max_nth;
3085  kmp_r_sched_t r_sched =
3086      __kmp_get_schedule_global(); // get current state of scheduling globals
3087  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3088  KMP_DEBUG_ASSERT(root);
3089  KMP_ASSERT(!root->r.r_begin);
3090
3091  /* setup the root state structure */
3092  __kmp_init_lock(&root->r.r_begin_lock);
3093  root->r.r_begin = FALSE;
3094  root->r.r_active = FALSE;
3095  root->r.r_in_parallel = 0;
3096  root->r.r_blocktime = __kmp_dflt_blocktime;
3097
3098  /* setup the root team for this task */
3099  /* allocate the root team structure */
3100  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3101
3102  root_team =
3103      __kmp_allocate_team(root,
3104                          1, // new_nproc
3105                          1, // max_nproc
3106#if OMPT_SUPPORT
3107                          ompt_data_none, // root parallel id
3108#endif
3109                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3110                          0 // argc
3111                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3112                          );
3113#if USE_DEBUGGER
3114  // Non-NULL value should be assigned to make the debugger display the root
3115  // team.
3116  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3117#endif
3118
3119  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3120
3121  root->r.r_root_team = root_team;
3122  root_team->t.t_control_stack_top = NULL;
3123
3124  /* initialize root team */
3125  root_team->t.t_threads[0] = NULL;
3126  root_team->t.t_nproc = 1;
3127  root_team->t.t_serialized = 1;
3128  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3129  root_team->t.t_sched.sched = r_sched.sched;
3130  KA_TRACE(
3131      20,
3132      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3133       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3134
3135  /* setup the  hot team for this task */
3136  /* allocate the hot team structure */
3137  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3138
3139  hot_team =
3140      __kmp_allocate_team(root,
3141                          1, // new_nproc
3142                          __kmp_dflt_team_nth_ub * 2, // max_nproc
3143#if OMPT_SUPPORT
3144                          ompt_data_none, // root parallel id
3145#endif
3146                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3147                          0 // argc
3148                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3149                          );
3150  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3151
3152  root->r.r_hot_team = hot_team;
3153  root_team->t.t_control_stack_top = NULL;
3154
3155  /* first-time initialization */
3156  hot_team->t.t_parent = root_team;
3157
3158  /* initialize hot team */
3159  hot_team_max_nth = hot_team->t.t_max_nproc;
3160  for (f = 0; f < hot_team_max_nth; ++f) {
3161    hot_team->t.t_threads[f] = NULL;
3162  }
3163  hot_team->t.t_nproc = 1;
3164  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3165  hot_team->t.t_sched.sched = r_sched.sched;
3166  hot_team->t.t_size_changed = 0;
3167}
3168
3169#ifdef KMP_DEBUG
3170
3171typedef struct kmp_team_list_item {
3172  kmp_team_p const *entry;
3173  struct kmp_team_list_item *next;
3174} kmp_team_list_item_t;
3175typedef kmp_team_list_item_t *kmp_team_list_t;
3176
3177static void __kmp_print_structure_team_accum( // Add team to list of teams.
3178    kmp_team_list_t list, // List of teams.
3179    kmp_team_p const *team // Team to add.
3180    ) {
3181
3182  // List must terminate with item where both entry and next are NULL.
3183  // Team is added to the list only once.
3184  // List is sorted in ascending order by team id.
3185  // Team id is *not* a key.
3186
3187  kmp_team_list_t l;
3188
3189  KMP_DEBUG_ASSERT(list != NULL);
3190  if (team == NULL) {
3191    return;
3192  }
3193
3194  __kmp_print_structure_team_accum(list, team->t.t_parent);
3195  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3196
3197  // Search list for the team.
3198  l = list;
3199  while (l->next != NULL && l->entry != team) {
3200    l = l->next;
3201  }
3202  if (l->next != NULL) {
3203    return; // Team has been added before, exit.
3204  }
3205
3206  // Team is not found. Search list again for insertion point.
3207  l = list;
3208  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3209    l = l->next;
3210  }
3211
3212  // Insert team.
3213  {
3214    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3215        sizeof(kmp_team_list_item_t));
3216    *item = *l;
3217    l->entry = team;
3218    l->next = item;
3219  }
3220}
3221
3222static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3223
3224                                       ) {
3225  __kmp_printf("%s", title);
3226  if (team != NULL) {
3227    __kmp_printf("%2x %p\n", team->t.t_id, team);
3228  } else {
3229    __kmp_printf(" - (nil)\n");
3230  }
3231}
3232
3233static void __kmp_print_structure_thread(char const *title,
3234                                         kmp_info_p const *thread) {
3235  __kmp_printf("%s", title);
3236  if (thread != NULL) {
3237    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3238  } else {
3239    __kmp_printf(" - (nil)\n");
3240  }
3241}
3242
3243void __kmp_print_structure(void) {
3244
3245  kmp_team_list_t list;
3246
3247  // Initialize list of teams.
3248  list =
3249      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3250  list->entry = NULL;
3251  list->next = NULL;
3252
3253  __kmp_printf("\n------------------------------\nGlobal Thread "
3254               "Table\n------------------------------\n");
3255  {
3256    int gtid;
3257    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3258      __kmp_printf("%2d", gtid);
3259      if (__kmp_threads != NULL) {
3260        __kmp_printf(" %p", __kmp_threads[gtid]);
3261      }
3262      if (__kmp_root != NULL) {
3263        __kmp_printf(" %p", __kmp_root[gtid]);
3264      }
3265      __kmp_printf("\n");
3266    }
3267  }
3268
3269  // Print out __kmp_threads array.
3270  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3271               "----------\n");
3272  if (__kmp_threads != NULL) {
3273    int gtid;
3274    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3275      kmp_info_t const *thread = __kmp_threads[gtid];
3276      if (thread != NULL) {
3277        __kmp_printf("GTID %2d %p:\n", gtid, thread);
3278        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3279        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3280        __kmp_print_structure_team("    Serial Team:  ",
3281                                   thread->th.th_serial_team);
3282        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3283        __kmp_print_structure_thread("    Master:       ",
3284                                     thread->th.th_team_master);
3285        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3286        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3287        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3288        __kmp_print_structure_thread("    Next in pool: ",
3289                                     thread->th.th_next_pool);
3290        __kmp_printf("\n");
3291        __kmp_print_structure_team_accum(list, thread->th.th_team);
3292        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3293      }
3294    }
3295  } else {
3296    __kmp_printf("Threads array is not allocated.\n");
3297  }
3298
3299  // Print out __kmp_root array.
3300  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3301               "--------\n");
3302  if (__kmp_root != NULL) {
3303    int gtid;
3304    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3305      kmp_root_t const *root = __kmp_root[gtid];
3306      if (root != NULL) {
3307        __kmp_printf("GTID %2d %p:\n", gtid, root);
3308        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3309        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3310        __kmp_print_structure_thread("    Uber Thread:  ",
3311                                     root->r.r_uber_thread);
3312        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3313        __kmp_printf("    In Parallel:  %2d\n",
3314                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3315        __kmp_printf("\n");
3316        __kmp_print_structure_team_accum(list, root->r.r_root_team);
3317        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3318      }
3319    }
3320  } else {
3321    __kmp_printf("Ubers array is not allocated.\n");
3322  }
3323
3324  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3325               "--------\n");
3326  while (list->next != NULL) {
3327    kmp_team_p const *team = list->entry;
3328    int i;
3329    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3330    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3331    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3332    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3333    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3334    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3335    for (i = 0; i < team->t.t_nproc; ++i) {
3336      __kmp_printf("    Thread %2d:      ", i);
3337      __kmp_print_structure_thread("", team->t.t_threads[i]);
3338    }
3339    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3340    __kmp_printf("\n");
3341    list = list->next;
3342  }
3343
3344  // Print out __kmp_thread_pool and __kmp_team_pool.
3345  __kmp_printf("\n------------------------------\nPools\n----------------------"
3346               "--------\n");
3347  __kmp_print_structure_thread("Thread pool:          ",
3348                               CCAST(kmp_info_t *, __kmp_thread_pool));
3349  __kmp_print_structure_team("Team pool:            ",
3350                             CCAST(kmp_team_t *, __kmp_team_pool));
3351  __kmp_printf("\n");
3352
3353  // Free team list.
3354  while (list != NULL) {
3355    kmp_team_list_item_t *item = list;
3356    list = list->next;
3357    KMP_INTERNAL_FREE(item);
3358  }
3359}
3360
3361#endif
3362
3363//---------------------------------------------------------------------------
3364//  Stuff for per-thread fast random number generator
3365//  Table of primes
3366static const unsigned __kmp_primes[] = {
3367    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3368    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3369    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3370    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3371    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3372    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3373    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3374    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3375    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3376    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3377    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3378
3379//---------------------------------------------------------------------------
3380//  __kmp_get_random: Get a random number using a linear congruential method.
3381unsigned short __kmp_get_random(kmp_info_t *thread) {
3382  unsigned x = thread->th.th_x;
3383  unsigned short r = x >> 16;
3384
3385  thread->th.th_x = x * thread->th.th_a + 1;
3386
3387  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3388                thread->th.th_info.ds.ds_tid, r));
3389
3390  return r;
3391}
3392//--------------------------------------------------------
3393// __kmp_init_random: Initialize a random number generator
3394void __kmp_init_random(kmp_info_t *thread) {
3395  unsigned seed = thread->th.th_info.ds.ds_tid;
3396
3397  thread->th.th_a =
3398      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3399  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3400  KA_TRACE(30,
3401           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3402}
3403
3404#if KMP_OS_WINDOWS
3405/* reclaim array entries for root threads that are already dead, returns number
3406 * reclaimed */
3407static int __kmp_reclaim_dead_roots(void) {
3408  int i, r = 0;
3409
3410  for (i = 0; i < __kmp_threads_capacity; ++i) {
3411    if (KMP_UBER_GTID(i) &&
3412        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3413        !__kmp_root[i]
3414             ->r.r_active) { // AC: reclaim only roots died in non-active state
3415      r += __kmp_unregister_root_other_thread(i);
3416    }
3417  }
3418  return r;
3419}
3420#endif
3421
3422/* This function attempts to create free entries in __kmp_threads and
3423   __kmp_root, and returns the number of free entries generated.
3424
3425   For Windows* OS static library, the first mechanism used is to reclaim array
3426   entries for root threads that are already dead.
3427
3428   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3429   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3430   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3431   threadprivate cache array has been created. Synchronization with
3432   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3433
3434   After any dead root reclamation, if the clipping value allows array expansion
3435   to result in the generation of a total of nNeed free slots, the function does
3436   that expansion. If not, nothing is done beyond the possible initial root
3437   thread reclamation.
3438
3439   If any argument is negative, the behavior is undefined. */
3440static int __kmp_expand_threads(int nNeed) {
3441  int added = 0;
3442  int minimumRequiredCapacity;
3443  int newCapacity;
3444  kmp_info_t **newThreads;
3445  kmp_root_t **newRoot;
3446
3447// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3448// resizing __kmp_threads does not need additional protection if foreign
3449// threads are present
3450
3451#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3452  /* only for Windows static library */
3453  /* reclaim array entries for root threads that are already dead */
3454  added = __kmp_reclaim_dead_roots();
3455
3456  if (nNeed) {
3457    nNeed -= added;
3458    if (nNeed < 0)
3459      nNeed = 0;
3460  }
3461#endif
3462  if (nNeed <= 0)
3463    return added;
3464
3465  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3466  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3467  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3468  // > __kmp_max_nth in one of two ways:
3469  //
3470  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3471  //    may not be resused by another thread, so we may need to increase
3472  //    __kmp_threads_capacity to __kmp_max_nth + 1.
3473  //
3474  // 2) New foreign root(s) are encountered.  We always register new foreign
3475  //    roots. This may cause a smaller # of threads to be allocated at
3476  //    subsequent parallel regions, but the worker threads hang around (and
3477  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3478  //
3479  // Anyway, that is the reason for moving the check to see if
3480  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3481  // instead of having it performed here. -BB
3482
3483  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3484
3485  /* compute expansion headroom to check if we can expand */
3486  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3487    /* possible expansion too small -- give up */
3488    return added;
3489  }
3490  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3491
3492  newCapacity = __kmp_threads_capacity;
3493  do {
3494    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3495                                                          : __kmp_sys_max_nth;
3496  } while (newCapacity < minimumRequiredCapacity);
3497  newThreads = (kmp_info_t **)__kmp_allocate(
3498      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3499  newRoot =
3500      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3501  KMP_MEMCPY(newThreads, __kmp_threads,
3502             __kmp_threads_capacity * sizeof(kmp_info_t *));
3503  KMP_MEMCPY(newRoot, __kmp_root,
3504             __kmp_threads_capacity * sizeof(kmp_root_t *));
3505
3506  kmp_info_t **temp_threads = __kmp_threads;
3507  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3508  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3509  __kmp_free(temp_threads);
3510  added += newCapacity - __kmp_threads_capacity;
3511  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3512
3513  if (newCapacity > __kmp_tp_capacity) {
3514    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3515    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3516      __kmp_threadprivate_resize_cache(newCapacity);
3517    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3518      *(volatile int *)&__kmp_tp_capacity = newCapacity;
3519    }
3520    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3521  }
3522
3523  return added;
3524}
3525
3526/* Register the current thread as a root thread and obtain our gtid. We must
3527   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3528   thread that calls from __kmp_do_serial_initialize() */
3529int __kmp_register_root(int initial_thread) {
3530  kmp_info_t *root_thread;
3531  kmp_root_t *root;
3532  int gtid;
3533  int capacity;
3534  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3535  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3536  KMP_MB();
3537
3538  /* 2007-03-02:
3539     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3540     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3541     work as expected -- it may return false (that means there is at least one
3542     empty slot in __kmp_threads array), but it is possible the only free slot
3543     is #0, which is reserved for initial thread and so cannot be used for this
3544     one. Following code workarounds this bug.
3545
3546     However, right solution seems to be not reserving slot #0 for initial
3547     thread because:
3548     (1) there is no magic in slot #0,
3549     (2) we cannot detect initial thread reliably (the first thread which does
3550        serial initialization may be not a real initial thread).
3551  */
3552  capacity = __kmp_threads_capacity;
3553  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3554    --capacity;
3555  }
3556
3557  /* see if there are too many threads */
3558  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3559    if (__kmp_tp_cached) {
3560      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3561                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3562                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3563    } else {
3564      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3565                  __kmp_msg_null);
3566    }
3567  }
3568
3569  /* find an available thread slot */
3570  /* Don't reassign the zero slot since we need that to only be used by initial
3571     thread */
3572  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3573       gtid++)
3574    ;
3575  KA_TRACE(1,
3576           ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3577  KMP_ASSERT(gtid < __kmp_threads_capacity);
3578
3579  /* update global accounting */
3580  __kmp_all_nth++;
3581  TCW_4(__kmp_nth, __kmp_nth + 1);
3582
3583  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3584  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3585  if (__kmp_adjust_gtid_mode) {
3586    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3587      if (TCR_4(__kmp_gtid_mode) != 2) {
3588        TCW_4(__kmp_gtid_mode, 2);
3589      }
3590    } else {
3591      if (TCR_4(__kmp_gtid_mode) != 1) {
3592        TCW_4(__kmp_gtid_mode, 1);
3593      }
3594    }
3595  }
3596
3597#ifdef KMP_ADJUST_BLOCKTIME
3598  /* Adjust blocktime to zero if necessary            */
3599  /* Middle initialization might not have occurred yet */
3600  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3601    if (__kmp_nth > __kmp_avail_proc) {
3602      __kmp_zero_bt = TRUE;
3603    }
3604  }
3605#endif /* KMP_ADJUST_BLOCKTIME */
3606
3607  /* setup this new hierarchy */
3608  if (!(root = __kmp_root[gtid])) {
3609    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3610    KMP_DEBUG_ASSERT(!root->r.r_root_team);
3611  }
3612
3613#if KMP_STATS_ENABLED
3614  // Initialize stats as soon as possible (right after gtid assignment).
3615  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3616  __kmp_stats_thread_ptr->startLife();
3617  KMP_SET_THREAD_STATE(SERIAL_REGION);
3618  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3619#endif
3620  __kmp_initialize_root(root);
3621
3622  /* setup new root thread structure */
3623  if (root->r.r_uber_thread) {
3624    root_thread = root->r.r_uber_thread;
3625  } else {
3626    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3627    if (__kmp_storage_map) {
3628      __kmp_print_thread_storage_map(root_thread, gtid);
3629    }
3630    root_thread->th.th_info.ds.ds_gtid = gtid;
3631#if OMPT_SUPPORT
3632    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3633#endif
3634    root_thread->th.th_root = root;
3635    if (__kmp_env_consistency_check) {
3636      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3637    }
3638#if USE_FAST_MEMORY
3639    __kmp_initialize_fast_memory(root_thread);
3640#endif /* USE_FAST_MEMORY */
3641
3642#if KMP_USE_BGET
3643    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3644    __kmp_initialize_bget(root_thread);
3645#endif
3646    __kmp_init_random(root_thread); // Initialize random number generator
3647  }
3648
3649  /* setup the serial team held in reserve by the root thread */
3650  if (!root_thread->th.th_serial_team) {
3651    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3652    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3653    root_thread->th.th_serial_team = __kmp_allocate_team(
3654        root, 1, 1,
3655#if OMPT_SUPPORT
3656        ompt_data_none, // root parallel id
3657#endif
3658        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3659  }
3660  KMP_ASSERT(root_thread->th.th_serial_team);
3661  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3662                root_thread->th.th_serial_team));
3663
3664  /* drop root_thread into place */
3665  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3666
3667  root->r.r_root_team->t.t_threads[0] = root_thread;
3668  root->r.r_hot_team->t.t_threads[0] = root_thread;
3669  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3670  // AC: the team created in reserve, not for execution (it is unused for now).
3671  root_thread->th.th_serial_team->t.t_serialized = 0;
3672  root->r.r_uber_thread = root_thread;
3673
3674  /* initialize the thread, get it ready to go */
3675  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3676  TCW_4(__kmp_init_gtid, TRUE);
3677
3678  /* prepare the master thread for get_gtid() */
3679  __kmp_gtid_set_specific(gtid);
3680
3681#if USE_ITT_BUILD
3682  __kmp_itt_thread_name(gtid);
3683#endif /* USE_ITT_BUILD */
3684
3685#ifdef KMP_TDATA_GTID
3686  __kmp_gtid = gtid;
3687#endif
3688  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3689  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3690
3691  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3692                "plain=%u\n",
3693                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3694                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3695                KMP_INIT_BARRIER_STATE));
3696  { // Initialize barrier data.
3697    int b;
3698    for (b = 0; b < bs_last_barrier; ++b) {
3699      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3700#if USE_DEBUGGER
3701      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3702#endif
3703    }
3704  }
3705  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3706                   KMP_INIT_BARRIER_STATE);
3707
3708#if KMP_AFFINITY_SUPPORTED
3709  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3710  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3711  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3712  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3713  if (TCR_4(__kmp_init_middle)) {
3714    __kmp_affinity_set_init_mask(gtid, TRUE);
3715  }
3716#endif /* KMP_AFFINITY_SUPPORTED */
3717  root_thread->th.th_def_allocator = __kmp_def_allocator;
3718  root_thread->th.th_prev_level = 0;
3719  root_thread->th.th_prev_num_threads = 1;
3720
3721  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3722  tmp->cg_root = root_thread;
3723  tmp->cg_thread_limit = __kmp_cg_max_nth;
3724  tmp->cg_nthreads = 1;
3725  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3726                 " cg_nthreads init to 1\n",
3727                 root_thread, tmp));
3728  tmp->up = NULL;
3729  root_thread->th.th_cg_roots = tmp;
3730
3731  __kmp_root_counter++;
3732
3733#if OMPT_SUPPORT
3734  if (!initial_thread && ompt_enabled.enabled) {
3735
3736    kmp_info_t *root_thread = ompt_get_thread();
3737
3738    ompt_set_thread_state(root_thread, ompt_state_overhead);
3739
3740    if (ompt_enabled.ompt_callback_thread_begin) {
3741      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3742          ompt_thread_initial, __ompt_get_thread_data_internal());
3743    }
3744    ompt_data_t *task_data;
3745    ompt_data_t *parallel_data;
3746    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3747    if (ompt_enabled.ompt_callback_implicit_task) {
3748      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3749          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3750    }
3751
3752    ompt_set_thread_state(root_thread, ompt_state_work_serial);
3753  }
3754#endif
3755
3756  KMP_MB();
3757  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3758
3759  return gtid;
3760}
3761
3762#if KMP_NESTED_HOT_TEAMS
3763static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3764                                const int max_level) {
3765  int i, n, nth;
3766  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3767  if (!hot_teams || !hot_teams[level].hot_team) {
3768    return 0;
3769  }
3770  KMP_DEBUG_ASSERT(level < max_level);
3771  kmp_team_t *team = hot_teams[level].hot_team;
3772  nth = hot_teams[level].hot_team_nth;
3773  n = nth - 1; // master is not freed
3774  if (level < max_level - 1) {
3775    for (i = 0; i < nth; ++i) {
3776      kmp_info_t *th = team->t.t_threads[i];
3777      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3778      if (i > 0 && th->th.th_hot_teams) {
3779        __kmp_free(th->th.th_hot_teams);
3780        th->th.th_hot_teams = NULL;
3781      }
3782    }
3783  }
3784  __kmp_free_team(root, team, NULL);
3785  return n;
3786}
3787#endif
3788
3789// Resets a root thread and clear its root and hot teams.
3790// Returns the number of __kmp_threads entries directly and indirectly freed.
3791static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3792  kmp_team_t *root_team = root->r.r_root_team;
3793  kmp_team_t *hot_team = root->r.r_hot_team;
3794  int n = hot_team->t.t_nproc;
3795  int i;
3796
3797  KMP_DEBUG_ASSERT(!root->r.r_active);
3798
3799  root->r.r_root_team = NULL;
3800  root->r.r_hot_team = NULL;
3801  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3802  // before call to __kmp_free_team().
3803  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3804#if KMP_NESTED_HOT_TEAMS
3805  if (__kmp_hot_teams_max_level >
3806      0) { // need to free nested hot teams and their threads if any
3807    for (i = 0; i < hot_team->t.t_nproc; ++i) {
3808      kmp_info_t *th = hot_team->t.t_threads[i];
3809      if (__kmp_hot_teams_max_level > 1) {
3810        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3811      }
3812      if (th->th.th_hot_teams) {
3813        __kmp_free(th->th.th_hot_teams);
3814        th->th.th_hot_teams = NULL;
3815      }
3816    }
3817  }
3818#endif
3819  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3820
3821  // Before we can reap the thread, we need to make certain that all other
3822  // threads in the teams that had this root as ancestor have stopped trying to
3823  // steal tasks.
3824  if (__kmp_tasking_mode != tskm_immediate_exec) {
3825    __kmp_wait_to_unref_task_teams();
3826  }
3827
3828#if KMP_OS_WINDOWS
3829  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3830  KA_TRACE(
3831      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3832           "\n",
3833           (LPVOID) & (root->r.r_uber_thread->th),
3834           root->r.r_uber_thread->th.th_info.ds.ds_thread));
3835  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3836#endif /* KMP_OS_WINDOWS */
3837
3838#if OMPT_SUPPORT
3839  ompt_data_t *task_data;
3840  ompt_data_t *parallel_data;
3841  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3842  if (ompt_enabled.ompt_callback_implicit_task) {
3843    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3844        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3845  }
3846  if (ompt_enabled.ompt_callback_thread_end) {
3847    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3848        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3849  }
3850#endif
3851
3852  TCW_4(__kmp_nth,
3853        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3854  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3855  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3856                 " to %d\n",
3857                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3858                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3859  if (i == 1) {
3860    // need to free contention group structure
3861    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3862                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
3863    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3864    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3865    root->r.r_uber_thread->th.th_cg_roots = NULL;
3866  }
3867  __kmp_reap_thread(root->r.r_uber_thread, 1);
3868
3869  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3870  // of freeing.
3871  root->r.r_uber_thread = NULL;
3872  /* mark root as no longer in use */
3873  root->r.r_begin = FALSE;
3874
3875  return n;
3876}
3877
3878void __kmp_unregister_root_current_thread(int gtid) {
3879  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3880  /* this lock should be ok, since unregister_root_current_thread is never
3881     called during an abort, only during a normal close. furthermore, if you
3882     have the forkjoin lock, you should never try to get the initz lock */
3883  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3884  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3885    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3886                  "exiting T#%d\n",
3887                  gtid));
3888    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3889    return;
3890  }
3891  kmp_root_t *root = __kmp_root[gtid];
3892
3893  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3894  KMP_ASSERT(KMP_UBER_GTID(gtid));
3895  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3896  KMP_ASSERT(root->r.r_active == FALSE);
3897
3898  KMP_MB();
3899
3900  kmp_info_t *thread = __kmp_threads[gtid];
3901  kmp_team_t *team = thread->th.th_team;
3902  kmp_task_team_t *task_team = thread->th.th_task_team;
3903
3904  // we need to wait for the proxy tasks before finishing the thread
3905  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3906#if OMPT_SUPPORT
3907    // the runtime is shutting down so we won't report any events
3908    thread->th.ompt_thread_info.state = ompt_state_undefined;
3909#endif
3910    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3911  }
3912
3913  __kmp_reset_root(gtid, root);
3914
3915  /* free up this thread slot */
3916  __kmp_gtid_set_specific(KMP_GTID_DNE);
3917#ifdef KMP_TDATA_GTID
3918  __kmp_gtid = KMP_GTID_DNE;
3919#endif
3920
3921  KMP_MB();
3922  KC_TRACE(10,
3923           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3924
3925  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3926}
3927
3928#if KMP_OS_WINDOWS
3929/* __kmp_forkjoin_lock must be already held
3930   Unregisters a root thread that is not the current thread.  Returns the number
3931   of __kmp_threads entries freed as a result. */
3932static int __kmp_unregister_root_other_thread(int gtid) {
3933  kmp_root_t *root = __kmp_root[gtid];
3934  int r;
3935
3936  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3937  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938  KMP_ASSERT(KMP_UBER_GTID(gtid));
3939  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940  KMP_ASSERT(root->r.r_active == FALSE);
3941
3942  r = __kmp_reset_root(gtid, root);
3943  KC_TRACE(10,
3944           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3945  return r;
3946}
3947#endif
3948
3949#if KMP_DEBUG
3950void __kmp_task_info() {
3951
3952  kmp_int32 gtid = __kmp_entry_gtid();
3953  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3954  kmp_info_t *this_thr = __kmp_threads[gtid];
3955  kmp_team_t *steam = this_thr->th.th_serial_team;
3956  kmp_team_t *team = this_thr->th.th_team;
3957
3958  __kmp_printf(
3959      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3960      "ptask=%p\n",
3961      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3962      team->t.t_implicit_task_taskdata[tid].td_parent);
3963}
3964#endif // KMP_DEBUG
3965
3966/* TODO optimize with one big memclr, take out what isn't needed, split
3967   responsibility to workers as much as possible, and delay initialization of
3968   features as much as possible  */
3969static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3970                                  int tid, int gtid) {
3971  /* this_thr->th.th_info.ds.ds_gtid is setup in
3972     kmp_allocate_thread/create_worker.
3973     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3974  kmp_info_t *master = team->t.t_threads[0];
3975  KMP_DEBUG_ASSERT(this_thr != NULL);
3976  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
3977  KMP_DEBUG_ASSERT(team);
3978  KMP_DEBUG_ASSERT(team->t.t_threads);
3979  KMP_DEBUG_ASSERT(team->t.t_dispatch);
3980  KMP_DEBUG_ASSERT(master);
3981  KMP_DEBUG_ASSERT(master->th.th_root);
3982
3983  KMP_MB();
3984
3985  TCW_SYNC_PTR(this_thr->th.th_team, team);
3986
3987  this_thr->th.th_info.ds.ds_tid = tid;
3988  this_thr->th.th_set_nproc = 0;
3989  if (__kmp_tasking_mode != tskm_immediate_exec)
3990    // When tasking is possible, threads are not safe to reap until they are
3991    // done tasking; this will be set when tasking code is exited in wait
3992    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3993  else // no tasking --> always safe to reap
3994    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
3995  this_thr->th.th_set_proc_bind = proc_bind_default;
3996#if KMP_AFFINITY_SUPPORTED
3997  this_thr->th.th_new_place = this_thr->th.th_current_place;
3998#endif
3999  this_thr->th.th_root = master->th.th_root;
4000
4001  /* setup the thread's cache of the team structure */
4002  this_thr->th.th_team_nproc = team->t.t_nproc;
4003  this_thr->th.th_team_master = master;
4004  this_thr->th.th_team_serialized = team->t.t_serialized;
4005  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4006
4007  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4008
4009  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4010                tid, gtid, this_thr, this_thr->th.th_current_task));
4011
4012  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4013                           team, tid, TRUE);
4014
4015  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4016                tid, gtid, this_thr, this_thr->th.th_current_task));
4017  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4018  // __kmp_initialize_team()?
4019
4020  /* TODO no worksharing in speculative threads */
4021  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4022
4023  this_thr->th.th_local.this_construct = 0;
4024
4025  if (!this_thr->th.th_pri_common) {
4026    this_thr->th.th_pri_common =
4027        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4028    if (__kmp_storage_map) {
4029      __kmp_print_storage_map_gtid(
4030          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4031          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4032    }
4033    this_thr->th.th_pri_head = NULL;
4034  }
4035
4036  if (this_thr != master && // Master's CG root is initialized elsewhere
4037      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4038    // Make new thread's CG root same as master's
4039    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4040    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4041    if (tmp) {
4042      // worker changes CG, need to check if old CG should be freed
4043      int i = tmp->cg_nthreads--;
4044      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4045                     " on node %p of thread %p to %d\n",
4046                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4047      if (i == 1) {
4048        __kmp_free(tmp); // last thread left CG --> free it
4049      }
4050    }
4051    this_thr->th.th_cg_roots = master->th.th_cg_roots;
4052    // Increment new thread's CG root's counter to add the new thread
4053    this_thr->th.th_cg_roots->cg_nthreads++;
4054    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4055                   " node %p of thread %p to %d\n",
4056                   this_thr, this_thr->th.th_cg_roots,
4057                   this_thr->th.th_cg_roots->cg_root,
4058                   this_thr->th.th_cg_roots->cg_nthreads));
4059    this_thr->th.th_current_task->td_icvs.thread_limit =
4060        this_thr->th.th_cg_roots->cg_thread_limit;
4061  }
4062
4063  /* Initialize dynamic dispatch */
4064  {
4065    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4066    // Use team max_nproc since this will never change for the team.
4067    size_t disp_size =
4068        sizeof(dispatch_private_info_t) *
4069        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4070    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4071                  team->t.t_max_nproc));
4072    KMP_ASSERT(dispatch);
4073    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4075
4076    dispatch->th_disp_index = 0;
4077    dispatch->th_doacross_buf_idx = 0;
4078    if (!dispatch->th_disp_buffer) {
4079      dispatch->th_disp_buffer =
4080          (dispatch_private_info_t *)__kmp_allocate(disp_size);
4081
4082      if (__kmp_storage_map) {
4083        __kmp_print_storage_map_gtid(
4084            gtid, &dispatch->th_disp_buffer[0],
4085            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4086                                          ? 1
4087                                          : __kmp_dispatch_num_buffers],
4088            disp_size, "th_%d.th_dispatch.th_disp_buffer "
4089                       "(team_%d.t_dispatch[%d].th_disp_buffer)",
4090            gtid, team->t.t_id, gtid);
4091      }
4092    } else {
4093      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4094    }
4095
4096    dispatch->th_dispatch_pr_current = 0;
4097    dispatch->th_dispatch_sh_current = 0;
4098
4099    dispatch->th_deo_fcn = 0; /* ORDERED     */
4100    dispatch->th_dxo_fcn = 0; /* END ORDERED */
4101  }
4102
4103  this_thr->th.th_next_pool = NULL;
4104
4105  if (!this_thr->th.th_task_state_memo_stack) {
4106    size_t i;
4107    this_thr->th.th_task_state_memo_stack =
4108        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4109    this_thr->th.th_task_state_top = 0;
4110    this_thr->th.th_task_state_stack_sz = 4;
4111    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4112         ++i) // zero init the stack
4113      this_thr->th.th_task_state_memo_stack[i] = 0;
4114  }
4115
4116  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4117  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4118
4119  KMP_MB();
4120}
4121
4122/* allocate a new thread for the requesting team. this is only called from
4123   within a forkjoin critical section. we will first try to get an available
4124   thread from the thread pool. if none is available, we will fork a new one
4125   assuming we are able to create a new one. this should be assured, as the
4126   caller should check on this first. */
4127kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4128                                  int new_tid) {
4129  kmp_team_t *serial_team;
4130  kmp_info_t *new_thr;
4131  int new_gtid;
4132
4133  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4134  KMP_DEBUG_ASSERT(root && team);
4135#if !KMP_NESTED_HOT_TEAMS
4136  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4137#endif
4138  KMP_MB();
4139
4140  /* first, try to get one from the thread pool */
4141  if (__kmp_thread_pool) {
4142    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4143    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4144    if (new_thr == __kmp_thread_pool_insert_pt) {
4145      __kmp_thread_pool_insert_pt = NULL;
4146    }
4147    TCW_4(new_thr->th.th_in_pool, FALSE);
4148    __kmp_suspend_initialize_thread(new_thr);
4149    __kmp_lock_suspend_mx(new_thr);
4150    if (new_thr->th.th_active_in_pool == TRUE) {
4151      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4152      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4153      new_thr->th.th_active_in_pool = FALSE;
4154    }
4155    __kmp_unlock_suspend_mx(new_thr);
4156
4157    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4158                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4159    KMP_ASSERT(!new_thr->th.th_team);
4160    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4161
4162    /* setup the thread structure */
4163    __kmp_initialize_info(new_thr, team, new_tid,
4164                          new_thr->th.th_info.ds.ds_gtid);
4165    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4166
4167    TCW_4(__kmp_nth, __kmp_nth + 1);
4168
4169    new_thr->th.th_task_state = 0;
4170    new_thr->th.th_task_state_top = 0;
4171    new_thr->th.th_task_state_stack_sz = 4;
4172
4173#ifdef KMP_ADJUST_BLOCKTIME
4174    /* Adjust blocktime back to zero if necessary */
4175    /* Middle initialization might not have occurred yet */
4176    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4177      if (__kmp_nth > __kmp_avail_proc) {
4178        __kmp_zero_bt = TRUE;
4179      }
4180    }
4181#endif /* KMP_ADJUST_BLOCKTIME */
4182
4183#if KMP_DEBUG
4184    // If thread entered pool via __kmp_free_thread, wait_flag should !=
4185    // KMP_BARRIER_PARENT_FLAG.
4186    int b;
4187    kmp_balign_t *balign = new_thr->th.th_bar;
4188    for (b = 0; b < bs_last_barrier; ++b)
4189      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4190#endif
4191
4192    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4193                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4194
4195    KMP_MB();
4196    return new_thr;
4197  }
4198
4199  /* no, well fork a new one */
4200  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4201  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4202
4203#if KMP_USE_MONITOR
4204  // If this is the first worker thread the RTL is creating, then also
4205  // launch the monitor thread.  We try to do this as early as possible.
4206  if (!TCR_4(__kmp_init_monitor)) {
4207    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4208    if (!TCR_4(__kmp_init_monitor)) {
4209      KF_TRACE(10, ("before __kmp_create_monitor\n"));
4210      TCW_4(__kmp_init_monitor, 1);
4211      __kmp_create_monitor(&__kmp_monitor);
4212      KF_TRACE(10, ("after __kmp_create_monitor\n"));
4213#if KMP_OS_WINDOWS
4214      // AC: wait until monitor has started. This is a fix for CQ232808.
4215      // The reason is that if the library is loaded/unloaded in a loop with
4216      // small (parallel) work in between, then there is high probability that
4217      // monitor thread started after the library shutdown. At shutdown it is
4218      // too late to cope with the problem, because when the master is in
4219      // DllMain (process detach) the monitor has no chances to start (it is
4220      // blocked), and master has no means to inform the monitor that the
4221      // library has gone, because all the memory which the monitor can access
4222      // is going to be released/reset.
4223      while (TCR_4(__kmp_init_monitor) < 2) {
4224        KMP_YIELD(TRUE);
4225      }
4226      KF_TRACE(10, ("after monitor thread has started\n"));
4227#endif
4228    }
4229    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4230  }
4231#endif
4232
4233  KMP_MB();
4234  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4235    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4236  }
4237
4238  /* allocate space for it. */
4239  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4240
4241  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4242
4243  if (__kmp_storage_map) {
4244    __kmp_print_thread_storage_map(new_thr, new_gtid);
4245  }
4246
4247  // add the reserve serialized team, initialized from the team's master thread
4248  {
4249    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4250    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4251    new_thr->th.th_serial_team = serial_team =
4252        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4253#if OMPT_SUPPORT
4254                                          ompt_data_none, // root parallel id
4255#endif
4256                                          proc_bind_default, &r_icvs,
4257                                          0 USE_NESTED_HOT_ARG(NULL));
4258  }
4259  KMP_ASSERT(serial_team);
4260  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4261  // execution (it is unused for now).
4262  serial_team->t.t_threads[0] = new_thr;
4263  KF_TRACE(10,
4264           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4265            new_thr));
4266
4267  /* setup the thread structures */
4268  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4269
4270#if USE_FAST_MEMORY
4271  __kmp_initialize_fast_memory(new_thr);
4272#endif /* USE_FAST_MEMORY */
4273
4274#if KMP_USE_BGET
4275  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4276  __kmp_initialize_bget(new_thr);
4277#endif
4278
4279  __kmp_init_random(new_thr); // Initialize random number generator
4280
4281  /* Initialize these only once when thread is grabbed for a team allocation */
4282  KA_TRACE(20,
4283           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4284            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4285
4286  int b;
4287  kmp_balign_t *balign = new_thr->th.th_bar;
4288  for (b = 0; b < bs_last_barrier; ++b) {
4289    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4290    balign[b].bb.team = NULL;
4291    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4292    balign[b].bb.use_oncore_barrier = 0;
4293  }
4294
4295  new_thr->th.th_spin_here = FALSE;
4296  new_thr->th.th_next_waiting = 0;
4297#if KMP_OS_UNIX
4298  new_thr->th.th_blocking = false;
4299#endif
4300
4301#if KMP_AFFINITY_SUPPORTED
4302  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4303  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4304  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4305  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4306#endif
4307  new_thr->th.th_def_allocator = __kmp_def_allocator;
4308  new_thr->th.th_prev_level = 0;
4309  new_thr->th.th_prev_num_threads = 1;
4310
4311  TCW_4(new_thr->th.th_in_pool, FALSE);
4312  new_thr->th.th_active_in_pool = FALSE;
4313  TCW_4(new_thr->th.th_active, TRUE);
4314
4315  /* adjust the global counters */
4316  __kmp_all_nth++;
4317  __kmp_nth++;
4318
4319  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4320  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4321  if (__kmp_adjust_gtid_mode) {
4322    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4323      if (TCR_4(__kmp_gtid_mode) != 2) {
4324        TCW_4(__kmp_gtid_mode, 2);
4325      }
4326    } else {
4327      if (TCR_4(__kmp_gtid_mode) != 1) {
4328        TCW_4(__kmp_gtid_mode, 1);
4329      }
4330    }
4331  }
4332
4333#ifdef KMP_ADJUST_BLOCKTIME
4334  /* Adjust blocktime back to zero if necessary       */
4335  /* Middle initialization might not have occurred yet */
4336  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4337    if (__kmp_nth > __kmp_avail_proc) {
4338      __kmp_zero_bt = TRUE;
4339    }
4340  }
4341#endif /* KMP_ADJUST_BLOCKTIME */
4342
4343  /* actually fork it and create the new worker thread */
4344  KF_TRACE(
4345      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4346  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4347  KF_TRACE(10,
4348           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4349
4350  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4351                new_gtid));
4352  KMP_MB();
4353  return new_thr;
4354}
4355
4356/* Reinitialize team for reuse.
4357   The hot team code calls this case at every fork barrier, so EPCC barrier
4358   test are extremely sensitive to changes in it, esp. writes to the team
4359   struct, which cause a cache invalidation in all threads.
4360   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4361static void __kmp_reinitialize_team(kmp_team_t *team,
4362                                    kmp_internal_control_t *new_icvs,
4363                                    ident_t *loc) {
4364  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4365                team->t.t_threads[0], team));
4366  KMP_DEBUG_ASSERT(team && new_icvs);
4367  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4368  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4369
4370  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4371  // Copy ICVs to the master thread's implicit taskdata
4372  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4373  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4374
4375  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4376                team->t.t_threads[0], team));
4377}
4378
4379/* Initialize the team data structure.
4380   This assumes the t_threads and t_max_nproc are already set.
4381   Also, we don't touch the arguments */
4382static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4383                                  kmp_internal_control_t *new_icvs,
4384                                  ident_t *loc) {
4385  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4386
4387  /* verify */
4388  KMP_DEBUG_ASSERT(team);
4389  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4390  KMP_DEBUG_ASSERT(team->t.t_threads);
4391  KMP_MB();
4392
4393  team->t.t_master_tid = 0; /* not needed */
4394  /* team->t.t_master_bar;        not needed */
4395  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4396  team->t.t_nproc = new_nproc;
4397
4398  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4399  team->t.t_next_pool = NULL;
4400  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4401   * up hot team */
4402
4403  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4404  team->t.t_invoke = NULL; /* not needed */
4405
4406  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4407  team->t.t_sched.sched = new_icvs->sched.sched;
4408
4409#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4410  team->t.t_fp_control_saved = FALSE; /* not needed */
4411  team->t.t_x87_fpu_control_word = 0; /* not needed */
4412  team->t.t_mxcsr = 0; /* not needed */
4413#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4414
4415  team->t.t_construct = 0;
4416
4417  team->t.t_ordered.dt.t_value = 0;
4418  team->t.t_master_active = FALSE;
4419
4420#ifdef KMP_DEBUG
4421  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4422#endif
4423#if KMP_OS_WINDOWS
4424  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4425#endif
4426
4427  team->t.t_control_stack_top = NULL;
4428
4429  __kmp_reinitialize_team(team, new_icvs, loc);
4430
4431  KMP_MB();
4432  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4433}
4434
4435#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4436/* Sets full mask for thread and returns old mask, no changes to structures. */
4437static void
4438__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4439  if (KMP_AFFINITY_CAPABLE()) {
4440    int status;
4441    if (old_mask != NULL) {
4442      status = __kmp_get_system_affinity(old_mask, TRUE);
4443      int error = errno;
4444      if (status != 0) {
4445        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4446                    __kmp_msg_null);
4447      }
4448    }
4449    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4450  }
4451}
4452#endif
4453
4454#if KMP_AFFINITY_SUPPORTED
4455
4456// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4457// It calculats the worker + master thread's partition based upon the parent
4458// thread's partition, and binds each worker to a thread in their partition.
4459// The master thread's partition should already include its current binding.
4460static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4461  // Copy the master thread's place partion to the team struct
4462  kmp_info_t *master_th = team->t.t_threads[0];
4463  KMP_DEBUG_ASSERT(master_th != NULL);
4464  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4465  int first_place = master_th->th.th_first_place;
4466  int last_place = master_th->th.th_last_place;
4467  int masters_place = master_th->th.th_current_place;
4468  team->t.t_first_place = first_place;
4469  team->t.t_last_place = last_place;
4470
4471  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4472                "bound to place %d partition = [%d,%d]\n",
4473                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4474                team->t.t_id, masters_place, first_place, last_place));
4475
4476  switch (proc_bind) {
4477
4478  case proc_bind_default:
4479    // serial teams might have the proc_bind policy set to proc_bind_default. It
4480    // doesn't matter, as we don't rebind master thread for any proc_bind policy
4481    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4482    break;
4483
4484  case proc_bind_master: {
4485    int f;
4486    int n_th = team->t.t_nproc;
4487    for (f = 1; f < n_th; f++) {
4488      kmp_info_t *th = team->t.t_threads[f];
4489      KMP_DEBUG_ASSERT(th != NULL);
4490      th->th.th_first_place = first_place;
4491      th->th.th_last_place = last_place;
4492      th->th.th_new_place = masters_place;
4493      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4494          team->t.t_display_affinity != 1) {
4495        team->t.t_display_affinity = 1;
4496      }
4497
4498      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4499                     "partition = [%d,%d]\n",
4500                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4501                     f, masters_place, first_place, last_place));
4502    }
4503  } break;
4504
4505  case proc_bind_close: {
4506    int f;
4507    int n_th = team->t.t_nproc;
4508    int n_places;
4509    if (first_place <= last_place) {
4510      n_places = last_place - first_place + 1;
4511    } else {
4512      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4513    }
4514    if (n_th <= n_places) {
4515      int place = masters_place;
4516      for (f = 1; f < n_th; f++) {
4517        kmp_info_t *th = team->t.t_threads[f];
4518        KMP_DEBUG_ASSERT(th != NULL);
4519
4520        if (place == last_place) {
4521          place = first_place;
4522        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4523          place = 0;
4524        } else {
4525          place++;
4526        }
4527        th->th.th_first_place = first_place;
4528        th->th.th_last_place = last_place;
4529        th->th.th_new_place = place;
4530        if (__kmp_display_affinity && place != th->th.th_current_place &&
4531            team->t.t_display_affinity != 1) {
4532          team->t.t_display_affinity = 1;
4533        }
4534
4535        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4536                       "partition = [%d,%d]\n",
4537                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4538                       team->t.t_id, f, place, first_place, last_place));
4539      }
4540    } else {
4541      int S, rem, gap, s_count;
4542      S = n_th / n_places;
4543      s_count = 0;
4544      rem = n_th - (S * n_places);
4545      gap = rem > 0 ? n_places / rem : n_places;
4546      int place = masters_place;
4547      int gap_ct = gap;
4548      for (f = 0; f < n_th; f++) {
4549        kmp_info_t *th = team->t.t_threads[f];
4550        KMP_DEBUG_ASSERT(th != NULL);
4551
4552        th->th.th_first_place = first_place;
4553        th->th.th_last_place = last_place;
4554        th->th.th_new_place = place;
4555        if (__kmp_display_affinity && place != th->th.th_current_place &&
4556            team->t.t_display_affinity != 1) {
4557          team->t.t_display_affinity = 1;
4558        }
4559        s_count++;
4560
4561        if ((s_count == S) && rem && (gap_ct == gap)) {
4562          // do nothing, add an extra thread to place on next iteration
4563        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4564          // we added an extra thread to this place; move to next place
4565          if (place == last_place) {
4566            place = first_place;
4567          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4568            place = 0;
4569          } else {
4570            place++;
4571          }
4572          s_count = 0;
4573          gap_ct = 1;
4574          rem--;
4575        } else if (s_count == S) { // place full; don't add extra
4576          if (place == last_place) {
4577            place = first_place;
4578          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4579            place = 0;
4580          } else {
4581            place++;
4582          }
4583          gap_ct++;
4584          s_count = 0;
4585        }
4586
4587        KA_TRACE(100,
4588                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4589                  "partition = [%d,%d]\n",
4590                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4591                  th->th.th_new_place, first_place, last_place));
4592      }
4593      KMP_DEBUG_ASSERT(place == masters_place);
4594    }
4595  } break;
4596
4597  case proc_bind_spread: {
4598    int f;
4599    int n_th = team->t.t_nproc;
4600    int n_places;
4601    int thidx;
4602    if (first_place <= last_place) {
4603      n_places = last_place - first_place + 1;
4604    } else {
4605      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4606    }
4607    if (n_th <= n_places) {
4608      int place = -1;
4609
4610      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4611        int S = n_places / n_th;
4612        int s_count, rem, gap, gap_ct;
4613
4614        place = masters_place;
4615        rem = n_places - n_th * S;
4616        gap = rem ? n_th / rem : 1;
4617        gap_ct = gap;
4618        thidx = n_th;
4619        if (update_master_only == 1)
4620          thidx = 1;
4621        for (f = 0; f < thidx; f++) {
4622          kmp_info_t *th = team->t.t_threads[f];
4623          KMP_DEBUG_ASSERT(th != NULL);
4624
4625          th->th.th_first_place = place;
4626          th->th.th_new_place = place;
4627          if (__kmp_display_affinity && place != th->th.th_current_place &&
4628              team->t.t_display_affinity != 1) {
4629            team->t.t_display_affinity = 1;
4630          }
4631          s_count = 1;
4632          while (s_count < S) {
4633            if (place == last_place) {
4634              place = first_place;
4635            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4636              place = 0;
4637            } else {
4638              place++;
4639            }
4640            s_count++;
4641          }
4642          if (rem && (gap_ct == gap)) {
4643            if (place == last_place) {
4644              place = first_place;
4645            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4646              place = 0;
4647            } else {
4648              place++;
4649            }
4650            rem--;
4651            gap_ct = 0;
4652          }
4653          th->th.th_last_place = place;
4654          gap_ct++;
4655
4656          if (place == last_place) {
4657            place = first_place;
4658          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659            place = 0;
4660          } else {
4661            place++;
4662          }
4663
4664          KA_TRACE(100,
4665                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4666                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4667                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4668                    f, th->th.th_new_place, th->th.th_first_place,
4669                    th->th.th_last_place, __kmp_affinity_num_masks));
4670        }
4671      } else {
4672        /* Having uniform space of available computation places I can create
4673           T partitions of round(P/T) size and put threads into the first
4674           place of each partition. */
4675        double current = static_cast<double>(masters_place);
4676        double spacing =
4677            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4678        int first, last;
4679        kmp_info_t *th;
4680
4681        thidx = n_th + 1;
4682        if (update_master_only == 1)
4683          thidx = 1;
4684        for (f = 0; f < thidx; f++) {
4685          first = static_cast<int>(current);
4686          last = static_cast<int>(current + spacing) - 1;
4687          KMP_DEBUG_ASSERT(last >= first);
4688          if (first >= n_places) {
4689            if (masters_place) {
4690              first -= n_places;
4691              last -= n_places;
4692              if (first == (masters_place + 1)) {
4693                KMP_DEBUG_ASSERT(f == n_th);
4694                first--;
4695              }
4696              if (last == masters_place) {
4697                KMP_DEBUG_ASSERT(f == (n_th - 1));
4698                last--;
4699              }
4700            } else {
4701              KMP_DEBUG_ASSERT(f == n_th);
4702              first = 0;
4703              last = 0;
4704            }
4705          }
4706          if (last >= n_places) {
4707            last = (n_places - 1);
4708          }
4709          place = first;
4710          current += spacing;
4711          if (f < n_th) {
4712            KMP_DEBUG_ASSERT(0 <= first);
4713            KMP_DEBUG_ASSERT(n_places > first);
4714            KMP_DEBUG_ASSERT(0 <= last);
4715            KMP_DEBUG_ASSERT(n_places > last);
4716            KMP_DEBUG_ASSERT(last_place >= first_place);
4717            th = team->t.t_threads[f];
4718            KMP_DEBUG_ASSERT(th);
4719            th->th.th_first_place = first;
4720            th->th.th_new_place = place;
4721            th->th.th_last_place = last;
4722            if (__kmp_display_affinity && place != th->th.th_current_place &&
4723                team->t.t_display_affinity != 1) {
4724              team->t.t_display_affinity = 1;
4725            }
4726            KA_TRACE(100,
4727                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4728                      "partition = [%d,%d], spacing = %.4f\n",
4729                      __kmp_gtid_from_thread(team->t.t_threads[f]),
4730                      team->t.t_id, f, th->th.th_new_place,
4731                      th->th.th_first_place, th->th.th_last_place, spacing));
4732          }
4733        }
4734      }
4735      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4736    } else {
4737      int S, rem, gap, s_count;
4738      S = n_th / n_places;
4739      s_count = 0;
4740      rem = n_th - (S * n_places);
4741      gap = rem > 0 ? n_places / rem : n_places;
4742      int place = masters_place;
4743      int gap_ct = gap;
4744      thidx = n_th;
4745      if (update_master_only == 1)
4746        thidx = 1;
4747      for (f = 0; f < thidx; f++) {
4748        kmp_info_t *th = team->t.t_threads[f];
4749        KMP_DEBUG_ASSERT(th != NULL);
4750
4751        th->th.th_first_place = place;
4752        th->th.th_last_place = place;
4753        th->th.th_new_place = place;
4754        if (__kmp_display_affinity && place != th->th.th_current_place &&
4755            team->t.t_display_affinity != 1) {
4756          team->t.t_display_affinity = 1;
4757        }
4758        s_count++;
4759
4760        if ((s_count == S) && rem && (gap_ct == gap)) {
4761          // do nothing, add an extra thread to place on next iteration
4762        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763          // we added an extra thread to this place; move on to next place
4764          if (place == last_place) {
4765            place = first_place;
4766          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767            place = 0;
4768          } else {
4769            place++;
4770          }
4771          s_count = 0;
4772          gap_ct = 1;
4773          rem--;
4774        } else if (s_count == S) { // place is full; don't add extra thread
4775          if (place == last_place) {
4776            place = first_place;
4777          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778            place = 0;
4779          } else {
4780            place++;
4781          }
4782          gap_ct++;
4783          s_count = 0;
4784        }
4785
4786        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4787                       "partition = [%d,%d]\n",
4788                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4789                       team->t.t_id, f, th->th.th_new_place,
4790                       th->th.th_first_place, th->th.th_last_place));
4791      }
4792      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4793    }
4794  } break;
4795
4796  default:
4797    break;
4798  }
4799
4800  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4801}
4802
4803#endif // KMP_AFFINITY_SUPPORTED
4804
4805/* allocate a new team data structure to use.  take one off of the free pool if
4806   available */
4807kmp_team_t *
4808__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4809#if OMPT_SUPPORT
4810                    ompt_data_t ompt_parallel_data,
4811#endif
4812                    kmp_proc_bind_t new_proc_bind,
4813                    kmp_internal_control_t *new_icvs,
4814                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4815  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4816  int f;
4817  kmp_team_t *team;
4818  int use_hot_team = !root->r.r_active;
4819  int level = 0;
4820
4821  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4822  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4823  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4824  KMP_MB();
4825
4826#if KMP_NESTED_HOT_TEAMS
4827  kmp_hot_team_ptr_t *hot_teams;
4828  if (master) {
4829    team = master->th.th_team;
4830    level = team->t.t_active_level;
4831    if (master->th.th_teams_microtask) { // in teams construct?
4832      if (master->th.th_teams_size.nteams > 1 &&
4833          ( // #teams > 1
4834              team->t.t_pkfn ==
4835                  (microtask_t)__kmp_teams_master || // inner fork of the teams
4836              master->th.th_teams_level <
4837                  team->t.t_level)) { // or nested parallel inside the teams
4838        ++level; // not increment if #teams==1, or for outer fork of the teams;
4839        // increment otherwise
4840      }
4841    }
4842    hot_teams = master->th.th_hot_teams;
4843    if (level < __kmp_hot_teams_max_level && hot_teams &&
4844        hot_teams[level]
4845            .hot_team) { // hot team has already been allocated for given level
4846      use_hot_team = 1;
4847    } else {
4848      use_hot_team = 0;
4849    }
4850  }
4851#endif
4852  // Optimization to use a "hot" team
4853  if (use_hot_team && new_nproc > 1) {
4854    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4855#if KMP_NESTED_HOT_TEAMS
4856    team = hot_teams[level].hot_team;
4857#else
4858    team = root->r.r_hot_team;
4859#endif
4860#if KMP_DEBUG
4861    if (__kmp_tasking_mode != tskm_immediate_exec) {
4862      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4863                    "task_team[1] = %p before reinit\n",
4864                    team->t.t_task_team[0], team->t.t_task_team[1]));
4865    }
4866#endif
4867
4868    // Has the number of threads changed?
4869    /* Let's assume the most common case is that the number of threads is
4870       unchanged, and put that case first. */
4871    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4872      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4873      // This case can mean that omp_set_num_threads() was called and the hot
4874      // team size was already reduced, so we check the special flag
4875      if (team->t.t_size_changed == -1) {
4876        team->t.t_size_changed = 1;
4877      } else {
4878        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4879      }
4880
4881      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4882      kmp_r_sched_t new_sched = new_icvs->sched;
4883      // set master's schedule as new run-time schedule
4884      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4885
4886      __kmp_reinitialize_team(team, new_icvs,
4887                              root->r.r_uber_thread->th.th_ident);
4888
4889      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4890                    team->t.t_threads[0], team));
4891      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4892
4893#if KMP_AFFINITY_SUPPORTED
4894      if ((team->t.t_size_changed == 0) &&
4895          (team->t.t_proc_bind == new_proc_bind)) {
4896        if (new_proc_bind == proc_bind_spread) {
4897          __kmp_partition_places(
4898              team, 1); // add flag to update only master for spread
4899        }
4900        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4901                       "proc_bind = %d, partition = [%d,%d]\n",
4902                       team->t.t_id, new_proc_bind, team->t.t_first_place,
4903                       team->t.t_last_place));
4904      } else {
4905        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4906        __kmp_partition_places(team);
4907      }
4908#else
4909      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4910#endif /* KMP_AFFINITY_SUPPORTED */
4911    } else if (team->t.t_nproc > new_nproc) {
4912      KA_TRACE(20,
4913               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4914                new_nproc));
4915
4916      team->t.t_size_changed = 1;
4917#if KMP_NESTED_HOT_TEAMS
4918      if (__kmp_hot_teams_mode == 0) {
4919        // AC: saved number of threads should correspond to team's value in this
4920        // mode, can be bigger in mode 1, when hot team has threads in reserve
4921        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4922        hot_teams[level].hot_team_nth = new_nproc;
4923#endif // KMP_NESTED_HOT_TEAMS
4924        /* release the extra threads we don't need any more */
4925        for (f = new_nproc; f < team->t.t_nproc; f++) {
4926          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4927          if (__kmp_tasking_mode != tskm_immediate_exec) {
4928            // When decreasing team size, threads no longer in the team should
4929            // unref task team.
4930            team->t.t_threads[f]->th.th_task_team = NULL;
4931          }
4932          __kmp_free_thread(team->t.t_threads[f]);
4933          team->t.t_threads[f] = NULL;
4934        }
4935#if KMP_NESTED_HOT_TEAMS
4936      } // (__kmp_hot_teams_mode == 0)
4937      else {
4938        // When keeping extra threads in team, switch threads to wait on own
4939        // b_go flag
4940        for (f = new_nproc; f < team->t.t_nproc; ++f) {
4941          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4942          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4943          for (int b = 0; b < bs_last_barrier; ++b) {
4944            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4945              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4946            }
4947            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4948          }
4949        }
4950      }
4951#endif // KMP_NESTED_HOT_TEAMS
4952      team->t.t_nproc = new_nproc;
4953      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4954      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4955      __kmp_reinitialize_team(team, new_icvs,
4956                              root->r.r_uber_thread->th.th_ident);
4957
4958      // Update remaining threads
4959      for (f = 0; f < new_nproc; ++f) {
4960        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4961      }
4962
4963      // restore the current task state of the master thread: should be the
4964      // implicit task
4965      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4966                    team->t.t_threads[0], team));
4967
4968      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4969
4970#ifdef KMP_DEBUG
4971      for (f = 0; f < team->t.t_nproc; f++) {
4972        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4973                         team->t.t_threads[f]->th.th_team_nproc ==
4974                             team->t.t_nproc);
4975      }
4976#endif
4977
4978      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4979#if KMP_AFFINITY_SUPPORTED
4980      __kmp_partition_places(team);
4981#endif
4982    } else { // team->t.t_nproc < new_nproc
4983#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984      kmp_affin_mask_t *old_mask;
4985      if (KMP_AFFINITY_CAPABLE()) {
4986        KMP_CPU_ALLOC(old_mask);
4987      }
4988#endif
4989
4990      KA_TRACE(20,
4991               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4992                new_nproc));
4993
4994      team->t.t_size_changed = 1;
4995
4996#if KMP_NESTED_HOT_TEAMS
4997      int avail_threads = hot_teams[level].hot_team_nth;
4998      if (new_nproc < avail_threads)
4999        avail_threads = new_nproc;
5000      kmp_info_t **other_threads = team->t.t_threads;
5001      for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002        // Adjust barrier data of reserved threads (if any) of the team
5003        // Other data will be set in __kmp_initialize_info() below.
5004        int b;
5005        kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006        for (b = 0; b < bs_last_barrier; ++b) {
5007          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009#if USE_DEBUGGER
5010          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5011#endif
5012        }
5013      }
5014      if (hot_teams[level].hot_team_nth >= new_nproc) {
5015        // we have all needed threads in reserve, no need to allocate any
5016        // this only possible in mode 1, cannot have reserved threads in mode 0
5017        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018        team->t.t_nproc = new_nproc; // just get reserved threads involved
5019      } else {
5020        // we may have some threads in reserve, but not enough
5021        team->t.t_nproc =
5022            hot_teams[level]
5023                .hot_team_nth; // get reserved threads involved if any
5024        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025#endif // KMP_NESTED_HOT_TEAMS
5026        if (team->t.t_max_nproc < new_nproc) {
5027          /* reallocate larger arrays */
5028          __kmp_reallocate_team_arrays(team, new_nproc);
5029          __kmp_reinitialize_team(team, new_icvs, NULL);
5030        }
5031
5032#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033        /* Temporarily set full mask for master thread before creation of
5034           workers. The reason is that workers inherit the affinity from master,
5035           so if a lot of workers are created on the single core quickly, they
5036           don't get a chance to set their own affinity for a long time. */
5037        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5038#endif
5039
5040        /* allocate new threads for the hot team */
5041        for (f = team->t.t_nproc; f < new_nproc; f++) {
5042          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043          KMP_DEBUG_ASSERT(new_worker);
5044          team->t.t_threads[f] = new_worker;
5045
5046          KA_TRACE(20,
5047                   ("__kmp_allocate_team: team %d init T#%d arrived: "
5048                    "join=%llu, plain=%llu\n",
5049                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051                    team->t.t_bar[bs_plain_barrier].b_arrived));
5052
5053          { // Initialize barrier data for new threads.
5054            int b;
5055            kmp_balign_t *balign = new_worker->th.th_bar;
5056            for (b = 0; b < bs_last_barrier; ++b) {
5057              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059                               KMP_BARRIER_PARENT_FLAG);
5060#if USE_DEBUGGER
5061              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5062#endif
5063            }
5064          }
5065        }
5066
5067#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068        if (KMP_AFFINITY_CAPABLE()) {
5069          /* Restore initial master thread's affinity mask */
5070          __kmp_set_system_affinity(old_mask, TRUE);
5071          KMP_CPU_FREE(old_mask);
5072        }
5073#endif
5074#if KMP_NESTED_HOT_TEAMS
5075      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076#endif // KMP_NESTED_HOT_TEAMS
5077      /* make sure everyone is syncronized */
5078      int old_nproc = team->t.t_nproc; // save old value and use to update only
5079      // new threads below
5080      __kmp_initialize_team(team, new_nproc, new_icvs,
5081                            root->r.r_uber_thread->th.th_ident);
5082
5083      /* reinitialize the threads */
5084      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085      for (f = 0; f < team->t.t_nproc; ++f)
5086        __kmp_initialize_info(team->t.t_threads[f], team, f,
5087                              __kmp_gtid_from_tid(f, team));
5088
5089      if (level) { // set th_task_state for new threads in nested hot team
5090        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5091        // only need to set the th_task_state for the new threads. th_task_state
5092        // for master thread will not be accurate until after this in
5093        // __kmp_fork_call(), so we look to the master's memo_stack to get the
5094        // correct value.
5095        for (f = old_nproc; f < team->t.t_nproc; ++f)
5096          team->t.t_threads[f]->th.th_task_state =
5097              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5098      } else { // set th_task_state for new threads in non-nested hot team
5099        int old_state =
5100            team->t.t_threads[0]->th.th_task_state; // copy master's state
5101        for (f = old_nproc; f < team->t.t_nproc; ++f)
5102          team->t.t_threads[f]->th.th_task_state = old_state;
5103      }
5104
5105#ifdef KMP_DEBUG
5106      for (f = 0; f < team->t.t_nproc; ++f) {
5107        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5108                         team->t.t_threads[f]->th.th_team_nproc ==
5109                             team->t.t_nproc);
5110      }
5111#endif
5112
5113      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114#if KMP_AFFINITY_SUPPORTED
5115      __kmp_partition_places(team);
5116#endif
5117    } // Check changes in number of threads
5118
5119    kmp_info_t *master = team->t.t_threads[0];
5120    if (master->th.th_teams_microtask) {
5121      for (f = 1; f < new_nproc; ++f) {
5122        // propagate teams construct specific info to workers
5123        kmp_info_t *thr = team->t.t_threads[f];
5124        thr->th.th_teams_microtask = master->th.th_teams_microtask;
5125        thr->th.th_teams_level = master->th.th_teams_level;
5126        thr->th.th_teams_size = master->th.th_teams_size;
5127      }
5128    }
5129#if KMP_NESTED_HOT_TEAMS
5130    if (level) {
5131      // Sync barrier state for nested hot teams, not needed for outermost hot
5132      // team.
5133      for (f = 1; f < new_nproc; ++f) {
5134        kmp_info_t *thr = team->t.t_threads[f];
5135        int b;
5136        kmp_balign_t *balign = thr->th.th_bar;
5137        for (b = 0; b < bs_last_barrier; ++b) {
5138          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5139          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5140#if USE_DEBUGGER
5141          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5142#endif
5143        }
5144      }
5145    }
5146#endif // KMP_NESTED_HOT_TEAMS
5147
5148    /* reallocate space for arguments if necessary */
5149    __kmp_alloc_argv_entries(argc, team, TRUE);
5150    KMP_CHECK_UPDATE(team->t.t_argc, argc);
5151    // The hot team re-uses the previous task team,
5152    // if untouched during the previous release->gather phase.
5153
5154    KF_TRACE(10, (" hot_team = %p\n", team));
5155
5156#if KMP_DEBUG
5157    if (__kmp_tasking_mode != tskm_immediate_exec) {
5158      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5159                    "task_team[1] = %p after reinit\n",
5160                    team->t.t_task_team[0], team->t.t_task_team[1]));
5161    }
5162#endif
5163
5164#if OMPT_SUPPORT
5165    __ompt_team_assign_id(team, ompt_parallel_data);
5166#endif
5167
5168    KMP_MB();
5169
5170    return team;
5171  }
5172
5173  /* next, let's try to take one from the team pool */
5174  KMP_MB();
5175  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5176    /* TODO: consider resizing undersized teams instead of reaping them, now
5177       that we have a resizing mechanism */
5178    if (team->t.t_max_nproc >= max_nproc) {
5179      /* take this team from the team pool */
5180      __kmp_team_pool = team->t.t_next_pool;
5181
5182      /* setup the team for fresh use */
5183      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5184
5185      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5186                    "task_team[1] %p to NULL\n",
5187                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
5188      team->t.t_task_team[0] = NULL;
5189      team->t.t_task_team[1] = NULL;
5190
5191      /* reallocate space for arguments if necessary */
5192      __kmp_alloc_argv_entries(argc, team, TRUE);
5193      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5194
5195      KA_TRACE(
5196          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5197               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5198      { // Initialize barrier data.
5199        int b;
5200        for (b = 0; b < bs_last_barrier; ++b) {
5201          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5202#if USE_DEBUGGER
5203          team->t.t_bar[b].b_master_arrived = 0;
5204          team->t.t_bar[b].b_team_arrived = 0;
5205#endif
5206        }
5207      }
5208
5209      team->t.t_proc_bind = new_proc_bind;
5210
5211      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5212                    team->t.t_id));
5213
5214#if OMPT_SUPPORT
5215      __ompt_team_assign_id(team, ompt_parallel_data);
5216#endif
5217
5218      KMP_MB();
5219
5220      return team;
5221    }
5222
5223    /* reap team if it is too small, then loop back and check the next one */
5224    // not sure if this is wise, but, will be redone during the hot-teams
5225    // rewrite.
5226    /* TODO: Use technique to find the right size hot-team, don't reap them */
5227    team = __kmp_reap_team(team);
5228    __kmp_team_pool = team;
5229  }
5230
5231  /* nothing available in the pool, no matter, make a new team! */
5232  KMP_MB();
5233  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5234
5235  /* and set it up */
5236  team->t.t_max_nproc = max_nproc;
5237  /* NOTE well, for some reason allocating one big buffer and dividing it up
5238     seems to really hurt performance a lot on the P4, so, let's not use this */
5239  __kmp_allocate_team_arrays(team, max_nproc);
5240
5241  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5242  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5243
5244  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5245                "%p to NULL\n",
5246                &team->t.t_task_team[0], &team->t.t_task_team[1]));
5247  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5248  // memory, no need to duplicate
5249  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5250  // memory, no need to duplicate
5251
5252  if (__kmp_storage_map) {
5253    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5254  }
5255
5256  /* allocate space for arguments */
5257  __kmp_alloc_argv_entries(argc, team, FALSE);
5258  team->t.t_argc = argc;
5259
5260  KA_TRACE(20,
5261           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5262            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5263  { // Initialize barrier data.
5264    int b;
5265    for (b = 0; b < bs_last_barrier; ++b) {
5266      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5267#if USE_DEBUGGER
5268      team->t.t_bar[b].b_master_arrived = 0;
5269      team->t.t_bar[b].b_team_arrived = 0;
5270#endif
5271    }
5272  }
5273
5274  team->t.t_proc_bind = new_proc_bind;
5275
5276#if OMPT_SUPPORT
5277  __ompt_team_assign_id(team, ompt_parallel_data);
5278  team->t.ompt_serialized_team_info = NULL;
5279#endif
5280
5281  KMP_MB();
5282
5283  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5284                team->t.t_id));
5285
5286  return team;
5287}
5288
5289/* TODO implement hot-teams at all levels */
5290/* TODO implement lazy thread release on demand (disband request) */
5291
5292/* free the team.  return it to the team pool.  release all the threads
5293 * associated with it */
5294void __kmp_free_team(kmp_root_t *root,
5295                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5296  int f;
5297  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5298                team->t.t_id));
5299
5300  /* verify state */
5301  KMP_DEBUG_ASSERT(root);
5302  KMP_DEBUG_ASSERT(team);
5303  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5304  KMP_DEBUG_ASSERT(team->t.t_threads);
5305
5306  int use_hot_team = team == root->r.r_hot_team;
5307#if KMP_NESTED_HOT_TEAMS
5308  int level;
5309  kmp_hot_team_ptr_t *hot_teams;
5310  if (master) {
5311    level = team->t.t_active_level - 1;
5312    if (master->th.th_teams_microtask) { // in teams construct?
5313      if (master->th.th_teams_size.nteams > 1) {
5314        ++level; // level was not increased in teams construct for
5315        // team_of_masters
5316      }
5317      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5318          master->th.th_teams_level == team->t.t_level) {
5319        ++level; // level was not increased in teams construct for
5320        // team_of_workers before the parallel
5321      } // team->t.t_level will be increased inside parallel
5322    }
5323    hot_teams = master->th.th_hot_teams;
5324    if (level < __kmp_hot_teams_max_level) {
5325      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5326      use_hot_team = 1;
5327    }
5328  }
5329#endif // KMP_NESTED_HOT_TEAMS
5330
5331  /* team is done working */
5332  TCW_SYNC_PTR(team->t.t_pkfn,
5333               NULL); // Important for Debugging Support Library.
5334#if KMP_OS_WINDOWS
5335  team->t.t_copyin_counter = 0; // init counter for possible reuse
5336#endif
5337  // Do not reset pointer to parent team to NULL for hot teams.
5338
5339  /* if we are non-hot team, release our threads */
5340  if (!use_hot_team) {
5341    if (__kmp_tasking_mode != tskm_immediate_exec) {
5342      // Wait for threads to reach reapable state
5343      for (f = 1; f < team->t.t_nproc; ++f) {
5344        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5345        kmp_info_t *th = team->t.t_threads[f];
5346        volatile kmp_uint32 *state = &th->th.th_reap_state;
5347        while (*state != KMP_SAFE_TO_REAP) {
5348#if KMP_OS_WINDOWS
5349          // On Windows a thread can be killed at any time, check this
5350          DWORD ecode;
5351          if (!__kmp_is_thread_alive(th, &ecode)) {
5352            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5353            break;
5354          }
5355#endif
5356          // first check if thread is sleeping
5357          kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5358          if (fl.is_sleeping())
5359            fl.resume(__kmp_gtid_from_thread(th));
5360          KMP_CPU_PAUSE();
5361        }
5362      }
5363
5364      // Delete task teams
5365      int tt_idx;
5366      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5367        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5368        if (task_team != NULL) {
5369          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5370            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5371            team->t.t_threads[f]->th.th_task_team = NULL;
5372          }
5373          KA_TRACE(
5374              20,
5375              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5376               __kmp_get_gtid(), task_team, team->t.t_id));
5377#if KMP_NESTED_HOT_TEAMS
5378          __kmp_free_task_team(master, task_team);
5379#endif
5380          team->t.t_task_team[tt_idx] = NULL;
5381        }
5382      }
5383    }
5384
5385    // Reset pointer to parent team only for non-hot teams.
5386    team->t.t_parent = NULL;
5387    team->t.t_level = 0;
5388    team->t.t_active_level = 0;
5389
5390    /* free the worker threads */
5391    for (f = 1; f < team->t.t_nproc; ++f) {
5392      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5393      __kmp_free_thread(team->t.t_threads[f]);
5394      team->t.t_threads[f] = NULL;
5395    }
5396
5397    /* put the team back in the team pool */
5398    /* TODO limit size of team pool, call reap_team if pool too large */
5399    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5400    __kmp_team_pool = (volatile kmp_team_t *)team;
5401  } else { // Check if team was created for the masters in a teams construct
5402    // See if first worker is a CG root
5403    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5404                     team->t.t_threads[1]->th.th_cg_roots);
5405    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5406      // Clean up the CG root nodes on workers so that this team can be re-used
5407      for (f = 1; f < team->t.t_nproc; ++f) {
5408        kmp_info_t *thr = team->t.t_threads[f];
5409        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5410                         thr->th.th_cg_roots->cg_root == thr);
5411        // Pop current CG root off list
5412        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5413        thr->th.th_cg_roots = tmp->up;
5414        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5415                       " up to node %p. cg_nthreads was %d\n",
5416                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5417        int i = tmp->cg_nthreads--;
5418        if (i == 1) {
5419          __kmp_free(tmp); // free CG if we are the last thread in it
5420        }
5421        // Restore current task's thread_limit from CG root
5422        if (thr->th.th_cg_roots)
5423          thr->th.th_current_task->td_icvs.thread_limit =
5424              thr->th.th_cg_roots->cg_thread_limit;
5425      }
5426    }
5427  }
5428
5429  KMP_MB();
5430}
5431
5432/* reap the team.  destroy it, reclaim all its resources and free its memory */
5433kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5434  kmp_team_t *next_pool = team->t.t_next_pool;
5435
5436  KMP_DEBUG_ASSERT(team);
5437  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5438  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5439  KMP_DEBUG_ASSERT(team->t.t_threads);
5440  KMP_DEBUG_ASSERT(team->t.t_argv);
5441
5442  /* TODO clean the threads that are a part of this? */
5443
5444  /* free stuff */
5445  __kmp_free_team_arrays(team);
5446  if (team->t.t_argv != &team->t.t_inline_argv[0])
5447    __kmp_free((void *)team->t.t_argv);
5448  __kmp_free(team);
5449
5450  KMP_MB();
5451  return next_pool;
5452}
5453
5454// Free the thread.  Don't reap it, just place it on the pool of available
5455// threads.
5456//
5457// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5458// binding for the affinity mechanism to be useful.
5459//
5460// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5461// However, we want to avoid a potential performance problem by always
5462// scanning through the list to find the correct point at which to insert
5463// the thread (potential N**2 behavior).  To do this we keep track of the
5464// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5465// With single-level parallelism, threads will always be added to the tail
5466// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5467// parallelism, all bets are off and we may need to scan through the entire
5468// free list.
5469//
5470// This change also has a potentially large performance benefit, for some
5471// applications.  Previously, as threads were freed from the hot team, they
5472// would be placed back on the free list in inverse order.  If the hot team
5473// grew back to it's original size, then the freed thread would be placed
5474// back on the hot team in reverse order.  This could cause bad cache
5475// locality problems on programs where the size of the hot team regularly
5476// grew and shrunk.
5477//
5478// Now, for single-level parallelism, the OMP tid is alway == gtid.
5479void __kmp_free_thread(kmp_info_t *this_th) {
5480  int gtid;
5481  kmp_info_t **scan;
5482
5483  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5484                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5485
5486  KMP_DEBUG_ASSERT(this_th);
5487
5488  // When moving thread to pool, switch thread to wait on own b_go flag, and
5489  // uninitialized (NULL team).
5490  int b;
5491  kmp_balign_t *balign = this_th->th.th_bar;
5492  for (b = 0; b < bs_last_barrier; ++b) {
5493    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5494      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5495    balign[b].bb.team = NULL;
5496    balign[b].bb.leaf_kids = 0;
5497  }
5498  this_th->th.th_task_state = 0;
5499  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5500
5501  /* put thread back on the free pool */
5502  TCW_PTR(this_th->th.th_team, NULL);
5503  TCW_PTR(this_th->th.th_root, NULL);
5504  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5505
5506  while (this_th->th.th_cg_roots) {
5507    this_th->th.th_cg_roots->cg_nthreads--;
5508    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5509                   " %p of thread  %p to %d\n",
5510                   this_th, this_th->th.th_cg_roots,
5511                   this_th->th.th_cg_roots->cg_root,
5512                   this_th->th.th_cg_roots->cg_nthreads));
5513    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5514    if (tmp->cg_root == this_th) { // Thread is a cg_root
5515      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5516      KA_TRACE(
5517          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5518      this_th->th.th_cg_roots = tmp->up;
5519      __kmp_free(tmp);
5520    } else { // Worker thread
5521      if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5522        __kmp_free(tmp);
5523      }
5524      this_th->th.th_cg_roots = NULL;
5525      break;
5526    }
5527  }
5528
5529  /* If the implicit task assigned to this thread can be used by other threads
5530   * -> multiple threads can share the data and try to free the task at
5531   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5532   * with higher probability when hot team is disabled but can occurs even when
5533   * the hot team is enabled */
5534  __kmp_free_implicit_task(this_th);
5535  this_th->th.th_current_task = NULL;
5536
5537  // If the __kmp_thread_pool_insert_pt is already past the new insert
5538  // point, then we need to re-scan the entire list.
5539  gtid = this_th->th.th_info.ds.ds_gtid;
5540  if (__kmp_thread_pool_insert_pt != NULL) {
5541    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5542    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5543      __kmp_thread_pool_insert_pt = NULL;
5544    }
5545  }
5546
5547  // Scan down the list to find the place to insert the thread.
5548  // scan is the address of a link in the list, possibly the address of
5549  // __kmp_thread_pool itself.
5550  //
5551  // In the absence of nested parallism, the for loop will have 0 iterations.
5552  if (__kmp_thread_pool_insert_pt != NULL) {
5553    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5554  } else {
5555    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5556  }
5557  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5558       scan = &((*scan)->th.th_next_pool))
5559    ;
5560
5561  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5562  // to its address.
5563  TCW_PTR(this_th->th.th_next_pool, *scan);
5564  __kmp_thread_pool_insert_pt = *scan = this_th;
5565  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5566                   (this_th->th.th_info.ds.ds_gtid <
5567                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5568  TCW_4(this_th->th.th_in_pool, TRUE);
5569  __kmp_suspend_initialize_thread(this_th);
5570  __kmp_lock_suspend_mx(this_th);
5571  if (this_th->th.th_active == TRUE) {
5572    KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5573    this_th->th.th_active_in_pool = TRUE;
5574  }
5575#if KMP_DEBUG
5576  else {
5577    KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5578  }
5579#endif
5580  __kmp_unlock_suspend_mx(this_th);
5581
5582  TCW_4(__kmp_nth, __kmp_nth - 1);
5583
5584#ifdef KMP_ADJUST_BLOCKTIME
5585  /* Adjust blocktime back to user setting or default if necessary */
5586  /* Middle initialization might never have occurred                */
5587  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5588    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5589    if (__kmp_nth <= __kmp_avail_proc) {
5590      __kmp_zero_bt = FALSE;
5591    }
5592  }
5593#endif /* KMP_ADJUST_BLOCKTIME */
5594
5595  KMP_MB();
5596}
5597
5598/* ------------------------------------------------------------------------ */
5599
5600void *__kmp_launch_thread(kmp_info_t *this_thr) {
5601  int gtid = this_thr->th.th_info.ds.ds_gtid;
5602  /*    void                 *stack_data;*/
5603  kmp_team_t *(*volatile pteam);
5604
5605  KMP_MB();
5606  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5607
5608  if (__kmp_env_consistency_check) {
5609    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5610  }
5611
5612#if OMPT_SUPPORT
5613  ompt_data_t *thread_data;
5614  if (ompt_enabled.enabled) {
5615    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5616    *thread_data = ompt_data_none;
5617
5618    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5619    this_thr->th.ompt_thread_info.wait_id = 0;
5620    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5621    if (ompt_enabled.ompt_callback_thread_begin) {
5622      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5623          ompt_thread_worker, thread_data);
5624    }
5625  }
5626#endif
5627
5628#if OMPT_SUPPORT
5629  if (ompt_enabled.enabled) {
5630    this_thr->th.ompt_thread_info.state = ompt_state_idle;
5631  }
5632#endif
5633  /* This is the place where threads wait for work */
5634  while (!TCR_4(__kmp_global.g.g_done)) {
5635    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5636    KMP_MB();
5637
5638    /* wait for work to do */
5639    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5640
5641    /* No tid yet since not part of a team */
5642    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5643
5644#if OMPT_SUPPORT
5645    if (ompt_enabled.enabled) {
5646      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5647    }
5648#endif
5649
5650    pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5651
5652    /* have we been allocated? */
5653    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5654      /* we were just woken up, so run our new task */
5655      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5656        int rc;
5657        KA_TRACE(20,
5658                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5659                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5660                  (*pteam)->t.t_pkfn));
5661
5662        updateHWFPControl(*pteam);
5663
5664#if OMPT_SUPPORT
5665        if (ompt_enabled.enabled) {
5666          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5667        }
5668#endif
5669
5670        rc = (*pteam)->t.t_invoke(gtid);
5671        KMP_ASSERT(rc);
5672
5673        KMP_MB();
5674        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5675                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5676                      (*pteam)->t.t_pkfn));
5677      }
5678#if OMPT_SUPPORT
5679      if (ompt_enabled.enabled) {
5680        /* no frame set while outside task */
5681        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5682
5683        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5684      }
5685#endif
5686      /* join barrier after parallel region */
5687      __kmp_join_barrier(gtid);
5688    }
5689  }
5690  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5691
5692#if OMPT_SUPPORT
5693  if (ompt_enabled.ompt_callback_thread_end) {
5694    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5695  }
5696#endif
5697
5698  this_thr->th.th_task_team = NULL;
5699  /* run the destructors for the threadprivate data for this thread */
5700  __kmp_common_destroy_gtid(gtid);
5701
5702  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5703  KMP_MB();
5704  return this_thr;
5705}
5706
5707/* ------------------------------------------------------------------------ */
5708
5709void __kmp_internal_end_dest(void *specific_gtid) {
5710#if KMP_COMPILER_ICC
5711#pragma warning(push)
5712#pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5713// significant bits
5714#endif
5715  // Make sure no significant bits are lost
5716  int gtid = (kmp_intptr_t)specific_gtid - 1;
5717#if KMP_COMPILER_ICC
5718#pragma warning(pop)
5719#endif
5720
5721  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5722  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5723   * this is because 0 is reserved for the nothing-stored case */
5724
5725  /* josh: One reason for setting the gtid specific data even when it is being
5726     destroyed by pthread is to allow gtid lookup through thread specific data
5727     (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5728     that gets executed in the call to __kmp_internal_end_thread, actually
5729     gets the gtid through the thread specific data.  Setting it here seems
5730     rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5731     to run smoothly.
5732     todo: get rid of this after we remove the dependence on
5733     __kmp_gtid_get_specific  */
5734  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5735    __kmp_gtid_set_specific(gtid);
5736#ifdef KMP_TDATA_GTID
5737  __kmp_gtid = gtid;
5738#endif
5739  __kmp_internal_end_thread(gtid);
5740}
5741
5742#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5743
5744// 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5745// destructors work perfectly, but in real libomp.so I have no evidence it is
5746// ever called. However, -fini linker option in makefile.mk works fine.
5747
5748__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5749  __kmp_internal_end_atexit();
5750}
5751
5752void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5753
5754#endif
5755
5756/* [Windows] josh: when the atexit handler is called, there may still be more
5757   than one thread alive */
5758void __kmp_internal_end_atexit(void) {
5759  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5760  /* [Windows]
5761     josh: ideally, we want to completely shutdown the library in this atexit
5762     handler, but stat code that depends on thread specific data for gtid fails
5763     because that data becomes unavailable at some point during the shutdown, so
5764     we call __kmp_internal_end_thread instead. We should eventually remove the
5765     dependency on __kmp_get_specific_gtid in the stat code and use
5766     __kmp_internal_end_library to cleanly shutdown the library.
5767
5768     // TODO: Can some of this comment about GVS be removed?
5769     I suspect that the offending stat code is executed when the calling thread
5770     tries to clean up a dead root thread's data structures, resulting in GVS
5771     code trying to close the GVS structures for that thread, but since the stat
5772     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5773     the calling thread is cleaning up itself instead of another thread, it get
5774     confused. This happens because allowing a thread to unregister and cleanup
5775     another thread is a recent modification for addressing an issue.
5776     Based on the current design (20050722), a thread may end up
5777     trying to unregister another thread only if thread death does not trigger
5778     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5779     thread specific data destructor function to detect thread death. For
5780     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5781     is nothing.  Thus, the workaround is applicable only for Windows static
5782     stat library. */
5783  __kmp_internal_end_library(-1);
5784#if KMP_OS_WINDOWS
5785  __kmp_close_console();
5786#endif
5787}
5788
5789static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5790  // It is assumed __kmp_forkjoin_lock is acquired.
5791
5792  int gtid;
5793
5794  KMP_DEBUG_ASSERT(thread != NULL);
5795
5796  gtid = thread->th.th_info.ds.ds_gtid;
5797
5798  if (!is_root) {
5799    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5800      /* Assume the threads are at the fork barrier here */
5801      KA_TRACE(
5802          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5803               gtid));
5804      /* Need release fence here to prevent seg faults for tree forkjoin barrier
5805       * (GEH) */
5806      ANNOTATE_HAPPENS_BEFORE(thread);
5807      kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5808      __kmp_release_64(&flag);
5809    }
5810
5811    // Terminate OS thread.
5812    __kmp_reap_worker(thread);
5813
5814    // The thread was killed asynchronously.  If it was actively
5815    // spinning in the thread pool, decrement the global count.
5816    //
5817    // There is a small timing hole here - if the worker thread was just waking
5818    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5819    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5820    // the global counter might not get updated.
5821    //
5822    // Currently, this can only happen as the library is unloaded,
5823    // so there are no harmful side effects.
5824    if (thread->th.th_active_in_pool) {
5825      thread->th.th_active_in_pool = FALSE;
5826      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5827      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5828    }
5829  }
5830
5831  __kmp_free_implicit_task(thread);
5832
5833// Free the fast memory for tasking
5834#if USE_FAST_MEMORY
5835  __kmp_free_fast_memory(thread);
5836#endif /* USE_FAST_MEMORY */
5837
5838  __kmp_suspend_uninitialize_thread(thread);
5839
5840  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5841  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5842
5843  --__kmp_all_nth;
5844// __kmp_nth was decremented when thread is added to the pool.
5845
5846#ifdef KMP_ADJUST_BLOCKTIME
5847  /* Adjust blocktime back to user setting or default if necessary */
5848  /* Middle initialization might never have occurred                */
5849  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5850    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5851    if (__kmp_nth <= __kmp_avail_proc) {
5852      __kmp_zero_bt = FALSE;
5853    }
5854  }
5855#endif /* KMP_ADJUST_BLOCKTIME */
5856
5857  /* free the memory being used */
5858  if (__kmp_env_consistency_check) {
5859    if (thread->th.th_cons) {
5860      __kmp_free_cons_stack(thread->th.th_cons);
5861      thread->th.th_cons = NULL;
5862    }
5863  }
5864
5865  if (thread->th.th_pri_common != NULL) {
5866    __kmp_free(thread->th.th_pri_common);
5867    thread->th.th_pri_common = NULL;
5868  }
5869
5870  if (thread->th.th_task_state_memo_stack != NULL) {
5871    __kmp_free(thread->th.th_task_state_memo_stack);
5872    thread->th.th_task_state_memo_stack = NULL;
5873  }
5874
5875#if KMP_USE_BGET
5876  if (thread->th.th_local.bget_data != NULL) {
5877    __kmp_finalize_bget(thread);
5878  }
5879#endif
5880
5881#if KMP_AFFINITY_SUPPORTED
5882  if (thread->th.th_affin_mask != NULL) {
5883    KMP_CPU_FREE(thread->th.th_affin_mask);
5884    thread->th.th_affin_mask = NULL;
5885  }
5886#endif /* KMP_AFFINITY_SUPPORTED */
5887
5888#if KMP_USE_HIER_SCHED
5889  if (thread->th.th_hier_bar_data != NULL) {
5890    __kmp_free(thread->th.th_hier_bar_data);
5891    thread->th.th_hier_bar_data = NULL;
5892  }
5893#endif
5894
5895  __kmp_reap_team(thread->th.th_serial_team);
5896  thread->th.th_serial_team = NULL;
5897  __kmp_free(thread);
5898
5899  KMP_MB();
5900
5901} // __kmp_reap_thread
5902
5903static void __kmp_internal_end(void) {
5904  int i;
5905
5906  /* First, unregister the library */
5907  __kmp_unregister_library();
5908
5909#if KMP_OS_WINDOWS
5910  /* In Win static library, we can't tell when a root actually dies, so we
5911     reclaim the data structures for any root threads that have died but not
5912     unregistered themselves, in order to shut down cleanly.
5913     In Win dynamic library we also can't tell when a thread dies.  */
5914  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5915// dead roots
5916#endif
5917
5918  for (i = 0; i < __kmp_threads_capacity; i++)
5919    if (__kmp_root[i])
5920      if (__kmp_root[i]->r.r_active)
5921        break;
5922  KMP_MB(); /* Flush all pending memory write invalidates.  */
5923  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5924
5925  if (i < __kmp_threads_capacity) {
5926#if KMP_USE_MONITOR
5927    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5928    KMP_MB(); /* Flush all pending memory write invalidates.  */
5929
5930    // Need to check that monitor was initialized before reaping it. If we are
5931    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5932    // __kmp_monitor will appear to contain valid data, but it is only valid in
5933    // the parent process, not the child.
5934    // New behavior (201008): instead of keying off of the flag
5935    // __kmp_init_parallel, the monitor thread creation is keyed off
5936    // of the new flag __kmp_init_monitor.
5937    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5938    if (TCR_4(__kmp_init_monitor)) {
5939      __kmp_reap_monitor(&__kmp_monitor);
5940      TCW_4(__kmp_init_monitor, 0);
5941    }
5942    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5943    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5944#endif // KMP_USE_MONITOR
5945  } else {
5946/* TODO move this to cleanup code */
5947#ifdef KMP_DEBUG
5948    /* make sure that everything has properly ended */
5949    for (i = 0; i < __kmp_threads_capacity; i++) {
5950      if (__kmp_root[i]) {
5951        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5952        //                    there can be uber threads alive here
5953        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5954      }
5955    }
5956#endif
5957
5958    KMP_MB();
5959
5960    // Reap the worker threads.
5961    // This is valid for now, but be careful if threads are reaped sooner.
5962    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5963      // Get the next thread from the pool.
5964      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5965      __kmp_thread_pool = thread->th.th_next_pool;
5966      // Reap it.
5967      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5968      thread->th.th_next_pool = NULL;
5969      thread->th.th_in_pool = FALSE;
5970      __kmp_reap_thread(thread, 0);
5971    }
5972    __kmp_thread_pool_insert_pt = NULL;
5973
5974    // Reap teams.
5975    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5976      // Get the next team from the pool.
5977      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5978      __kmp_team_pool = team->t.t_next_pool;
5979      // Reap it.
5980      team->t.t_next_pool = NULL;
5981      __kmp_reap_team(team);
5982    }
5983
5984    __kmp_reap_task_teams();
5985
5986#if KMP_OS_UNIX
5987    // Threads that are not reaped should not access any resources since they
5988    // are going to be deallocated soon, so the shutdown sequence should wait
5989    // until all threads either exit the final spin-waiting loop or begin
5990    // sleeping after the given blocktime.
5991    for (i = 0; i < __kmp_threads_capacity; i++) {
5992      kmp_info_t *thr = __kmp_threads[i];
5993      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5994        KMP_CPU_PAUSE();
5995    }
5996#endif
5997
5998    for (i = 0; i < __kmp_threads_capacity; ++i) {
5999      // TBD: Add some checking...
6000      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6001    }
6002
6003    /* Make sure all threadprivate destructors get run by joining with all
6004       worker threads before resetting this flag */
6005    TCW_SYNC_4(__kmp_init_common, FALSE);
6006
6007    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6008    KMP_MB();
6009
6010#if KMP_USE_MONITOR
6011    // See note above: One of the possible fixes for CQ138434 / CQ140126
6012    //
6013    // FIXME: push both code fragments down and CSE them?
6014    // push them into __kmp_cleanup() ?
6015    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6016    if (TCR_4(__kmp_init_monitor)) {
6017      __kmp_reap_monitor(&__kmp_monitor);
6018      TCW_4(__kmp_init_monitor, 0);
6019    }
6020    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6021    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6022#endif
6023  } /* else !__kmp_global.t_active */
6024  TCW_4(__kmp_init_gtid, FALSE);
6025  KMP_MB(); /* Flush all pending memory write invalidates.  */
6026
6027  __kmp_cleanup();
6028#if OMPT_SUPPORT
6029  ompt_fini();
6030#endif
6031}
6032
6033void __kmp_internal_end_library(int gtid_req) {
6034  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6035  /* this shouldn't be a race condition because __kmp_internal_end() is the
6036     only place to clear __kmp_serial_init */
6037  /* we'll check this later too, after we get the lock */
6038  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6039  // redundaant, because the next check will work in any case.
6040  if (__kmp_global.g.g_abort) {
6041    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6042    /* TODO abort? */
6043    return;
6044  }
6045  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6046    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6047    return;
6048  }
6049
6050  KMP_MB(); /* Flush all pending memory write invalidates.  */
6051
6052  /* find out who we are and what we should do */
6053  {
6054    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6055    KA_TRACE(
6056        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6057    if (gtid == KMP_GTID_SHUTDOWN) {
6058      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6059                    "already shutdown\n"));
6060      return;
6061    } else if (gtid == KMP_GTID_MONITOR) {
6062      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6063                    "registered, or system shutdown\n"));
6064      return;
6065    } else if (gtid == KMP_GTID_DNE) {
6066      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6067                    "shutdown\n"));
6068      /* we don't know who we are, but we may still shutdown the library */
6069    } else if (KMP_UBER_GTID(gtid)) {
6070      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6071      if (__kmp_root[gtid]->r.r_active) {
6072        __kmp_global.g.g_abort = -1;
6073        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6074        KA_TRACE(10,
6075                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6076                  gtid));
6077        return;
6078      } else {
6079        KA_TRACE(
6080            10,
6081            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6082        __kmp_unregister_root_current_thread(gtid);
6083      }
6084    } else {
6085/* worker threads may call this function through the atexit handler, if they
6086 * call exit() */
6087/* For now, skip the usual subsequent processing and just dump the debug buffer.
6088   TODO: do a thorough shutdown instead */
6089#ifdef DUMP_DEBUG_ON_EXIT
6090      if (__kmp_debug_buf)
6091        __kmp_dump_debug_buffer();
6092#endif
6093      return;
6094    }
6095  }
6096  /* synchronize the termination process */
6097  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6098
6099  /* have we already finished */
6100  if (__kmp_global.g.g_abort) {
6101    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6102    /* TODO abort? */
6103    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6104    return;
6105  }
6106  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6107    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6108    return;
6109  }
6110
6111  /* We need this lock to enforce mutex between this reading of
6112     __kmp_threads_capacity and the writing by __kmp_register_root.
6113     Alternatively, we can use a counter of roots that is atomically updated by
6114     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6115     __kmp_internal_end_*.  */
6116  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6117
6118  /* now we can safely conduct the actual termination */
6119  __kmp_internal_end();
6120
6121  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6122  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6123
6124  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6125
6126#ifdef DUMP_DEBUG_ON_EXIT
6127  if (__kmp_debug_buf)
6128    __kmp_dump_debug_buffer();
6129#endif
6130
6131#if KMP_OS_WINDOWS
6132  __kmp_close_console();
6133#endif
6134
6135  __kmp_fini_allocator();
6136
6137} // __kmp_internal_end_library
6138
6139void __kmp_internal_end_thread(int gtid_req) {
6140  int i;
6141
6142  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6143  /* this shouldn't be a race condition because __kmp_internal_end() is the
6144   * only place to clear __kmp_serial_init */
6145  /* we'll check this later too, after we get the lock */
6146  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6147  // redundant, because the next check will work in any case.
6148  if (__kmp_global.g.g_abort) {
6149    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6150    /* TODO abort? */
6151    return;
6152  }
6153  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6154    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6155    return;
6156  }
6157
6158  KMP_MB(); /* Flush all pending memory write invalidates.  */
6159
6160  /* find out who we are and what we should do */
6161  {
6162    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6163    KA_TRACE(10,
6164             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6165    if (gtid == KMP_GTID_SHUTDOWN) {
6166      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6167                    "already shutdown\n"));
6168      return;
6169    } else if (gtid == KMP_GTID_MONITOR) {
6170      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6171                    "registered, or system shutdown\n"));
6172      return;
6173    } else if (gtid == KMP_GTID_DNE) {
6174      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6175                    "shutdown\n"));
6176      return;
6177      /* we don't know who we are */
6178    } else if (KMP_UBER_GTID(gtid)) {
6179      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6180      if (__kmp_root[gtid]->r.r_active) {
6181        __kmp_global.g.g_abort = -1;
6182        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6183        KA_TRACE(10,
6184                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6185                  gtid));
6186        return;
6187      } else {
6188        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6189                      gtid));
6190        __kmp_unregister_root_current_thread(gtid);
6191      }
6192    } else {
6193      /* just a worker thread, let's leave */
6194      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6195
6196      if (gtid >= 0) {
6197        __kmp_threads[gtid]->th.th_task_team = NULL;
6198      }
6199
6200      KA_TRACE(10,
6201               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6202                gtid));
6203      return;
6204    }
6205  }
6206#if KMP_DYNAMIC_LIB
6207  if (__kmp_pause_status != kmp_hard_paused)
6208  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6209  // because we will better shutdown later in the library destructor.
6210  {
6211    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6212    return;
6213  }
6214#endif
6215  /* synchronize the termination process */
6216  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6217
6218  /* have we already finished */
6219  if (__kmp_global.g.g_abort) {
6220    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6221    /* TODO abort? */
6222    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6223    return;
6224  }
6225  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6226    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6227    return;
6228  }
6229
6230  /* We need this lock to enforce mutex between this reading of
6231     __kmp_threads_capacity and the writing by __kmp_register_root.
6232     Alternatively, we can use a counter of roots that is atomically updated by
6233     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6234     __kmp_internal_end_*.  */
6235
6236  /* should we finish the run-time?  are all siblings done? */
6237  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6238
6239  for (i = 0; i < __kmp_threads_capacity; ++i) {
6240    if (KMP_UBER_GTID(i)) {
6241      KA_TRACE(
6242          10,
6243          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6244      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6245      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6246      return;
6247    }
6248  }
6249
6250  /* now we can safely conduct the actual termination */
6251
6252  __kmp_internal_end();
6253
6254  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6255  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6256
6257  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6258
6259#ifdef DUMP_DEBUG_ON_EXIT
6260  if (__kmp_debug_buf)
6261    __kmp_dump_debug_buffer();
6262#endif
6263} // __kmp_internal_end_thread
6264
6265// -----------------------------------------------------------------------------
6266// Library registration stuff.
6267
6268static long __kmp_registration_flag = 0;
6269// Random value used to indicate library initialization.
6270static char *__kmp_registration_str = NULL;
6271// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6272
6273static inline char *__kmp_reg_status_name() {
6274  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6275     each thread. If registration and unregistration go in different threads
6276     (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6277     env var can not be found, because the name will contain different pid. */
6278  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6279} // __kmp_reg_status_get
6280
6281void __kmp_register_library_startup(void) {
6282
6283  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6284  int done = 0;
6285  union {
6286    double dtime;
6287    long ltime;
6288  } time;
6289#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6290  __kmp_initialize_system_tick();
6291#endif
6292  __kmp_read_system_time(&time.dtime);
6293  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6294  __kmp_registration_str =
6295      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6296                       __kmp_registration_flag, KMP_LIBRARY_FILE);
6297
6298  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6299                __kmp_registration_str));
6300
6301  while (!done) {
6302
6303    char *value = NULL; // Actual value of the environment variable.
6304
6305    // Set environment variable, but do not overwrite if it is exist.
6306    __kmp_env_set(name, __kmp_registration_str, 0);
6307    // Check the variable is written.
6308    value = __kmp_env_get(name);
6309    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6310
6311      done = 1; // Ok, environment variable set successfully, exit the loop.
6312
6313    } else {
6314
6315      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6316      // Check whether it alive or dead.
6317      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6318      char *tail = value;
6319      char *flag_addr_str = NULL;
6320      char *flag_val_str = NULL;
6321      char const *file_name = NULL;
6322      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6323      __kmp_str_split(tail, '-', &flag_val_str, &tail);
6324      file_name = tail;
6325      if (tail != NULL) {
6326        long *flag_addr = 0;
6327        long flag_val = 0;
6328        KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6329        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6330        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6331          // First, check whether environment-encoded address is mapped into
6332          // addr space.
6333          // If so, dereference it to see if it still has the right value.
6334          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6335            neighbor = 1;
6336          } else {
6337            // If not, then we know the other copy of the library is no longer
6338            // running.
6339            neighbor = 2;
6340          }
6341        }
6342      }
6343      switch (neighbor) {
6344      case 0: // Cannot parse environment variable -- neighbor status unknown.
6345        // Assume it is the incompatible format of future version of the
6346        // library. Assume the other library is alive.
6347        // WARN( ... ); // TODO: Issue a warning.
6348        file_name = "unknown library";
6349        KMP_FALLTHROUGH();
6350      // Attention! Falling to the next case. That's intentional.
6351      case 1: { // Neighbor is alive.
6352        // Check it is allowed.
6353        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6354        if (!__kmp_str_match_true(duplicate_ok)) {
6355          // That's not allowed. Issue fatal error.
6356          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6357                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6358        }
6359        KMP_INTERNAL_FREE(duplicate_ok);
6360        __kmp_duplicate_library_ok = 1;
6361        done = 1; // Exit the loop.
6362      } break;
6363      case 2: { // Neighbor is dead.
6364        // Clear the variable and try to register library again.
6365        __kmp_env_unset(name);
6366      } break;
6367      default: { KMP_DEBUG_ASSERT(0); } break;
6368      }
6369    }
6370    KMP_INTERNAL_FREE((void *)value);
6371  }
6372  KMP_INTERNAL_FREE((void *)name);
6373
6374} // func __kmp_register_library_startup
6375
6376void __kmp_unregister_library(void) {
6377
6378  char *name = __kmp_reg_status_name();
6379  char *value = __kmp_env_get(name);
6380
6381  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6382  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6383  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6384    // Ok, this is our variable. Delete it.
6385    __kmp_env_unset(name);
6386  }
6387
6388  KMP_INTERNAL_FREE(__kmp_registration_str);
6389  KMP_INTERNAL_FREE(value);
6390  KMP_INTERNAL_FREE(name);
6391
6392  __kmp_registration_flag = 0;
6393  __kmp_registration_str = NULL;
6394
6395} // __kmp_unregister_library
6396
6397// End of Library registration stuff.
6398// -----------------------------------------------------------------------------
6399
6400#if KMP_MIC_SUPPORTED
6401
6402static void __kmp_check_mic_type() {
6403  kmp_cpuid_t cpuid_state = {0};
6404  kmp_cpuid_t *cs_p = &cpuid_state;
6405  __kmp_x86_cpuid(1, 0, cs_p);
6406  // We don't support mic1 at the moment
6407  if ((cs_p->eax & 0xff0) == 0xB10) {
6408    __kmp_mic_type = mic2;
6409  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6410    __kmp_mic_type = mic3;
6411  } else {
6412    __kmp_mic_type = non_mic;
6413  }
6414}
6415
6416#endif /* KMP_MIC_SUPPORTED */
6417
6418static void __kmp_do_serial_initialize(void) {
6419  int i, gtid;
6420  int size;
6421
6422  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6423
6424  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6425  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6426  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6427  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6428  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6429
6430#if OMPT_SUPPORT
6431  ompt_pre_init();
6432#endif
6433
6434  __kmp_validate_locks();
6435
6436  /* Initialize internal memory allocator */
6437  __kmp_init_allocator();
6438
6439  /* Register the library startup via an environment variable and check to see
6440     whether another copy of the library is already registered. */
6441
6442  __kmp_register_library_startup();
6443
6444  /* TODO reinitialization of library */
6445  if (TCR_4(__kmp_global.g.g_done)) {
6446    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6447  }
6448
6449  __kmp_global.g.g_abort = 0;
6450  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6451
6452/* initialize the locks */
6453#if KMP_USE_ADAPTIVE_LOCKS
6454#if KMP_DEBUG_ADAPTIVE_LOCKS
6455  __kmp_init_speculative_stats();
6456#endif
6457#endif
6458#if KMP_STATS_ENABLED
6459  __kmp_stats_init();
6460#endif
6461  __kmp_init_lock(&__kmp_global_lock);
6462  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6463  __kmp_init_lock(&__kmp_debug_lock);
6464  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6465  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6466  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6467  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6468  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6469  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6470  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6471  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6472  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6473  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6474  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6475  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6476  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6477  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6478  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6479#if KMP_USE_MONITOR
6480  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6481#endif
6482  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6483
6484  /* conduct initialization and initial setup of configuration */
6485
6486  __kmp_runtime_initialize();
6487
6488#if KMP_MIC_SUPPORTED
6489  __kmp_check_mic_type();
6490#endif
6491
6492// Some global variable initialization moved here from kmp_env_initialize()
6493#ifdef KMP_DEBUG
6494  kmp_diag = 0;
6495#endif
6496  __kmp_abort_delay = 0;
6497
6498  // From __kmp_init_dflt_team_nth()
6499  /* assume the entire machine will be used */
6500  __kmp_dflt_team_nth_ub = __kmp_xproc;
6501  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6502    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6503  }
6504  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6505    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6506  }
6507  __kmp_max_nth = __kmp_sys_max_nth;
6508  __kmp_cg_max_nth = __kmp_sys_max_nth;
6509  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6510  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6511    __kmp_teams_max_nth = __kmp_sys_max_nth;
6512  }
6513
6514  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6515  // part
6516  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6517#if KMP_USE_MONITOR
6518  __kmp_monitor_wakeups =
6519      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6520  __kmp_bt_intervals =
6521      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6522#endif
6523  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6524  __kmp_library = library_throughput;
6525  // From KMP_SCHEDULE initialization
6526  __kmp_static = kmp_sch_static_balanced;
6527// AC: do not use analytical here, because it is non-monotonous
6528//__kmp_guided = kmp_sch_guided_iterative_chunked;
6529//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6530// need to repeat assignment
6531// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6532// bit control and barrier method control parts
6533#if KMP_FAST_REDUCTION_BARRIER
6534#define kmp_reduction_barrier_gather_bb ((int)1)
6535#define kmp_reduction_barrier_release_bb ((int)1)
6536#define kmp_reduction_barrier_gather_pat bp_hyper_bar
6537#define kmp_reduction_barrier_release_pat bp_hyper_bar
6538#endif // KMP_FAST_REDUCTION_BARRIER
6539  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6540    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6541    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6542    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6543    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6544#if KMP_FAST_REDUCTION_BARRIER
6545    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6546      // lin_64 ): hyper,1
6547      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6548      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6549      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6550      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6551    }
6552#endif // KMP_FAST_REDUCTION_BARRIER
6553  }
6554#if KMP_FAST_REDUCTION_BARRIER
6555#undef kmp_reduction_barrier_release_pat
6556#undef kmp_reduction_barrier_gather_pat
6557#undef kmp_reduction_barrier_release_bb
6558#undef kmp_reduction_barrier_gather_bb
6559#endif // KMP_FAST_REDUCTION_BARRIER
6560#if KMP_MIC_SUPPORTED
6561  if (__kmp_mic_type == mic2) { // KNC
6562    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6563    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6564    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6565        1; // forkjoin release
6566    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6567    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6568  }
6569#if KMP_FAST_REDUCTION_BARRIER
6570  if (__kmp_mic_type == mic2) { // KNC
6571    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6572    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6573  }
6574#endif // KMP_FAST_REDUCTION_BARRIER
6575#endif // KMP_MIC_SUPPORTED
6576
6577// From KMP_CHECKS initialization
6578#ifdef KMP_DEBUG
6579  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6580#else
6581  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6582#endif
6583
6584  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6585  __kmp_foreign_tp = TRUE;
6586
6587  __kmp_global.g.g_dynamic = FALSE;
6588  __kmp_global.g.g_dynamic_mode = dynamic_default;
6589
6590  __kmp_env_initialize(NULL);
6591
6592// Print all messages in message catalog for testing purposes.
6593#ifdef KMP_DEBUG
6594  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6595  if (__kmp_str_match_true(val)) {
6596    kmp_str_buf_t buffer;
6597    __kmp_str_buf_init(&buffer);
6598    __kmp_i18n_dump_catalog(&buffer);
6599    __kmp_printf("%s", buffer.str);
6600    __kmp_str_buf_free(&buffer);
6601  }
6602  __kmp_env_free(&val);
6603#endif
6604
6605  __kmp_threads_capacity =
6606      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6607  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6608  __kmp_tp_capacity = __kmp_default_tp_capacity(
6609      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6610
6611  // If the library is shut down properly, both pools must be NULL. Just in
6612  // case, set them to NULL -- some memory may leak, but subsequent code will
6613  // work even if pools are not freed.
6614  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6615  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6616  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6617  __kmp_thread_pool = NULL;
6618  __kmp_thread_pool_insert_pt = NULL;
6619  __kmp_team_pool = NULL;
6620
6621  /* Allocate all of the variable sized records */
6622  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6623   * expandable */
6624  /* Since allocation is cache-aligned, just add extra padding at the end */
6625  size =
6626      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6627      CACHE_LINE;
6628  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6629  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6630                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
6631
6632  /* init thread counts */
6633  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6634                   0); // Asserts fail if the library is reinitializing and
6635  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6636  __kmp_all_nth = 0;
6637  __kmp_nth = 0;
6638
6639  /* setup the uber master thread and hierarchy */
6640  gtid = __kmp_register_root(TRUE);
6641  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6642  KMP_ASSERT(KMP_UBER_GTID(gtid));
6643  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6644
6645  KMP_MB(); /* Flush all pending memory write invalidates.  */
6646
6647  __kmp_common_initialize();
6648
6649#if KMP_OS_UNIX
6650  /* invoke the child fork handler */
6651  __kmp_register_atfork();
6652#endif
6653
6654#if !KMP_DYNAMIC_LIB
6655  {
6656    /* Invoke the exit handler when the program finishes, only for static
6657       library. For dynamic library, we already have _fini and DllMain. */
6658    int rc = atexit(__kmp_internal_end_atexit);
6659    if (rc != 0) {
6660      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6661                  __kmp_msg_null);
6662    }
6663  }
6664#endif
6665
6666#if KMP_HANDLE_SIGNALS
6667#if KMP_OS_UNIX
6668  /* NOTE: make sure that this is called before the user installs their own
6669     signal handlers so that the user handlers are called first. this way they
6670     can return false, not call our handler, avoid terminating the library, and
6671     continue execution where they left off. */
6672  __kmp_install_signals(FALSE);
6673#endif /* KMP_OS_UNIX */
6674#if KMP_OS_WINDOWS
6675  __kmp_install_signals(TRUE);
6676#endif /* KMP_OS_WINDOWS */
6677#endif
6678
6679  /* we have finished the serial initialization */
6680  __kmp_init_counter++;
6681
6682  __kmp_init_serial = TRUE;
6683
6684  if (__kmp_settings) {
6685    __kmp_env_print();
6686  }
6687
6688  if (__kmp_display_env || __kmp_display_env_verbose) {
6689    __kmp_env_print_2();
6690  }
6691
6692#if OMPT_SUPPORT
6693  ompt_post_init();
6694#endif
6695
6696  KMP_MB();
6697
6698  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6699}
6700
6701void __kmp_serial_initialize(void) {
6702  if (__kmp_init_serial) {
6703    return;
6704  }
6705  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6706  if (__kmp_init_serial) {
6707    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6708    return;
6709  }
6710  __kmp_do_serial_initialize();
6711  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6712}
6713
6714static void __kmp_do_middle_initialize(void) {
6715  int i, j;
6716  int prev_dflt_team_nth;
6717
6718  if (!__kmp_init_serial) {
6719    __kmp_do_serial_initialize();
6720  }
6721
6722  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6723
6724  // Save the previous value for the __kmp_dflt_team_nth so that
6725  // we can avoid some reinitialization if it hasn't changed.
6726  prev_dflt_team_nth = __kmp_dflt_team_nth;
6727
6728#if KMP_AFFINITY_SUPPORTED
6729  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6730  // number of cores on the machine.
6731  __kmp_affinity_initialize();
6732
6733  // Run through the __kmp_threads array and set the affinity mask
6734  // for each root thread that is currently registered with the RTL.
6735  for (i = 0; i < __kmp_threads_capacity; i++) {
6736    if (TCR_PTR(__kmp_threads[i]) != NULL) {
6737      __kmp_affinity_set_init_mask(i, TRUE);
6738    }
6739  }
6740#endif /* KMP_AFFINITY_SUPPORTED */
6741
6742  KMP_ASSERT(__kmp_xproc > 0);
6743  if (__kmp_avail_proc == 0) {
6744    __kmp_avail_proc = __kmp_xproc;
6745  }
6746
6747  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6748  // correct them now
6749  j = 0;
6750  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6751    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6752        __kmp_avail_proc;
6753    j++;
6754  }
6755
6756  if (__kmp_dflt_team_nth == 0) {
6757#ifdef KMP_DFLT_NTH_CORES
6758    // Default #threads = #cores
6759    __kmp_dflt_team_nth = __kmp_ncores;
6760    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6761                  "__kmp_ncores (%d)\n",
6762                  __kmp_dflt_team_nth));
6763#else
6764    // Default #threads = #available OS procs
6765    __kmp_dflt_team_nth = __kmp_avail_proc;
6766    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6767                  "__kmp_avail_proc(%d)\n",
6768                  __kmp_dflt_team_nth));
6769#endif /* KMP_DFLT_NTH_CORES */
6770  }
6771
6772  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6773    __kmp_dflt_team_nth = KMP_MIN_NTH;
6774  }
6775  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6776    __kmp_dflt_team_nth = __kmp_sys_max_nth;
6777  }
6778
6779  // There's no harm in continuing if the following check fails,
6780  // but it indicates an error in the previous logic.
6781  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6782
6783  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6784    // Run through the __kmp_threads array and set the num threads icv for each
6785    // root thread that is currently registered with the RTL (which has not
6786    // already explicitly set its nthreads-var with a call to
6787    // omp_set_num_threads()).
6788    for (i = 0; i < __kmp_threads_capacity; i++) {
6789      kmp_info_t *thread = __kmp_threads[i];
6790      if (thread == NULL)
6791        continue;
6792      if (thread->th.th_current_task->td_icvs.nproc != 0)
6793        continue;
6794
6795      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6796    }
6797  }
6798  KA_TRACE(
6799      20,
6800      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6801       __kmp_dflt_team_nth));
6802
6803#ifdef KMP_ADJUST_BLOCKTIME
6804  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6805  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6806    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6807    if (__kmp_nth > __kmp_avail_proc) {
6808      __kmp_zero_bt = TRUE;
6809    }
6810  }
6811#endif /* KMP_ADJUST_BLOCKTIME */
6812
6813  /* we have finished middle initialization */
6814  TCW_SYNC_4(__kmp_init_middle, TRUE);
6815
6816  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6817}
6818
6819void __kmp_middle_initialize(void) {
6820  if (__kmp_init_middle) {
6821    return;
6822  }
6823  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6824  if (__kmp_init_middle) {
6825    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6826    return;
6827  }
6828  __kmp_do_middle_initialize();
6829  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6830}
6831
6832void __kmp_parallel_initialize(void) {
6833  int gtid = __kmp_entry_gtid(); // this might be a new root
6834
6835  /* synchronize parallel initialization (for sibling) */
6836  if (TCR_4(__kmp_init_parallel))
6837    return;
6838  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6839  if (TCR_4(__kmp_init_parallel)) {
6840    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6841    return;
6842  }
6843
6844  /* TODO reinitialization after we have already shut down */
6845  if (TCR_4(__kmp_global.g.g_done)) {
6846    KA_TRACE(
6847        10,
6848        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6849    __kmp_infinite_loop();
6850  }
6851
6852  /* jc: The lock __kmp_initz_lock is already held, so calling
6853     __kmp_serial_initialize would cause a deadlock.  So we call
6854     __kmp_do_serial_initialize directly. */
6855  if (!__kmp_init_middle) {
6856    __kmp_do_middle_initialize();
6857  }
6858  __kmp_resume_if_hard_paused();
6859
6860  /* begin initialization */
6861  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6862  KMP_ASSERT(KMP_UBER_GTID(gtid));
6863
6864#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6865  // Save the FP control regs.
6866  // Worker threads will set theirs to these values at thread startup.
6867  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6868  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6869  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6870#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6871
6872#if KMP_OS_UNIX
6873#if KMP_HANDLE_SIGNALS
6874  /*  must be after __kmp_serial_initialize  */
6875  __kmp_install_signals(TRUE);
6876#endif
6877#endif
6878
6879  __kmp_suspend_initialize();
6880
6881#if defined(USE_LOAD_BALANCE)
6882  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6883    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6884  }
6885#else
6886  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6887    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6888  }
6889#endif
6890
6891  if (__kmp_version) {
6892    __kmp_print_version_2();
6893  }
6894
6895  /* we have finished parallel initialization */
6896  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6897
6898  KMP_MB();
6899  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6900
6901  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6902}
6903
6904/* ------------------------------------------------------------------------ */
6905
6906void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6907                                   kmp_team_t *team) {
6908  kmp_disp_t *dispatch;
6909
6910  KMP_MB();
6911
6912  /* none of the threads have encountered any constructs, yet. */
6913  this_thr->th.th_local.this_construct = 0;
6914#if KMP_CACHE_MANAGE
6915  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6916#endif /* KMP_CACHE_MANAGE */
6917  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6918  KMP_DEBUG_ASSERT(dispatch);
6919  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6920  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6921  // this_thr->th.th_info.ds.ds_tid ] );
6922
6923  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6924  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6925  if (__kmp_env_consistency_check)
6926    __kmp_push_parallel(gtid, team->t.t_ident);
6927
6928  KMP_MB(); /* Flush all pending memory write invalidates.  */
6929}
6930
6931void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6932                                  kmp_team_t *team) {
6933  if (__kmp_env_consistency_check)
6934    __kmp_pop_parallel(gtid, team->t.t_ident);
6935
6936  __kmp_finish_implicit_task(this_thr);
6937}
6938
6939int __kmp_invoke_task_func(int gtid) {
6940  int rc;
6941  int tid = __kmp_tid_from_gtid(gtid);
6942  kmp_info_t *this_thr = __kmp_threads[gtid];
6943  kmp_team_t *team = this_thr->th.th_team;
6944
6945  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6946#if USE_ITT_BUILD
6947  if (__itt_stack_caller_create_ptr) {
6948    __kmp_itt_stack_callee_enter(
6949        (__itt_caller)
6950            team->t.t_stack_id); // inform ittnotify about entering user's code
6951  }
6952#endif /* USE_ITT_BUILD */
6953#if INCLUDE_SSC_MARKS
6954  SSC_MARK_INVOKING();
6955#endif
6956
6957#if OMPT_SUPPORT
6958  void *dummy;
6959  void **exit_runtime_p;
6960  ompt_data_t *my_task_data;
6961  ompt_data_t *my_parallel_data;
6962  int ompt_team_size;
6963
6964  if (ompt_enabled.enabled) {
6965    exit_runtime_p = &(
6966        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6967  } else {
6968    exit_runtime_p = &dummy;
6969  }
6970
6971  my_task_data =
6972      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6973  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6974  if (ompt_enabled.ompt_callback_implicit_task) {
6975    ompt_team_size = team->t.t_nproc;
6976    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6977        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6978        __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
6979    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6980  }
6981#endif
6982
6983#if KMP_STATS_ENABLED
6984  stats_state_e previous_state = KMP_GET_THREAD_STATE();
6985  if (previous_state == stats_state_e::TEAMS_REGION) {
6986    KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
6987  } else {
6988    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
6989  }
6990  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
6991#endif
6992
6993  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6994                              tid, (int)team->t.t_argc, (void **)team->t.t_argv
6995#if OMPT_SUPPORT
6996                              ,
6997                              exit_runtime_p
6998#endif
6999                              );
7000#if OMPT_SUPPORT
7001  *exit_runtime_p = NULL;
7002#endif
7003
7004#if KMP_STATS_ENABLED
7005  if (previous_state == stats_state_e::TEAMS_REGION) {
7006    KMP_SET_THREAD_STATE(previous_state);
7007  }
7008  KMP_POP_PARTITIONED_TIMER();
7009#endif
7010
7011#if USE_ITT_BUILD
7012  if (__itt_stack_caller_create_ptr) {
7013    __kmp_itt_stack_callee_leave(
7014        (__itt_caller)
7015            team->t.t_stack_id); // inform ittnotify about leaving user's code
7016  }
7017#endif /* USE_ITT_BUILD */
7018  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7019
7020  return rc;
7021}
7022
7023void __kmp_teams_master(int gtid) {
7024  // This routine is called by all master threads in teams construct
7025  kmp_info_t *thr = __kmp_threads[gtid];
7026  kmp_team_t *team = thr->th.th_team;
7027  ident_t *loc = team->t.t_ident;
7028  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7029  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7030  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7031  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7032                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7033
7034  // This thread is a new CG root.  Set up the proper variables.
7035  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7036  tmp->cg_root = thr; // Make thr the CG root
7037  // Init to thread limit that was stored when league masters were forked
7038  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7039  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7040  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7041                 " cg_nthreads to 1\n",
7042                 thr, tmp));
7043  tmp->up = thr->th.th_cg_roots;
7044  thr->th.th_cg_roots = tmp;
7045
7046// Launch league of teams now, but not let workers execute
7047// (they hang on fork barrier until next parallel)
7048#if INCLUDE_SSC_MARKS
7049  SSC_MARK_FORKING();
7050#endif
7051  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7052                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7053                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7054#if INCLUDE_SSC_MARKS
7055  SSC_MARK_JOINING();
7056#endif
7057  // If the team size was reduced from the limit, set it to the new size
7058  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7059    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7060  // AC: last parameter "1" eliminates join barrier which won't work because
7061  // worker threads are in a fork barrier waiting for more parallel regions
7062  __kmp_join_call(loc, gtid
7063#if OMPT_SUPPORT
7064                  ,
7065                  fork_context_intel
7066#endif
7067                  ,
7068                  1);
7069}
7070
7071int __kmp_invoke_teams_master(int gtid) {
7072  kmp_info_t *this_thr = __kmp_threads[gtid];
7073  kmp_team_t *team = this_thr->th.th_team;
7074#if KMP_DEBUG
7075  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7076    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7077                     (void *)__kmp_teams_master);
7078#endif
7079  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7080  __kmp_teams_master(gtid);
7081  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7082  return 1;
7083}
7084
7085/* this sets the requested number of threads for the next parallel region
7086   encountered by this team. since this should be enclosed in the forkjoin
7087   critical section it should avoid race conditions with assymmetrical nested
7088   parallelism */
7089
7090void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7091  kmp_info_t *thr = __kmp_threads[gtid];
7092
7093  if (num_threads > 0)
7094    thr->th.th_set_nproc = num_threads;
7095}
7096
7097/* this sets the requested number of teams for the teams region and/or
7098   the number of threads for the next parallel region encountered  */
7099void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7100                          int num_threads) {
7101  kmp_info_t *thr = __kmp_threads[gtid];
7102  KMP_DEBUG_ASSERT(num_teams >= 0);
7103  KMP_DEBUG_ASSERT(num_threads >= 0);
7104
7105  if (num_teams == 0)
7106    num_teams = 1; // default number of teams is 1.
7107  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7108    if (!__kmp_reserve_warn) {
7109      __kmp_reserve_warn = 1;
7110      __kmp_msg(kmp_ms_warning,
7111                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7112                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7113    }
7114    num_teams = __kmp_teams_max_nth;
7115  }
7116  // Set number of teams (number of threads in the outer "parallel" of the
7117  // teams)
7118  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7119
7120  // Remember the number of threads for inner parallel regions
7121  if (num_threads == 0) {
7122    if (!TCR_4(__kmp_init_middle))
7123      __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7124    num_threads = __kmp_avail_proc / num_teams;
7125    if (num_teams * num_threads > __kmp_teams_max_nth) {
7126      // adjust num_threads w/o warning as it is not user setting
7127      num_threads = __kmp_teams_max_nth / num_teams;
7128    }
7129  } else {
7130    // This thread will be the master of the league masters
7131    // Store new thread limit; old limit is saved in th_cg_roots list
7132    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7133
7134    if (num_teams * num_threads > __kmp_teams_max_nth) {
7135      int new_threads = __kmp_teams_max_nth / num_teams;
7136      if (!__kmp_reserve_warn) { // user asked for too many threads
7137        __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7138        __kmp_msg(kmp_ms_warning,
7139                  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7140                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7141      }
7142      num_threads = new_threads;
7143    }
7144  }
7145  thr->th.th_teams_size.nth = num_threads;
7146}
7147
7148// Set the proc_bind var to use in the following parallel region.
7149void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7150  kmp_info_t *thr = __kmp_threads[gtid];
7151  thr->th.th_set_proc_bind = proc_bind;
7152}
7153
7154/* Launch the worker threads into the microtask. */
7155
7156void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7157  kmp_info_t *this_thr = __kmp_threads[gtid];
7158
7159#ifdef KMP_DEBUG
7160  int f;
7161#endif /* KMP_DEBUG */
7162
7163  KMP_DEBUG_ASSERT(team);
7164  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7165  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7166  KMP_MB(); /* Flush all pending memory write invalidates.  */
7167
7168  team->t.t_construct = 0; /* no single directives seen yet */
7169  team->t.t_ordered.dt.t_value =
7170      0; /* thread 0 enters the ordered section first */
7171
7172  /* Reset the identifiers on the dispatch buffer */
7173  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7174  if (team->t.t_max_nproc > 1) {
7175    int i;
7176    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7177      team->t.t_disp_buffer[i].buffer_index = i;
7178      team->t.t_disp_buffer[i].doacross_buf_idx = i;
7179    }
7180  } else {
7181    team->t.t_disp_buffer[0].buffer_index = 0;
7182    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7183  }
7184
7185  KMP_MB(); /* Flush all pending memory write invalidates.  */
7186  KMP_ASSERT(this_thr->th.th_team == team);
7187
7188#ifdef KMP_DEBUG
7189  for (f = 0; f < team->t.t_nproc; f++) {
7190    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7191                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7192  }
7193#endif /* KMP_DEBUG */
7194
7195  /* release the worker threads so they may begin working */
7196  __kmp_fork_barrier(gtid, 0);
7197}
7198
7199void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7200  kmp_info_t *this_thr = __kmp_threads[gtid];
7201
7202  KMP_DEBUG_ASSERT(team);
7203  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7204  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7205  KMP_MB(); /* Flush all pending memory write invalidates.  */
7206
7207/* Join barrier after fork */
7208
7209#ifdef KMP_DEBUG
7210  if (__kmp_threads[gtid] &&
7211      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7212    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7213                 __kmp_threads[gtid]);
7214    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7215                 "team->t.t_nproc=%d\n",
7216                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7217                 team->t.t_nproc);
7218    __kmp_print_structure();
7219  }
7220  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7221                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7222#endif /* KMP_DEBUG */
7223
7224  __kmp_join_barrier(gtid); /* wait for everyone */
7225#if OMPT_SUPPORT
7226  if (ompt_enabled.enabled &&
7227      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7228    int ds_tid = this_thr->th.th_info.ds.ds_tid;
7229    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7230    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7231#if OMPT_OPTIONAL
7232    void *codeptr = NULL;
7233    if (KMP_MASTER_TID(ds_tid) &&
7234        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7235         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7236      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7237
7238    if (ompt_enabled.ompt_callback_sync_region_wait) {
7239      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7240          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7241          codeptr);
7242    }
7243    if (ompt_enabled.ompt_callback_sync_region) {
7244      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7245          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7246          codeptr);
7247    }
7248#endif
7249    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7250      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7251          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7252    }
7253  }
7254#endif
7255
7256  KMP_MB(); /* Flush all pending memory write invalidates.  */
7257  KMP_ASSERT(this_thr->th.th_team == team);
7258}
7259
7260/* ------------------------------------------------------------------------ */
7261
7262#ifdef USE_LOAD_BALANCE
7263
7264// Return the worker threads actively spinning in the hot team, if we
7265// are at the outermost level of parallelism.  Otherwise, return 0.
7266static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7267  int i;
7268  int retval;
7269  kmp_team_t *hot_team;
7270
7271  if (root->r.r_active) {
7272    return 0;
7273  }
7274  hot_team = root->r.r_hot_team;
7275  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7276    return hot_team->t.t_nproc - 1; // Don't count master thread
7277  }
7278
7279  // Skip the master thread - it is accounted for elsewhere.
7280  retval = 0;
7281  for (i = 1; i < hot_team->t.t_nproc; i++) {
7282    if (hot_team->t.t_threads[i]->th.th_active) {
7283      retval++;
7284    }
7285  }
7286  return retval;
7287}
7288
7289// Perform an automatic adjustment to the number of
7290// threads used by the next parallel region.
7291static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7292  int retval;
7293  int pool_active;
7294  int hot_team_active;
7295  int team_curr_active;
7296  int system_active;
7297
7298  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7299                set_nproc));
7300  KMP_DEBUG_ASSERT(root);
7301  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7302                       ->th.th_current_task->td_icvs.dynamic == TRUE);
7303  KMP_DEBUG_ASSERT(set_nproc > 1);
7304
7305  if (set_nproc == 1) {
7306    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7307    return 1;
7308  }
7309
7310  // Threads that are active in the thread pool, active in the hot team for this
7311  // particular root (if we are at the outer par level), and the currently
7312  // executing thread (to become the master) are available to add to the new
7313  // team, but are currently contributing to the system load, and must be
7314  // accounted for.
7315  pool_active = __kmp_thread_pool_active_nth;
7316  hot_team_active = __kmp_active_hot_team_nproc(root);
7317  team_curr_active = pool_active + hot_team_active + 1;
7318
7319  // Check the system load.
7320  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7321  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7322                "hot team active = %d\n",
7323                system_active, pool_active, hot_team_active));
7324
7325  if (system_active < 0) {
7326    // There was an error reading the necessary info from /proc, so use the
7327    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7328    // = dynamic_thread_limit, we shouldn't wind up getting back here.
7329    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7330    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7331
7332    // Make this call behave like the thread limit algorithm.
7333    retval = __kmp_avail_proc - __kmp_nth +
7334             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7335    if (retval > set_nproc) {
7336      retval = set_nproc;
7337    }
7338    if (retval < KMP_MIN_NTH) {
7339      retval = KMP_MIN_NTH;
7340    }
7341
7342    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7343                  retval));
7344    return retval;
7345  }
7346
7347  // There is a slight delay in the load balance algorithm in detecting new
7348  // running procs. The real system load at this instant should be at least as
7349  // large as the #active omp thread that are available to add to the team.
7350  if (system_active < team_curr_active) {
7351    system_active = team_curr_active;
7352  }
7353  retval = __kmp_avail_proc - system_active + team_curr_active;
7354  if (retval > set_nproc) {
7355    retval = set_nproc;
7356  }
7357  if (retval < KMP_MIN_NTH) {
7358    retval = KMP_MIN_NTH;
7359  }
7360
7361  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7362  return retval;
7363} // __kmp_load_balance_nproc()
7364
7365#endif /* USE_LOAD_BALANCE */
7366
7367/* ------------------------------------------------------------------------ */
7368
7369/* NOTE: this is called with the __kmp_init_lock held */
7370void __kmp_cleanup(void) {
7371  int f;
7372
7373  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7374
7375  if (TCR_4(__kmp_init_parallel)) {
7376#if KMP_HANDLE_SIGNALS
7377    __kmp_remove_signals();
7378#endif
7379    TCW_4(__kmp_init_parallel, FALSE);
7380  }
7381
7382  if (TCR_4(__kmp_init_middle)) {
7383#if KMP_AFFINITY_SUPPORTED
7384    __kmp_affinity_uninitialize();
7385#endif /* KMP_AFFINITY_SUPPORTED */
7386    __kmp_cleanup_hierarchy();
7387    TCW_4(__kmp_init_middle, FALSE);
7388  }
7389
7390  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7391
7392  if (__kmp_init_serial) {
7393    __kmp_runtime_destroy();
7394    __kmp_init_serial = FALSE;
7395  }
7396
7397  __kmp_cleanup_threadprivate_caches();
7398
7399  for (f = 0; f < __kmp_threads_capacity; f++) {
7400    if (__kmp_root[f] != NULL) {
7401      __kmp_free(__kmp_root[f]);
7402      __kmp_root[f] = NULL;
7403    }
7404  }
7405  __kmp_free(__kmp_threads);
7406  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7407  // there is no need in freeing __kmp_root.
7408  __kmp_threads = NULL;
7409  __kmp_root = NULL;
7410  __kmp_threads_capacity = 0;
7411
7412#if KMP_USE_DYNAMIC_LOCK
7413  __kmp_cleanup_indirect_user_locks();
7414#else
7415  __kmp_cleanup_user_locks();
7416#endif
7417
7418#if KMP_AFFINITY_SUPPORTED
7419  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7420  __kmp_cpuinfo_file = NULL;
7421#endif /* KMP_AFFINITY_SUPPORTED */
7422
7423#if KMP_USE_ADAPTIVE_LOCKS
7424#if KMP_DEBUG_ADAPTIVE_LOCKS
7425  __kmp_print_speculative_stats();
7426#endif
7427#endif
7428  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7429  __kmp_nested_nth.nth = NULL;
7430  __kmp_nested_nth.size = 0;
7431  __kmp_nested_nth.used = 0;
7432  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7433  __kmp_nested_proc_bind.bind_types = NULL;
7434  __kmp_nested_proc_bind.size = 0;
7435  __kmp_nested_proc_bind.used = 0;
7436  if (__kmp_affinity_format) {
7437    KMP_INTERNAL_FREE(__kmp_affinity_format);
7438    __kmp_affinity_format = NULL;
7439  }
7440
7441  __kmp_i18n_catclose();
7442
7443#if KMP_USE_HIER_SCHED
7444  __kmp_hier_scheds.deallocate();
7445#endif
7446
7447#if KMP_STATS_ENABLED
7448  __kmp_stats_fini();
7449#endif
7450
7451  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7452}
7453
7454/* ------------------------------------------------------------------------ */
7455
7456int __kmp_ignore_mppbeg(void) {
7457  char *env;
7458
7459  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7460    if (__kmp_str_match_false(env))
7461      return FALSE;
7462  }
7463  // By default __kmpc_begin() is no-op.
7464  return TRUE;
7465}
7466
7467int __kmp_ignore_mppend(void) {
7468  char *env;
7469
7470  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7471    if (__kmp_str_match_false(env))
7472      return FALSE;
7473  }
7474  // By default __kmpc_end() is no-op.
7475  return TRUE;
7476}
7477
7478void __kmp_internal_begin(void) {
7479  int gtid;
7480  kmp_root_t *root;
7481
7482  /* this is a very important step as it will register new sibling threads
7483     and assign these new uber threads a new gtid */
7484  gtid = __kmp_entry_gtid();
7485  root = __kmp_threads[gtid]->th.th_root;
7486  KMP_ASSERT(KMP_UBER_GTID(gtid));
7487
7488  if (root->r.r_begin)
7489    return;
7490  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7491  if (root->r.r_begin) {
7492    __kmp_release_lock(&root->r.r_begin_lock, gtid);
7493    return;
7494  }
7495
7496  root->r.r_begin = TRUE;
7497
7498  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7499}
7500
7501/* ------------------------------------------------------------------------ */
7502
7503void __kmp_user_set_library(enum library_type arg) {
7504  int gtid;
7505  kmp_root_t *root;
7506  kmp_info_t *thread;
7507
7508  /* first, make sure we are initialized so we can get our gtid */
7509
7510  gtid = __kmp_entry_gtid();
7511  thread = __kmp_threads[gtid];
7512
7513  root = thread->th.th_root;
7514
7515  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7516                library_serial));
7517  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7518                                  thread */
7519    KMP_WARNING(SetLibraryIncorrectCall);
7520    return;
7521  }
7522
7523  switch (arg) {
7524  case library_serial:
7525    thread->th.th_set_nproc = 0;
7526    set__nproc(thread, 1);
7527    break;
7528  case library_turnaround:
7529    thread->th.th_set_nproc = 0;
7530    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7531                                           : __kmp_dflt_team_nth_ub);
7532    break;
7533  case library_throughput:
7534    thread->th.th_set_nproc = 0;
7535    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7536                                           : __kmp_dflt_team_nth_ub);
7537    break;
7538  default:
7539    KMP_FATAL(UnknownLibraryType, arg);
7540  }
7541
7542  __kmp_aux_set_library(arg);
7543}
7544
7545void __kmp_aux_set_stacksize(size_t arg) {
7546  if (!__kmp_init_serial)
7547    __kmp_serial_initialize();
7548
7549#if KMP_OS_DARWIN
7550  if (arg & (0x1000 - 1)) {
7551    arg &= ~(0x1000 - 1);
7552    if (arg + 0x1000) /* check for overflow if we round up */
7553      arg += 0x1000;
7554  }
7555#endif
7556  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7557
7558  /* only change the default stacksize before the first parallel region */
7559  if (!TCR_4(__kmp_init_parallel)) {
7560    size_t value = arg; /* argument is in bytes */
7561
7562    if (value < __kmp_sys_min_stksize)
7563      value = __kmp_sys_min_stksize;
7564    else if (value > KMP_MAX_STKSIZE)
7565      value = KMP_MAX_STKSIZE;
7566
7567    __kmp_stksize = value;
7568
7569    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7570  }
7571
7572  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7573}
7574
7575/* set the behaviour of the runtime library */
7576/* TODO this can cause some odd behaviour with sibling parallelism... */
7577void __kmp_aux_set_library(enum library_type arg) {
7578  __kmp_library = arg;
7579
7580  switch (__kmp_library) {
7581  case library_serial: {
7582    KMP_INFORM(LibraryIsSerial);
7583  } break;
7584  case library_turnaround:
7585    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7586      __kmp_use_yield = 2; // only yield when oversubscribed
7587    break;
7588  case library_throughput:
7589    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7590      __kmp_dflt_blocktime = 200;
7591    break;
7592  default:
7593    KMP_FATAL(UnknownLibraryType, arg);
7594  }
7595}
7596
7597/* Getting team information common for all team API */
7598// Returns NULL if not in teams construct
7599static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7600  kmp_info_t *thr = __kmp_entry_thread();
7601  teams_serialized = 0;
7602  if (thr->th.th_teams_microtask) {
7603    kmp_team_t *team = thr->th.th_team;
7604    int tlevel = thr->th.th_teams_level; // the level of the teams construct
7605    int ii = team->t.t_level;
7606    teams_serialized = team->t.t_serialized;
7607    int level = tlevel + 1;
7608    KMP_DEBUG_ASSERT(ii >= tlevel);
7609    while (ii > level) {
7610      for (teams_serialized = team->t.t_serialized;
7611           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7612      }
7613      if (team->t.t_serialized && (!teams_serialized)) {
7614        team = team->t.t_parent;
7615        continue;
7616      }
7617      if (ii > level) {
7618        team = team->t.t_parent;
7619        ii--;
7620      }
7621    }
7622    return team;
7623  }
7624  return NULL;
7625}
7626
7627int __kmp_aux_get_team_num() {
7628  int serialized;
7629  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7630  if (team) {
7631    if (serialized > 1) {
7632      return 0; // teams region is serialized ( 1 team of 1 thread ).
7633    } else {
7634      return team->t.t_master_tid;
7635    }
7636  }
7637  return 0;
7638}
7639
7640int __kmp_aux_get_num_teams() {
7641  int serialized;
7642  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7643  if (team) {
7644    if (serialized > 1) {
7645      return 1;
7646    } else {
7647      return team->t.t_parent->t.t_nproc;
7648    }
7649  }
7650  return 1;
7651}
7652
7653/* ------------------------------------------------------------------------ */
7654
7655/*
7656 * Affinity Format Parser
7657 *
7658 * Field is in form of: %[[[0].]size]type
7659 * % and type are required (%% means print a literal '%')
7660 * type is either single char or long name surrounded by {},
7661 * e.g., N or {num_threads}
7662 * 0 => leading zeros
7663 * . => right justified when size is specified
7664 * by default output is left justified
7665 * size is the *minimum* field length
7666 * All other characters are printed as is
7667 *
7668 * Available field types:
7669 * L {thread_level}      - omp_get_level()
7670 * n {thread_num}        - omp_get_thread_num()
7671 * h {host}              - name of host machine
7672 * P {process_id}        - process id (integer)
7673 * T {thread_identifier} - native thread identifier (integer)
7674 * N {num_threads}       - omp_get_num_threads()
7675 * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7676 * a {thread_affinity}   - comma separated list of integers or integer ranges
7677 *                         (values of affinity mask)
7678 *
7679 * Implementation-specific field types can be added
7680 * If a type is unknown, print "undefined"
7681*/
7682
7683// Structure holding the short name, long name, and corresponding data type
7684// for snprintf.  A table of these will represent the entire valid keyword
7685// field types.
7686typedef struct kmp_affinity_format_field_t {
7687  char short_name; // from spec e.g., L -> thread level
7688  const char *long_name; // from spec thread_level -> thread level
7689  char field_format; // data type for snprintf (typically 'd' or 's'
7690  // for integer or string)
7691} kmp_affinity_format_field_t;
7692
7693static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7694#if KMP_AFFINITY_SUPPORTED
7695    {'A', "thread_affinity", 's'},
7696#endif
7697    {'t', "team_num", 'd'},
7698    {'T', "num_teams", 'd'},
7699    {'L', "nesting_level", 'd'},
7700    {'n', "thread_num", 'd'},
7701    {'N', "num_threads", 'd'},
7702    {'a', "ancestor_tnum", 'd'},
7703    {'H', "host", 's'},
7704    {'P', "process_id", 'd'},
7705    {'i', "native_thread_id", 'd'}};
7706
7707// Return the number of characters it takes to hold field
7708static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7709                                            const char **ptr,
7710                                            kmp_str_buf_t *field_buffer) {
7711  int rc, format_index, field_value;
7712  const char *width_left, *width_right;
7713  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7714  static const int FORMAT_SIZE = 20;
7715  char format[FORMAT_SIZE] = {0};
7716  char absolute_short_name = 0;
7717
7718  KMP_DEBUG_ASSERT(gtid >= 0);
7719  KMP_DEBUG_ASSERT(th);
7720  KMP_DEBUG_ASSERT(**ptr == '%');
7721  KMP_DEBUG_ASSERT(field_buffer);
7722
7723  __kmp_str_buf_clear(field_buffer);
7724
7725  // Skip the initial %
7726  (*ptr)++;
7727
7728  // Check for %% first
7729  if (**ptr == '%') {
7730    __kmp_str_buf_cat(field_buffer, "%", 1);
7731    (*ptr)++; // skip over the second %
7732    return 1;
7733  }
7734
7735  // Parse field modifiers if they are present
7736  pad_zeros = false;
7737  if (**ptr == '0') {
7738    pad_zeros = true;
7739    (*ptr)++; // skip over 0
7740  }
7741  right_justify = false;
7742  if (**ptr == '.') {
7743    right_justify = true;
7744    (*ptr)++; // skip over .
7745  }
7746  // Parse width of field: [width_left, width_right)
7747  width_left = width_right = NULL;
7748  if (**ptr >= '0' && **ptr <= '9') {
7749    width_left = *ptr;
7750    SKIP_DIGITS(*ptr);
7751    width_right = *ptr;
7752  }
7753
7754  // Create the format for KMP_SNPRINTF based on flags parsed above
7755  format_index = 0;
7756  format[format_index++] = '%';
7757  if (!right_justify)
7758    format[format_index++] = '-';
7759  if (pad_zeros)
7760    format[format_index++] = '0';
7761  if (width_left && width_right) {
7762    int i = 0;
7763    // Only allow 8 digit number widths.
7764    // This also prevents overflowing format variable
7765    while (i < 8 && width_left < width_right) {
7766      format[format_index++] = *width_left;
7767      width_left++;
7768      i++;
7769    }
7770  }
7771
7772  // Parse a name (long or short)
7773  // Canonicalize the name into absolute_short_name
7774  found_valid_name = false;
7775  parse_long_name = (**ptr == '{');
7776  if (parse_long_name)
7777    (*ptr)++; // skip initial left brace
7778  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7779                             sizeof(__kmp_affinity_format_table[0]);
7780       ++i) {
7781    char short_name = __kmp_affinity_format_table[i].short_name;
7782    const char *long_name = __kmp_affinity_format_table[i].long_name;
7783    char field_format = __kmp_affinity_format_table[i].field_format;
7784    if (parse_long_name) {
7785      int length = KMP_STRLEN(long_name);
7786      if (strncmp(*ptr, long_name, length) == 0) {
7787        found_valid_name = true;
7788        (*ptr) += length; // skip the long name
7789      }
7790    } else if (**ptr == short_name) {
7791      found_valid_name = true;
7792      (*ptr)++; // skip the short name
7793    }
7794    if (found_valid_name) {
7795      format[format_index++] = field_format;
7796      format[format_index++] = '\0';
7797      absolute_short_name = short_name;
7798      break;
7799    }
7800  }
7801  if (parse_long_name) {
7802    if (**ptr != '}') {
7803      absolute_short_name = 0;
7804    } else {
7805      (*ptr)++; // skip over the right brace
7806    }
7807  }
7808
7809  // Attempt to fill the buffer with the requested
7810  // value using snprintf within __kmp_str_buf_print()
7811  switch (absolute_short_name) {
7812  case 't':
7813    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7814    break;
7815  case 'T':
7816    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7817    break;
7818  case 'L':
7819    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7820    break;
7821  case 'n':
7822    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7823    break;
7824  case 'H': {
7825    static const int BUFFER_SIZE = 256;
7826    char buf[BUFFER_SIZE];
7827    __kmp_expand_host_name(buf, BUFFER_SIZE);
7828    rc = __kmp_str_buf_print(field_buffer, format, buf);
7829  } break;
7830  case 'P':
7831    rc = __kmp_str_buf_print(field_buffer, format, getpid());
7832    break;
7833  case 'i':
7834    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7835    break;
7836  case 'N':
7837    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7838    break;
7839  case 'a':
7840    field_value =
7841        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7842    rc = __kmp_str_buf_print(field_buffer, format, field_value);
7843    break;
7844#if KMP_AFFINITY_SUPPORTED
7845  case 'A': {
7846    kmp_str_buf_t buf;
7847    __kmp_str_buf_init(&buf);
7848    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7849    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7850    __kmp_str_buf_free(&buf);
7851  } break;
7852#endif
7853  default:
7854    // According to spec, If an implementation does not have info for field
7855    // type, then "undefined" is printed
7856    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7857    // Skip the field
7858    if (parse_long_name) {
7859      SKIP_TOKEN(*ptr);
7860      if (**ptr == '}')
7861        (*ptr)++;
7862    } else {
7863      (*ptr)++;
7864    }
7865  }
7866
7867  KMP_ASSERT(format_index <= FORMAT_SIZE);
7868  return rc;
7869}
7870
7871/*
7872 * Return number of characters needed to hold the affinity string
7873 * (not including null byte character)
7874 * The resultant string is printed to buffer, which the caller can then
7875 * handle afterwards
7876*/
7877size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7878                                  kmp_str_buf_t *buffer) {
7879  const char *parse_ptr;
7880  size_t retval;
7881  const kmp_info_t *th;
7882  kmp_str_buf_t field;
7883
7884  KMP_DEBUG_ASSERT(buffer);
7885  KMP_DEBUG_ASSERT(gtid >= 0);
7886
7887  __kmp_str_buf_init(&field);
7888  __kmp_str_buf_clear(buffer);
7889
7890  th = __kmp_threads[gtid];
7891  retval = 0;
7892
7893  // If format is NULL or zero-length string, then we use
7894  // affinity-format-var ICV
7895  parse_ptr = format;
7896  if (parse_ptr == NULL || *parse_ptr == '\0') {
7897    parse_ptr = __kmp_affinity_format;
7898  }
7899  KMP_DEBUG_ASSERT(parse_ptr);
7900
7901  while (*parse_ptr != '\0') {
7902    // Parse a field
7903    if (*parse_ptr == '%') {
7904      // Put field in the buffer
7905      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7906      __kmp_str_buf_catbuf(buffer, &field);
7907      retval += rc;
7908    } else {
7909      // Put literal character in buffer
7910      __kmp_str_buf_cat(buffer, parse_ptr, 1);
7911      retval++;
7912      parse_ptr++;
7913    }
7914  }
7915  __kmp_str_buf_free(&field);
7916  return retval;
7917}
7918
7919// Displays the affinity string to stdout
7920void __kmp_aux_display_affinity(int gtid, const char *format) {
7921  kmp_str_buf_t buf;
7922  __kmp_str_buf_init(&buf);
7923  __kmp_aux_capture_affinity(gtid, format, &buf);
7924  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7925  __kmp_str_buf_free(&buf);
7926}
7927
7928/* ------------------------------------------------------------------------ */
7929
7930void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7931  int blocktime = arg; /* argument is in milliseconds */
7932#if KMP_USE_MONITOR
7933  int bt_intervals;
7934#endif
7935  int bt_set;
7936
7937  __kmp_save_internal_controls(thread);
7938
7939  /* Normalize and set blocktime for the teams */
7940  if (blocktime < KMP_MIN_BLOCKTIME)
7941    blocktime = KMP_MIN_BLOCKTIME;
7942  else if (blocktime > KMP_MAX_BLOCKTIME)
7943    blocktime = KMP_MAX_BLOCKTIME;
7944
7945  set__blocktime_team(thread->th.th_team, tid, blocktime);
7946  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7947
7948#if KMP_USE_MONITOR
7949  /* Calculate and set blocktime intervals for the teams */
7950  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7951
7952  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7953  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7954#endif
7955
7956  /* Set whether blocktime has been set to "TRUE" */
7957  bt_set = TRUE;
7958
7959  set__bt_set_team(thread->th.th_team, tid, bt_set);
7960  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7961#if KMP_USE_MONITOR
7962  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7963                "bt_intervals=%d, monitor_updates=%d\n",
7964                __kmp_gtid_from_tid(tid, thread->th.th_team),
7965                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7966                __kmp_monitor_wakeups));
7967#else
7968  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7969                __kmp_gtid_from_tid(tid, thread->th.th_team),
7970                thread->th.th_team->t.t_id, tid, blocktime));
7971#endif
7972}
7973
7974void __kmp_aux_set_defaults(char const *str, int len) {
7975  if (!__kmp_init_serial) {
7976    __kmp_serial_initialize();
7977  }
7978  __kmp_env_initialize(str);
7979
7980  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
7981    __kmp_env_print();
7982  }
7983} // __kmp_aux_set_defaults
7984
7985/* ------------------------------------------------------------------------ */
7986/* internal fast reduction routines */
7987
7988PACKED_REDUCTION_METHOD_T
7989__kmp_determine_reduction_method(
7990    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7991    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7992    kmp_critical_name *lck) {
7993
7994  // Default reduction method: critical construct ( lck != NULL, like in current
7995  // PAROPT )
7996  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7997  // can be selected by RTL
7998  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7999  // can be selected by RTL
8000  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8001  // among generated by PAROPT.
8002
8003  PACKED_REDUCTION_METHOD_T retval;
8004
8005  int team_size;
8006
8007  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8008  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8009
8010#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8011  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8012#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8013
8014  retval = critical_reduce_block;
8015
8016  // another choice of getting a team size (with 1 dynamic deference) is slower
8017  team_size = __kmp_get_team_num_threads(global_tid);
8018  if (team_size == 1) {
8019
8020    retval = empty_reduce_block;
8021
8022  } else {
8023
8024    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8025
8026#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8027
8028#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8029    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8030
8031    int teamsize_cutoff = 4;
8032
8033#if KMP_MIC_SUPPORTED
8034    if (__kmp_mic_type != non_mic) {
8035      teamsize_cutoff = 8;
8036    }
8037#endif
8038    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8039    if (tree_available) {
8040      if (team_size <= teamsize_cutoff) {
8041        if (atomic_available) {
8042          retval = atomic_reduce_block;
8043        }
8044      } else {
8045        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8046      }
8047    } else if (atomic_available) {
8048      retval = atomic_reduce_block;
8049    }
8050#else
8051#error "Unknown or unsupported OS"
8052#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8053       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8054
8055#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8056
8057#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8058
8059    // basic tuning
8060
8061    if (atomic_available) {
8062      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8063        retval = atomic_reduce_block;
8064      }
8065    } // otherwise: use critical section
8066
8067#elif KMP_OS_DARWIN
8068
8069    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8070    if (atomic_available && (num_vars <= 3)) {
8071      retval = atomic_reduce_block;
8072    } else if (tree_available) {
8073      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8074          (reduce_size < (2000 * sizeof(kmp_real64)))) {
8075        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8076      }
8077    } // otherwise: use critical section
8078
8079#else
8080#error "Unknown or unsupported OS"
8081#endif
8082
8083#else
8084#error "Unknown or unsupported architecture"
8085#endif
8086  }
8087
8088  // KMP_FORCE_REDUCTION
8089
8090  // If the team is serialized (team_size == 1), ignore the forced reduction
8091  // method and stay with the unsynchronized method (empty_reduce_block)
8092  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8093      team_size != 1) {
8094
8095    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8096
8097    int atomic_available, tree_available;
8098
8099    switch ((forced_retval = __kmp_force_reduction_method)) {
8100    case critical_reduce_block:
8101      KMP_ASSERT(lck); // lck should be != 0
8102      break;
8103
8104    case atomic_reduce_block:
8105      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8106      if (!atomic_available) {
8107        KMP_WARNING(RedMethodNotSupported, "atomic");
8108        forced_retval = critical_reduce_block;
8109      }
8110      break;
8111
8112    case tree_reduce_block:
8113      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8114      if (!tree_available) {
8115        KMP_WARNING(RedMethodNotSupported, "tree");
8116        forced_retval = critical_reduce_block;
8117      } else {
8118#if KMP_FAST_REDUCTION_BARRIER
8119        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8120#endif
8121      }
8122      break;
8123
8124    default:
8125      KMP_ASSERT(0); // "unsupported method specified"
8126    }
8127
8128    retval = forced_retval;
8129  }
8130
8131  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8132
8133#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8134#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8135
8136  return (retval);
8137}
8138
8139// this function is for testing set/get/determine reduce method
8140kmp_int32 __kmp_get_reduce_method(void) {
8141  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8142}
8143
8144// Soft pause sets up threads to ignore blocktime and just go to sleep.
8145// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8146void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8147
8148// Hard pause shuts down the runtime completely.  Resume happens naturally when
8149// OpenMP is used subsequently.
8150void __kmp_hard_pause() {
8151  __kmp_pause_status = kmp_hard_paused;
8152  __kmp_internal_end_thread(-1);
8153}
8154
8155// Soft resume sets __kmp_pause_status, and wakes up all threads.
8156void __kmp_resume_if_soft_paused() {
8157  if (__kmp_pause_status == kmp_soft_paused) {
8158    __kmp_pause_status = kmp_not_paused;
8159
8160    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8161      kmp_info_t *thread = __kmp_threads[gtid];
8162      if (thread) { // Wake it if sleeping
8163        kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8164        if (fl.is_sleeping())
8165          fl.resume(gtid);
8166        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8167          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8168        } else { // thread holds the lock and may sleep soon
8169          do { // until either the thread sleeps, or we can get the lock
8170            if (fl.is_sleeping()) {
8171              fl.resume(gtid);
8172              break;
8173            } else if (__kmp_try_suspend_mx(thread)) {
8174              __kmp_unlock_suspend_mx(thread);
8175              break;
8176            }
8177          } while (1);
8178        }
8179      }
8180    }
8181  }
8182}
8183
8184// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8185// TODO: add warning messages
8186int __kmp_pause_resource(kmp_pause_status_t level) {
8187  if (level == kmp_not_paused) { // requesting resume
8188    if (__kmp_pause_status == kmp_not_paused) {
8189      // error message about runtime not being paused, so can't resume
8190      return 1;
8191    } else {
8192      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8193                       __kmp_pause_status == kmp_hard_paused);
8194      __kmp_pause_status = kmp_not_paused;
8195      return 0;
8196    }
8197  } else if (level == kmp_soft_paused) { // requesting soft pause
8198    if (__kmp_pause_status != kmp_not_paused) {
8199      // error message about already being paused
8200      return 1;
8201    } else {
8202      __kmp_soft_pause();
8203      return 0;
8204    }
8205  } else if (level == kmp_hard_paused) { // requesting hard pause
8206    if (__kmp_pause_status != kmp_not_paused) {
8207      // error message about already being paused
8208      return 1;
8209    } else {
8210      __kmp_hard_pause();
8211      return 0;
8212    }
8213  } else {
8214    // error message about invalid level
8215    return 1;
8216  }
8217}
8218