1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34
35/* these are temporary issues to be dealt with */
36#define KMP_USE_PRCTL 0
37
38#if KMP_OS_WINDOWS
39#include <process.h>
40#endif
41
42#include "tsan_annotations.h"
43
44#if defined(KMP_GOMP_COMPAT)
45char const __kmp_version_alt_comp[] =
46    KMP_VERSION_PREFIX "alternative compiler support: yes";
47#endif /* defined(KMP_GOMP_COMPAT) */
48
49char const __kmp_version_omp_api[] =
50    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51
52#ifdef KMP_DEBUG
53char const __kmp_version_lock[] =
54    KMP_VERSION_PREFIX "lock type: run time selectable";
55#endif /* KMP_DEBUG */
56
57#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58
59/* ------------------------------------------------------------------------ */
60
61#if KMP_USE_MONITOR
62kmp_info_t __kmp_monitor;
63#endif
64
65/* Forward declarations */
66
67void __kmp_cleanup(void);
68
69static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                  int gtid);
71static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                  kmp_internal_control_t *new_icvs,
73                                  ident_t *loc);
74#if KMP_AFFINITY_SUPPORTED
75static void __kmp_partition_places(kmp_team_t *team,
76                                   int update_master_only = 0);
77#endif
78static void __kmp_do_serial_initialize(void);
79void __kmp_fork_barrier(int gtid, int tid);
80void __kmp_join_barrier(int gtid);
81void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                          kmp_internal_control_t *new_icvs, ident_t *loc);
83
84#ifdef USE_LOAD_BALANCE
85static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86#endif
87
88static int __kmp_expand_threads(int nNeed);
89#if KMP_OS_WINDOWS
90static int __kmp_unregister_root_other_thread(int gtid);
91#endif
92static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95
96/* Calculate the identifier of the current thread */
97/* fast (and somewhat portable) way to get unique identifier of executing
98   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99int __kmp_get_global_thread_id() {
100  int i;
101  kmp_info_t **other_threads;
102  size_t stack_data;
103  char *stack_addr;
104  size_t stack_size;
105  char *stack_base;
106
107  KA_TRACE(
108      1000,
109      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110       __kmp_nth, __kmp_all_nth));
111
112  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115     __kmp_init_gtid for this to work. */
116
117  if (!TCR_4(__kmp_init_gtid))
118    return KMP_GTID_DNE;
119
120#ifdef KMP_TDATA_GTID
121  if (TCR_4(__kmp_gtid_mode) >= 3) {
122    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123    return __kmp_gtid;
124  }
125#endif
126  if (TCR_4(__kmp_gtid_mode) >= 2) {
127    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128    return __kmp_gtid_get_specific();
129  }
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131
132  stack_addr = (char *)&stack_data;
133  other_threads = __kmp_threads;
134
135  /* ATT: The code below is a source of potential bugs due to unsynchronized
136     access to __kmp_threads array. For example:
137     1. Current thread loads other_threads[i] to thr and checks it, it is
138        non-NULL.
139     2. Current thread is suspended by OS.
140     3. Another thread unregisters and finishes (debug versions of free()
141        may fill memory with something like 0xEF).
142     4. Current thread is resumed.
143     5. Current thread reads junk from *thr.
144     TODO: Fix it.  --ln  */
145
146  for (i = 0; i < __kmp_threads_capacity; i++) {
147
148    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149    if (!thr)
150      continue;
151
152    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154
155    /* stack grows down -- search through all of the active threads */
156
157    if (stack_addr <= stack_base) {
158      size_t stack_diff = stack_base - stack_addr;
159
160      if (stack_diff <= stack_size) {
161        /* The only way we can be closer than the allocated */
162        /* stack size is if we are running on this thread. */
163        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164        return i;
165      }
166    }
167  }
168
169  /* get specific to try and determine our gtid */
170  KA_TRACE(1000,
171           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172            "thread, using TLS\n"));
173  i = __kmp_gtid_get_specific();
174
175  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176
177  /* if we havn't been assigned a gtid, then return code */
178  if (i < 0)
179    return i;
180
181  /* dynamically updated stack window for uber threads to avoid get_specific
182     call */
183  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184    KMP_FATAL(StackOverflow, i);
185  }
186
187  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188  if (stack_addr > stack_base) {
189    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                stack_base);
193  } else {
194    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195            stack_base - stack_addr);
196  }
197
198  /* Reprint stack bounds for ubermaster since they have been refined */
199  if (__kmp_storage_map) {
200    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                 other_threads[i]->th.th_info.ds.ds_stacksize,
204                                 "th_%d stack (refinement)", i);
205  }
206  return i;
207}
208
209int __kmp_get_global_thread_id_reg() {
210  int gtid;
211
212  if (!__kmp_init_serial) {
213    gtid = KMP_GTID_DNE;
214  } else
215#ifdef KMP_TDATA_GTID
216      if (TCR_4(__kmp_gtid_mode) >= 3) {
217    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218    gtid = __kmp_gtid;
219  } else
220#endif
221      if (TCR_4(__kmp_gtid_mode) >= 2) {
222    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223    gtid = __kmp_gtid_get_specific();
224  } else {
225    KA_TRACE(1000,
226             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227    gtid = __kmp_get_global_thread_id();
228  }
229
230  /* we must be a new uber master sibling thread */
231  if (gtid == KMP_GTID_DNE) {
232    KA_TRACE(10,
233             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234              "Registering a new gtid.\n"));
235    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236    if (!__kmp_init_serial) {
237      __kmp_do_serial_initialize();
238      gtid = __kmp_gtid_get_specific();
239    } else {
240      gtid = __kmp_register_root(FALSE);
241    }
242    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244  }
245
246  KMP_DEBUG_ASSERT(gtid >= 0);
247
248  return gtid;
249}
250
251/* caller must hold forkjoin_lock */
252void __kmp_check_stack_overlap(kmp_info_t *th) {
253  int f;
254  char *stack_beg = NULL;
255  char *stack_end = NULL;
256  int gtid;
257
258  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259  if (__kmp_storage_map) {
260    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262
263    gtid = __kmp_gtid_from_thread(th);
264
265    if (gtid == KMP_GTID_MONITOR) {
266      __kmp_print_storage_map_gtid(
267          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268          "th_%s stack (%s)", "mon",
269          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270    } else {
271      __kmp_print_storage_map_gtid(
272          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273          "th_%d stack (%s)", gtid,
274          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275    }
276  }
277
278  /* No point in checking ubermaster threads since they use refinement and
279   * cannot overlap */
280  gtid = __kmp_gtid_from_thread(th);
281  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282    KA_TRACE(10,
283             ("__kmp_check_stack_overlap: performing extensive checking\n"));
284    if (stack_beg == NULL) {
285      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287    }
288
289    for (f = 0; f < __kmp_threads_capacity; f++) {
290      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291
292      if (f_th && f_th != th) {
293        char *other_stack_end =
294            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295        char *other_stack_beg =
296            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299
300          /* Print the other stack values before the abort */
301          if (__kmp_storage_map)
302            __kmp_print_storage_map_gtid(
303                -1, other_stack_beg, other_stack_end,
304                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306
307          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                      __kmp_msg_null);
309        }
310      }
311    }
312  }
313  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314}
315
316/* ------------------------------------------------------------------------ */
317
318void __kmp_infinite_loop(void) {
319  static int done = FALSE;
320
321  while (!done) {
322    KMP_YIELD(TRUE);
323  }
324}
325
326#define MAX_MESSAGE 512
327
328void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                  char const *format, ...) {
330  char buffer[MAX_MESSAGE];
331  va_list ap;
332
333  va_start(ap, format);
334  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335               p2, (unsigned long)size, format);
336  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337  __kmp_vprintf(kmp_err, buffer, ap);
338#if KMP_PRINT_DATA_PLACEMENT
339  int node;
340  if (gtid >= 0) {
341    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342      if (__kmp_storage_map_verbose) {
343        node = __kmp_get_host_node(p1);
344        if (node < 0) /* doesn't work, so don't try this next time */
345          __kmp_storage_map_verbose = FALSE;
346        else {
347          char *last;
348          int lastNode;
349          int localProc = __kmp_get_cpu_from_gtid(gtid);
350
351          const int page_size = KMP_GET_PAGE_SIZE();
352
353          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355          if (localProc >= 0)
356            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                 localProc >> 1);
358          else
359            __kmp_printf_no_lock("  GTID %d\n", gtid);
360#if KMP_USE_PRCTL
361          /* The more elaborate format is disabled for now because of the prctl
362           * hanging bug. */
363          do {
364            last = p1;
365            lastNode = node;
366            /* This loop collates adjacent pages with the same host node. */
367            do {
368              (char *)p1 += page_size;
369            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                 lastNode);
372          } while (p1 <= p2);
373#else
374          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                               (char *)p1 + (page_size - 1),
376                               __kmp_get_host_node(p1));
377          if (p1 < p2) {
378            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                 (char *)p2 + (page_size - 1),
380                                 __kmp_get_host_node(p2));
381          }
382#endif
383        }
384      }
385    } else
386      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387  }
388#endif /* KMP_PRINT_DATA_PLACEMENT */
389  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390}
391
392void __kmp_warn(char const *format, ...) {
393  char buffer[MAX_MESSAGE];
394  va_list ap;
395
396  if (__kmp_generate_warnings == kmp_warnings_off) {
397    return;
398  }
399
400  va_start(ap, format);
401
402  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404  __kmp_vprintf(kmp_err, buffer, ap);
405  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406
407  va_end(ap);
408}
409
410void __kmp_abort_process() {
411  // Later threads may stall here, but that's ok because abort() will kill them.
412  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413
414  if (__kmp_debug_buf) {
415    __kmp_dump_debug_buffer();
416  }
417
418  if (KMP_OS_WINDOWS) {
419    // Let other threads know of abnormal termination and prevent deadlock
420    // if abort happened during library initialization or shutdown
421    __kmp_global.g.g_abort = SIGABRT;
422
423    /* On Windows* OS by default abort() causes pop-up error box, which stalls
424       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425       boxes. _set_abort_behavior() works well, but this function is not
426       available in VS7 (this is not problem for DLL, but it is a problem for
427       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428       help, at least in some versions of MS C RTL.
429
430       It seems following sequence is the only way to simulate abort() and
431       avoid pop-up error box. */
432    raise(SIGABRT);
433    _exit(3); // Just in case, if signal ignored, exit anyway.
434  } else {
435    abort();
436  }
437
438  __kmp_infinite_loop();
439  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440
441} // __kmp_abort_process
442
443void __kmp_abort_thread(void) {
444  // TODO: Eliminate g_abort global variable and this function.
445  // In case of abort just call abort(), it will kill all the threads.
446  __kmp_infinite_loop();
447} // __kmp_abort_thread
448
449/* Print out the storage map for the major kmp_info_t thread data structures
450   that are allocated together. */
451
452static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                               gtid);
455
456  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
461
462  __kmp_print_storage_map_gtid(
463      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465
466  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                               &thr->th.th_bar[bs_plain_barrier + 1],
468                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                               gtid);
470
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                               gtid);
475
476#if KMP_FAST_REDUCTION_BARRIER
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                               &thr->th.th_bar[bs_reduction_barrier + 1],
479                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                               gtid);
481#endif // KMP_FAST_REDUCTION_BARRIER
482}
483
484/* Print out the storage map for the major kmp_team_t team data structures
485   that are allocated together. */
486
487static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                         int team_id, int num_thr) {
489  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                               header, team_id);
492
493  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                               &team->t.t_bar[bs_last_barrier],
495                               sizeof(kmp_balign_team_t) * bs_last_barrier,
496                               "%s_%d.t_bar", header, team_id);
497
498  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                               &team->t.t_bar[bs_plain_barrier + 1],
500                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                               header, team_id);
502
503  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                               &team->t.t_bar[bs_forkjoin_barrier + 1],
505                               sizeof(kmp_balign_team_t),
506                               "%s_%d.t_bar[forkjoin]", header, team_id);
507
508#if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                               &team->t.t_bar[bs_reduction_barrier + 1],
511                               sizeof(kmp_balign_team_t),
512                               "%s_%d.t_bar[reduction]", header, team_id);
513#endif // KMP_FAST_REDUCTION_BARRIER
514
515  __kmp_print_storage_map_gtid(
516      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518
519  __kmp_print_storage_map_gtid(
520      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522
523  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                               &team->t.t_disp_buffer[num_disp_buff],
525                               sizeof(dispatch_shared_info_t) * num_disp_buff,
526                               "%s_%d.t_disp_buffer", header, team_id);
527}
528
529static void __kmp_init_allocator() { __kmp_init_memkind(); }
530static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531
532/* ------------------------------------------------------------------------ */
533
534#if KMP_DYNAMIC_LIB
535#if KMP_OS_WINDOWS
536
537static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538  // TODO: Change to __kmp_break_bootstrap_lock().
539  __kmp_init_bootstrap_lock(lck); // make the lock released
540}
541
542static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543  int i;
544  int thread_count;
545
546  // PROCESS_DETACH is expected to be called by a thread that executes
547  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550  // threads can be still alive here, although being about to be terminated. The
551  // threads in the array with ds_thread==0 are most suspicious. Actually, it
552  // can be not safe to access the __kmp_threads[].
553
554  // TODO: does it make sense to check __kmp_roots[] ?
555
556  // Let's check that there are no other alive threads registered with the OMP
557  // lib.
558  while (1) {
559    thread_count = 0;
560    for (i = 0; i < __kmp_threads_capacity; ++i) {
561      if (!__kmp_threads)
562        continue;
563      kmp_info_t *th = __kmp_threads[i];
564      if (th == NULL)
565        continue;
566      int gtid = th->th.th_info.ds.ds_gtid;
567      if (gtid == gtid_req)
568        continue;
569      if (gtid < 0)
570        continue;
571      DWORD exit_val;
572      int alive = __kmp_is_thread_alive(th, &exit_val);
573      if (alive) {
574        ++thread_count;
575      }
576    }
577    if (thread_count == 0)
578      break; // success
579  }
580
581  // Assume that I'm alone. Now it might be safe to check and reset locks.
582  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583  __kmp_reset_lock(&__kmp_forkjoin_lock);
584#ifdef KMP_DEBUG
585  __kmp_reset_lock(&__kmp_stdio_lock);
586#endif // KMP_DEBUG
587}
588
589BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591
592  switch (fdwReason) {
593
594  case DLL_PROCESS_ATTACH:
595    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596
597    return TRUE;
598
599  case DLL_PROCESS_DETACH:
600    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601
602    if (lpReserved != NULL) {
603      // lpReserved is used for telling the difference:
604      //   lpReserved == NULL when FreeLibrary() was called,
605      //   lpReserved != NULL when the process terminates.
606      // When FreeLibrary() is called, worker threads remain alive. So they will
607      // release the forkjoin lock by themselves. When the process terminates,
608      // worker threads disappear triggering the problem of unreleased forkjoin
609      // lock as described below.
610
611      // A worker thread can take the forkjoin lock. The problem comes up if
612      // that worker thread becomes dead before it releases the forkjoin lock.
613      // The forkjoin lock remains taken, while the thread executing
614      // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615      // to take the forkjoin lock and will always fail, so that the application
616      // will never finish [normally]. This scenario is possible if
617      // __kmpc_end() has not been executed. It looks like it's not a corner
618      // case, but common cases:
619      // - the main function was compiled by an alternative compiler;
620      // - the main function was compiled by icl but without /Qopenmp
621      //   (application with plugins);
622      // - application terminates by calling C exit(), Fortran CALL EXIT() or
623      //   Fortran STOP.
624      // - alive foreign thread prevented __kmpc_end from doing cleanup.
625      //
626      // This is a hack to work around the problem.
627      // TODO: !!! figure out something better.
628      __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629    }
630
631    __kmp_internal_end_library(__kmp_gtid_get_specific());
632
633    return TRUE;
634
635  case DLL_THREAD_ATTACH:
636    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637
638    /* if we want to register new siblings all the time here call
639     * __kmp_get_gtid(); */
640    return TRUE;
641
642  case DLL_THREAD_DETACH:
643    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644
645    __kmp_internal_end_thread(__kmp_gtid_get_specific());
646    return TRUE;
647  }
648
649  return TRUE;
650}
651
652#endif /* KMP_OS_WINDOWS */
653#endif /* KMP_DYNAMIC_LIB */
654
655/* __kmp_parallel_deo -- Wait until it's our turn. */
656void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657  int gtid = *gtid_ref;
658#ifdef BUILD_PARALLEL_ORDERED
659  kmp_team_t *team = __kmp_team_from_gtid(gtid);
660#endif /* BUILD_PARALLEL_ORDERED */
661
662  if (__kmp_env_consistency_check) {
663    if (__kmp_threads[gtid]->th.th_root->r.r_active)
664#if KMP_USE_DYNAMIC_LOCK
665      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666#else
667      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668#endif
669  }
670#ifdef BUILD_PARALLEL_ORDERED
671  if (!team->t.t_serialized) {
672    KMP_MB();
673    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674             NULL);
675    KMP_MB();
676  }
677#endif /* BUILD_PARALLEL_ORDERED */
678}
679
680/* __kmp_parallel_dxo -- Signal the next task. */
681void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683#ifdef BUILD_PARALLEL_ORDERED
684  int tid = __kmp_tid_from_gtid(gtid);
685  kmp_team_t *team = __kmp_team_from_gtid(gtid);
686#endif /* BUILD_PARALLEL_ORDERED */
687
688  if (__kmp_env_consistency_check) {
689    if (__kmp_threads[gtid]->th.th_root->r.r_active)
690      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691  }
692#ifdef BUILD_PARALLEL_ORDERED
693  if (!team->t.t_serialized) {
694    KMP_MB(); /* Flush all pending memory write invalidates.  */
695
696    /* use the tid of the next thread in this team */
697    /* TODO replace with general release procedure */
698    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699
700    KMP_MB(); /* Flush all pending memory write invalidates.  */
701  }
702#endif /* BUILD_PARALLEL_ORDERED */
703}
704
705/* ------------------------------------------------------------------------ */
706/* The BARRIER for a SINGLE process section is always explicit   */
707
708int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709  int status;
710  kmp_info_t *th;
711  kmp_team_t *team;
712
713  if (!TCR_4(__kmp_init_parallel))
714    __kmp_parallel_initialize();
715  __kmp_resume_if_soft_paused();
716
717  th = __kmp_threads[gtid];
718  team = th->th.th_team;
719  status = 0;
720
721  th->th.th_ident = id_ref;
722
723  if (team->t.t_serialized) {
724    status = 1;
725  } else {
726    kmp_int32 old_this = th->th.th_local.this_construct;
727
728    ++th->th.th_local.this_construct;
729    /* try to set team count to thread count--success means thread got the
730       single block */
731    /* TODO: Should this be acquire or release? */
732    if (team->t.t_construct == old_this) {
733      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                              th->th.th_local.this_construct);
735    }
736#if USE_ITT_BUILD
737    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739        team->t.t_active_level ==
740            1) { // Only report metadata by master of active team at level 1
741      __kmp_itt_metadata_single(id_ref);
742    }
743#endif /* USE_ITT_BUILD */
744  }
745
746  if (__kmp_env_consistency_check) {
747    if (status && push_ws) {
748      __kmp_push_workshare(gtid, ct_psingle, id_ref);
749    } else {
750      __kmp_check_workshare(gtid, ct_psingle, id_ref);
751    }
752  }
753#if USE_ITT_BUILD
754  if (status) {
755    __kmp_itt_single_start(gtid);
756  }
757#endif /* USE_ITT_BUILD */
758  return status;
759}
760
761void __kmp_exit_single(int gtid) {
762#if USE_ITT_BUILD
763  __kmp_itt_single_end(gtid);
764#endif /* USE_ITT_BUILD */
765  if (__kmp_env_consistency_check)
766    __kmp_pop_workshare(gtid, ct_psingle, NULL);
767}
768
769/* determine if we can go parallel or must use a serialized parallel region and
770 * how many threads we can use
771 * set_nproc is the number of threads requested for the team
772 * returns 0 if we should serialize or only use one thread,
773 * otherwise the number of threads to use
774 * The forkjoin lock is held by the caller. */
775static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                 int master_tid, int set_nthreads,
777                                 int enter_teams) {
778  int capacity;
779  int new_nthreads;
780  KMP_DEBUG_ASSERT(__kmp_init_serial);
781  KMP_DEBUG_ASSERT(root && parent_team);
782  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783
784  // If dyn-var is set, dynamically adjust the number of desired threads,
785  // according to the method specified by dynamic_mode.
786  new_nthreads = set_nthreads;
787  if (!get__dynamic_2(parent_team, master_tid)) {
788    ;
789  }
790#ifdef USE_LOAD_BALANCE
791  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793    if (new_nthreads == 1) {
794      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                    "reservation to 1 thread\n",
796                    master_tid));
797      return 1;
798    }
799    if (new_nthreads < set_nthreads) {
800      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                    "reservation to %d threads\n",
802                    master_tid, new_nthreads));
803    }
804  }
805#endif /* USE_LOAD_BALANCE */
806  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807    new_nthreads = __kmp_avail_proc - __kmp_nth +
808                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809    if (new_nthreads <= 1) {
810      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                    "reservation to 1 thread\n",
812                    master_tid));
813      return 1;
814    }
815    if (new_nthreads < set_nthreads) {
816      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                    "reservation to %d threads\n",
818                    master_tid, new_nthreads));
819    } else {
820      new_nthreads = set_nthreads;
821    }
822  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823    if (set_nthreads > 2) {
824      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825      new_nthreads = (new_nthreads % set_nthreads) + 1;
826      if (new_nthreads == 1) {
827        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                      "reservation to 1 thread\n",
829                      master_tid));
830        return 1;
831      }
832      if (new_nthreads < set_nthreads) {
833        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                      "reservation to %d threads\n",
835                      master_tid, new_nthreads));
836      }
837    }
838  } else {
839    KMP_ASSERT(0);
840  }
841
842  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843  if (__kmp_nth + new_nthreads -
844          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845      __kmp_max_nth) {
846    int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848    if (tl_nthreads <= 0) {
849      tl_nthreads = 1;
850    }
851
852    // If dyn-var is false, emit a 1-time warning.
853    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854      __kmp_reserve_warn = 1;
855      __kmp_msg(kmp_ms_warning,
856                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858    }
859    if (tl_nthreads == 1) {
860      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                    "reduced reservation to 1 thread\n",
862                    master_tid));
863      return 1;
864    }
865    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                  "reservation to %d threads\n",
867                  master_tid, tl_nthreads));
868    new_nthreads = tl_nthreads;
869  }
870
871  // Respect OMP_THREAD_LIMIT
872  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874  if (cg_nthreads + new_nthreads -
875          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876      max_cg_threads) {
877    int tl_nthreads = max_cg_threads - cg_nthreads +
878                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879    if (tl_nthreads <= 0) {
880      tl_nthreads = 1;
881    }
882
883    // If dyn-var is false, emit a 1-time warning.
884    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885      __kmp_reserve_warn = 1;
886      __kmp_msg(kmp_ms_warning,
887                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889    }
890    if (tl_nthreads == 1) {
891      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                    "reduced reservation to 1 thread\n",
893                    master_tid));
894      return 1;
895    }
896    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                  "reservation to %d threads\n",
898                  master_tid, tl_nthreads));
899    new_nthreads = tl_nthreads;
900  }
901
902  // Check if the threads array is large enough, or needs expanding.
903  // See comment in __kmp_register_root() about the adjustment if
904  // __kmp_threads[0] == NULL.
905  capacity = __kmp_threads_capacity;
906  if (TCR_PTR(__kmp_threads[0]) == NULL) {
907    --capacity;
908  }
909  if (__kmp_nth + new_nthreads -
910          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911      capacity) {
912    // Expand the threads array.
913    int slotsRequired = __kmp_nth + new_nthreads -
914                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                        capacity;
916    int slotsAdded = __kmp_expand_threads(slotsRequired);
917    if (slotsAdded < slotsRequired) {
918      // The threads array was not expanded enough.
919      new_nthreads -= (slotsRequired - slotsAdded);
920      KMP_ASSERT(new_nthreads >= 1);
921
922      // If dyn-var is false, emit a 1-time warning.
923      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924        __kmp_reserve_warn = 1;
925        if (__kmp_tp_cached) {
926          __kmp_msg(kmp_ms_warning,
927                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930        } else {
931          __kmp_msg(kmp_ms_warning,
932                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934        }
935      }
936    }
937  }
938
939#ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941    KC_TRACE(10,
942             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943              "dead roots and rechecking; requested %d threads\n",
944              __kmp_get_gtid(), set_nthreads));
945  } else {
946    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                  " %d threads\n",
948                  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950#endif // KMP_DEBUG
951  return new_nthreads;
952}
953
954/* Allocate threads from the thread pool and assign them to the new team. We are
955   assured that there are enough threads available, because we checked on that
956   earlier within critical section forkjoin */
957static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                    kmp_info_t *master_th, int master_gtid) {
959  int i;
960  int use_hot_team;
961
962  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964  KMP_MB();
965
966  /* first, let's setup the master thread */
967  master_th->th.th_info.ds.ds_tid = 0;
968  master_th->th.th_team = team;
969  master_th->th.th_team_nproc = team->t.t_nproc;
970  master_th->th.th_team_master = master_th;
971  master_th->th.th_team_serialized = FALSE;
972  master_th->th.th_dispatch = &team->t.t_dispatch[0];
973
974/* make sure we are not the optimized hot team */
975#if KMP_NESTED_HOT_TEAMS
976  use_hot_team = 0;
977  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978  if (hot_teams) { // hot teams array is not allocated if
979    // KMP_HOT_TEAMS_MAX_LEVEL=0
980    int level = team->t.t_active_level - 1; // index in array of hot teams
981    if (master_th->th.th_teams_microtask) { // are we inside the teams?
982      if (master_th->th.th_teams_size.nteams > 1) {
983        ++level; // level was not increased in teams construct for
984        // team_of_masters
985      }
986      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987          master_th->th.th_teams_level == team->t.t_level) {
988        ++level; // level was not increased in teams construct for
989        // team_of_workers before the parallel
990      } // team->t.t_level will be increased inside parallel
991    }
992    if (level < __kmp_hot_teams_max_level) {
993      if (hot_teams[level].hot_team) {
994        // hot team has already been allocated for given level
995        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996        use_hot_team = 1; // the team is ready to use
997      } else {
998        use_hot_team = 0; // AC: threads are not allocated yet
999        hot_teams[level].hot_team = team; // remember new hot team
1000        hot_teams[level].hot_team_nth = team->t.t_nproc;
1001      }
1002    } else {
1003      use_hot_team = 0;
1004    }
1005  }
1006#else
1007  use_hot_team = team == root->r.r_hot_team;
1008#endif
1009  if (!use_hot_team) {
1010
1011    /* install the master thread */
1012    team->t.t_threads[0] = master_th;
1013    __kmp_initialize_info(master_th, team, 0, master_gtid);
1014
1015    /* now, install the worker threads */
1016    for (i = 1; i < team->t.t_nproc; i++) {
1017
1018      /* fork or reallocate a new thread and install it in team */
1019      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020      team->t.t_threads[i] = thr;
1021      KMP_DEBUG_ASSERT(thr);
1022      KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023      /* align team and thread arrived states */
1024      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                    team->t.t_bar[bs_plain_barrier].b_arrived));
1030      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031      thr->th.th_teams_level = master_th->th.th_teams_level;
1032      thr->th.th_teams_size = master_th->th.th_teams_size;
1033      { // Initialize threads' barrier data.
1034        int b;
1035        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036        for (b = 0; b < bs_last_barrier; ++b) {
1037          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039#if USE_DEBUGGER
1040          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041#endif
1042        }
1043      }
1044    }
1045
1046#if KMP_AFFINITY_SUPPORTED
1047    __kmp_partition_places(team);
1048#endif
1049  }
1050
1051  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052    for (i = 0; i < team->t.t_nproc; i++) {
1053      kmp_info_t *thr = team->t.t_threads[i];
1054      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055          thr->th.th_prev_level != team->t.t_level) {
1056        team->t.t_display_affinity = 1;
1057        break;
1058      }
1059    }
1060  }
1061
1062  KMP_MB();
1063}
1064
1065#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066// Propagate any changes to the floating point control registers out to the team
1067// We try to avoid unnecessary writes to the relevant cache line in the team
1068// structure, so we don't make changes unless they are needed.
1069inline static void propagateFPControl(kmp_team_t *team) {
1070  if (__kmp_inherit_fp_control) {
1071    kmp_int16 x87_fpu_control_word;
1072    kmp_uint32 mxcsr;
1073
1074    // Get master values of FPU control flags (both X87 and vector)
1075    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076    __kmp_store_mxcsr(&mxcsr);
1077    mxcsr &= KMP_X86_MXCSR_MASK;
1078
1079    // There is no point looking at t_fp_control_saved here.
1080    // If it is TRUE, we still have to update the values if they are different
1081    // from those we now have. If it is FALSE we didn't save anything yet, but
1082    // our objective is the same. We have to ensure that the values in the team
1083    // are the same as those we have.
1084    // So, this code achieves what we need whether or not t_fp_control_saved is
1085    // true. By checking whether the value needs updating we avoid unnecessary
1086    // writes that would put the cache-line into a written state, causing all
1087    // threads in the team to have to read it again.
1088    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090    // Although we don't use this value, other code in the runtime wants to know
1091    // whether it should restore them. So we must ensure it is correct.
1092    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093  } else {
1094    // Similarly here. Don't write to this cache-line in the team structure
1095    // unless we have to.
1096    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097  }
1098}
1099
1100// Do the opposite, setting the hardware registers to the updated values from
1101// the team.
1102inline static void updateHWFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104    // Only reset the fp control regs if they have been changed in the team.
1105    // the parallel region that we are exiting.
1106    kmp_int16 x87_fpu_control_word;
1107    kmp_uint32 mxcsr;
1108    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109    __kmp_store_mxcsr(&mxcsr);
1110    mxcsr &= KMP_X86_MXCSR_MASK;
1111
1112    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113      __kmp_clear_x87_fpu_status_word();
1114      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115    }
1116
1117    if (team->t.t_mxcsr != mxcsr) {
1118      __kmp_load_mxcsr(&team->t.t_mxcsr);
1119    }
1120  }
1121}
1122#else
1123#define propagateFPControl(x) ((void)0)
1124#define updateHWFPControl(x) ((void)0)
1125#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126
1127static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                     int realloc); // forward declaration
1129
1130/* Run a parallel region that has been serialized, so runs only in a team of the
1131   single master thread. */
1132void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133  kmp_info_t *this_thr;
1134  kmp_team_t *serial_team;
1135
1136  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137
1138  /* Skip all this code for autopar serialized loops since it results in
1139     unacceptable overhead */
1140  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141    return;
1142
1143  if (!TCR_4(__kmp_init_parallel))
1144    __kmp_parallel_initialize();
1145  __kmp_resume_if_soft_paused();
1146
1147  this_thr = __kmp_threads[global_tid];
1148  serial_team = this_thr->th.th_serial_team;
1149
1150  /* utilize the serialized team held by this thread */
1151  KMP_DEBUG_ASSERT(serial_team);
1152  KMP_MB();
1153
1154  if (__kmp_tasking_mode != tskm_immediate_exec) {
1155    KMP_DEBUG_ASSERT(
1156        this_thr->th.th_task_team ==
1157        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                     NULL);
1160    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                  "team %p, new task_team = NULL\n",
1162                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163    this_thr->th.th_task_team = NULL;
1164  }
1165
1166  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168    proc_bind = proc_bind_false;
1169  } else if (proc_bind == proc_bind_default) {
1170    // No proc_bind clause was specified, so use the current value
1171    // of proc-bind-var for this parallel region.
1172    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173  }
1174  // Reset for next parallel region
1175  this_thr->th.th_set_proc_bind = proc_bind_default;
1176
1177#if OMPT_SUPPORT
1178  ompt_data_t ompt_parallel_data = ompt_data_none;
1179  ompt_data_t *implicit_task_data;
1180  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181  if (ompt_enabled.enabled &&
1182      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183
1184    ompt_task_info_t *parent_task_info;
1185    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186
1187    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188    if (ompt_enabled.ompt_callback_parallel_begin) {
1189      int team_size = 1;
1190
1191      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192          &(parent_task_info->task_data), &(parent_task_info->frame),
1193          &ompt_parallel_data, team_size,
1194          ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1195    }
1196  }
1197#endif // OMPT_SUPPORT
1198
1199  if (this_thr->th.th_team != serial_team) {
1200    // Nested level will be an index in the nested nthreads array
1201    int level = this_thr->th.th_team->t.t_level;
1202
1203    if (serial_team->t.t_serialized) {
1204      /* this serial team was already used
1205         TODO increase performance by making this locks more specific */
1206      kmp_team_t *new_team;
1207
1208      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209
1210      new_team =
1211          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212#if OMPT_SUPPORT
1213                              ompt_parallel_data,
1214#endif
1215                              proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                              0 USE_NESTED_HOT_ARG(NULL));
1217      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218      KMP_ASSERT(new_team);
1219
1220      /* setup new serialized team and install it */
1221      new_team->t.t_threads[0] = this_thr;
1222      new_team->t.t_parent = this_thr->th.th_team;
1223      serial_team = new_team;
1224      this_thr->th.th_serial_team = serial_team;
1225
1226      KF_TRACE(
1227          10,
1228          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229           global_tid, serial_team));
1230
1231      /* TODO the above breaks the requirement that if we run out of resources,
1232         then we can still guarantee that serialized teams are ok, since we may
1233         need to allocate a new one */
1234    } else {
1235      KF_TRACE(
1236          10,
1237          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238           global_tid, serial_team));
1239    }
1240
1241    /* we have to initialize this serial team */
1242    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245    serial_team->t.t_ident = loc;
1246    serial_team->t.t_serialized = 1;
1247    serial_team->t.t_nproc = 1;
1248    serial_team->t.t_parent = this_thr->th.th_team;
1249    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250    this_thr->th.th_team = serial_team;
1251    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252
1253    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                  this_thr->th.th_current_task));
1255    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256    this_thr->th.th_current_task->td_flags.executing = 0;
1257
1258    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259
1260    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261       implicit task for each serialized task represented by
1262       team->t.t_serialized? */
1263    copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264              &this_thr->th.th_current_task->td_parent->td_icvs);
1265
1266    // Thread value exists in the nested nthreads array for the next nested
1267    // level
1268    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269      this_thr->th.th_current_task->td_icvs.nproc =
1270          __kmp_nested_nth.nth[level + 1];
1271    }
1272
1273    if (__kmp_nested_proc_bind.used &&
1274        (level + 1 < __kmp_nested_proc_bind.used)) {
1275      this_thr->th.th_current_task->td_icvs.proc_bind =
1276          __kmp_nested_proc_bind.bind_types[level + 1];
1277    }
1278
1279#if USE_DEBUGGER
1280    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281#endif
1282    this_thr->th.th_info.ds.ds_tid = 0;
1283
1284    /* set thread cache values */
1285    this_thr->th.th_team_nproc = 1;
1286    this_thr->th.th_team_master = this_thr;
1287    this_thr->th.th_team_serialized = 1;
1288
1289    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292
1293    propagateFPControl(serial_team);
1294
1295    /* check if we need to allocate dispatch buffers stack */
1296    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297    if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298      serial_team->t.t_dispatch->th_disp_buffer =
1299          (dispatch_private_info_t *)__kmp_allocate(
1300              sizeof(dispatch_private_info_t));
1301    }
1302    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303
1304    KMP_MB();
1305
1306  } else {
1307    /* this serialized team is already being used,
1308     * that's fine, just add another nested level */
1309    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312    ++serial_team->t.t_serialized;
1313    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314
1315    // Nested level will be an index in the nested nthreads array
1316    int level = this_thr->th.th_team->t.t_level;
1317    // Thread value exists in the nested nthreads array for the next nested
1318    // level
1319    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320      this_thr->th.th_current_task->td_icvs.nproc =
1321          __kmp_nested_nth.nth[level + 1];
1322    }
1323    serial_team->t.t_level++;
1324    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                  "of serial team %p to %d\n",
1326                  global_tid, serial_team, serial_team->t.t_level));
1327
1328    /* allocate/push dispatch buffers stack */
1329    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330    {
1331      dispatch_private_info_t *disp_buffer =
1332          (dispatch_private_info_t *)__kmp_allocate(
1333              sizeof(dispatch_private_info_t));
1334      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336    }
1337    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338
1339    KMP_MB();
1340  }
1341  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342
1343  // Perform the display affinity functionality for
1344  // serialized parallel regions
1345  if (__kmp_display_affinity) {
1346    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347        this_thr->th.th_prev_num_threads != 1) {
1348      // NULL means use the affinity-format-var ICV
1349      __kmp_aux_display_affinity(global_tid, NULL);
1350      this_thr->th.th_prev_level = serial_team->t.t_level;
1351      this_thr->th.th_prev_num_threads = 1;
1352    }
1353  }
1354
1355  if (__kmp_env_consistency_check)
1356    __kmp_push_parallel(global_tid, NULL);
1357#if OMPT_SUPPORT
1358  serial_team->t.ompt_team_info.master_return_address = codeptr;
1359  if (ompt_enabled.enabled &&
1360      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362
1363    ompt_lw_taskteam_t lw_taskteam;
1364    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                            &ompt_parallel_data, codeptr);
1366
1367    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368    // don't use lw_taskteam after linking. content was swaped
1369
1370    /* OMPT implicit task begin */
1371    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372    if (ompt_enabled.ompt_callback_implicit_task) {
1373      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376      OMPT_CUR_TASK_INFO(this_thr)
1377          ->thread_num = __kmp_tid_from_gtid(global_tid);
1378    }
1379
1380    /* OMPT state */
1381    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383  }
1384#endif
1385}
1386
1387/* most of the work for a fork */
1388/* return true if we really went parallel, false if serialized */
1389int __kmp_fork_call(ident_t *loc, int gtid,
1390                    enum fork_context_e call_context, // Intel, GNU, ...
1391                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392                    kmp_va_list ap) {
1393  void **argv;
1394  int i;
1395  int master_tid;
1396  int master_this_cons;
1397  kmp_team_t *team;
1398  kmp_team_t *parent_team;
1399  kmp_info_t *master_th;
1400  kmp_root_t *root;
1401  int nthreads;
1402  int master_active;
1403  int master_set_numthreads;
1404  int level;
1405  int active_level;
1406  int teams_level;
1407#if KMP_NESTED_HOT_TEAMS
1408  kmp_hot_team_ptr_t **p_hot_teams;
1409#endif
1410  { // KMP_TIME_BLOCK
1411    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1412    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1413
1414    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1415    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1416      /* Some systems prefer the stack for the root thread(s) to start with */
1417      /* some gap from the parent stack to prevent false sharing. */
1418      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1419      /* These 2 lines below are so this does not get optimized out */
1420      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1421        __kmp_stkpadding += (short)((kmp_int64)dummy);
1422    }
1423
1424    /* initialize if needed */
1425    KMP_DEBUG_ASSERT(
1426        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1427    if (!TCR_4(__kmp_init_parallel))
1428      __kmp_parallel_initialize();
1429    __kmp_resume_if_soft_paused();
1430
1431    /* setup current data */
1432    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1433    // shutdown
1434    parent_team = master_th->th.th_team;
1435    master_tid = master_th->th.th_info.ds.ds_tid;
1436    master_this_cons = master_th->th.th_local.this_construct;
1437    root = master_th->th.th_root;
1438    master_active = root->r.r_active;
1439    master_set_numthreads = master_th->th.th_set_nproc;
1440
1441#if OMPT_SUPPORT
1442    ompt_data_t ompt_parallel_data = ompt_data_none;
1443    ompt_data_t *parent_task_data;
1444    ompt_frame_t *ompt_frame;
1445    ompt_data_t *implicit_task_data;
1446    void *return_address = NULL;
1447
1448    if (ompt_enabled.enabled) {
1449      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1450                                    NULL, NULL);
1451      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1452    }
1453#endif
1454
1455    // Nested level will be an index in the nested nthreads array
1456    level = parent_team->t.t_level;
1457    // used to launch non-serial teams even if nested is not allowed
1458    active_level = parent_team->t.t_active_level;
1459    // needed to check nesting inside the teams
1460    teams_level = master_th->th.th_teams_level;
1461#if KMP_NESTED_HOT_TEAMS
1462    p_hot_teams = &master_th->th.th_hot_teams;
1463    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1464      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1465          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1466      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1467      // it is either actual or not needed (when active_level > 0)
1468      (*p_hot_teams)[0].hot_team_nth = 1;
1469    }
1470#endif
1471
1472#if OMPT_SUPPORT
1473    if (ompt_enabled.enabled) {
1474      if (ompt_enabled.ompt_callback_parallel_begin) {
1475        int team_size = master_set_numthreads
1476                            ? master_set_numthreads
1477                            : get__nproc_2(parent_team, master_tid);
1478        int flags = OMPT_INVOKER(call_context) |
1479                    ((microtask == (microtask_t)__kmp_teams_master)
1480                         ? ompt_parallel_league
1481                         : ompt_parallel_team);
1482        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1483            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1484            return_address);
1485      }
1486      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1487    }
1488#endif
1489
1490    master_th->th.th_ident = loc;
1491
1492    if (master_th->th.th_teams_microtask && ap &&
1493        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1494      // AC: This is start of parallel that is nested inside teams construct.
1495      // The team is actual (hot), all workers are ready at the fork barrier.
1496      // No lock needed to initialize the team a bit, then free workers.
1497      parent_team->t.t_ident = loc;
1498      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1499      parent_team->t.t_argc = argc;
1500      argv = (void **)parent_team->t.t_argv;
1501      for (i = argc - 1; i >= 0; --i)
1502        *argv++ = va_arg(kmp_va_deref(ap), void *);
1503      // Increment our nested depth levels, but not increase the serialization
1504      if (parent_team == master_th->th.th_serial_team) {
1505        // AC: we are in serialized parallel
1506        __kmpc_serialized_parallel(loc, gtid);
1507        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1508
1509#if OMPT_SUPPORT
1510        void *dummy;
1511        void **exit_frame_p;
1512
1513        ompt_lw_taskteam_t lw_taskteam;
1514
1515        if (ompt_enabled.enabled) {
1516          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1517                                  &ompt_parallel_data, return_address);
1518          exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1519
1520          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1521          // don't use lw_taskteam after linking. content was swaped
1522
1523          /* OMPT implicit task begin */
1524          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1525          if (ompt_enabled.ompt_callback_implicit_task) {
1526            OMPT_CUR_TASK_INFO(master_th)
1527                ->thread_num = __kmp_tid_from_gtid(gtid);
1528            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1529                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1530                implicit_task_data, 1,
1531                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1532          }
1533
1534          /* OMPT state */
1535          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1536        } else {
1537          exit_frame_p = &dummy;
1538        }
1539#endif
1540        // AC: need to decrement t_serialized for enquiry functions to work
1541        // correctly, will restore at join time
1542        parent_team->t.t_serialized--;
1543
1544        {
1545          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1546          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1547          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1548#if OMPT_SUPPORT
1549                                 ,
1550                                 exit_frame_p
1551#endif
1552                                 );
1553        }
1554
1555#if OMPT_SUPPORT
1556        if (ompt_enabled.enabled) {
1557          *exit_frame_p = NULL;
1558          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1559          if (ompt_enabled.ompt_callback_implicit_task) {
1560            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1561                ompt_scope_end, NULL, implicit_task_data, 1,
1562                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1563          }
1564          ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1565          __ompt_lw_taskteam_unlink(master_th);
1566          if (ompt_enabled.ompt_callback_parallel_end) {
1567            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1568                &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1569                OMPT_INVOKER(call_context) | ompt_parallel_team,
1570                return_address);
1571          }
1572          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1573        }
1574#endif
1575        return TRUE;
1576      }
1577
1578      parent_team->t.t_pkfn = microtask;
1579      parent_team->t.t_invoke = invoker;
1580      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1581      parent_team->t.t_active_level++;
1582      parent_team->t.t_level++;
1583      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1584
1585#if OMPT_SUPPORT
1586      if (ompt_enabled.enabled) {
1587        ompt_lw_taskteam_t lw_taskteam;
1588        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1589                                &ompt_parallel_data, return_address);
1590        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1591      }
1592#endif
1593
1594      /* Change number of threads in the team if requested */
1595      if (master_set_numthreads) { // The parallel has num_threads clause
1596        if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1597          // AC: only can reduce number of threads dynamically, can't increase
1598          kmp_info_t **other_threads = parent_team->t.t_threads;
1599          parent_team->t.t_nproc = master_set_numthreads;
1600          for (i = 0; i < master_set_numthreads; ++i) {
1601            other_threads[i]->th.th_team_nproc = master_set_numthreads;
1602          }
1603          // Keep extra threads hot in the team for possible next parallels
1604        }
1605        master_th->th.th_set_nproc = 0;
1606      }
1607
1608#if USE_DEBUGGER
1609      if (__kmp_debugging) { // Let debugger override number of threads.
1610        int nth = __kmp_omp_num_threads(loc);
1611        if (nth > 0) { // 0 means debugger doesn't want to change num threads
1612          master_set_numthreads = nth;
1613        }
1614      }
1615#endif
1616
1617      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1618                    "master_th=%p, gtid=%d\n",
1619                    root, parent_team, master_th, gtid));
1620      __kmp_internal_fork(loc, gtid, parent_team);
1621      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1622                    "master_th=%p, gtid=%d\n",
1623                    root, parent_team, master_th, gtid));
1624
1625      /* Invoke microtask for MASTER thread */
1626      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1627                    parent_team->t.t_id, parent_team->t.t_pkfn));
1628
1629      if (!parent_team->t.t_invoke(gtid)) {
1630        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1631      }
1632      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1633                    parent_team->t.t_id, parent_team->t.t_pkfn));
1634      KMP_MB(); /* Flush all pending memory write invalidates.  */
1635
1636      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1637
1638      return TRUE;
1639    } // Parallel closely nested in teams construct
1640
1641#if KMP_DEBUG
1642    if (__kmp_tasking_mode != tskm_immediate_exec) {
1643      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1644                       parent_team->t.t_task_team[master_th->th.th_task_state]);
1645    }
1646#endif
1647
1648    if (parent_team->t.t_active_level >=
1649        master_th->th.th_current_task->td_icvs.max_active_levels) {
1650      nthreads = 1;
1651    } else {
1652      int enter_teams = ((ap == NULL && active_level == 0) ||
1653                         (ap && teams_level > 0 && teams_level == level));
1654      nthreads =
1655          master_set_numthreads
1656              ? master_set_numthreads
1657              : get__nproc_2(
1658                    parent_team,
1659                    master_tid); // TODO: get nproc directly from current task
1660
1661      // Check if we need to take forkjoin lock? (no need for serialized
1662      // parallel out of teams construct). This code moved here from
1663      // __kmp_reserve_threads() to speedup nested serialized parallels.
1664      if (nthreads > 1) {
1665        if ((get__max_active_levels(master_th) == 1 &&
1666             (root->r.r_in_parallel && !enter_teams)) ||
1667            (__kmp_library == library_serial)) {
1668          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1669                        " threads\n",
1670                        gtid, nthreads));
1671          nthreads = 1;
1672        }
1673      }
1674      if (nthreads > 1) {
1675        /* determine how many new threads we can use */
1676        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1677        /* AC: If we execute teams from parallel region (on host), then teams
1678           should be created but each can only have 1 thread if nesting is
1679           disabled. If teams called from serial region, then teams and their
1680           threads should be created regardless of the nesting setting. */
1681        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1682                                         nthreads, enter_teams);
1683        if (nthreads == 1) {
1684          // Free lock for single thread execution here; for multi-thread
1685          // execution it will be freed later after team of threads created
1686          // and initialized
1687          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1688        }
1689      }
1690    }
1691    KMP_DEBUG_ASSERT(nthreads > 0);
1692
1693    // If we temporarily changed the set number of threads then restore it now
1694    master_th->th.th_set_nproc = 0;
1695
1696    /* create a serialized parallel region? */
1697    if (nthreads == 1) {
1698/* josh todo: hypothetical question: what do we do for OS X*? */
1699#if KMP_OS_LINUX &&                                                            \
1700    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1701      void *args[argc];
1702#else
1703      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1704#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1705          KMP_ARCH_AARCH64) */
1706
1707      KA_TRACE(20,
1708               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1709
1710      __kmpc_serialized_parallel(loc, gtid);
1711
1712      if (call_context == fork_context_intel) {
1713        /* TODO this sucks, use the compiler itself to pass args! :) */
1714        master_th->th.th_serial_team->t.t_ident = loc;
1715        if (!ap) {
1716          // revert change made in __kmpc_serialized_parallel()
1717          master_th->th.th_serial_team->t.t_level--;
1718// Get args from parent team for teams construct
1719
1720#if OMPT_SUPPORT
1721          void *dummy;
1722          void **exit_frame_p;
1723          ompt_task_info_t *task_info;
1724
1725          ompt_lw_taskteam_t lw_taskteam;
1726
1727          if (ompt_enabled.enabled) {
1728            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1729                                    &ompt_parallel_data, return_address);
1730
1731            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1732            // don't use lw_taskteam after linking. content was swaped
1733
1734            task_info = OMPT_CUR_TASK_INFO(master_th);
1735            exit_frame_p = &(task_info->frame.exit_frame.ptr);
1736            if (ompt_enabled.ompt_callback_implicit_task) {
1737              OMPT_CUR_TASK_INFO(master_th)
1738                  ->thread_num = __kmp_tid_from_gtid(gtid);
1739              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1740                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1741                  &(task_info->task_data), 1,
1742                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1743                  ompt_task_implicit);
1744            }
1745
1746            /* OMPT state */
1747            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1748          } else {
1749            exit_frame_p = &dummy;
1750          }
1751#endif
1752
1753          {
1754            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1755            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1756            __kmp_invoke_microtask(microtask, gtid, 0, argc,
1757                                   parent_team->t.t_argv
1758#if OMPT_SUPPORT
1759                                   ,
1760                                   exit_frame_p
1761#endif
1762                                   );
1763          }
1764
1765#if OMPT_SUPPORT
1766          if (ompt_enabled.enabled) {
1767            *exit_frame_p = NULL;
1768            if (ompt_enabled.ompt_callback_implicit_task) {
1769              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1770                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1771                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1772                  ompt_task_implicit);
1773            }
1774            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1775            __ompt_lw_taskteam_unlink(master_th);
1776            if (ompt_enabled.ompt_callback_parallel_end) {
1777              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1778                  &ompt_parallel_data, parent_task_data,
1779                  OMPT_INVOKER(call_context) | ompt_parallel_team,
1780                  return_address);
1781            }
1782            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1783          }
1784#endif
1785        } else if (microtask == (microtask_t)__kmp_teams_master) {
1786          KMP_DEBUG_ASSERT(master_th->th.th_team ==
1787                           master_th->th.th_serial_team);
1788          team = master_th->th.th_team;
1789          // team->t.t_pkfn = microtask;
1790          team->t.t_invoke = invoker;
1791          __kmp_alloc_argv_entries(argc, team, TRUE);
1792          team->t.t_argc = argc;
1793          argv = (void **)team->t.t_argv;
1794          if (ap) {
1795            for (i = argc - 1; i >= 0; --i)
1796              *argv++ = va_arg(kmp_va_deref(ap), void *);
1797          } else {
1798            for (i = 0; i < argc; ++i)
1799              // Get args from parent team for teams construct
1800              argv[i] = parent_team->t.t_argv[i];
1801          }
1802          // AC: revert change made in __kmpc_serialized_parallel()
1803          //     because initial code in teams should have level=0
1804          team->t.t_level--;
1805          // AC: call special invoker for outer "parallel" of teams construct
1806          invoker(gtid);
1807#if OMPT_SUPPORT
1808          if (ompt_enabled.enabled) {
1809            ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1810            if (ompt_enabled.ompt_callback_implicit_task) {
1811              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1812                  ompt_scope_end, NULL, &(task_info->task_data), 0,
1813                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1814            }
1815            if (ompt_enabled.ompt_callback_parallel_end) {
1816              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1817                  &ompt_parallel_data, parent_task_data,
1818                  OMPT_INVOKER(call_context) | ompt_parallel_league,
1819                  return_address);
1820            }
1821            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1822          }
1823#endif
1824        } else {
1825          argv = args;
1826          for (i = argc - 1; i >= 0; --i)
1827            *argv++ = va_arg(kmp_va_deref(ap), void *);
1828          KMP_MB();
1829
1830#if OMPT_SUPPORT
1831          void *dummy;
1832          void **exit_frame_p;
1833          ompt_task_info_t *task_info;
1834
1835          ompt_lw_taskteam_t lw_taskteam;
1836
1837          if (ompt_enabled.enabled) {
1838            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1839                                    &ompt_parallel_data, return_address);
1840            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1841            // don't use lw_taskteam after linking. content was swaped
1842            task_info = OMPT_CUR_TASK_INFO(master_th);
1843            exit_frame_p = &(task_info->frame.exit_frame.ptr);
1844
1845            /* OMPT implicit task begin */
1846            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1847            if (ompt_enabled.ompt_callback_implicit_task) {
1848              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1849                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1850                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1851                  ompt_task_implicit);
1852              OMPT_CUR_TASK_INFO(master_th)
1853                  ->thread_num = __kmp_tid_from_gtid(gtid);
1854            }
1855
1856            /* OMPT state */
1857            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1858          } else {
1859            exit_frame_p = &dummy;
1860          }
1861#endif
1862
1863          {
1864            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1865            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1866            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1867#if OMPT_SUPPORT
1868                                   ,
1869                                   exit_frame_p
1870#endif
1871                                   );
1872          }
1873
1874#if OMPT_SUPPORT
1875          if (ompt_enabled.enabled) {
1876            *exit_frame_p = NULL;
1877            if (ompt_enabled.ompt_callback_implicit_task) {
1878              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1879                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1880                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1881                  ompt_task_implicit);
1882            }
1883
1884            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1885            __ompt_lw_taskteam_unlink(master_th);
1886            if (ompt_enabled.ompt_callback_parallel_end) {
1887              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1888                  &ompt_parallel_data, parent_task_data,
1889                  OMPT_INVOKER(call_context) | ompt_parallel_team,
1890                  return_address);
1891            }
1892            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1893          }
1894#endif
1895        }
1896      } else if (call_context == fork_context_gnu) {
1897#if OMPT_SUPPORT
1898        ompt_lw_taskteam_t lwt;
1899        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1900                                return_address);
1901
1902        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1903        __ompt_lw_taskteam_link(&lwt, master_th, 1);
1904// don't use lw_taskteam after linking. content was swaped
1905#endif
1906
1907        // we were called from GNU native code
1908        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1909        return FALSE;
1910      } else {
1911        KMP_ASSERT2(call_context < fork_context_last,
1912                    "__kmp_fork_call: unknown fork_context parameter");
1913      }
1914
1915      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1916      KMP_MB();
1917      return FALSE;
1918    } // if (nthreads == 1)
1919
1920    // GEH: only modify the executing flag in the case when not serialized
1921    //      serialized case is handled in kmpc_serialized_parallel
1922    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1923                  "curtask=%p, curtask_max_aclevel=%d\n",
1924                  parent_team->t.t_active_level, master_th,
1925                  master_th->th.th_current_task,
1926                  master_th->th.th_current_task->td_icvs.max_active_levels));
1927    // TODO: GEH - cannot do this assertion because root thread not set up as
1928    // executing
1929    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1930    master_th->th.th_current_task->td_flags.executing = 0;
1931
1932    if (!master_th->th.th_teams_microtask || level > teams_level) {
1933      /* Increment our nested depth level */
1934      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1935    }
1936
1937    // See if we need to make a copy of the ICVs.
1938    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1939    if ((level + 1 < __kmp_nested_nth.used) &&
1940        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1941      nthreads_icv = __kmp_nested_nth.nth[level + 1];
1942    } else {
1943      nthreads_icv = 0; // don't update
1944    }
1945
1946    // Figure out the proc_bind_policy for the new team.
1947    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1948    kmp_proc_bind_t proc_bind_icv =
1949        proc_bind_default; // proc_bind_default means don't update
1950    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1951      proc_bind = proc_bind_false;
1952    } else {
1953      if (proc_bind == proc_bind_default) {
1954        // No proc_bind clause specified; use current proc-bind-var for this
1955        // parallel region
1956        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1957      }
1958      /* else: The proc_bind policy was specified explicitly on parallel clause.
1959         This overrides proc-bind-var for this parallel region, but does not
1960         change proc-bind-var. */
1961      // Figure the value of proc-bind-var for the child threads.
1962      if ((level + 1 < __kmp_nested_proc_bind.used) &&
1963          (__kmp_nested_proc_bind.bind_types[level + 1] !=
1964           master_th->th.th_current_task->td_icvs.proc_bind)) {
1965        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1966      }
1967    }
1968
1969    // Reset for next parallel region
1970    master_th->th.th_set_proc_bind = proc_bind_default;
1971
1972    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1973      kmp_internal_control_t new_icvs;
1974      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1975      new_icvs.next = NULL;
1976      if (nthreads_icv > 0) {
1977        new_icvs.nproc = nthreads_icv;
1978      }
1979      if (proc_bind_icv != proc_bind_default) {
1980        new_icvs.proc_bind = proc_bind_icv;
1981      }
1982
1983      /* allocate a new parallel team */
1984      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1985      team = __kmp_allocate_team(root, nthreads, nthreads,
1986#if OMPT_SUPPORT
1987                                 ompt_parallel_data,
1988#endif
1989                                 proc_bind, &new_icvs,
1990                                 argc USE_NESTED_HOT_ARG(master_th));
1991    } else {
1992      /* allocate a new parallel team */
1993      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1994      team = __kmp_allocate_team(root, nthreads, nthreads,
1995#if OMPT_SUPPORT
1996                                 ompt_parallel_data,
1997#endif
1998                                 proc_bind,
1999                                 &master_th->th.th_current_task->td_icvs,
2000                                 argc USE_NESTED_HOT_ARG(master_th));
2001    }
2002    KF_TRACE(
2003        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2004
2005    /* setup the new team */
2006    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2007    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2008    KMP_CHECK_UPDATE(team->t.t_ident, loc);
2009    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2010    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2011#if OMPT_SUPPORT
2012    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2013                          return_address);
2014#endif
2015    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2016    // TODO: parent_team->t.t_level == INT_MAX ???
2017    if (!master_th->th.th_teams_microtask || level > teams_level) {
2018      int new_level = parent_team->t.t_level + 1;
2019      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2020      new_level = parent_team->t.t_active_level + 1;
2021      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2022    } else {
2023      // AC: Do not increase parallel level at start of the teams construct
2024      int new_level = parent_team->t.t_level;
2025      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2026      new_level = parent_team->t.t_active_level;
2027      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2028    }
2029    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2030    // set master's schedule as new run-time schedule
2031    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2032
2033    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2034    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2035
2036    // Update the floating point rounding in the team if required.
2037    propagateFPControl(team);
2038
2039    if (__kmp_tasking_mode != tskm_immediate_exec) {
2040      // Set master's task team to team's task team. Unless this is hot team, it
2041      // should be NULL.
2042      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2043                       parent_team->t.t_task_team[master_th->th.th_task_state]);
2044      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2045                    "%p, new task_team %p / team %p\n",
2046                    __kmp_gtid_from_thread(master_th),
2047                    master_th->th.th_task_team, parent_team,
2048                    team->t.t_task_team[master_th->th.th_task_state], team));
2049
2050      if (active_level || master_th->th.th_task_team) {
2051        // Take a memo of master's task_state
2052        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2053        if (master_th->th.th_task_state_top >=
2054            master_th->th.th_task_state_stack_sz) { // increase size
2055          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2056          kmp_uint8 *old_stack, *new_stack;
2057          kmp_uint32 i;
2058          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2059          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2060            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2061          }
2062          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2063               ++i) { // zero-init rest of stack
2064            new_stack[i] = 0;
2065          }
2066          old_stack = master_th->th.th_task_state_memo_stack;
2067          master_th->th.th_task_state_memo_stack = new_stack;
2068          master_th->th.th_task_state_stack_sz = new_size;
2069          __kmp_free(old_stack);
2070        }
2071        // Store master's task_state on stack
2072        master_th->th
2073            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2074            master_th->th.th_task_state;
2075        master_th->th.th_task_state_top++;
2076#if KMP_NESTED_HOT_TEAMS
2077        if (master_th->th.th_hot_teams &&
2078            active_level < __kmp_hot_teams_max_level &&
2079            team == master_th->th.th_hot_teams[active_level].hot_team) {
2080          // Restore master's nested state if nested hot team
2081          master_th->th.th_task_state =
2082              master_th->th
2083                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2084        } else {
2085#endif
2086          master_th->th.th_task_state = 0;
2087#if KMP_NESTED_HOT_TEAMS
2088        }
2089#endif
2090      }
2091#if !KMP_NESTED_HOT_TEAMS
2092      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2093                       (team == root->r.r_hot_team));
2094#endif
2095    }
2096
2097    KA_TRACE(
2098        20,
2099        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2100         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2101         team->t.t_nproc));
2102    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2103                     (team->t.t_master_tid == 0 &&
2104                      (team->t.t_parent == root->r.r_root_team ||
2105                       team->t.t_parent->t.t_serialized)));
2106    KMP_MB();
2107
2108    /* now, setup the arguments */
2109    argv = (void **)team->t.t_argv;
2110    if (ap) {
2111      for (i = argc - 1; i >= 0; --i) {
2112        void *new_argv = va_arg(kmp_va_deref(ap), void *);
2113        KMP_CHECK_UPDATE(*argv, new_argv);
2114        argv++;
2115      }
2116    } else {
2117      for (i = 0; i < argc; ++i) {
2118        // Get args from parent team for teams construct
2119        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2120      }
2121    }
2122
2123    /* now actually fork the threads */
2124    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2125    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2126      root->r.r_active = TRUE;
2127
2128    __kmp_fork_team_threads(root, team, master_th, gtid);
2129    __kmp_setup_icv_copy(team, nthreads,
2130                         &master_th->th.th_current_task->td_icvs, loc);
2131
2132#if OMPT_SUPPORT
2133    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2134#endif
2135
2136    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2137
2138#if USE_ITT_BUILD
2139    if (team->t.t_active_level == 1 // only report frames at level 1
2140        && !master_th->th.th_teams_microtask) { // not in teams construct
2141#if USE_ITT_NOTIFY
2142      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2143          (__kmp_forkjoin_frames_mode == 3 ||
2144           __kmp_forkjoin_frames_mode == 1)) {
2145        kmp_uint64 tmp_time = 0;
2146        if (__itt_get_timestamp_ptr)
2147          tmp_time = __itt_get_timestamp();
2148        // Internal fork - report frame begin
2149        master_th->th.th_frame_time = tmp_time;
2150        if (__kmp_forkjoin_frames_mode == 3)
2151          team->t.t_region_time = tmp_time;
2152      } else
2153// only one notification scheme (either "submit" or "forking/joined", not both)
2154#endif /* USE_ITT_NOTIFY */
2155          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2156              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2157        // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2158        __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2159      }
2160    }
2161#endif /* USE_ITT_BUILD */
2162
2163    /* now go on and do the work */
2164    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2165    KMP_MB();
2166    KF_TRACE(10,
2167             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2168              root, team, master_th, gtid));
2169
2170#if USE_ITT_BUILD
2171    if (__itt_stack_caller_create_ptr) {
2172      team->t.t_stack_id =
2173          __kmp_itt_stack_caller_create(); // create new stack stitching id
2174      // before entering fork barrier
2175    }
2176#endif /* USE_ITT_BUILD */
2177
2178    // AC: skip __kmp_internal_fork at teams construct, let only master
2179    // threads execute
2180    if (ap) {
2181      __kmp_internal_fork(loc, gtid, team);
2182      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2183                    "master_th=%p, gtid=%d\n",
2184                    root, team, master_th, gtid));
2185    }
2186
2187    if (call_context == fork_context_gnu) {
2188      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2189      return TRUE;
2190    }
2191
2192    /* Invoke microtask for MASTER thread */
2193    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2194                  team->t.t_id, team->t.t_pkfn));
2195  } // END of timer KMP_fork_call block
2196
2197#if KMP_STATS_ENABLED
2198  // If beginning a teams construct, then change thread state
2199  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2200  if (!ap) {
2201    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2202  }
2203#endif
2204
2205  if (!team->t.t_invoke(gtid)) {
2206    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2207  }
2208
2209#if KMP_STATS_ENABLED
2210  // If was beginning of a teams construct, then reset thread state
2211  if (!ap) {
2212    KMP_SET_THREAD_STATE(previous_state);
2213  }
2214#endif
2215
2216  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2217                team->t.t_id, team->t.t_pkfn));
2218  KMP_MB(); /* Flush all pending memory write invalidates.  */
2219
2220  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2221
2222#if OMPT_SUPPORT
2223  if (ompt_enabled.enabled) {
2224    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2225  }
2226#endif
2227
2228  return TRUE;
2229}
2230
2231#if OMPT_SUPPORT
2232static inline void __kmp_join_restore_state(kmp_info_t *thread,
2233                                            kmp_team_t *team) {
2234  // restore state outside the region
2235  thread->th.ompt_thread_info.state =
2236      ((team->t.t_serialized) ? ompt_state_work_serial
2237                              : ompt_state_work_parallel);
2238}
2239
2240static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2241                                   kmp_team_t *team, ompt_data_t *parallel_data,
2242                                   int flags, void *codeptr) {
2243  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2244  if (ompt_enabled.ompt_callback_parallel_end) {
2245    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2246        parallel_data, &(task_info->task_data), flags, codeptr);
2247  }
2248
2249  task_info->frame.enter_frame = ompt_data_none;
2250  __kmp_join_restore_state(thread, team);
2251}
2252#endif
2253
2254void __kmp_join_call(ident_t *loc, int gtid
2255#if OMPT_SUPPORT
2256                     ,
2257                     enum fork_context_e fork_context
2258#endif
2259                     ,
2260                     int exit_teams) {
2261  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2262  kmp_team_t *team;
2263  kmp_team_t *parent_team;
2264  kmp_info_t *master_th;
2265  kmp_root_t *root;
2266  int master_active;
2267
2268  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2269
2270  /* setup current data */
2271  master_th = __kmp_threads[gtid];
2272  root = master_th->th.th_root;
2273  team = master_th->th.th_team;
2274  parent_team = team->t.t_parent;
2275
2276  master_th->th.th_ident = loc;
2277
2278#if OMPT_SUPPORT
2279  void *team_microtask = (void *)team->t.t_pkfn;
2280  if (ompt_enabled.enabled) {
2281    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2282  }
2283#endif
2284
2285#if KMP_DEBUG
2286  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2287    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2288                  "th_task_team = %p\n",
2289                  __kmp_gtid_from_thread(master_th), team,
2290                  team->t.t_task_team[master_th->th.th_task_state],
2291                  master_th->th.th_task_team));
2292    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2293                     team->t.t_task_team[master_th->th.th_task_state]);
2294  }
2295#endif
2296
2297  if (team->t.t_serialized) {
2298    if (master_th->th.th_teams_microtask) {
2299      // We are in teams construct
2300      int level = team->t.t_level;
2301      int tlevel = master_th->th.th_teams_level;
2302      if (level == tlevel) {
2303        // AC: we haven't incremented it earlier at start of teams construct,
2304        //     so do it here - at the end of teams construct
2305        team->t.t_level++;
2306      } else if (level == tlevel + 1) {
2307        // AC: we are exiting parallel inside teams, need to increment
2308        // serialization in order to restore it in the next call to
2309        // __kmpc_end_serialized_parallel
2310        team->t.t_serialized++;
2311      }
2312    }
2313    __kmpc_end_serialized_parallel(loc, gtid);
2314
2315#if OMPT_SUPPORT
2316    if (ompt_enabled.enabled) {
2317      __kmp_join_restore_state(master_th, parent_team);
2318    }
2319#endif
2320
2321    return;
2322  }
2323
2324  master_active = team->t.t_master_active;
2325
2326  if (!exit_teams) {
2327    // AC: No barrier for internal teams at exit from teams construct.
2328    //     But there is barrier for external team (league).
2329    __kmp_internal_join(loc, gtid, team);
2330  } else {
2331    master_th->th.th_task_state =
2332        0; // AC: no tasking in teams (out of any parallel)
2333  }
2334
2335  KMP_MB();
2336
2337#if OMPT_SUPPORT
2338  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2339  void *codeptr = team->t.ompt_team_info.master_return_address;
2340#endif
2341
2342#if USE_ITT_BUILD
2343  if (__itt_stack_caller_create_ptr) {
2344    __kmp_itt_stack_caller_destroy(
2345        (__itt_caller)team->t
2346            .t_stack_id); // destroy the stack stitching id after join barrier
2347  }
2348
2349  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2350  if (team->t.t_active_level == 1 &&
2351      !master_th->th.th_teams_microtask) { /* not in teams construct */
2352    master_th->th.th_ident = loc;
2353    // only one notification scheme (either "submit" or "forking/joined", not
2354    // both)
2355    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2356        __kmp_forkjoin_frames_mode == 3)
2357      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2358                             master_th->th.th_frame_time, 0, loc,
2359                             master_th->th.th_team_nproc, 1);
2360    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2361             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2362      __kmp_itt_region_joined(gtid);
2363  } // active_level == 1
2364#endif /* USE_ITT_BUILD */
2365
2366  if (master_th->th.th_teams_microtask && !exit_teams &&
2367      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2368      team->t.t_level == master_th->th.th_teams_level + 1) {
2369// AC: We need to leave the team structure intact at the end of parallel
2370// inside the teams construct, so that at the next parallel same (hot) team
2371// works, only adjust nesting levels
2372#if OMPT_SUPPORT
2373    ompt_data_t ompt_parallel_data = ompt_data_none;
2374    if (ompt_enabled.enabled) {
2375      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2376      if (ompt_enabled.ompt_callback_implicit_task) {
2377        int ompt_team_size = team->t.t_nproc;
2378        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2379            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2380            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2381      }
2382      task_info->frame.exit_frame = ompt_data_none;
2383      task_info->task_data = ompt_data_none;
2384      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2385      __ompt_lw_taskteam_unlink(master_th);
2386    }
2387#endif
2388    /* Decrement our nested depth level */
2389    team->t.t_level--;
2390    team->t.t_active_level--;
2391    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2392
2393    // Restore number of threads in the team if needed. This code relies on
2394    // the proper adjustment of th_teams_size.nth after the fork in
2395    // __kmp_teams_master on each teams master in the case that
2396    // __kmp_reserve_threads reduced it.
2397    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2398      int old_num = master_th->th.th_team_nproc;
2399      int new_num = master_th->th.th_teams_size.nth;
2400      kmp_info_t **other_threads = team->t.t_threads;
2401      team->t.t_nproc = new_num;
2402      for (int i = 0; i < old_num; ++i) {
2403        other_threads[i]->th.th_team_nproc = new_num;
2404      }
2405      // Adjust states of non-used threads of the team
2406      for (int i = old_num; i < new_num; ++i) {
2407        // Re-initialize thread's barrier data.
2408        KMP_DEBUG_ASSERT(other_threads[i]);
2409        kmp_balign_t *balign = other_threads[i]->th.th_bar;
2410        for (int b = 0; b < bs_last_barrier; ++b) {
2411          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2412          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2413#if USE_DEBUGGER
2414          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2415#endif
2416        }
2417        if (__kmp_tasking_mode != tskm_immediate_exec) {
2418          // Synchronize thread's task state
2419          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2420        }
2421      }
2422    }
2423
2424#if OMPT_SUPPORT
2425    if (ompt_enabled.enabled) {
2426      __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2427                      OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2428    }
2429#endif
2430
2431    return;
2432  }
2433
2434  /* do cleanup and restore the parent team */
2435  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2436  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2437
2438  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2439
2440  /* jc: The following lock has instructions with REL and ACQ semantics,
2441     separating the parallel user code called in this parallel region
2442     from the serial user code called after this function returns. */
2443  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2444
2445  if (!master_th->th.th_teams_microtask ||
2446      team->t.t_level > master_th->th.th_teams_level) {
2447    /* Decrement our nested depth level */
2448    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2449  }
2450  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2451
2452#if OMPT_SUPPORT
2453  if (ompt_enabled.enabled) {
2454    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2455    if (ompt_enabled.ompt_callback_implicit_task) {
2456      int flags = (team_microtask == (void *)__kmp_teams_master)
2457                      ? ompt_task_initial
2458                      : ompt_task_implicit;
2459      int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2460      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2461          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2462          OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2463    }
2464    task_info->frame.exit_frame = ompt_data_none;
2465    task_info->task_data = ompt_data_none;
2466  }
2467#endif
2468
2469  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2470                master_th, team));
2471  __kmp_pop_current_task_from_thread(master_th);
2472
2473#if KMP_AFFINITY_SUPPORTED
2474  // Restore master thread's partition.
2475  master_th->th.th_first_place = team->t.t_first_place;
2476  master_th->th.th_last_place = team->t.t_last_place;
2477#endif // KMP_AFFINITY_SUPPORTED
2478  master_th->th.th_def_allocator = team->t.t_def_allocator;
2479
2480  updateHWFPControl(team);
2481
2482  if (root->r.r_active != master_active)
2483    root->r.r_active = master_active;
2484
2485  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2486                            master_th)); // this will free worker threads
2487
2488  /* this race was fun to find. make sure the following is in the critical
2489     region otherwise assertions may fail occasionally since the old team may be
2490     reallocated and the hierarchy appears inconsistent. it is actually safe to
2491     run and won't cause any bugs, but will cause those assertion failures. it's
2492     only one deref&assign so might as well put this in the critical region */
2493  master_th->th.th_team = parent_team;
2494  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2495  master_th->th.th_team_master = parent_team->t.t_threads[0];
2496  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2497
2498  /* restore serialized team, if need be */
2499  if (parent_team->t.t_serialized &&
2500      parent_team != master_th->th.th_serial_team &&
2501      parent_team != root->r.r_root_team) {
2502    __kmp_free_team(root,
2503                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2504    master_th->th.th_serial_team = parent_team;
2505  }
2506
2507  if (__kmp_tasking_mode != tskm_immediate_exec) {
2508    if (master_th->th.th_task_state_top >
2509        0) { // Restore task state from memo stack
2510      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2511      // Remember master's state if we re-use this nested hot team
2512      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2513          master_th->th.th_task_state;
2514      --master_th->th.th_task_state_top; // pop
2515      // Now restore state at this level
2516      master_th->th.th_task_state =
2517          master_th->th
2518              .th_task_state_memo_stack[master_th->th.th_task_state_top];
2519    }
2520    // Copy the task team from the parent team to the master thread
2521    master_th->th.th_task_team =
2522        parent_team->t.t_task_team[master_th->th.th_task_state];
2523    KA_TRACE(20,
2524             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2525              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2526              parent_team));
2527  }
2528
2529  // TODO: GEH - cannot do this assertion because root thread not set up as
2530  // executing
2531  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2532  master_th->th.th_current_task->td_flags.executing = 1;
2533
2534  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2535
2536#if OMPT_SUPPORT
2537  int flags =
2538      OMPT_INVOKER(fork_context) |
2539      ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2540                                                      : ompt_parallel_team);
2541  if (ompt_enabled.enabled) {
2542    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2543                    codeptr);
2544  }
2545#endif
2546
2547  KMP_MB();
2548  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2549}
2550
2551/* Check whether we should push an internal control record onto the
2552   serial team stack.  If so, do it.  */
2553void __kmp_save_internal_controls(kmp_info_t *thread) {
2554
2555  if (thread->th.th_team != thread->th.th_serial_team) {
2556    return;
2557  }
2558  if (thread->th.th_team->t.t_serialized > 1) {
2559    int push = 0;
2560
2561    if (thread->th.th_team->t.t_control_stack_top == NULL) {
2562      push = 1;
2563    } else {
2564      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2565          thread->th.th_team->t.t_serialized) {
2566        push = 1;
2567      }
2568    }
2569    if (push) { /* push a record on the serial team's stack */
2570      kmp_internal_control_t *control =
2571          (kmp_internal_control_t *)__kmp_allocate(
2572              sizeof(kmp_internal_control_t));
2573
2574      copy_icvs(control, &thread->th.th_current_task->td_icvs);
2575
2576      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2577
2578      control->next = thread->th.th_team->t.t_control_stack_top;
2579      thread->th.th_team->t.t_control_stack_top = control;
2580    }
2581  }
2582}
2583
2584/* Changes set_nproc */
2585void __kmp_set_num_threads(int new_nth, int gtid) {
2586  kmp_info_t *thread;
2587  kmp_root_t *root;
2588
2589  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2590  KMP_DEBUG_ASSERT(__kmp_init_serial);
2591
2592  if (new_nth < 1)
2593    new_nth = 1;
2594  else if (new_nth > __kmp_max_nth)
2595    new_nth = __kmp_max_nth;
2596
2597  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2598  thread = __kmp_threads[gtid];
2599  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2600    return; // nothing to do
2601
2602  __kmp_save_internal_controls(thread);
2603
2604  set__nproc(thread, new_nth);
2605
2606  // If this omp_set_num_threads() call will cause the hot team size to be
2607  // reduced (in the absence of a num_threads clause), then reduce it now,
2608  // rather than waiting for the next parallel region.
2609  root = thread->th.th_root;
2610  if (__kmp_init_parallel && (!root->r.r_active) &&
2611      (root->r.r_hot_team->t.t_nproc > new_nth)
2612#if KMP_NESTED_HOT_TEAMS
2613      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2614#endif
2615      ) {
2616    kmp_team_t *hot_team = root->r.r_hot_team;
2617    int f;
2618
2619    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2620
2621    // Release the extra threads we don't need any more.
2622    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2623      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2624      if (__kmp_tasking_mode != tskm_immediate_exec) {
2625        // When decreasing team size, threads no longer in the team should unref
2626        // task team.
2627        hot_team->t.t_threads[f]->th.th_task_team = NULL;
2628      }
2629      __kmp_free_thread(hot_team->t.t_threads[f]);
2630      hot_team->t.t_threads[f] = NULL;
2631    }
2632    hot_team->t.t_nproc = new_nth;
2633#if KMP_NESTED_HOT_TEAMS
2634    if (thread->th.th_hot_teams) {
2635      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2636      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2637    }
2638#endif
2639
2640    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2641
2642    // Update the t_nproc field in the threads that are still active.
2643    for (f = 0; f < new_nth; f++) {
2644      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2645      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2646    }
2647    // Special flag in case omp_set_num_threads() call
2648    hot_team->t.t_size_changed = -1;
2649  }
2650}
2651
2652/* Changes max_active_levels */
2653void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2654  kmp_info_t *thread;
2655
2656  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2657                "%d = (%d)\n",
2658                gtid, max_active_levels));
2659  KMP_DEBUG_ASSERT(__kmp_init_serial);
2660
2661  // validate max_active_levels
2662  if (max_active_levels < 0) {
2663    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2664    // We ignore this call if the user has specified a negative value.
2665    // The current setting won't be changed. The last valid setting will be
2666    // used. A warning will be issued (if warnings are allowed as controlled by
2667    // the KMP_WARNINGS env var).
2668    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2669                  "max_active_levels for thread %d = (%d)\n",
2670                  gtid, max_active_levels));
2671    return;
2672  }
2673  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2674    // it's OK, the max_active_levels is within the valid range: [ 0;
2675    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2676    // We allow a zero value. (implementation defined behavior)
2677  } else {
2678    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2679                KMP_MAX_ACTIVE_LEVELS_LIMIT);
2680    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2681    // Current upper limit is MAX_INT. (implementation defined behavior)
2682    // If the input exceeds the upper limit, we correct the input to be the
2683    // upper limit. (implementation defined behavior)
2684    // Actually, the flow should never get here until we use MAX_INT limit.
2685  }
2686  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2687                "max_active_levels for thread %d = (%d)\n",
2688                gtid, max_active_levels));
2689
2690  thread = __kmp_threads[gtid];
2691
2692  __kmp_save_internal_controls(thread);
2693
2694  set__max_active_levels(thread, max_active_levels);
2695}
2696
2697/* Gets max_active_levels */
2698int __kmp_get_max_active_levels(int gtid) {
2699  kmp_info_t *thread;
2700
2701  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2702  KMP_DEBUG_ASSERT(__kmp_init_serial);
2703
2704  thread = __kmp_threads[gtid];
2705  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2706  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2707                "curtask_maxaclevel=%d\n",
2708                gtid, thread->th.th_current_task,
2709                thread->th.th_current_task->td_icvs.max_active_levels));
2710  return thread->th.th_current_task->td_icvs.max_active_levels;
2711}
2712
2713KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2714KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2715
2716/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2717void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2718  kmp_info_t *thread;
2719  kmp_sched_t orig_kind;
2720  //    kmp_team_t *team;
2721
2722  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2723                gtid, (int)kind, chunk));
2724  KMP_DEBUG_ASSERT(__kmp_init_serial);
2725
2726  // Check if the kind parameter is valid, correct if needed.
2727  // Valid parameters should fit in one of two intervals - standard or extended:
2728  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2729  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2730  orig_kind = kind;
2731  kind = __kmp_sched_without_mods(kind);
2732
2733  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2734      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2735    // TODO: Hint needs attention in case we change the default schedule.
2736    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2737              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2738              __kmp_msg_null);
2739    kind = kmp_sched_default;
2740    chunk = 0; // ignore chunk value in case of bad kind
2741  }
2742
2743  thread = __kmp_threads[gtid];
2744
2745  __kmp_save_internal_controls(thread);
2746
2747  if (kind < kmp_sched_upper_std) {
2748    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2749      // differ static chunked vs. unchunked:  chunk should be invalid to
2750      // indicate unchunked schedule (which is the default)
2751      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2752    } else {
2753      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2754          __kmp_sch_map[kind - kmp_sched_lower - 1];
2755    }
2756  } else {
2757    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2758    //    kmp_sched_lower - 2 ];
2759    thread->th.th_current_task->td_icvs.sched.r_sched_type =
2760        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2761                      kmp_sched_lower - 2];
2762  }
2763  __kmp_sched_apply_mods_intkind(
2764      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2765  if (kind == kmp_sched_auto || chunk < 1) {
2766    // ignore parameter chunk for schedule auto
2767    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2768  } else {
2769    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2770  }
2771}
2772
2773/* Gets def_sched_var ICV values */
2774void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2775  kmp_info_t *thread;
2776  enum sched_type th_type;
2777
2778  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2779  KMP_DEBUG_ASSERT(__kmp_init_serial);
2780
2781  thread = __kmp_threads[gtid];
2782
2783  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2784  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2785  case kmp_sch_static:
2786  case kmp_sch_static_greedy:
2787  case kmp_sch_static_balanced:
2788    *kind = kmp_sched_static;
2789    __kmp_sched_apply_mods_stdkind(kind, th_type);
2790    *chunk = 0; // chunk was not set, try to show this fact via zero value
2791    return;
2792  case kmp_sch_static_chunked:
2793    *kind = kmp_sched_static;
2794    break;
2795  case kmp_sch_dynamic_chunked:
2796    *kind = kmp_sched_dynamic;
2797    break;
2798  case kmp_sch_guided_chunked:
2799  case kmp_sch_guided_iterative_chunked:
2800  case kmp_sch_guided_analytical_chunked:
2801    *kind = kmp_sched_guided;
2802    break;
2803  case kmp_sch_auto:
2804    *kind = kmp_sched_auto;
2805    break;
2806  case kmp_sch_trapezoidal:
2807    *kind = kmp_sched_trapezoidal;
2808    break;
2809#if KMP_STATIC_STEAL_ENABLED
2810  case kmp_sch_static_steal:
2811    *kind = kmp_sched_static_steal;
2812    break;
2813#endif
2814  default:
2815    KMP_FATAL(UnknownSchedulingType, th_type);
2816  }
2817
2818  __kmp_sched_apply_mods_stdkind(kind, th_type);
2819  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2820}
2821
2822int __kmp_get_ancestor_thread_num(int gtid, int level) {
2823
2824  int ii, dd;
2825  kmp_team_t *team;
2826  kmp_info_t *thr;
2827
2828  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2829  KMP_DEBUG_ASSERT(__kmp_init_serial);
2830
2831  // validate level
2832  if (level == 0)
2833    return 0;
2834  if (level < 0)
2835    return -1;
2836  thr = __kmp_threads[gtid];
2837  team = thr->th.th_team;
2838  ii = team->t.t_level;
2839  if (level > ii)
2840    return -1;
2841
2842  if (thr->th.th_teams_microtask) {
2843    // AC: we are in teams region where multiple nested teams have same level
2844    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2845    if (level <=
2846        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2847      KMP_DEBUG_ASSERT(ii >= tlevel);
2848      // AC: As we need to pass by the teams league, we need to artificially
2849      // increase ii
2850      if (ii == tlevel) {
2851        ii += 2; // three teams have same level
2852      } else {
2853        ii++; // two teams have same level
2854      }
2855    }
2856  }
2857
2858  if (ii == level)
2859    return __kmp_tid_from_gtid(gtid);
2860
2861  dd = team->t.t_serialized;
2862  level++;
2863  while (ii > level) {
2864    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2865    }
2866    if ((team->t.t_serialized) && (!dd)) {
2867      team = team->t.t_parent;
2868      continue;
2869    }
2870    if (ii > level) {
2871      team = team->t.t_parent;
2872      dd = team->t.t_serialized;
2873      ii--;
2874    }
2875  }
2876
2877  return (dd > 1) ? (0) : (team->t.t_master_tid);
2878}
2879
2880int __kmp_get_team_size(int gtid, int level) {
2881
2882  int ii, dd;
2883  kmp_team_t *team;
2884  kmp_info_t *thr;
2885
2886  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2887  KMP_DEBUG_ASSERT(__kmp_init_serial);
2888
2889  // validate level
2890  if (level == 0)
2891    return 1;
2892  if (level < 0)
2893    return -1;
2894  thr = __kmp_threads[gtid];
2895  team = thr->th.th_team;
2896  ii = team->t.t_level;
2897  if (level > ii)
2898    return -1;
2899
2900  if (thr->th.th_teams_microtask) {
2901    // AC: we are in teams region where multiple nested teams have same level
2902    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2903    if (level <=
2904        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2905      KMP_DEBUG_ASSERT(ii >= tlevel);
2906      // AC: As we need to pass by the teams league, we need to artificially
2907      // increase ii
2908      if (ii == tlevel) {
2909        ii += 2; // three teams have same level
2910      } else {
2911        ii++; // two teams have same level
2912      }
2913    }
2914  }
2915
2916  while (ii > level) {
2917    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2918    }
2919    if (team->t.t_serialized && (!dd)) {
2920      team = team->t.t_parent;
2921      continue;
2922    }
2923    if (ii > level) {
2924      team = team->t.t_parent;
2925      ii--;
2926    }
2927  }
2928
2929  return team->t.t_nproc;
2930}
2931
2932kmp_r_sched_t __kmp_get_schedule_global() {
2933  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2934  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2935  // independently. So one can get the updated schedule here.
2936
2937  kmp_r_sched_t r_sched;
2938
2939  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2940  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2941  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2942  // different roots (even in OMP 2.5)
2943  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2944  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2945  if (s == kmp_sch_static) {
2946    // replace STATIC with more detailed schedule (balanced or greedy)
2947    r_sched.r_sched_type = __kmp_static;
2948  } else if (s == kmp_sch_guided_chunked) {
2949    // replace GUIDED with more detailed schedule (iterative or analytical)
2950    r_sched.r_sched_type = __kmp_guided;
2951  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2952    r_sched.r_sched_type = __kmp_sched;
2953  }
2954  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2955
2956  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2957    // __kmp_chunk may be wrong here (if it was not ever set)
2958    r_sched.chunk = KMP_DEFAULT_CHUNK;
2959  } else {
2960    r_sched.chunk = __kmp_chunk;
2961  }
2962
2963  return r_sched;
2964}
2965
2966/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2967   at least argc number of *t_argv entries for the requested team. */
2968static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2969
2970  KMP_DEBUG_ASSERT(team);
2971  if (!realloc || argc > team->t.t_max_argc) {
2972
2973    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2974                   "current entries=%d\n",
2975                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2976    /* if previously allocated heap space for args, free them */
2977    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2978      __kmp_free((void *)team->t.t_argv);
2979
2980    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2981      /* use unused space in the cache line for arguments */
2982      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2983      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2984                     "argv entries\n",
2985                     team->t.t_id, team->t.t_max_argc));
2986      team->t.t_argv = &team->t.t_inline_argv[0];
2987      if (__kmp_storage_map) {
2988        __kmp_print_storage_map_gtid(
2989            -1, &team->t.t_inline_argv[0],
2990            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2991            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2992            team->t.t_id);
2993      }
2994    } else {
2995      /* allocate space for arguments in the heap */
2996      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2997                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
2998                               : 2 * argc;
2999      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3000                     "argv entries\n",
3001                     team->t.t_id, team->t.t_max_argc));
3002      team->t.t_argv =
3003          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3004      if (__kmp_storage_map) {
3005        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3006                                     &team->t.t_argv[team->t.t_max_argc],
3007                                     sizeof(void *) * team->t.t_max_argc,
3008                                     "team_%d.t_argv", team->t.t_id);
3009      }
3010    }
3011  }
3012}
3013
3014static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3015  int i;
3016  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3017  team->t.t_threads =
3018      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3019  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3020      sizeof(dispatch_shared_info_t) * num_disp_buff);
3021  team->t.t_dispatch =
3022      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3023  team->t.t_implicit_task_taskdata =
3024      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3025  team->t.t_max_nproc = max_nth;
3026
3027  /* setup dispatch buffers */
3028  for (i = 0; i < num_disp_buff; ++i) {
3029    team->t.t_disp_buffer[i].buffer_index = i;
3030    team->t.t_disp_buffer[i].doacross_buf_idx = i;
3031  }
3032}
3033
3034static void __kmp_free_team_arrays(kmp_team_t *team) {
3035  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3036  int i;
3037  for (i = 0; i < team->t.t_max_nproc; ++i) {
3038    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3039      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3040      team->t.t_dispatch[i].th_disp_buffer = NULL;
3041    }
3042  }
3043#if KMP_USE_HIER_SCHED
3044  __kmp_dispatch_free_hierarchies(team);
3045#endif
3046  __kmp_free(team->t.t_threads);
3047  __kmp_free(team->t.t_disp_buffer);
3048  __kmp_free(team->t.t_dispatch);
3049  __kmp_free(team->t.t_implicit_task_taskdata);
3050  team->t.t_threads = NULL;
3051  team->t.t_disp_buffer = NULL;
3052  team->t.t_dispatch = NULL;
3053  team->t.t_implicit_task_taskdata = 0;
3054}
3055
3056static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3057  kmp_info_t **oldThreads = team->t.t_threads;
3058
3059  __kmp_free(team->t.t_disp_buffer);
3060  __kmp_free(team->t.t_dispatch);
3061  __kmp_free(team->t.t_implicit_task_taskdata);
3062  __kmp_allocate_team_arrays(team, max_nth);
3063
3064  KMP_MEMCPY(team->t.t_threads, oldThreads,
3065             team->t.t_nproc * sizeof(kmp_info_t *));
3066
3067  __kmp_free(oldThreads);
3068}
3069
3070static kmp_internal_control_t __kmp_get_global_icvs(void) {
3071
3072  kmp_r_sched_t r_sched =
3073      __kmp_get_schedule_global(); // get current state of scheduling globals
3074
3075  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3076
3077  kmp_internal_control_t g_icvs = {
3078    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3079    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3080    // adjustment of threads (per thread)
3081    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3082    // whether blocktime is explicitly set
3083    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3084#if KMP_USE_MONITOR
3085    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3086// intervals
3087#endif
3088    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3089    // next parallel region (per thread)
3090    // (use a max ub on value if __kmp_parallel_initialize not called yet)
3091    __kmp_cg_max_nth, // int thread_limit;
3092    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3093    // for max_active_levels
3094    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3095    // {sched,chunk} pair
3096    __kmp_nested_proc_bind.bind_types[0],
3097    __kmp_default_device,
3098    NULL // struct kmp_internal_control *next;
3099  };
3100
3101  return g_icvs;
3102}
3103
3104static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3105
3106  kmp_internal_control_t gx_icvs;
3107  gx_icvs.serial_nesting_level =
3108      0; // probably =team->t.t_serial like in save_inter_controls
3109  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3110  gx_icvs.next = NULL;
3111
3112  return gx_icvs;
3113}
3114
3115static void __kmp_initialize_root(kmp_root_t *root) {
3116  int f;
3117  kmp_team_t *root_team;
3118  kmp_team_t *hot_team;
3119  int hot_team_max_nth;
3120  kmp_r_sched_t r_sched =
3121      __kmp_get_schedule_global(); // get current state of scheduling globals
3122  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3123  KMP_DEBUG_ASSERT(root);
3124  KMP_ASSERT(!root->r.r_begin);
3125
3126  /* setup the root state structure */
3127  __kmp_init_lock(&root->r.r_begin_lock);
3128  root->r.r_begin = FALSE;
3129  root->r.r_active = FALSE;
3130  root->r.r_in_parallel = 0;
3131  root->r.r_blocktime = __kmp_dflt_blocktime;
3132
3133  /* setup the root team for this task */
3134  /* allocate the root team structure */
3135  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3136
3137  root_team =
3138      __kmp_allocate_team(root,
3139                          1, // new_nproc
3140                          1, // max_nproc
3141#if OMPT_SUPPORT
3142                          ompt_data_none, // root parallel id
3143#endif
3144                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3145                          0 // argc
3146                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3147                          );
3148#if USE_DEBUGGER
3149  // Non-NULL value should be assigned to make the debugger display the root
3150  // team.
3151  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3152#endif
3153
3154  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3155
3156  root->r.r_root_team = root_team;
3157  root_team->t.t_control_stack_top = NULL;
3158
3159  /* initialize root team */
3160  root_team->t.t_threads[0] = NULL;
3161  root_team->t.t_nproc = 1;
3162  root_team->t.t_serialized = 1;
3163  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3164  root_team->t.t_sched.sched = r_sched.sched;
3165  KA_TRACE(
3166      20,
3167      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3168       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3169
3170  /* setup the  hot team for this task */
3171  /* allocate the hot team structure */
3172  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3173
3174  hot_team =
3175      __kmp_allocate_team(root,
3176                          1, // new_nproc
3177                          __kmp_dflt_team_nth_ub * 2, // max_nproc
3178#if OMPT_SUPPORT
3179                          ompt_data_none, // root parallel id
3180#endif
3181                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3182                          0 // argc
3183                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3184                          );
3185  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3186
3187  root->r.r_hot_team = hot_team;
3188  root_team->t.t_control_stack_top = NULL;
3189
3190  /* first-time initialization */
3191  hot_team->t.t_parent = root_team;
3192
3193  /* initialize hot team */
3194  hot_team_max_nth = hot_team->t.t_max_nproc;
3195  for (f = 0; f < hot_team_max_nth; ++f) {
3196    hot_team->t.t_threads[f] = NULL;
3197  }
3198  hot_team->t.t_nproc = 1;
3199  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3200  hot_team->t.t_sched.sched = r_sched.sched;
3201  hot_team->t.t_size_changed = 0;
3202}
3203
3204#ifdef KMP_DEBUG
3205
3206typedef struct kmp_team_list_item {
3207  kmp_team_p const *entry;
3208  struct kmp_team_list_item *next;
3209} kmp_team_list_item_t;
3210typedef kmp_team_list_item_t *kmp_team_list_t;
3211
3212static void __kmp_print_structure_team_accum( // Add team to list of teams.
3213    kmp_team_list_t list, // List of teams.
3214    kmp_team_p const *team // Team to add.
3215    ) {
3216
3217  // List must terminate with item where both entry and next are NULL.
3218  // Team is added to the list only once.
3219  // List is sorted in ascending order by team id.
3220  // Team id is *not* a key.
3221
3222  kmp_team_list_t l;
3223
3224  KMP_DEBUG_ASSERT(list != NULL);
3225  if (team == NULL) {
3226    return;
3227  }
3228
3229  __kmp_print_structure_team_accum(list, team->t.t_parent);
3230  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3231
3232  // Search list for the team.
3233  l = list;
3234  while (l->next != NULL && l->entry != team) {
3235    l = l->next;
3236  }
3237  if (l->next != NULL) {
3238    return; // Team has been added before, exit.
3239  }
3240
3241  // Team is not found. Search list again for insertion point.
3242  l = list;
3243  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3244    l = l->next;
3245  }
3246
3247  // Insert team.
3248  {
3249    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3250        sizeof(kmp_team_list_item_t));
3251    *item = *l;
3252    l->entry = team;
3253    l->next = item;
3254  }
3255}
3256
3257static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3258
3259                                       ) {
3260  __kmp_printf("%s", title);
3261  if (team != NULL) {
3262    __kmp_printf("%2x %p\n", team->t.t_id, team);
3263  } else {
3264    __kmp_printf(" - (nil)\n");
3265  }
3266}
3267
3268static void __kmp_print_structure_thread(char const *title,
3269                                         kmp_info_p const *thread) {
3270  __kmp_printf("%s", title);
3271  if (thread != NULL) {
3272    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3273  } else {
3274    __kmp_printf(" - (nil)\n");
3275  }
3276}
3277
3278void __kmp_print_structure(void) {
3279
3280  kmp_team_list_t list;
3281
3282  // Initialize list of teams.
3283  list =
3284      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3285  list->entry = NULL;
3286  list->next = NULL;
3287
3288  __kmp_printf("\n------------------------------\nGlobal Thread "
3289               "Table\n------------------------------\n");
3290  {
3291    int gtid;
3292    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3293      __kmp_printf("%2d", gtid);
3294      if (__kmp_threads != NULL) {
3295        __kmp_printf(" %p", __kmp_threads[gtid]);
3296      }
3297      if (__kmp_root != NULL) {
3298        __kmp_printf(" %p", __kmp_root[gtid]);
3299      }
3300      __kmp_printf("\n");
3301    }
3302  }
3303
3304  // Print out __kmp_threads array.
3305  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3306               "----------\n");
3307  if (__kmp_threads != NULL) {
3308    int gtid;
3309    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3310      kmp_info_t const *thread = __kmp_threads[gtid];
3311      if (thread != NULL) {
3312        __kmp_printf("GTID %2d %p:\n", gtid, thread);
3313        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3314        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3315        __kmp_print_structure_team("    Serial Team:  ",
3316                                   thread->th.th_serial_team);
3317        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3318        __kmp_print_structure_thread("    Master:       ",
3319                                     thread->th.th_team_master);
3320        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3321        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3322        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3323        __kmp_print_structure_thread("    Next in pool: ",
3324                                     thread->th.th_next_pool);
3325        __kmp_printf("\n");
3326        __kmp_print_structure_team_accum(list, thread->th.th_team);
3327        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3328      }
3329    }
3330  } else {
3331    __kmp_printf("Threads array is not allocated.\n");
3332  }
3333
3334  // Print out __kmp_root array.
3335  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3336               "--------\n");
3337  if (__kmp_root != NULL) {
3338    int gtid;
3339    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3340      kmp_root_t const *root = __kmp_root[gtid];
3341      if (root != NULL) {
3342        __kmp_printf("GTID %2d %p:\n", gtid, root);
3343        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3344        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3345        __kmp_print_structure_thread("    Uber Thread:  ",
3346                                     root->r.r_uber_thread);
3347        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3348        __kmp_printf("    In Parallel:  %2d\n",
3349                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3350        __kmp_printf("\n");
3351        __kmp_print_structure_team_accum(list, root->r.r_root_team);
3352        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3353      }
3354    }
3355  } else {
3356    __kmp_printf("Ubers array is not allocated.\n");
3357  }
3358
3359  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3360               "--------\n");
3361  while (list->next != NULL) {
3362    kmp_team_p const *team = list->entry;
3363    int i;
3364    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3365    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3366    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3367    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3368    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3369    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3370    for (i = 0; i < team->t.t_nproc; ++i) {
3371      __kmp_printf("    Thread %2d:      ", i);
3372      __kmp_print_structure_thread("", team->t.t_threads[i]);
3373    }
3374    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3375    __kmp_printf("\n");
3376    list = list->next;
3377  }
3378
3379  // Print out __kmp_thread_pool and __kmp_team_pool.
3380  __kmp_printf("\n------------------------------\nPools\n----------------------"
3381               "--------\n");
3382  __kmp_print_structure_thread("Thread pool:          ",
3383                               CCAST(kmp_info_t *, __kmp_thread_pool));
3384  __kmp_print_structure_team("Team pool:            ",
3385                             CCAST(kmp_team_t *, __kmp_team_pool));
3386  __kmp_printf("\n");
3387
3388  // Free team list.
3389  while (list != NULL) {
3390    kmp_team_list_item_t *item = list;
3391    list = list->next;
3392    KMP_INTERNAL_FREE(item);
3393  }
3394}
3395
3396#endif
3397
3398//---------------------------------------------------------------------------
3399//  Stuff for per-thread fast random number generator
3400//  Table of primes
3401static const unsigned __kmp_primes[] = {
3402    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3403    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3404    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3405    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3406    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3407    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3408    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3409    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3410    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3411    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3412    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3413
3414//---------------------------------------------------------------------------
3415//  __kmp_get_random: Get a random number using a linear congruential method.
3416unsigned short __kmp_get_random(kmp_info_t *thread) {
3417  unsigned x = thread->th.th_x;
3418  unsigned short r = x >> 16;
3419
3420  thread->th.th_x = x * thread->th.th_a + 1;
3421
3422  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3423                thread->th.th_info.ds.ds_tid, r));
3424
3425  return r;
3426}
3427//--------------------------------------------------------
3428// __kmp_init_random: Initialize a random number generator
3429void __kmp_init_random(kmp_info_t *thread) {
3430  unsigned seed = thread->th.th_info.ds.ds_tid;
3431
3432  thread->th.th_a =
3433      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3434  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3435  KA_TRACE(30,
3436           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3437}
3438
3439#if KMP_OS_WINDOWS
3440/* reclaim array entries for root threads that are already dead, returns number
3441 * reclaimed */
3442static int __kmp_reclaim_dead_roots(void) {
3443  int i, r = 0;
3444
3445  for (i = 0; i < __kmp_threads_capacity; ++i) {
3446    if (KMP_UBER_GTID(i) &&
3447        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3448        !__kmp_root[i]
3449             ->r.r_active) { // AC: reclaim only roots died in non-active state
3450      r += __kmp_unregister_root_other_thread(i);
3451    }
3452  }
3453  return r;
3454}
3455#endif
3456
3457/* This function attempts to create free entries in __kmp_threads and
3458   __kmp_root, and returns the number of free entries generated.
3459
3460   For Windows* OS static library, the first mechanism used is to reclaim array
3461   entries for root threads that are already dead.
3462
3463   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3464   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3465   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3466   threadprivate cache array has been created. Synchronization with
3467   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3468
3469   After any dead root reclamation, if the clipping value allows array expansion
3470   to result in the generation of a total of nNeed free slots, the function does
3471   that expansion. If not, nothing is done beyond the possible initial root
3472   thread reclamation.
3473
3474   If any argument is negative, the behavior is undefined. */
3475static int __kmp_expand_threads(int nNeed) {
3476  int added = 0;
3477  int minimumRequiredCapacity;
3478  int newCapacity;
3479  kmp_info_t **newThreads;
3480  kmp_root_t **newRoot;
3481
3482// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3483// resizing __kmp_threads does not need additional protection if foreign
3484// threads are present
3485
3486#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3487  /* only for Windows static library */
3488  /* reclaim array entries for root threads that are already dead */
3489  added = __kmp_reclaim_dead_roots();
3490
3491  if (nNeed) {
3492    nNeed -= added;
3493    if (nNeed < 0)
3494      nNeed = 0;
3495  }
3496#endif
3497  if (nNeed <= 0)
3498    return added;
3499
3500  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3501  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3502  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3503  // > __kmp_max_nth in one of two ways:
3504  //
3505  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3506  //    may not be resused by another thread, so we may need to increase
3507  //    __kmp_threads_capacity to __kmp_max_nth + 1.
3508  //
3509  // 2) New foreign root(s) are encountered.  We always register new foreign
3510  //    roots. This may cause a smaller # of threads to be allocated at
3511  //    subsequent parallel regions, but the worker threads hang around (and
3512  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3513  //
3514  // Anyway, that is the reason for moving the check to see if
3515  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3516  // instead of having it performed here. -BB
3517
3518  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3519
3520  /* compute expansion headroom to check if we can expand */
3521  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3522    /* possible expansion too small -- give up */
3523    return added;
3524  }
3525  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3526
3527  newCapacity = __kmp_threads_capacity;
3528  do {
3529    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3530                                                          : __kmp_sys_max_nth;
3531  } while (newCapacity < minimumRequiredCapacity);
3532  newThreads = (kmp_info_t **)__kmp_allocate(
3533      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3534  newRoot =
3535      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3536  KMP_MEMCPY(newThreads, __kmp_threads,
3537             __kmp_threads_capacity * sizeof(kmp_info_t *));
3538  KMP_MEMCPY(newRoot, __kmp_root,
3539             __kmp_threads_capacity * sizeof(kmp_root_t *));
3540
3541  kmp_info_t **temp_threads = __kmp_threads;
3542  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3543  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3544  __kmp_free(temp_threads);
3545  added += newCapacity - __kmp_threads_capacity;
3546  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3547
3548  if (newCapacity > __kmp_tp_capacity) {
3549    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3550    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3551      __kmp_threadprivate_resize_cache(newCapacity);
3552    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3553      *(volatile int *)&__kmp_tp_capacity = newCapacity;
3554    }
3555    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3556  }
3557
3558  return added;
3559}
3560
3561/* Register the current thread as a root thread and obtain our gtid. We must
3562   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3563   thread that calls from __kmp_do_serial_initialize() */
3564int __kmp_register_root(int initial_thread) {
3565  kmp_info_t *root_thread;
3566  kmp_root_t *root;
3567  int gtid;
3568  int capacity;
3569  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3570  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3571  KMP_MB();
3572
3573  /* 2007-03-02:
3574     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3575     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3576     work as expected -- it may return false (that means there is at least one
3577     empty slot in __kmp_threads array), but it is possible the only free slot
3578     is #0, which is reserved for initial thread and so cannot be used for this
3579     one. Following code workarounds this bug.
3580
3581     However, right solution seems to be not reserving slot #0 for initial
3582     thread because:
3583     (1) there is no magic in slot #0,
3584     (2) we cannot detect initial thread reliably (the first thread which does
3585        serial initialization may be not a real initial thread).
3586  */
3587  capacity = __kmp_threads_capacity;
3588  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3589    --capacity;
3590  }
3591
3592  /* see if there are too many threads */
3593  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3594    if (__kmp_tp_cached) {
3595      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3596                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3597                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3598    } else {
3599      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3600                  __kmp_msg_null);
3601    }
3602  }
3603
3604  /* find an available thread slot */
3605  /* Don't reassign the zero slot since we need that to only be used by initial
3606     thread */
3607  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3608       gtid++)
3609    ;
3610  KA_TRACE(1,
3611           ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3612  KMP_ASSERT(gtid < __kmp_threads_capacity);
3613
3614  /* update global accounting */
3615  __kmp_all_nth++;
3616  TCW_4(__kmp_nth, __kmp_nth + 1);
3617
3618  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3619  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3620  if (__kmp_adjust_gtid_mode) {
3621    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3622      if (TCR_4(__kmp_gtid_mode) != 2) {
3623        TCW_4(__kmp_gtid_mode, 2);
3624      }
3625    } else {
3626      if (TCR_4(__kmp_gtid_mode) != 1) {
3627        TCW_4(__kmp_gtid_mode, 1);
3628      }
3629    }
3630  }
3631
3632#ifdef KMP_ADJUST_BLOCKTIME
3633  /* Adjust blocktime to zero if necessary            */
3634  /* Middle initialization might not have occurred yet */
3635  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3636    if (__kmp_nth > __kmp_avail_proc) {
3637      __kmp_zero_bt = TRUE;
3638    }
3639  }
3640#endif /* KMP_ADJUST_BLOCKTIME */
3641
3642  /* setup this new hierarchy */
3643  if (!(root = __kmp_root[gtid])) {
3644    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3645    KMP_DEBUG_ASSERT(!root->r.r_root_team);
3646  }
3647
3648#if KMP_STATS_ENABLED
3649  // Initialize stats as soon as possible (right after gtid assignment).
3650  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3651  __kmp_stats_thread_ptr->startLife();
3652  KMP_SET_THREAD_STATE(SERIAL_REGION);
3653  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3654#endif
3655  __kmp_initialize_root(root);
3656
3657  /* setup new root thread structure */
3658  if (root->r.r_uber_thread) {
3659    root_thread = root->r.r_uber_thread;
3660  } else {
3661    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3662    if (__kmp_storage_map) {
3663      __kmp_print_thread_storage_map(root_thread, gtid);
3664    }
3665    root_thread->th.th_info.ds.ds_gtid = gtid;
3666#if OMPT_SUPPORT
3667    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3668#endif
3669    root_thread->th.th_root = root;
3670    if (__kmp_env_consistency_check) {
3671      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3672    }
3673#if USE_FAST_MEMORY
3674    __kmp_initialize_fast_memory(root_thread);
3675#endif /* USE_FAST_MEMORY */
3676
3677#if KMP_USE_BGET
3678    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3679    __kmp_initialize_bget(root_thread);
3680#endif
3681    __kmp_init_random(root_thread); // Initialize random number generator
3682  }
3683
3684  /* setup the serial team held in reserve by the root thread */
3685  if (!root_thread->th.th_serial_team) {
3686    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3687    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3688    root_thread->th.th_serial_team = __kmp_allocate_team(
3689        root, 1, 1,
3690#if OMPT_SUPPORT
3691        ompt_data_none, // root parallel id
3692#endif
3693        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3694  }
3695  KMP_ASSERT(root_thread->th.th_serial_team);
3696  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3697                root_thread->th.th_serial_team));
3698
3699  /* drop root_thread into place */
3700  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3701
3702  root->r.r_root_team->t.t_threads[0] = root_thread;
3703  root->r.r_hot_team->t.t_threads[0] = root_thread;
3704  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3705  // AC: the team created in reserve, not for execution (it is unused for now).
3706  root_thread->th.th_serial_team->t.t_serialized = 0;
3707  root->r.r_uber_thread = root_thread;
3708
3709  /* initialize the thread, get it ready to go */
3710  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3711  TCW_4(__kmp_init_gtid, TRUE);
3712
3713  /* prepare the master thread for get_gtid() */
3714  __kmp_gtid_set_specific(gtid);
3715
3716#if USE_ITT_BUILD
3717  __kmp_itt_thread_name(gtid);
3718#endif /* USE_ITT_BUILD */
3719
3720#ifdef KMP_TDATA_GTID
3721  __kmp_gtid = gtid;
3722#endif
3723  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3724  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3725
3726  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3727                "plain=%u\n",
3728                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3729                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3730                KMP_INIT_BARRIER_STATE));
3731  { // Initialize barrier data.
3732    int b;
3733    for (b = 0; b < bs_last_barrier; ++b) {
3734      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3735#if USE_DEBUGGER
3736      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3737#endif
3738    }
3739  }
3740  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3741                   KMP_INIT_BARRIER_STATE);
3742
3743#if KMP_AFFINITY_SUPPORTED
3744  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3745  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3746  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3747  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3748  if (TCR_4(__kmp_init_middle)) {
3749    __kmp_affinity_set_init_mask(gtid, TRUE);
3750  }
3751#endif /* KMP_AFFINITY_SUPPORTED */
3752  root_thread->th.th_def_allocator = __kmp_def_allocator;
3753  root_thread->th.th_prev_level = 0;
3754  root_thread->th.th_prev_num_threads = 1;
3755
3756  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3757  tmp->cg_root = root_thread;
3758  tmp->cg_thread_limit = __kmp_cg_max_nth;
3759  tmp->cg_nthreads = 1;
3760  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3761                 " cg_nthreads init to 1\n",
3762                 root_thread, tmp));
3763  tmp->up = NULL;
3764  root_thread->th.th_cg_roots = tmp;
3765
3766  __kmp_root_counter++;
3767
3768#if OMPT_SUPPORT
3769  if (!initial_thread && ompt_enabled.enabled) {
3770
3771    kmp_info_t *root_thread = ompt_get_thread();
3772
3773    ompt_set_thread_state(root_thread, ompt_state_overhead);
3774
3775    if (ompt_enabled.ompt_callback_thread_begin) {
3776      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3777          ompt_thread_initial, __ompt_get_thread_data_internal());
3778    }
3779    ompt_data_t *task_data;
3780    ompt_data_t *parallel_data;
3781    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3782    if (ompt_enabled.ompt_callback_implicit_task) {
3783      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3784          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3785    }
3786
3787    ompt_set_thread_state(root_thread, ompt_state_work_serial);
3788  }
3789#endif
3790
3791  KMP_MB();
3792  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3793
3794  return gtid;
3795}
3796
3797#if KMP_NESTED_HOT_TEAMS
3798static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3799                                const int max_level) {
3800  int i, n, nth;
3801  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3802  if (!hot_teams || !hot_teams[level].hot_team) {
3803    return 0;
3804  }
3805  KMP_DEBUG_ASSERT(level < max_level);
3806  kmp_team_t *team = hot_teams[level].hot_team;
3807  nth = hot_teams[level].hot_team_nth;
3808  n = nth - 1; // master is not freed
3809  if (level < max_level - 1) {
3810    for (i = 0; i < nth; ++i) {
3811      kmp_info_t *th = team->t.t_threads[i];
3812      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3813      if (i > 0 && th->th.th_hot_teams) {
3814        __kmp_free(th->th.th_hot_teams);
3815        th->th.th_hot_teams = NULL;
3816      }
3817    }
3818  }
3819  __kmp_free_team(root, team, NULL);
3820  return n;
3821}
3822#endif
3823
3824// Resets a root thread and clear its root and hot teams.
3825// Returns the number of __kmp_threads entries directly and indirectly freed.
3826static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3827  kmp_team_t *root_team = root->r.r_root_team;
3828  kmp_team_t *hot_team = root->r.r_hot_team;
3829  int n = hot_team->t.t_nproc;
3830  int i;
3831
3832  KMP_DEBUG_ASSERT(!root->r.r_active);
3833
3834  root->r.r_root_team = NULL;
3835  root->r.r_hot_team = NULL;
3836  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3837  // before call to __kmp_free_team().
3838  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3839#if KMP_NESTED_HOT_TEAMS
3840  if (__kmp_hot_teams_max_level >
3841      0) { // need to free nested hot teams and their threads if any
3842    for (i = 0; i < hot_team->t.t_nproc; ++i) {
3843      kmp_info_t *th = hot_team->t.t_threads[i];
3844      if (__kmp_hot_teams_max_level > 1) {
3845        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3846      }
3847      if (th->th.th_hot_teams) {
3848        __kmp_free(th->th.th_hot_teams);
3849        th->th.th_hot_teams = NULL;
3850      }
3851    }
3852  }
3853#endif
3854  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3855
3856  // Before we can reap the thread, we need to make certain that all other
3857  // threads in the teams that had this root as ancestor have stopped trying to
3858  // steal tasks.
3859  if (__kmp_tasking_mode != tskm_immediate_exec) {
3860    __kmp_wait_to_unref_task_teams();
3861  }
3862
3863#if KMP_OS_WINDOWS
3864  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3865  KA_TRACE(
3866      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3867           "\n",
3868           (LPVOID) & (root->r.r_uber_thread->th),
3869           root->r.r_uber_thread->th.th_info.ds.ds_thread));
3870  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3871#endif /* KMP_OS_WINDOWS */
3872
3873#if OMPT_SUPPORT
3874  ompt_data_t *task_data;
3875  ompt_data_t *parallel_data;
3876  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3877  if (ompt_enabled.ompt_callback_implicit_task) {
3878    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3879        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3880  }
3881  if (ompt_enabled.ompt_callback_thread_end) {
3882    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3883        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3884  }
3885#endif
3886
3887  TCW_4(__kmp_nth,
3888        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3889  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3890  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3891                 " to %d\n",
3892                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3893                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3894  if (i == 1) {
3895    // need to free contention group structure
3896    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3897                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
3898    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3899    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3900    root->r.r_uber_thread->th.th_cg_roots = NULL;
3901  }
3902  __kmp_reap_thread(root->r.r_uber_thread, 1);
3903
3904  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3905  // instead of freeing.
3906  root->r.r_uber_thread = NULL;
3907  /* mark root as no longer in use */
3908  root->r.r_begin = FALSE;
3909
3910  return n;
3911}
3912
3913void __kmp_unregister_root_current_thread(int gtid) {
3914  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3915  /* this lock should be ok, since unregister_root_current_thread is never
3916     called during an abort, only during a normal close. furthermore, if you
3917     have the forkjoin lock, you should never try to get the initz lock */
3918  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3919  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3920    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3921                  "exiting T#%d\n",
3922                  gtid));
3923    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3924    return;
3925  }
3926  kmp_root_t *root = __kmp_root[gtid];
3927
3928  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3929  KMP_ASSERT(KMP_UBER_GTID(gtid));
3930  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3931  KMP_ASSERT(root->r.r_active == FALSE);
3932
3933  KMP_MB();
3934
3935  kmp_info_t *thread = __kmp_threads[gtid];
3936  kmp_team_t *team = thread->th.th_team;
3937  kmp_task_team_t *task_team = thread->th.th_task_team;
3938
3939  // we need to wait for the proxy tasks before finishing the thread
3940  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3941#if OMPT_SUPPORT
3942    // the runtime is shutting down so we won't report any events
3943    thread->th.ompt_thread_info.state = ompt_state_undefined;
3944#endif
3945    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3946  }
3947
3948  __kmp_reset_root(gtid, root);
3949
3950  /* free up this thread slot */
3951  __kmp_gtid_set_specific(KMP_GTID_DNE);
3952#ifdef KMP_TDATA_GTID
3953  __kmp_gtid = KMP_GTID_DNE;
3954#endif
3955
3956  KMP_MB();
3957  KC_TRACE(10,
3958           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3959
3960  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3961}
3962
3963#if KMP_OS_WINDOWS
3964/* __kmp_forkjoin_lock must be already held
3965   Unregisters a root thread that is not the current thread.  Returns the number
3966   of __kmp_threads entries freed as a result. */
3967static int __kmp_unregister_root_other_thread(int gtid) {
3968  kmp_root_t *root = __kmp_root[gtid];
3969  int r;
3970
3971  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3972  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3973  KMP_ASSERT(KMP_UBER_GTID(gtid));
3974  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3975  KMP_ASSERT(root->r.r_active == FALSE);
3976
3977  r = __kmp_reset_root(gtid, root);
3978  KC_TRACE(10,
3979           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3980  return r;
3981}
3982#endif
3983
3984#if KMP_DEBUG
3985void __kmp_task_info() {
3986
3987  kmp_int32 gtid = __kmp_entry_gtid();
3988  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3989  kmp_info_t *this_thr = __kmp_threads[gtid];
3990  kmp_team_t *steam = this_thr->th.th_serial_team;
3991  kmp_team_t *team = this_thr->th.th_team;
3992
3993  __kmp_printf(
3994      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3995      "ptask=%p\n",
3996      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3997      team->t.t_implicit_task_taskdata[tid].td_parent);
3998}
3999#endif // KMP_DEBUG
4000
4001/* TODO optimize with one big memclr, take out what isn't needed, split
4002   responsibility to workers as much as possible, and delay initialization of
4003   features as much as possible  */
4004static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4005                                  int tid, int gtid) {
4006  /* this_thr->th.th_info.ds.ds_gtid is setup in
4007     kmp_allocate_thread/create_worker.
4008     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4009  kmp_info_t *master = team->t.t_threads[0];
4010  KMP_DEBUG_ASSERT(this_thr != NULL);
4011  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4012  KMP_DEBUG_ASSERT(team);
4013  KMP_DEBUG_ASSERT(team->t.t_threads);
4014  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4015  KMP_DEBUG_ASSERT(master);
4016  KMP_DEBUG_ASSERT(master->th.th_root);
4017
4018  KMP_MB();
4019
4020  TCW_SYNC_PTR(this_thr->th.th_team, team);
4021
4022  this_thr->th.th_info.ds.ds_tid = tid;
4023  this_thr->th.th_set_nproc = 0;
4024  if (__kmp_tasking_mode != tskm_immediate_exec)
4025    // When tasking is possible, threads are not safe to reap until they are
4026    // done tasking; this will be set when tasking code is exited in wait
4027    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4028  else // no tasking --> always safe to reap
4029    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4030  this_thr->th.th_set_proc_bind = proc_bind_default;
4031#if KMP_AFFINITY_SUPPORTED
4032  this_thr->th.th_new_place = this_thr->th.th_current_place;
4033#endif
4034  this_thr->th.th_root = master->th.th_root;
4035
4036  /* setup the thread's cache of the team structure */
4037  this_thr->th.th_team_nproc = team->t.t_nproc;
4038  this_thr->th.th_team_master = master;
4039  this_thr->th.th_team_serialized = team->t.t_serialized;
4040  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4041
4042  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4043
4044  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4045                tid, gtid, this_thr, this_thr->th.th_current_task));
4046
4047  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4048                           team, tid, TRUE);
4049
4050  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4051                tid, gtid, this_thr, this_thr->th.th_current_task));
4052  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4053  // __kmp_initialize_team()?
4054
4055  /* TODO no worksharing in speculative threads */
4056  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4057
4058  this_thr->th.th_local.this_construct = 0;
4059
4060  if (!this_thr->th.th_pri_common) {
4061    this_thr->th.th_pri_common =
4062        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4063    if (__kmp_storage_map) {
4064      __kmp_print_storage_map_gtid(
4065          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4066          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4067    }
4068    this_thr->th.th_pri_head = NULL;
4069  }
4070
4071  if (this_thr != master && // Master's CG root is initialized elsewhere
4072      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4073    // Make new thread's CG root same as master's
4074    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4075    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4076    if (tmp) {
4077      // worker changes CG, need to check if old CG should be freed
4078      int i = tmp->cg_nthreads--;
4079      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4080                     " on node %p of thread %p to %d\n",
4081                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4082      if (i == 1) {
4083        __kmp_free(tmp); // last thread left CG --> free it
4084      }
4085    }
4086    this_thr->th.th_cg_roots = master->th.th_cg_roots;
4087    // Increment new thread's CG root's counter to add the new thread
4088    this_thr->th.th_cg_roots->cg_nthreads++;
4089    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4090                   " node %p of thread %p to %d\n",
4091                   this_thr, this_thr->th.th_cg_roots,
4092                   this_thr->th.th_cg_roots->cg_root,
4093                   this_thr->th.th_cg_roots->cg_nthreads));
4094    this_thr->th.th_current_task->td_icvs.thread_limit =
4095        this_thr->th.th_cg_roots->cg_thread_limit;
4096  }
4097
4098  /* Initialize dynamic dispatch */
4099  {
4100    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4101    // Use team max_nproc since this will never change for the team.
4102    size_t disp_size =
4103        sizeof(dispatch_private_info_t) *
4104        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4105    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4106                  team->t.t_max_nproc));
4107    KMP_ASSERT(dispatch);
4108    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4109    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4110
4111    dispatch->th_disp_index = 0;
4112    dispatch->th_doacross_buf_idx = 0;
4113    if (!dispatch->th_disp_buffer) {
4114      dispatch->th_disp_buffer =
4115          (dispatch_private_info_t *)__kmp_allocate(disp_size);
4116
4117      if (__kmp_storage_map) {
4118        __kmp_print_storage_map_gtid(
4119            gtid, &dispatch->th_disp_buffer[0],
4120            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4121                                          ? 1
4122                                          : __kmp_dispatch_num_buffers],
4123            disp_size, "th_%d.th_dispatch.th_disp_buffer "
4124                       "(team_%d.t_dispatch[%d].th_disp_buffer)",
4125            gtid, team->t.t_id, gtid);
4126      }
4127    } else {
4128      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4129    }
4130
4131    dispatch->th_dispatch_pr_current = 0;
4132    dispatch->th_dispatch_sh_current = 0;
4133
4134    dispatch->th_deo_fcn = 0; /* ORDERED     */
4135    dispatch->th_dxo_fcn = 0; /* END ORDERED */
4136  }
4137
4138  this_thr->th.th_next_pool = NULL;
4139
4140  if (!this_thr->th.th_task_state_memo_stack) {
4141    size_t i;
4142    this_thr->th.th_task_state_memo_stack =
4143        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4144    this_thr->th.th_task_state_top = 0;
4145    this_thr->th.th_task_state_stack_sz = 4;
4146    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4147         ++i) // zero init the stack
4148      this_thr->th.th_task_state_memo_stack[i] = 0;
4149  }
4150
4151  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4152  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4153
4154  KMP_MB();
4155}
4156
4157/* allocate a new thread for the requesting team. this is only called from
4158   within a forkjoin critical section. we will first try to get an available
4159   thread from the thread pool. if none is available, we will fork a new one
4160   assuming we are able to create a new one. this should be assured, as the
4161   caller should check on this first. */
4162kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4163                                  int new_tid) {
4164  kmp_team_t *serial_team;
4165  kmp_info_t *new_thr;
4166  int new_gtid;
4167
4168  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4169  KMP_DEBUG_ASSERT(root && team);
4170#if !KMP_NESTED_HOT_TEAMS
4171  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4172#endif
4173  KMP_MB();
4174
4175  /* first, try to get one from the thread pool */
4176  if (__kmp_thread_pool) {
4177    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4178    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4179    if (new_thr == __kmp_thread_pool_insert_pt) {
4180      __kmp_thread_pool_insert_pt = NULL;
4181    }
4182    TCW_4(new_thr->th.th_in_pool, FALSE);
4183    __kmp_suspend_initialize_thread(new_thr);
4184    __kmp_lock_suspend_mx(new_thr);
4185    if (new_thr->th.th_active_in_pool == TRUE) {
4186      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4187      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4188      new_thr->th.th_active_in_pool = FALSE;
4189    }
4190    __kmp_unlock_suspend_mx(new_thr);
4191
4192    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4193                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4194    KMP_ASSERT(!new_thr->th.th_team);
4195    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4196
4197    /* setup the thread structure */
4198    __kmp_initialize_info(new_thr, team, new_tid,
4199                          new_thr->th.th_info.ds.ds_gtid);
4200    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4201
4202    TCW_4(__kmp_nth, __kmp_nth + 1);
4203
4204    new_thr->th.th_task_state = 0;
4205    new_thr->th.th_task_state_top = 0;
4206    new_thr->th.th_task_state_stack_sz = 4;
4207
4208#ifdef KMP_ADJUST_BLOCKTIME
4209    /* Adjust blocktime back to zero if necessary */
4210    /* Middle initialization might not have occurred yet */
4211    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4212      if (__kmp_nth > __kmp_avail_proc) {
4213        __kmp_zero_bt = TRUE;
4214      }
4215    }
4216#endif /* KMP_ADJUST_BLOCKTIME */
4217
4218#if KMP_DEBUG
4219    // If thread entered pool via __kmp_free_thread, wait_flag should !=
4220    // KMP_BARRIER_PARENT_FLAG.
4221    int b;
4222    kmp_balign_t *balign = new_thr->th.th_bar;
4223    for (b = 0; b < bs_last_barrier; ++b)
4224      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4225#endif
4226
4227    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4228                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4229
4230    KMP_MB();
4231    return new_thr;
4232  }
4233
4234  /* no, well fork a new one */
4235  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4236  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4237
4238#if KMP_USE_MONITOR
4239  // If this is the first worker thread the RTL is creating, then also
4240  // launch the monitor thread.  We try to do this as early as possible.
4241  if (!TCR_4(__kmp_init_monitor)) {
4242    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4243    if (!TCR_4(__kmp_init_monitor)) {
4244      KF_TRACE(10, ("before __kmp_create_monitor\n"));
4245      TCW_4(__kmp_init_monitor, 1);
4246      __kmp_create_monitor(&__kmp_monitor);
4247      KF_TRACE(10, ("after __kmp_create_monitor\n"));
4248#if KMP_OS_WINDOWS
4249      // AC: wait until monitor has started. This is a fix for CQ232808.
4250      // The reason is that if the library is loaded/unloaded in a loop with
4251      // small (parallel) work in between, then there is high probability that
4252      // monitor thread started after the library shutdown. At shutdown it is
4253      // too late to cope with the problem, because when the master is in
4254      // DllMain (process detach) the monitor has no chances to start (it is
4255      // blocked), and master has no means to inform the monitor that the
4256      // library has gone, because all the memory which the monitor can access
4257      // is going to be released/reset.
4258      while (TCR_4(__kmp_init_monitor) < 2) {
4259        KMP_YIELD(TRUE);
4260      }
4261      KF_TRACE(10, ("after monitor thread has started\n"));
4262#endif
4263    }
4264    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4265  }
4266#endif
4267
4268  KMP_MB();
4269  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4270    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4271  }
4272
4273  /* allocate space for it. */
4274  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4275
4276  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4277
4278  if (__kmp_storage_map) {
4279    __kmp_print_thread_storage_map(new_thr, new_gtid);
4280  }
4281
4282  // add the reserve serialized team, initialized from the team's master thread
4283  {
4284    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4285    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4286    new_thr->th.th_serial_team = serial_team =
4287        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4288#if OMPT_SUPPORT
4289                                          ompt_data_none, // root parallel id
4290#endif
4291                                          proc_bind_default, &r_icvs,
4292                                          0 USE_NESTED_HOT_ARG(NULL));
4293  }
4294  KMP_ASSERT(serial_team);
4295  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4296  // execution (it is unused for now).
4297  serial_team->t.t_threads[0] = new_thr;
4298  KF_TRACE(10,
4299           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4300            new_thr));
4301
4302  /* setup the thread structures */
4303  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4304
4305#if USE_FAST_MEMORY
4306  __kmp_initialize_fast_memory(new_thr);
4307#endif /* USE_FAST_MEMORY */
4308
4309#if KMP_USE_BGET
4310  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4311  __kmp_initialize_bget(new_thr);
4312#endif
4313
4314  __kmp_init_random(new_thr); // Initialize random number generator
4315
4316  /* Initialize these only once when thread is grabbed for a team allocation */
4317  KA_TRACE(20,
4318           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4319            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4320
4321  int b;
4322  kmp_balign_t *balign = new_thr->th.th_bar;
4323  for (b = 0; b < bs_last_barrier; ++b) {
4324    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4325    balign[b].bb.team = NULL;
4326    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4327    balign[b].bb.use_oncore_barrier = 0;
4328  }
4329
4330  new_thr->th.th_spin_here = FALSE;
4331  new_thr->th.th_next_waiting = 0;
4332#if KMP_OS_UNIX
4333  new_thr->th.th_blocking = false;
4334#endif
4335
4336#if KMP_AFFINITY_SUPPORTED
4337  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4338  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4339  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4340  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4341#endif
4342  new_thr->th.th_def_allocator = __kmp_def_allocator;
4343  new_thr->th.th_prev_level = 0;
4344  new_thr->th.th_prev_num_threads = 1;
4345
4346  TCW_4(new_thr->th.th_in_pool, FALSE);
4347  new_thr->th.th_active_in_pool = FALSE;
4348  TCW_4(new_thr->th.th_active, TRUE);
4349
4350  /* adjust the global counters */
4351  __kmp_all_nth++;
4352  __kmp_nth++;
4353
4354  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4355  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4356  if (__kmp_adjust_gtid_mode) {
4357    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4358      if (TCR_4(__kmp_gtid_mode) != 2) {
4359        TCW_4(__kmp_gtid_mode, 2);
4360      }
4361    } else {
4362      if (TCR_4(__kmp_gtid_mode) != 1) {
4363        TCW_4(__kmp_gtid_mode, 1);
4364      }
4365    }
4366  }
4367
4368#ifdef KMP_ADJUST_BLOCKTIME
4369  /* Adjust blocktime back to zero if necessary       */
4370  /* Middle initialization might not have occurred yet */
4371  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4372    if (__kmp_nth > __kmp_avail_proc) {
4373      __kmp_zero_bt = TRUE;
4374    }
4375  }
4376#endif /* KMP_ADJUST_BLOCKTIME */
4377
4378  /* actually fork it and create the new worker thread */
4379  KF_TRACE(
4380      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4381  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4382  KF_TRACE(10,
4383           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4384
4385  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4386                new_gtid));
4387  KMP_MB();
4388  return new_thr;
4389}
4390
4391/* Reinitialize team for reuse.
4392   The hot team code calls this case at every fork barrier, so EPCC barrier
4393   test are extremely sensitive to changes in it, esp. writes to the team
4394   struct, which cause a cache invalidation in all threads.
4395   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4396static void __kmp_reinitialize_team(kmp_team_t *team,
4397                                    kmp_internal_control_t *new_icvs,
4398                                    ident_t *loc) {
4399  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4400                team->t.t_threads[0], team));
4401  KMP_DEBUG_ASSERT(team && new_icvs);
4402  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4403  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4404
4405  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4406  // Copy ICVs to the master thread's implicit taskdata
4407  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4408  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4409
4410  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4411                team->t.t_threads[0], team));
4412}
4413
4414/* Initialize the team data structure.
4415   This assumes the t_threads and t_max_nproc are already set.
4416   Also, we don't touch the arguments */
4417static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4418                                  kmp_internal_control_t *new_icvs,
4419                                  ident_t *loc) {
4420  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4421
4422  /* verify */
4423  KMP_DEBUG_ASSERT(team);
4424  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4425  KMP_DEBUG_ASSERT(team->t.t_threads);
4426  KMP_MB();
4427
4428  team->t.t_master_tid = 0; /* not needed */
4429  /* team->t.t_master_bar;        not needed */
4430  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4431  team->t.t_nproc = new_nproc;
4432
4433  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4434  team->t.t_next_pool = NULL;
4435  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4436   * up hot team */
4437
4438  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4439  team->t.t_invoke = NULL; /* not needed */
4440
4441  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4442  team->t.t_sched.sched = new_icvs->sched.sched;
4443
4444#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4445  team->t.t_fp_control_saved = FALSE; /* not needed */
4446  team->t.t_x87_fpu_control_word = 0; /* not needed */
4447  team->t.t_mxcsr = 0; /* not needed */
4448#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4449
4450  team->t.t_construct = 0;
4451
4452  team->t.t_ordered.dt.t_value = 0;
4453  team->t.t_master_active = FALSE;
4454
4455#ifdef KMP_DEBUG
4456  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4457#endif
4458#if KMP_OS_WINDOWS
4459  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4460#endif
4461
4462  team->t.t_control_stack_top = NULL;
4463
4464  __kmp_reinitialize_team(team, new_icvs, loc);
4465
4466  KMP_MB();
4467  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4468}
4469
4470#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4471/* Sets full mask for thread and returns old mask, no changes to structures. */
4472static void
4473__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4474  if (KMP_AFFINITY_CAPABLE()) {
4475    int status;
4476    if (old_mask != NULL) {
4477      status = __kmp_get_system_affinity(old_mask, TRUE);
4478      int error = errno;
4479      if (status != 0) {
4480        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4481                    __kmp_msg_null);
4482      }
4483    }
4484    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4485  }
4486}
4487#endif
4488
4489#if KMP_AFFINITY_SUPPORTED
4490
4491// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4492// It calculats the worker + master thread's partition based upon the parent
4493// thread's partition, and binds each worker to a thread in their partition.
4494// The master thread's partition should already include its current binding.
4495static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4496  // Copy the master thread's place partion to the team struct
4497  kmp_info_t *master_th = team->t.t_threads[0];
4498  KMP_DEBUG_ASSERT(master_th != NULL);
4499  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4500  int first_place = master_th->th.th_first_place;
4501  int last_place = master_th->th.th_last_place;
4502  int masters_place = master_th->th.th_current_place;
4503  team->t.t_first_place = first_place;
4504  team->t.t_last_place = last_place;
4505
4506  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4507                "bound to place %d partition = [%d,%d]\n",
4508                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4509                team->t.t_id, masters_place, first_place, last_place));
4510
4511  switch (proc_bind) {
4512
4513  case proc_bind_default:
4514    // serial teams might have the proc_bind policy set to proc_bind_default. It
4515    // doesn't matter, as we don't rebind master thread for any proc_bind policy
4516    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4517    break;
4518
4519  case proc_bind_master: {
4520    int f;
4521    int n_th = team->t.t_nproc;
4522    for (f = 1; f < n_th; f++) {
4523      kmp_info_t *th = team->t.t_threads[f];
4524      KMP_DEBUG_ASSERT(th != NULL);
4525      th->th.th_first_place = first_place;
4526      th->th.th_last_place = last_place;
4527      th->th.th_new_place = masters_place;
4528      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4529          team->t.t_display_affinity != 1) {
4530        team->t.t_display_affinity = 1;
4531      }
4532
4533      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4534                     "partition = [%d,%d]\n",
4535                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4536                     f, masters_place, first_place, last_place));
4537    }
4538  } break;
4539
4540  case proc_bind_close: {
4541    int f;
4542    int n_th = team->t.t_nproc;
4543    int n_places;
4544    if (first_place <= last_place) {
4545      n_places = last_place - first_place + 1;
4546    } else {
4547      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4548    }
4549    if (n_th <= n_places) {
4550      int place = masters_place;
4551      for (f = 1; f < n_th; f++) {
4552        kmp_info_t *th = team->t.t_threads[f];
4553        KMP_DEBUG_ASSERT(th != NULL);
4554
4555        if (place == last_place) {
4556          place = first_place;
4557        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4558          place = 0;
4559        } else {
4560          place++;
4561        }
4562        th->th.th_first_place = first_place;
4563        th->th.th_last_place = last_place;
4564        th->th.th_new_place = place;
4565        if (__kmp_display_affinity && place != th->th.th_current_place &&
4566            team->t.t_display_affinity != 1) {
4567          team->t.t_display_affinity = 1;
4568        }
4569
4570        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4571                       "partition = [%d,%d]\n",
4572                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4573                       team->t.t_id, f, place, first_place, last_place));
4574      }
4575    } else {
4576      int S, rem, gap, s_count;
4577      S = n_th / n_places;
4578      s_count = 0;
4579      rem = n_th - (S * n_places);
4580      gap = rem > 0 ? n_places / rem : n_places;
4581      int place = masters_place;
4582      int gap_ct = gap;
4583      for (f = 0; f < n_th; f++) {
4584        kmp_info_t *th = team->t.t_threads[f];
4585        KMP_DEBUG_ASSERT(th != NULL);
4586
4587        th->th.th_first_place = first_place;
4588        th->th.th_last_place = last_place;
4589        th->th.th_new_place = place;
4590        if (__kmp_display_affinity && place != th->th.th_current_place &&
4591            team->t.t_display_affinity != 1) {
4592          team->t.t_display_affinity = 1;
4593        }
4594        s_count++;
4595
4596        if ((s_count == S) && rem && (gap_ct == gap)) {
4597          // do nothing, add an extra thread to place on next iteration
4598        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4599          // we added an extra thread to this place; move to next place
4600          if (place == last_place) {
4601            place = first_place;
4602          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4603            place = 0;
4604          } else {
4605            place++;
4606          }
4607          s_count = 0;
4608          gap_ct = 1;
4609          rem--;
4610        } else if (s_count == S) { // place full; don't add extra
4611          if (place == last_place) {
4612            place = first_place;
4613          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4614            place = 0;
4615          } else {
4616            place++;
4617          }
4618          gap_ct++;
4619          s_count = 0;
4620        }
4621
4622        KA_TRACE(100,
4623                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4624                  "partition = [%d,%d]\n",
4625                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4626                  th->th.th_new_place, first_place, last_place));
4627      }
4628      KMP_DEBUG_ASSERT(place == masters_place);
4629    }
4630  } break;
4631
4632  case proc_bind_spread: {
4633    int f;
4634    int n_th = team->t.t_nproc;
4635    int n_places;
4636    int thidx;
4637    if (first_place <= last_place) {
4638      n_places = last_place - first_place + 1;
4639    } else {
4640      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4641    }
4642    if (n_th <= n_places) {
4643      int place = -1;
4644
4645      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4646        int S = n_places / n_th;
4647        int s_count, rem, gap, gap_ct;
4648
4649        place = masters_place;
4650        rem = n_places - n_th * S;
4651        gap = rem ? n_th / rem : 1;
4652        gap_ct = gap;
4653        thidx = n_th;
4654        if (update_master_only == 1)
4655          thidx = 1;
4656        for (f = 0; f < thidx; f++) {
4657          kmp_info_t *th = team->t.t_threads[f];
4658          KMP_DEBUG_ASSERT(th != NULL);
4659
4660          th->th.th_first_place = place;
4661          th->th.th_new_place = place;
4662          if (__kmp_display_affinity && place != th->th.th_current_place &&
4663              team->t.t_display_affinity != 1) {
4664            team->t.t_display_affinity = 1;
4665          }
4666          s_count = 1;
4667          while (s_count < S) {
4668            if (place == last_place) {
4669              place = first_place;
4670            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4671              place = 0;
4672            } else {
4673              place++;
4674            }
4675            s_count++;
4676          }
4677          if (rem && (gap_ct == gap)) {
4678            if (place == last_place) {
4679              place = first_place;
4680            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4681              place = 0;
4682            } else {
4683              place++;
4684            }
4685            rem--;
4686            gap_ct = 0;
4687          }
4688          th->th.th_last_place = place;
4689          gap_ct++;
4690
4691          if (place == last_place) {
4692            place = first_place;
4693          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4694            place = 0;
4695          } else {
4696            place++;
4697          }
4698
4699          KA_TRACE(100,
4700                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4701                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4702                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4703                    f, th->th.th_new_place, th->th.th_first_place,
4704                    th->th.th_last_place, __kmp_affinity_num_masks));
4705        }
4706      } else {
4707        /* Having uniform space of available computation places I can create
4708           T partitions of round(P/T) size and put threads into the first
4709           place of each partition. */
4710        double current = static_cast<double>(masters_place);
4711        double spacing =
4712            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4713        int first, last;
4714        kmp_info_t *th;
4715
4716        thidx = n_th + 1;
4717        if (update_master_only == 1)
4718          thidx = 1;
4719        for (f = 0; f < thidx; f++) {
4720          first = static_cast<int>(current);
4721          last = static_cast<int>(current + spacing) - 1;
4722          KMP_DEBUG_ASSERT(last >= first);
4723          if (first >= n_places) {
4724            if (masters_place) {
4725              first -= n_places;
4726              last -= n_places;
4727              if (first == (masters_place + 1)) {
4728                KMP_DEBUG_ASSERT(f == n_th);
4729                first--;
4730              }
4731              if (last == masters_place) {
4732                KMP_DEBUG_ASSERT(f == (n_th - 1));
4733                last--;
4734              }
4735            } else {
4736              KMP_DEBUG_ASSERT(f == n_th);
4737              first = 0;
4738              last = 0;
4739            }
4740          }
4741          if (last >= n_places) {
4742            last = (n_places - 1);
4743          }
4744          place = first;
4745          current += spacing;
4746          if (f < n_th) {
4747            KMP_DEBUG_ASSERT(0 <= first);
4748            KMP_DEBUG_ASSERT(n_places > first);
4749            KMP_DEBUG_ASSERT(0 <= last);
4750            KMP_DEBUG_ASSERT(n_places > last);
4751            KMP_DEBUG_ASSERT(last_place >= first_place);
4752            th = team->t.t_threads[f];
4753            KMP_DEBUG_ASSERT(th);
4754            th->th.th_first_place = first;
4755            th->th.th_new_place = place;
4756            th->th.th_last_place = last;
4757            if (__kmp_display_affinity && place != th->th.th_current_place &&
4758                team->t.t_display_affinity != 1) {
4759              team->t.t_display_affinity = 1;
4760            }
4761            KA_TRACE(100,
4762                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4763                      "partition = [%d,%d], spacing = %.4f\n",
4764                      __kmp_gtid_from_thread(team->t.t_threads[f]),
4765                      team->t.t_id, f, th->th.th_new_place,
4766                      th->th.th_first_place, th->th.th_last_place, spacing));
4767          }
4768        }
4769      }
4770      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4771    } else {
4772      int S, rem, gap, s_count;
4773      S = n_th / n_places;
4774      s_count = 0;
4775      rem = n_th - (S * n_places);
4776      gap = rem > 0 ? n_places / rem : n_places;
4777      int place = masters_place;
4778      int gap_ct = gap;
4779      thidx = n_th;
4780      if (update_master_only == 1)
4781        thidx = 1;
4782      for (f = 0; f < thidx; f++) {
4783        kmp_info_t *th = team->t.t_threads[f];
4784        KMP_DEBUG_ASSERT(th != NULL);
4785
4786        th->th.th_first_place = place;
4787        th->th.th_last_place = place;
4788        th->th.th_new_place = place;
4789        if (__kmp_display_affinity && place != th->th.th_current_place &&
4790            team->t.t_display_affinity != 1) {
4791          team->t.t_display_affinity = 1;
4792        }
4793        s_count++;
4794
4795        if ((s_count == S) && rem && (gap_ct == gap)) {
4796          // do nothing, add an extra thread to place on next iteration
4797        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4798          // we added an extra thread to this place; move on to next place
4799          if (place == last_place) {
4800            place = first_place;
4801          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4802            place = 0;
4803          } else {
4804            place++;
4805          }
4806          s_count = 0;
4807          gap_ct = 1;
4808          rem--;
4809        } else if (s_count == S) { // place is full; don't add extra thread
4810          if (place == last_place) {
4811            place = first_place;
4812          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4813            place = 0;
4814          } else {
4815            place++;
4816          }
4817          gap_ct++;
4818          s_count = 0;
4819        }
4820
4821        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4822                       "partition = [%d,%d]\n",
4823                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4824                       team->t.t_id, f, th->th.th_new_place,
4825                       th->th.th_first_place, th->th.th_last_place));
4826      }
4827      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4828    }
4829  } break;
4830
4831  default:
4832    break;
4833  }
4834
4835  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4836}
4837
4838#endif // KMP_AFFINITY_SUPPORTED
4839
4840/* allocate a new team data structure to use.  take one off of the free pool if
4841   available */
4842kmp_team_t *
4843__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4844#if OMPT_SUPPORT
4845                    ompt_data_t ompt_parallel_data,
4846#endif
4847                    kmp_proc_bind_t new_proc_bind,
4848                    kmp_internal_control_t *new_icvs,
4849                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4850  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4851  int f;
4852  kmp_team_t *team;
4853  int use_hot_team = !root->r.r_active;
4854  int level = 0;
4855
4856  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4857  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4858  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4859  KMP_MB();
4860
4861#if KMP_NESTED_HOT_TEAMS
4862  kmp_hot_team_ptr_t *hot_teams;
4863  if (master) {
4864    team = master->th.th_team;
4865    level = team->t.t_active_level;
4866    if (master->th.th_teams_microtask) { // in teams construct?
4867      if (master->th.th_teams_size.nteams > 1 &&
4868          ( // #teams > 1
4869              team->t.t_pkfn ==
4870                  (microtask_t)__kmp_teams_master || // inner fork of the teams
4871              master->th.th_teams_level <
4872                  team->t.t_level)) { // or nested parallel inside the teams
4873        ++level; // not increment if #teams==1, or for outer fork of the teams;
4874        // increment otherwise
4875      }
4876    }
4877    hot_teams = master->th.th_hot_teams;
4878    if (level < __kmp_hot_teams_max_level && hot_teams &&
4879        hot_teams[level]
4880            .hot_team) { // hot team has already been allocated for given level
4881      use_hot_team = 1;
4882    } else {
4883      use_hot_team = 0;
4884    }
4885  }
4886#endif
4887  // Optimization to use a "hot" team
4888  if (use_hot_team && new_nproc > 1) {
4889    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4890#if KMP_NESTED_HOT_TEAMS
4891    team = hot_teams[level].hot_team;
4892#else
4893    team = root->r.r_hot_team;
4894#endif
4895#if KMP_DEBUG
4896    if (__kmp_tasking_mode != tskm_immediate_exec) {
4897      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4898                    "task_team[1] = %p before reinit\n",
4899                    team->t.t_task_team[0], team->t.t_task_team[1]));
4900    }
4901#endif
4902
4903    // Has the number of threads changed?
4904    /* Let's assume the most common case is that the number of threads is
4905       unchanged, and put that case first. */
4906    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4907      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4908      // This case can mean that omp_set_num_threads() was called and the hot
4909      // team size was already reduced, so we check the special flag
4910      if (team->t.t_size_changed == -1) {
4911        team->t.t_size_changed = 1;
4912      } else {
4913        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4914      }
4915
4916      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4917      kmp_r_sched_t new_sched = new_icvs->sched;
4918      // set master's schedule as new run-time schedule
4919      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4920
4921      __kmp_reinitialize_team(team, new_icvs,
4922                              root->r.r_uber_thread->th.th_ident);
4923
4924      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4925                    team->t.t_threads[0], team));
4926      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4927
4928#if KMP_AFFINITY_SUPPORTED
4929      if ((team->t.t_size_changed == 0) &&
4930          (team->t.t_proc_bind == new_proc_bind)) {
4931        if (new_proc_bind == proc_bind_spread) {
4932          __kmp_partition_places(
4933              team, 1); // add flag to update only master for spread
4934        }
4935        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4936                       "proc_bind = %d, partition = [%d,%d]\n",
4937                       team->t.t_id, new_proc_bind, team->t.t_first_place,
4938                       team->t.t_last_place));
4939      } else {
4940        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4941        __kmp_partition_places(team);
4942      }
4943#else
4944      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4945#endif /* KMP_AFFINITY_SUPPORTED */
4946    } else if (team->t.t_nproc > new_nproc) {
4947      KA_TRACE(20,
4948               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4949                new_nproc));
4950
4951      team->t.t_size_changed = 1;
4952#if KMP_NESTED_HOT_TEAMS
4953      if (__kmp_hot_teams_mode == 0) {
4954        // AC: saved number of threads should correspond to team's value in this
4955        // mode, can be bigger in mode 1, when hot team has threads in reserve
4956        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4957        hot_teams[level].hot_team_nth = new_nproc;
4958#endif // KMP_NESTED_HOT_TEAMS
4959        /* release the extra threads we don't need any more */
4960        for (f = new_nproc; f < team->t.t_nproc; f++) {
4961          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4962          if (__kmp_tasking_mode != tskm_immediate_exec) {
4963            // When decreasing team size, threads no longer in the team should
4964            // unref task team.
4965            team->t.t_threads[f]->th.th_task_team = NULL;
4966          }
4967          __kmp_free_thread(team->t.t_threads[f]);
4968          team->t.t_threads[f] = NULL;
4969        }
4970#if KMP_NESTED_HOT_TEAMS
4971      } // (__kmp_hot_teams_mode == 0)
4972      else {
4973        // When keeping extra threads in team, switch threads to wait on own
4974        // b_go flag
4975        for (f = new_nproc; f < team->t.t_nproc; ++f) {
4976          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4977          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4978          for (int b = 0; b < bs_last_barrier; ++b) {
4979            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4980              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4981            }
4982            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4983          }
4984        }
4985      }
4986#endif // KMP_NESTED_HOT_TEAMS
4987      team->t.t_nproc = new_nproc;
4988      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4989      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4990      __kmp_reinitialize_team(team, new_icvs,
4991                              root->r.r_uber_thread->th.th_ident);
4992
4993      // Update remaining threads
4994      for (f = 0; f < new_nproc; ++f) {
4995        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4996      }
4997
4998      // restore the current task state of the master thread: should be the
4999      // implicit task
5000      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5001                    team->t.t_threads[0], team));
5002
5003      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5004
5005#ifdef KMP_DEBUG
5006      for (f = 0; f < team->t.t_nproc; f++) {
5007        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5008                         team->t.t_threads[f]->th.th_team_nproc ==
5009                             team->t.t_nproc);
5010      }
5011#endif
5012
5013      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5014#if KMP_AFFINITY_SUPPORTED
5015      __kmp_partition_places(team);
5016#endif
5017    } else { // team->t.t_nproc < new_nproc
5018#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5019      kmp_affin_mask_t *old_mask;
5020      if (KMP_AFFINITY_CAPABLE()) {
5021        KMP_CPU_ALLOC(old_mask);
5022      }
5023#endif
5024
5025      KA_TRACE(20,
5026               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5027                new_nproc));
5028
5029      team->t.t_size_changed = 1;
5030
5031#if KMP_NESTED_HOT_TEAMS
5032      int avail_threads = hot_teams[level].hot_team_nth;
5033      if (new_nproc < avail_threads)
5034        avail_threads = new_nproc;
5035      kmp_info_t **other_threads = team->t.t_threads;
5036      for (f = team->t.t_nproc; f < avail_threads; ++f) {
5037        // Adjust barrier data of reserved threads (if any) of the team
5038        // Other data will be set in __kmp_initialize_info() below.
5039        int b;
5040        kmp_balign_t *balign = other_threads[f]->th.th_bar;
5041        for (b = 0; b < bs_last_barrier; ++b) {
5042          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5043          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5044#if USE_DEBUGGER
5045          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5046#endif
5047        }
5048      }
5049      if (hot_teams[level].hot_team_nth >= new_nproc) {
5050        // we have all needed threads in reserve, no need to allocate any
5051        // this only possible in mode 1, cannot have reserved threads in mode 0
5052        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5053        team->t.t_nproc = new_nproc; // just get reserved threads involved
5054      } else {
5055        // we may have some threads in reserve, but not enough
5056        team->t.t_nproc =
5057            hot_teams[level]
5058                .hot_team_nth; // get reserved threads involved if any
5059        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5060#endif // KMP_NESTED_HOT_TEAMS
5061        if (team->t.t_max_nproc < new_nproc) {
5062          /* reallocate larger arrays */
5063          __kmp_reallocate_team_arrays(team, new_nproc);
5064          __kmp_reinitialize_team(team, new_icvs, NULL);
5065        }
5066
5067#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5068        /* Temporarily set full mask for master thread before creation of
5069           workers. The reason is that workers inherit the affinity from master,
5070           so if a lot of workers are created on the single core quickly, they
5071           don't get a chance to set their own affinity for a long time. */
5072        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5073#endif
5074
5075        /* allocate new threads for the hot team */
5076        for (f = team->t.t_nproc; f < new_nproc; f++) {
5077          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5078          KMP_DEBUG_ASSERT(new_worker);
5079          team->t.t_threads[f] = new_worker;
5080
5081          KA_TRACE(20,
5082                   ("__kmp_allocate_team: team %d init T#%d arrived: "
5083                    "join=%llu, plain=%llu\n",
5084                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5085                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5086                    team->t.t_bar[bs_plain_barrier].b_arrived));
5087
5088          { // Initialize barrier data for new threads.
5089            int b;
5090            kmp_balign_t *balign = new_worker->th.th_bar;
5091            for (b = 0; b < bs_last_barrier; ++b) {
5092              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5093              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5094                               KMP_BARRIER_PARENT_FLAG);
5095#if USE_DEBUGGER
5096              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5097#endif
5098            }
5099          }
5100        }
5101
5102#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5103        if (KMP_AFFINITY_CAPABLE()) {
5104          /* Restore initial master thread's affinity mask */
5105          __kmp_set_system_affinity(old_mask, TRUE);
5106          KMP_CPU_FREE(old_mask);
5107        }
5108#endif
5109#if KMP_NESTED_HOT_TEAMS
5110      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5111#endif // KMP_NESTED_HOT_TEAMS
5112      /* make sure everyone is syncronized */
5113      int old_nproc = team->t.t_nproc; // save old value and use to update only
5114      // new threads below
5115      __kmp_initialize_team(team, new_nproc, new_icvs,
5116                            root->r.r_uber_thread->th.th_ident);
5117
5118      /* reinitialize the threads */
5119      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5120      for (f = 0; f < team->t.t_nproc; ++f)
5121        __kmp_initialize_info(team->t.t_threads[f], team, f,
5122                              __kmp_gtid_from_tid(f, team));
5123
5124      if (level) { // set th_task_state for new threads in nested hot team
5125        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5126        // only need to set the th_task_state for the new threads. th_task_state
5127        // for master thread will not be accurate until after this in
5128        // __kmp_fork_call(), so we look to the master's memo_stack to get the
5129        // correct value.
5130        for (f = old_nproc; f < team->t.t_nproc; ++f)
5131          team->t.t_threads[f]->th.th_task_state =
5132              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5133      } else { // set th_task_state for new threads in non-nested hot team
5134        int old_state =
5135            team->t.t_threads[0]->th.th_task_state; // copy master's state
5136        for (f = old_nproc; f < team->t.t_nproc; ++f)
5137          team->t.t_threads[f]->th.th_task_state = old_state;
5138      }
5139
5140#ifdef KMP_DEBUG
5141      for (f = 0; f < team->t.t_nproc; ++f) {
5142        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5143                         team->t.t_threads[f]->th.th_team_nproc ==
5144                             team->t.t_nproc);
5145      }
5146#endif
5147
5148      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5149#if KMP_AFFINITY_SUPPORTED
5150      __kmp_partition_places(team);
5151#endif
5152    } // Check changes in number of threads
5153
5154    kmp_info_t *master = team->t.t_threads[0];
5155    if (master->th.th_teams_microtask) {
5156      for (f = 1; f < new_nproc; ++f) {
5157        // propagate teams construct specific info to workers
5158        kmp_info_t *thr = team->t.t_threads[f];
5159        thr->th.th_teams_microtask = master->th.th_teams_microtask;
5160        thr->th.th_teams_level = master->th.th_teams_level;
5161        thr->th.th_teams_size = master->th.th_teams_size;
5162      }
5163    }
5164#if KMP_NESTED_HOT_TEAMS
5165    if (level) {
5166      // Sync barrier state for nested hot teams, not needed for outermost hot
5167      // team.
5168      for (f = 1; f < new_nproc; ++f) {
5169        kmp_info_t *thr = team->t.t_threads[f];
5170        int b;
5171        kmp_balign_t *balign = thr->th.th_bar;
5172        for (b = 0; b < bs_last_barrier; ++b) {
5173          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5174          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5175#if USE_DEBUGGER
5176          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5177#endif
5178        }
5179      }
5180    }
5181#endif // KMP_NESTED_HOT_TEAMS
5182
5183    /* reallocate space for arguments if necessary */
5184    __kmp_alloc_argv_entries(argc, team, TRUE);
5185    KMP_CHECK_UPDATE(team->t.t_argc, argc);
5186    // The hot team re-uses the previous task team,
5187    // if untouched during the previous release->gather phase.
5188
5189    KF_TRACE(10, (" hot_team = %p\n", team));
5190
5191#if KMP_DEBUG
5192    if (__kmp_tasking_mode != tskm_immediate_exec) {
5193      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5194                    "task_team[1] = %p after reinit\n",
5195                    team->t.t_task_team[0], team->t.t_task_team[1]));
5196    }
5197#endif
5198
5199#if OMPT_SUPPORT
5200    __ompt_team_assign_id(team, ompt_parallel_data);
5201#endif
5202
5203    KMP_MB();
5204
5205    return team;
5206  }
5207
5208  /* next, let's try to take one from the team pool */
5209  KMP_MB();
5210  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5211    /* TODO: consider resizing undersized teams instead of reaping them, now
5212       that we have a resizing mechanism */
5213    if (team->t.t_max_nproc >= max_nproc) {
5214      /* take this team from the team pool */
5215      __kmp_team_pool = team->t.t_next_pool;
5216
5217      /* setup the team for fresh use */
5218      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5219
5220      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5221                    "task_team[1] %p to NULL\n",
5222                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
5223      team->t.t_task_team[0] = NULL;
5224      team->t.t_task_team[1] = NULL;
5225
5226      /* reallocate space for arguments if necessary */
5227      __kmp_alloc_argv_entries(argc, team, TRUE);
5228      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5229
5230      KA_TRACE(
5231          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5232               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5233      { // Initialize barrier data.
5234        int b;
5235        for (b = 0; b < bs_last_barrier; ++b) {
5236          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5237#if USE_DEBUGGER
5238          team->t.t_bar[b].b_master_arrived = 0;
5239          team->t.t_bar[b].b_team_arrived = 0;
5240#endif
5241        }
5242      }
5243
5244      team->t.t_proc_bind = new_proc_bind;
5245
5246      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5247                    team->t.t_id));
5248
5249#if OMPT_SUPPORT
5250      __ompt_team_assign_id(team, ompt_parallel_data);
5251#endif
5252
5253      KMP_MB();
5254
5255      return team;
5256    }
5257
5258    /* reap team if it is too small, then loop back and check the next one */
5259    // not sure if this is wise, but, will be redone during the hot-teams
5260    // rewrite.
5261    /* TODO: Use technique to find the right size hot-team, don't reap them */
5262    team = __kmp_reap_team(team);
5263    __kmp_team_pool = team;
5264  }
5265
5266  /* nothing available in the pool, no matter, make a new team! */
5267  KMP_MB();
5268  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5269
5270  /* and set it up */
5271  team->t.t_max_nproc = max_nproc;
5272  /* NOTE well, for some reason allocating one big buffer and dividing it up
5273     seems to really hurt performance a lot on the P4, so, let's not use this */
5274  __kmp_allocate_team_arrays(team, max_nproc);
5275
5276  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5277  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5278
5279  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5280                "%p to NULL\n",
5281                &team->t.t_task_team[0], &team->t.t_task_team[1]));
5282  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5283  // memory, no need to duplicate
5284  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5285  // memory, no need to duplicate
5286
5287  if (__kmp_storage_map) {
5288    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5289  }
5290
5291  /* allocate space for arguments */
5292  __kmp_alloc_argv_entries(argc, team, FALSE);
5293  team->t.t_argc = argc;
5294
5295  KA_TRACE(20,
5296           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5297            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5298  { // Initialize barrier data.
5299    int b;
5300    for (b = 0; b < bs_last_barrier; ++b) {
5301      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5302#if USE_DEBUGGER
5303      team->t.t_bar[b].b_master_arrived = 0;
5304      team->t.t_bar[b].b_team_arrived = 0;
5305#endif
5306    }
5307  }
5308
5309  team->t.t_proc_bind = new_proc_bind;
5310
5311#if OMPT_SUPPORT
5312  __ompt_team_assign_id(team, ompt_parallel_data);
5313  team->t.ompt_serialized_team_info = NULL;
5314#endif
5315
5316  KMP_MB();
5317
5318  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5319                team->t.t_id));
5320
5321  return team;
5322}
5323
5324/* TODO implement hot-teams at all levels */
5325/* TODO implement lazy thread release on demand (disband request) */
5326
5327/* free the team.  return it to the team pool.  release all the threads
5328 * associated with it */
5329void __kmp_free_team(kmp_root_t *root,
5330                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5331  int f;
5332  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5333                team->t.t_id));
5334
5335  /* verify state */
5336  KMP_DEBUG_ASSERT(root);
5337  KMP_DEBUG_ASSERT(team);
5338  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5339  KMP_DEBUG_ASSERT(team->t.t_threads);
5340
5341  int use_hot_team = team == root->r.r_hot_team;
5342#if KMP_NESTED_HOT_TEAMS
5343  int level;
5344  kmp_hot_team_ptr_t *hot_teams;
5345  if (master) {
5346    level = team->t.t_active_level - 1;
5347    if (master->th.th_teams_microtask) { // in teams construct?
5348      if (master->th.th_teams_size.nteams > 1) {
5349        ++level; // level was not increased in teams construct for
5350        // team_of_masters
5351      }
5352      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5353          master->th.th_teams_level == team->t.t_level) {
5354        ++level; // level was not increased in teams construct for
5355        // team_of_workers before the parallel
5356      } // team->t.t_level will be increased inside parallel
5357    }
5358    hot_teams = master->th.th_hot_teams;
5359    if (level < __kmp_hot_teams_max_level) {
5360      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5361      use_hot_team = 1;
5362    }
5363  }
5364#endif // KMP_NESTED_HOT_TEAMS
5365
5366  /* team is done working */
5367  TCW_SYNC_PTR(team->t.t_pkfn,
5368               NULL); // Important for Debugging Support Library.
5369#if KMP_OS_WINDOWS
5370  team->t.t_copyin_counter = 0; // init counter for possible reuse
5371#endif
5372  // Do not reset pointer to parent team to NULL for hot teams.
5373
5374  /* if we are non-hot team, release our threads */
5375  if (!use_hot_team) {
5376    if (__kmp_tasking_mode != tskm_immediate_exec) {
5377      // Wait for threads to reach reapable state
5378      for (f = 1; f < team->t.t_nproc; ++f) {
5379        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5380        kmp_info_t *th = team->t.t_threads[f];
5381        volatile kmp_uint32 *state = &th->th.th_reap_state;
5382        while (*state != KMP_SAFE_TO_REAP) {
5383#if KMP_OS_WINDOWS
5384          // On Windows a thread can be killed at any time, check this
5385          DWORD ecode;
5386          if (!__kmp_is_thread_alive(th, &ecode)) {
5387            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5388            break;
5389          }
5390#endif
5391          // first check if thread is sleeping
5392          kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5393          if (fl.is_sleeping())
5394            fl.resume(__kmp_gtid_from_thread(th));
5395          KMP_CPU_PAUSE();
5396        }
5397      }
5398
5399      // Delete task teams
5400      int tt_idx;
5401      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5402        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5403        if (task_team != NULL) {
5404          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5405            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5406            team->t.t_threads[f]->th.th_task_team = NULL;
5407          }
5408          KA_TRACE(
5409              20,
5410              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5411               __kmp_get_gtid(), task_team, team->t.t_id));
5412#if KMP_NESTED_HOT_TEAMS
5413          __kmp_free_task_team(master, task_team);
5414#endif
5415          team->t.t_task_team[tt_idx] = NULL;
5416        }
5417      }
5418    }
5419
5420    // Reset pointer to parent team only for non-hot teams.
5421    team->t.t_parent = NULL;
5422    team->t.t_level = 0;
5423    team->t.t_active_level = 0;
5424
5425    /* free the worker threads */
5426    for (f = 1; f < team->t.t_nproc; ++f) {
5427      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5428      __kmp_free_thread(team->t.t_threads[f]);
5429      team->t.t_threads[f] = NULL;
5430    }
5431
5432    /* put the team back in the team pool */
5433    /* TODO limit size of team pool, call reap_team if pool too large */
5434    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5435    __kmp_team_pool = (volatile kmp_team_t *)team;
5436  } else { // Check if team was created for the masters in a teams construct
5437    // See if first worker is a CG root
5438    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5439                     team->t.t_threads[1]->th.th_cg_roots);
5440    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5441      // Clean up the CG root nodes on workers so that this team can be re-used
5442      for (f = 1; f < team->t.t_nproc; ++f) {
5443        kmp_info_t *thr = team->t.t_threads[f];
5444        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5445                         thr->th.th_cg_roots->cg_root == thr);
5446        // Pop current CG root off list
5447        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5448        thr->th.th_cg_roots = tmp->up;
5449        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5450                       " up to node %p. cg_nthreads was %d\n",
5451                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5452        int i = tmp->cg_nthreads--;
5453        if (i == 1) {
5454          __kmp_free(tmp); // free CG if we are the last thread in it
5455        }
5456        // Restore current task's thread_limit from CG root
5457        if (thr->th.th_cg_roots)
5458          thr->th.th_current_task->td_icvs.thread_limit =
5459              thr->th.th_cg_roots->cg_thread_limit;
5460      }
5461    }
5462  }
5463
5464  KMP_MB();
5465}
5466
5467/* reap the team.  destroy it, reclaim all its resources and free its memory */
5468kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5469  kmp_team_t *next_pool = team->t.t_next_pool;
5470
5471  KMP_DEBUG_ASSERT(team);
5472  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5473  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5474  KMP_DEBUG_ASSERT(team->t.t_threads);
5475  KMP_DEBUG_ASSERT(team->t.t_argv);
5476
5477  /* TODO clean the threads that are a part of this? */
5478
5479  /* free stuff */
5480  __kmp_free_team_arrays(team);
5481  if (team->t.t_argv != &team->t.t_inline_argv[0])
5482    __kmp_free((void *)team->t.t_argv);
5483  __kmp_free(team);
5484
5485  KMP_MB();
5486  return next_pool;
5487}
5488
5489// Free the thread.  Don't reap it, just place it on the pool of available
5490// threads.
5491//
5492// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5493// binding for the affinity mechanism to be useful.
5494//
5495// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5496// However, we want to avoid a potential performance problem by always
5497// scanning through the list to find the correct point at which to insert
5498// the thread (potential N**2 behavior).  To do this we keep track of the
5499// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5500// With single-level parallelism, threads will always be added to the tail
5501// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5502// parallelism, all bets are off and we may need to scan through the entire
5503// free list.
5504//
5505// This change also has a potentially large performance benefit, for some
5506// applications.  Previously, as threads were freed from the hot team, they
5507// would be placed back on the free list in inverse order.  If the hot team
5508// grew back to it's original size, then the freed thread would be placed
5509// back on the hot team in reverse order.  This could cause bad cache
5510// locality problems on programs where the size of the hot team regularly
5511// grew and shrunk.
5512//
5513// Now, for single-level parallelism, the OMP tid is alway == gtid.
5514void __kmp_free_thread(kmp_info_t *this_th) {
5515  int gtid;
5516  kmp_info_t **scan;
5517
5518  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5519                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5520
5521  KMP_DEBUG_ASSERT(this_th);
5522
5523  // When moving thread to pool, switch thread to wait on own b_go flag, and
5524  // uninitialized (NULL team).
5525  int b;
5526  kmp_balign_t *balign = this_th->th.th_bar;
5527  for (b = 0; b < bs_last_barrier; ++b) {
5528    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5529      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5530    balign[b].bb.team = NULL;
5531    balign[b].bb.leaf_kids = 0;
5532  }
5533  this_th->th.th_task_state = 0;
5534  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5535
5536  /* put thread back on the free pool */
5537  TCW_PTR(this_th->th.th_team, NULL);
5538  TCW_PTR(this_th->th.th_root, NULL);
5539  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5540
5541  while (this_th->th.th_cg_roots) {
5542    this_th->th.th_cg_roots->cg_nthreads--;
5543    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5544                   " %p of thread  %p to %d\n",
5545                   this_th, this_th->th.th_cg_roots,
5546                   this_th->th.th_cg_roots->cg_root,
5547                   this_th->th.th_cg_roots->cg_nthreads));
5548    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5549    if (tmp->cg_root == this_th) { // Thread is a cg_root
5550      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5551      KA_TRACE(
5552          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5553      this_th->th.th_cg_roots = tmp->up;
5554      __kmp_free(tmp);
5555    } else { // Worker thread
5556      if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5557        __kmp_free(tmp);
5558      }
5559      this_th->th.th_cg_roots = NULL;
5560      break;
5561    }
5562  }
5563
5564  /* If the implicit task assigned to this thread can be used by other threads
5565   * -> multiple threads can share the data and try to free the task at
5566   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5567   * with higher probability when hot team is disabled but can occurs even when
5568   * the hot team is enabled */
5569  __kmp_free_implicit_task(this_th);
5570  this_th->th.th_current_task = NULL;
5571
5572  // If the __kmp_thread_pool_insert_pt is already past the new insert
5573  // point, then we need to re-scan the entire list.
5574  gtid = this_th->th.th_info.ds.ds_gtid;
5575  if (__kmp_thread_pool_insert_pt != NULL) {
5576    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5577    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5578      __kmp_thread_pool_insert_pt = NULL;
5579    }
5580  }
5581
5582  // Scan down the list to find the place to insert the thread.
5583  // scan is the address of a link in the list, possibly the address of
5584  // __kmp_thread_pool itself.
5585  //
5586  // In the absence of nested parallism, the for loop will have 0 iterations.
5587  if (__kmp_thread_pool_insert_pt != NULL) {
5588    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5589  } else {
5590    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5591  }
5592  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5593       scan = &((*scan)->th.th_next_pool))
5594    ;
5595
5596  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5597  // to its address.
5598  TCW_PTR(this_th->th.th_next_pool, *scan);
5599  __kmp_thread_pool_insert_pt = *scan = this_th;
5600  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5601                   (this_th->th.th_info.ds.ds_gtid <
5602                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5603  TCW_4(this_th->th.th_in_pool, TRUE);
5604  __kmp_suspend_initialize_thread(this_th);
5605  __kmp_lock_suspend_mx(this_th);
5606  if (this_th->th.th_active == TRUE) {
5607    KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5608    this_th->th.th_active_in_pool = TRUE;
5609  }
5610#if KMP_DEBUG
5611  else {
5612    KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5613  }
5614#endif
5615  __kmp_unlock_suspend_mx(this_th);
5616
5617  TCW_4(__kmp_nth, __kmp_nth - 1);
5618
5619#ifdef KMP_ADJUST_BLOCKTIME
5620  /* Adjust blocktime back to user setting or default if necessary */
5621  /* Middle initialization might never have occurred                */
5622  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5623    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5624    if (__kmp_nth <= __kmp_avail_proc) {
5625      __kmp_zero_bt = FALSE;
5626    }
5627  }
5628#endif /* KMP_ADJUST_BLOCKTIME */
5629
5630  KMP_MB();
5631}
5632
5633/* ------------------------------------------------------------------------ */
5634
5635void *__kmp_launch_thread(kmp_info_t *this_thr) {
5636  int gtid = this_thr->th.th_info.ds.ds_gtid;
5637  /*    void                 *stack_data;*/
5638  kmp_team_t **volatile pteam;
5639
5640  KMP_MB();
5641  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5642
5643  if (__kmp_env_consistency_check) {
5644    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5645  }
5646
5647#if OMPT_SUPPORT
5648  ompt_data_t *thread_data;
5649  if (ompt_enabled.enabled) {
5650    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5651    *thread_data = ompt_data_none;
5652
5653    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5654    this_thr->th.ompt_thread_info.wait_id = 0;
5655    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5656    this_thr->th.ompt_thread_info.parallel_flags = 0;
5657    if (ompt_enabled.ompt_callback_thread_begin) {
5658      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5659          ompt_thread_worker, thread_data);
5660    }
5661    this_thr->th.ompt_thread_info.state = ompt_state_idle;
5662  }
5663#endif
5664
5665  /* This is the place where threads wait for work */
5666  while (!TCR_4(__kmp_global.g.g_done)) {
5667    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5668    KMP_MB();
5669
5670    /* wait for work to do */
5671    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5672
5673    /* No tid yet since not part of a team */
5674    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5675
5676#if OMPT_SUPPORT
5677    if (ompt_enabled.enabled) {
5678      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5679    }
5680#endif
5681
5682    pteam = &this_thr->th.th_team;
5683
5684    /* have we been allocated? */
5685    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5686      /* we were just woken up, so run our new task */
5687      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5688        int rc;
5689        KA_TRACE(20,
5690                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5691                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5692                  (*pteam)->t.t_pkfn));
5693
5694        updateHWFPControl(*pteam);
5695
5696#if OMPT_SUPPORT
5697        if (ompt_enabled.enabled) {
5698          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5699        }
5700#endif
5701
5702        rc = (*pteam)->t.t_invoke(gtid);
5703        KMP_ASSERT(rc);
5704
5705        KMP_MB();
5706        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5707                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5708                      (*pteam)->t.t_pkfn));
5709      }
5710#if OMPT_SUPPORT
5711      if (ompt_enabled.enabled) {
5712        /* no frame set while outside task */
5713        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5714
5715        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5716      }
5717#endif
5718      /* join barrier after parallel region */
5719      __kmp_join_barrier(gtid);
5720    }
5721  }
5722  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5723
5724#if OMPT_SUPPORT
5725  if (ompt_enabled.ompt_callback_thread_end) {
5726    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5727  }
5728#endif
5729
5730  this_thr->th.th_task_team = NULL;
5731  /* run the destructors for the threadprivate data for this thread */
5732  __kmp_common_destroy_gtid(gtid);
5733
5734  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5735  KMP_MB();
5736  return this_thr;
5737}
5738
5739/* ------------------------------------------------------------------------ */
5740
5741void __kmp_internal_end_dest(void *specific_gtid) {
5742#if KMP_COMPILER_ICC
5743#pragma warning(push)
5744#pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5745// significant bits
5746#endif
5747  // Make sure no significant bits are lost
5748  int gtid = (kmp_intptr_t)specific_gtid - 1;
5749#if KMP_COMPILER_ICC
5750#pragma warning(pop)
5751#endif
5752
5753  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5754  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5755   * this is because 0 is reserved for the nothing-stored case */
5756
5757  /* josh: One reason for setting the gtid specific data even when it is being
5758     destroyed by pthread is to allow gtid lookup through thread specific data
5759     (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5760     that gets executed in the call to __kmp_internal_end_thread, actually
5761     gets the gtid through the thread specific data.  Setting it here seems
5762     rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5763     to run smoothly.
5764     todo: get rid of this after we remove the dependence on
5765     __kmp_gtid_get_specific  */
5766  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5767    __kmp_gtid_set_specific(gtid);
5768#ifdef KMP_TDATA_GTID
5769  __kmp_gtid = gtid;
5770#endif
5771  __kmp_internal_end_thread(gtid);
5772}
5773
5774#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5775
5776__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5777  __kmp_internal_end_atexit();
5778}
5779
5780#endif
5781
5782/* [Windows] josh: when the atexit handler is called, there may still be more
5783   than one thread alive */
5784void __kmp_internal_end_atexit(void) {
5785  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5786  /* [Windows]
5787     josh: ideally, we want to completely shutdown the library in this atexit
5788     handler, but stat code that depends on thread specific data for gtid fails
5789     because that data becomes unavailable at some point during the shutdown, so
5790     we call __kmp_internal_end_thread instead. We should eventually remove the
5791     dependency on __kmp_get_specific_gtid in the stat code and use
5792     __kmp_internal_end_library to cleanly shutdown the library.
5793
5794     // TODO: Can some of this comment about GVS be removed?
5795     I suspect that the offending stat code is executed when the calling thread
5796     tries to clean up a dead root thread's data structures, resulting in GVS
5797     code trying to close the GVS structures for that thread, but since the stat
5798     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5799     the calling thread is cleaning up itself instead of another thread, it get
5800     confused. This happens because allowing a thread to unregister and cleanup
5801     another thread is a recent modification for addressing an issue.
5802     Based on the current design (20050722), a thread may end up
5803     trying to unregister another thread only if thread death does not trigger
5804     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5805     thread specific data destructor function to detect thread death. For
5806     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5807     is nothing.  Thus, the workaround is applicable only for Windows static
5808     stat library. */
5809  __kmp_internal_end_library(-1);
5810#if KMP_OS_WINDOWS
5811  __kmp_close_console();
5812#endif
5813}
5814
5815static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5816  // It is assumed __kmp_forkjoin_lock is acquired.
5817
5818  int gtid;
5819
5820  KMP_DEBUG_ASSERT(thread != NULL);
5821
5822  gtid = thread->th.th_info.ds.ds_gtid;
5823
5824  if (!is_root) {
5825    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5826      /* Assume the threads are at the fork barrier here */
5827      KA_TRACE(
5828          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5829               gtid));
5830      /* Need release fence here to prevent seg faults for tree forkjoin barrier
5831       * (GEH) */
5832      ANNOTATE_HAPPENS_BEFORE(thread);
5833      kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5834      __kmp_release_64(&flag);
5835    }
5836
5837    // Terminate OS thread.
5838    __kmp_reap_worker(thread);
5839
5840    // The thread was killed asynchronously.  If it was actively
5841    // spinning in the thread pool, decrement the global count.
5842    //
5843    // There is a small timing hole here - if the worker thread was just waking
5844    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5845    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5846    // the global counter might not get updated.
5847    //
5848    // Currently, this can only happen as the library is unloaded,
5849    // so there are no harmful side effects.
5850    if (thread->th.th_active_in_pool) {
5851      thread->th.th_active_in_pool = FALSE;
5852      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5853      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5854    }
5855  }
5856
5857  __kmp_free_implicit_task(thread);
5858
5859// Free the fast memory for tasking
5860#if USE_FAST_MEMORY
5861  __kmp_free_fast_memory(thread);
5862#endif /* USE_FAST_MEMORY */
5863
5864  __kmp_suspend_uninitialize_thread(thread);
5865
5866  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5867  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5868
5869  --__kmp_all_nth;
5870// __kmp_nth was decremented when thread is added to the pool.
5871
5872#ifdef KMP_ADJUST_BLOCKTIME
5873  /* Adjust blocktime back to user setting or default if necessary */
5874  /* Middle initialization might never have occurred                */
5875  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5876    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5877    if (__kmp_nth <= __kmp_avail_proc) {
5878      __kmp_zero_bt = FALSE;
5879    }
5880  }
5881#endif /* KMP_ADJUST_BLOCKTIME */
5882
5883  /* free the memory being used */
5884  if (__kmp_env_consistency_check) {
5885    if (thread->th.th_cons) {
5886      __kmp_free_cons_stack(thread->th.th_cons);
5887      thread->th.th_cons = NULL;
5888    }
5889  }
5890
5891  if (thread->th.th_pri_common != NULL) {
5892    __kmp_free(thread->th.th_pri_common);
5893    thread->th.th_pri_common = NULL;
5894  }
5895
5896  if (thread->th.th_task_state_memo_stack != NULL) {
5897    __kmp_free(thread->th.th_task_state_memo_stack);
5898    thread->th.th_task_state_memo_stack = NULL;
5899  }
5900
5901#if KMP_USE_BGET
5902  if (thread->th.th_local.bget_data != NULL) {
5903    __kmp_finalize_bget(thread);
5904  }
5905#endif
5906
5907#if KMP_AFFINITY_SUPPORTED
5908  if (thread->th.th_affin_mask != NULL) {
5909    KMP_CPU_FREE(thread->th.th_affin_mask);
5910    thread->th.th_affin_mask = NULL;
5911  }
5912#endif /* KMP_AFFINITY_SUPPORTED */
5913
5914#if KMP_USE_HIER_SCHED
5915  if (thread->th.th_hier_bar_data != NULL) {
5916    __kmp_free(thread->th.th_hier_bar_data);
5917    thread->th.th_hier_bar_data = NULL;
5918  }
5919#endif
5920
5921  __kmp_reap_team(thread->th.th_serial_team);
5922  thread->th.th_serial_team = NULL;
5923  __kmp_free(thread);
5924
5925  KMP_MB();
5926
5927} // __kmp_reap_thread
5928
5929static void __kmp_internal_end(void) {
5930  int i;
5931
5932  /* First, unregister the library */
5933  __kmp_unregister_library();
5934
5935#if KMP_OS_WINDOWS
5936  /* In Win static library, we can't tell when a root actually dies, so we
5937     reclaim the data structures for any root threads that have died but not
5938     unregistered themselves, in order to shut down cleanly.
5939     In Win dynamic library we also can't tell when a thread dies.  */
5940  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5941// dead roots
5942#endif
5943
5944  for (i = 0; i < __kmp_threads_capacity; i++)
5945    if (__kmp_root[i])
5946      if (__kmp_root[i]->r.r_active)
5947        break;
5948  KMP_MB(); /* Flush all pending memory write invalidates.  */
5949  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5950
5951  if (i < __kmp_threads_capacity) {
5952#if KMP_USE_MONITOR
5953    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5954    KMP_MB(); /* Flush all pending memory write invalidates.  */
5955
5956    // Need to check that monitor was initialized before reaping it. If we are
5957    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5958    // __kmp_monitor will appear to contain valid data, but it is only valid in
5959    // the parent process, not the child.
5960    // New behavior (201008): instead of keying off of the flag
5961    // __kmp_init_parallel, the monitor thread creation is keyed off
5962    // of the new flag __kmp_init_monitor.
5963    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5964    if (TCR_4(__kmp_init_monitor)) {
5965      __kmp_reap_monitor(&__kmp_monitor);
5966      TCW_4(__kmp_init_monitor, 0);
5967    }
5968    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5969    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5970#endif // KMP_USE_MONITOR
5971  } else {
5972/* TODO move this to cleanup code */
5973#ifdef KMP_DEBUG
5974    /* make sure that everything has properly ended */
5975    for (i = 0; i < __kmp_threads_capacity; i++) {
5976      if (__kmp_root[i]) {
5977        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5978        //                    there can be uber threads alive here
5979        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5980      }
5981    }
5982#endif
5983
5984    KMP_MB();
5985
5986    // Reap the worker threads.
5987    // This is valid for now, but be careful if threads are reaped sooner.
5988    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5989      // Get the next thread from the pool.
5990      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5991      __kmp_thread_pool = thread->th.th_next_pool;
5992      // Reap it.
5993      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5994      thread->th.th_next_pool = NULL;
5995      thread->th.th_in_pool = FALSE;
5996      __kmp_reap_thread(thread, 0);
5997    }
5998    __kmp_thread_pool_insert_pt = NULL;
5999
6000    // Reap teams.
6001    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6002      // Get the next team from the pool.
6003      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6004      __kmp_team_pool = team->t.t_next_pool;
6005      // Reap it.
6006      team->t.t_next_pool = NULL;
6007      __kmp_reap_team(team);
6008    }
6009
6010    __kmp_reap_task_teams();
6011
6012#if KMP_OS_UNIX
6013    // Threads that are not reaped should not access any resources since they
6014    // are going to be deallocated soon, so the shutdown sequence should wait
6015    // until all threads either exit the final spin-waiting loop or begin
6016    // sleeping after the given blocktime.
6017    for (i = 0; i < __kmp_threads_capacity; i++) {
6018      kmp_info_t *thr = __kmp_threads[i];
6019      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6020        KMP_CPU_PAUSE();
6021    }
6022#endif
6023
6024    for (i = 0; i < __kmp_threads_capacity; ++i) {
6025      // TBD: Add some checking...
6026      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6027    }
6028
6029    /* Make sure all threadprivate destructors get run by joining with all
6030       worker threads before resetting this flag */
6031    TCW_SYNC_4(__kmp_init_common, FALSE);
6032
6033    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6034    KMP_MB();
6035
6036#if KMP_USE_MONITOR
6037    // See note above: One of the possible fixes for CQ138434 / CQ140126
6038    //
6039    // FIXME: push both code fragments down and CSE them?
6040    // push them into __kmp_cleanup() ?
6041    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6042    if (TCR_4(__kmp_init_monitor)) {
6043      __kmp_reap_monitor(&__kmp_monitor);
6044      TCW_4(__kmp_init_monitor, 0);
6045    }
6046    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6047    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6048#endif
6049  } /* else !__kmp_global.t_active */
6050  TCW_4(__kmp_init_gtid, FALSE);
6051  KMP_MB(); /* Flush all pending memory write invalidates.  */
6052
6053  __kmp_cleanup();
6054#if OMPT_SUPPORT
6055  ompt_fini();
6056#endif
6057}
6058
6059void __kmp_internal_end_library(int gtid_req) {
6060  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6061  /* this shouldn't be a race condition because __kmp_internal_end() is the
6062     only place to clear __kmp_serial_init */
6063  /* we'll check this later too, after we get the lock */
6064  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6065  // redundaant, because the next check will work in any case.
6066  if (__kmp_global.g.g_abort) {
6067    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6068    /* TODO abort? */
6069    return;
6070  }
6071  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6072    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6073    return;
6074  }
6075
6076  KMP_MB(); /* Flush all pending memory write invalidates.  */
6077
6078  /* find out who we are and what we should do */
6079  {
6080    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6081    KA_TRACE(
6082        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6083    if (gtid == KMP_GTID_SHUTDOWN) {
6084      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6085                    "already shutdown\n"));
6086      return;
6087    } else if (gtid == KMP_GTID_MONITOR) {
6088      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6089                    "registered, or system shutdown\n"));
6090      return;
6091    } else if (gtid == KMP_GTID_DNE) {
6092      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6093                    "shutdown\n"));
6094      /* we don't know who we are, but we may still shutdown the library */
6095    } else if (KMP_UBER_GTID(gtid)) {
6096      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6097      if (__kmp_root[gtid]->r.r_active) {
6098        __kmp_global.g.g_abort = -1;
6099        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6100        KA_TRACE(10,
6101                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6102                  gtid));
6103        return;
6104      } else {
6105        KA_TRACE(
6106            10,
6107            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6108        __kmp_unregister_root_current_thread(gtid);
6109      }
6110    } else {
6111/* worker threads may call this function through the atexit handler, if they
6112 * call exit() */
6113/* For now, skip the usual subsequent processing and just dump the debug buffer.
6114   TODO: do a thorough shutdown instead */
6115#ifdef DUMP_DEBUG_ON_EXIT
6116      if (__kmp_debug_buf)
6117        __kmp_dump_debug_buffer();
6118#endif
6119      return;
6120    }
6121  }
6122  /* synchronize the termination process */
6123  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6124
6125  /* have we already finished */
6126  if (__kmp_global.g.g_abort) {
6127    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6128    /* TODO abort? */
6129    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6130    return;
6131  }
6132  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6133    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6134    return;
6135  }
6136
6137  /* We need this lock to enforce mutex between this reading of
6138     __kmp_threads_capacity and the writing by __kmp_register_root.
6139     Alternatively, we can use a counter of roots that is atomically updated by
6140     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6141     __kmp_internal_end_*.  */
6142  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6143
6144  /* now we can safely conduct the actual termination */
6145  __kmp_internal_end();
6146
6147  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6148  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6149
6150  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6151
6152#ifdef DUMP_DEBUG_ON_EXIT
6153  if (__kmp_debug_buf)
6154    __kmp_dump_debug_buffer();
6155#endif
6156
6157#if KMP_OS_WINDOWS
6158  __kmp_close_console();
6159#endif
6160
6161  __kmp_fini_allocator();
6162
6163} // __kmp_internal_end_library
6164
6165void __kmp_internal_end_thread(int gtid_req) {
6166  int i;
6167
6168  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6169  /* this shouldn't be a race condition because __kmp_internal_end() is the
6170   * only place to clear __kmp_serial_init */
6171  /* we'll check this later too, after we get the lock */
6172  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6173  // redundant, because the next check will work in any case.
6174  if (__kmp_global.g.g_abort) {
6175    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6176    /* TODO abort? */
6177    return;
6178  }
6179  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6180    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6181    return;
6182  }
6183
6184  KMP_MB(); /* Flush all pending memory write invalidates.  */
6185
6186  /* find out who we are and what we should do */
6187  {
6188    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6189    KA_TRACE(10,
6190             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6191    if (gtid == KMP_GTID_SHUTDOWN) {
6192      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6193                    "already shutdown\n"));
6194      return;
6195    } else if (gtid == KMP_GTID_MONITOR) {
6196      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6197                    "registered, or system shutdown\n"));
6198      return;
6199    } else if (gtid == KMP_GTID_DNE) {
6200      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6201                    "shutdown\n"));
6202      return;
6203      /* we don't know who we are */
6204    } else if (KMP_UBER_GTID(gtid)) {
6205      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6206      if (__kmp_root[gtid]->r.r_active) {
6207        __kmp_global.g.g_abort = -1;
6208        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6209        KA_TRACE(10,
6210                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6211                  gtid));
6212        return;
6213      } else {
6214        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6215                      gtid));
6216        __kmp_unregister_root_current_thread(gtid);
6217      }
6218    } else {
6219      /* just a worker thread, let's leave */
6220      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6221
6222      if (gtid >= 0) {
6223        __kmp_threads[gtid]->th.th_task_team = NULL;
6224      }
6225
6226      KA_TRACE(10,
6227               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6228                gtid));
6229      return;
6230    }
6231  }
6232#if KMP_DYNAMIC_LIB
6233  if (__kmp_pause_status != kmp_hard_paused)
6234  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6235  // because we will better shutdown later in the library destructor.
6236  {
6237    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6238    return;
6239  }
6240#endif
6241  /* synchronize the termination process */
6242  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6243
6244  /* have we already finished */
6245  if (__kmp_global.g.g_abort) {
6246    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6247    /* TODO abort? */
6248    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6249    return;
6250  }
6251  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6252    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6253    return;
6254  }
6255
6256  /* We need this lock to enforce mutex between this reading of
6257     __kmp_threads_capacity and the writing by __kmp_register_root.
6258     Alternatively, we can use a counter of roots that is atomically updated by
6259     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6260     __kmp_internal_end_*.  */
6261
6262  /* should we finish the run-time?  are all siblings done? */
6263  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6264
6265  for (i = 0; i < __kmp_threads_capacity; ++i) {
6266    if (KMP_UBER_GTID(i)) {
6267      KA_TRACE(
6268          10,
6269          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6270      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6271      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6272      return;
6273    }
6274  }
6275
6276  /* now we can safely conduct the actual termination */
6277
6278  __kmp_internal_end();
6279
6280  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6281  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6282
6283  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6284
6285#ifdef DUMP_DEBUG_ON_EXIT
6286  if (__kmp_debug_buf)
6287    __kmp_dump_debug_buffer();
6288#endif
6289} // __kmp_internal_end_thread
6290
6291// -----------------------------------------------------------------------------
6292// Library registration stuff.
6293
6294static long __kmp_registration_flag = 0;
6295// Random value used to indicate library initialization.
6296static char *__kmp_registration_str = NULL;
6297// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6298
6299static inline char *__kmp_reg_status_name() {
6300  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6301     each thread. If registration and unregistration go in different threads
6302     (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6303     env var can not be found, because the name will contain different pid. */
6304  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6305} // __kmp_reg_status_get
6306
6307void __kmp_register_library_startup(void) {
6308
6309  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6310  int done = 0;
6311  union {
6312    double dtime;
6313    long ltime;
6314  } time;
6315#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6316  __kmp_initialize_system_tick();
6317#endif
6318  __kmp_read_system_time(&time.dtime);
6319  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6320  __kmp_registration_str =
6321      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6322                       __kmp_registration_flag, KMP_LIBRARY_FILE);
6323
6324  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6325                __kmp_registration_str));
6326
6327  while (!done) {
6328
6329    char *value = NULL; // Actual value of the environment variable.
6330
6331    // Set environment variable, but do not overwrite if it is exist.
6332    __kmp_env_set(name, __kmp_registration_str, 0);
6333    // Check the variable is written.
6334    value = __kmp_env_get(name);
6335    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6336
6337      done = 1; // Ok, environment variable set successfully, exit the loop.
6338
6339    } else {
6340
6341      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6342      // Check whether it alive or dead.
6343      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6344      char *tail = value;
6345      char *flag_addr_str = NULL;
6346      char *flag_val_str = NULL;
6347      char const *file_name = NULL;
6348      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6349      __kmp_str_split(tail, '-', &flag_val_str, &tail);
6350      file_name = tail;
6351      if (tail != NULL) {
6352        long *flag_addr = 0;
6353        long flag_val = 0;
6354        KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6355        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6356        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6357          // First, check whether environment-encoded address is mapped into
6358          // addr space.
6359          // If so, dereference it to see if it still has the right value.
6360          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6361            neighbor = 1;
6362          } else {
6363            // If not, then we know the other copy of the library is no longer
6364            // running.
6365            neighbor = 2;
6366          }
6367        }
6368      }
6369      switch (neighbor) {
6370      case 0: // Cannot parse environment variable -- neighbor status unknown.
6371        // Assume it is the incompatible format of future version of the
6372        // library. Assume the other library is alive.
6373        // WARN( ... ); // TODO: Issue a warning.
6374        file_name = "unknown library";
6375        KMP_FALLTHROUGH();
6376      // Attention! Falling to the next case. That's intentional.
6377      case 1: { // Neighbor is alive.
6378        // Check it is allowed.
6379        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6380        if (!__kmp_str_match_true(duplicate_ok)) {
6381          // That's not allowed. Issue fatal error.
6382          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6383                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6384        }
6385        KMP_INTERNAL_FREE(duplicate_ok);
6386        __kmp_duplicate_library_ok = 1;
6387        done = 1; // Exit the loop.
6388      } break;
6389      case 2: { // Neighbor is dead.
6390        // Clear the variable and try to register library again.
6391        __kmp_env_unset(name);
6392      } break;
6393      default: { KMP_DEBUG_ASSERT(0); } break;
6394      }
6395    }
6396    KMP_INTERNAL_FREE((void *)value);
6397  }
6398  KMP_INTERNAL_FREE((void *)name);
6399
6400} // func __kmp_register_library_startup
6401
6402void __kmp_unregister_library(void) {
6403
6404  char *name = __kmp_reg_status_name();
6405  char *value = __kmp_env_get(name);
6406
6407  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6408  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6409  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6410    // Ok, this is our variable. Delete it.
6411    __kmp_env_unset(name);
6412  }
6413
6414  KMP_INTERNAL_FREE(__kmp_registration_str);
6415  KMP_INTERNAL_FREE(value);
6416  KMP_INTERNAL_FREE(name);
6417
6418  __kmp_registration_flag = 0;
6419  __kmp_registration_str = NULL;
6420
6421} // __kmp_unregister_library
6422
6423// End of Library registration stuff.
6424// -----------------------------------------------------------------------------
6425
6426#if KMP_MIC_SUPPORTED
6427
6428static void __kmp_check_mic_type() {
6429  kmp_cpuid_t cpuid_state = {0};
6430  kmp_cpuid_t *cs_p = &cpuid_state;
6431  __kmp_x86_cpuid(1, 0, cs_p);
6432  // We don't support mic1 at the moment
6433  if ((cs_p->eax & 0xff0) == 0xB10) {
6434    __kmp_mic_type = mic2;
6435  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6436    __kmp_mic_type = mic3;
6437  } else {
6438    __kmp_mic_type = non_mic;
6439  }
6440}
6441
6442#endif /* KMP_MIC_SUPPORTED */
6443
6444static void __kmp_do_serial_initialize(void) {
6445  int i, gtid;
6446  int size;
6447
6448  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6449
6450  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6451  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6452  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6453  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6454  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6455
6456#if OMPT_SUPPORT
6457  ompt_pre_init();
6458#endif
6459
6460  __kmp_validate_locks();
6461
6462  /* Initialize internal memory allocator */
6463  __kmp_init_allocator();
6464
6465  /* Register the library startup via an environment variable and check to see
6466     whether another copy of the library is already registered. */
6467
6468  __kmp_register_library_startup();
6469
6470  /* TODO reinitialization of library */
6471  if (TCR_4(__kmp_global.g.g_done)) {
6472    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6473  }
6474
6475  __kmp_global.g.g_abort = 0;
6476  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6477
6478/* initialize the locks */
6479#if KMP_USE_ADAPTIVE_LOCKS
6480#if KMP_DEBUG_ADAPTIVE_LOCKS
6481  __kmp_init_speculative_stats();
6482#endif
6483#endif
6484#if KMP_STATS_ENABLED
6485  __kmp_stats_init();
6486#endif
6487  __kmp_init_lock(&__kmp_global_lock);
6488  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6489  __kmp_init_lock(&__kmp_debug_lock);
6490  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6491  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6492  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6493  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6494  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6495  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6496  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6497  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6498  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6499  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6500  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6501  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6502  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6503  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6504  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6505#if KMP_USE_MONITOR
6506  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6507#endif
6508  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6509
6510  /* conduct initialization and initial setup of configuration */
6511
6512  __kmp_runtime_initialize();
6513
6514#if KMP_MIC_SUPPORTED
6515  __kmp_check_mic_type();
6516#endif
6517
6518// Some global variable initialization moved here from kmp_env_initialize()
6519#ifdef KMP_DEBUG
6520  kmp_diag = 0;
6521#endif
6522  __kmp_abort_delay = 0;
6523
6524  // From __kmp_init_dflt_team_nth()
6525  /* assume the entire machine will be used */
6526  __kmp_dflt_team_nth_ub = __kmp_xproc;
6527  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6528    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6529  }
6530  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6531    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6532  }
6533  __kmp_max_nth = __kmp_sys_max_nth;
6534  __kmp_cg_max_nth = __kmp_sys_max_nth;
6535  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6536  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6537    __kmp_teams_max_nth = __kmp_sys_max_nth;
6538  }
6539
6540  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6541  // part
6542  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6543#if KMP_USE_MONITOR
6544  __kmp_monitor_wakeups =
6545      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6546  __kmp_bt_intervals =
6547      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6548#endif
6549  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6550  __kmp_library = library_throughput;
6551  // From KMP_SCHEDULE initialization
6552  __kmp_static = kmp_sch_static_balanced;
6553// AC: do not use analytical here, because it is non-monotonous
6554//__kmp_guided = kmp_sch_guided_iterative_chunked;
6555//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6556// need to repeat assignment
6557// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6558// bit control and barrier method control parts
6559#if KMP_FAST_REDUCTION_BARRIER
6560#define kmp_reduction_barrier_gather_bb ((int)1)
6561#define kmp_reduction_barrier_release_bb ((int)1)
6562#define kmp_reduction_barrier_gather_pat bp_hyper_bar
6563#define kmp_reduction_barrier_release_pat bp_hyper_bar
6564#endif // KMP_FAST_REDUCTION_BARRIER
6565  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6566    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6567    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6568    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6569    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6570#if KMP_FAST_REDUCTION_BARRIER
6571    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6572      // lin_64 ): hyper,1
6573      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6574      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6575      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6576      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6577    }
6578#endif // KMP_FAST_REDUCTION_BARRIER
6579  }
6580#if KMP_FAST_REDUCTION_BARRIER
6581#undef kmp_reduction_barrier_release_pat
6582#undef kmp_reduction_barrier_gather_pat
6583#undef kmp_reduction_barrier_release_bb
6584#undef kmp_reduction_barrier_gather_bb
6585#endif // KMP_FAST_REDUCTION_BARRIER
6586#if KMP_MIC_SUPPORTED
6587  if (__kmp_mic_type == mic2) { // KNC
6588    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6589    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6590    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6591        1; // forkjoin release
6592    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6593    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6594  }
6595#if KMP_FAST_REDUCTION_BARRIER
6596  if (__kmp_mic_type == mic2) { // KNC
6597    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6598    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6599  }
6600#endif // KMP_FAST_REDUCTION_BARRIER
6601#endif // KMP_MIC_SUPPORTED
6602
6603// From KMP_CHECKS initialization
6604#ifdef KMP_DEBUG
6605  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6606#else
6607  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6608#endif
6609
6610  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6611  __kmp_foreign_tp = TRUE;
6612
6613  __kmp_global.g.g_dynamic = FALSE;
6614  __kmp_global.g.g_dynamic_mode = dynamic_default;
6615
6616  __kmp_env_initialize(NULL);
6617
6618// Print all messages in message catalog for testing purposes.
6619#ifdef KMP_DEBUG
6620  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6621  if (__kmp_str_match_true(val)) {
6622    kmp_str_buf_t buffer;
6623    __kmp_str_buf_init(&buffer);
6624    __kmp_i18n_dump_catalog(&buffer);
6625    __kmp_printf("%s", buffer.str);
6626    __kmp_str_buf_free(&buffer);
6627  }
6628  __kmp_env_free(&val);
6629#endif
6630
6631  __kmp_threads_capacity =
6632      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6633  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6634  __kmp_tp_capacity = __kmp_default_tp_capacity(
6635      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6636
6637  // If the library is shut down properly, both pools must be NULL. Just in
6638  // case, set them to NULL -- some memory may leak, but subsequent code will
6639  // work even if pools are not freed.
6640  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6641  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6642  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6643  __kmp_thread_pool = NULL;
6644  __kmp_thread_pool_insert_pt = NULL;
6645  __kmp_team_pool = NULL;
6646
6647  /* Allocate all of the variable sized records */
6648  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6649   * expandable */
6650  /* Since allocation is cache-aligned, just add extra padding at the end */
6651  size =
6652      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6653      CACHE_LINE;
6654  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6655  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6656                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
6657
6658  /* init thread counts */
6659  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6660                   0); // Asserts fail if the library is reinitializing and
6661  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6662  __kmp_all_nth = 0;
6663  __kmp_nth = 0;
6664
6665  /* setup the uber master thread and hierarchy */
6666  gtid = __kmp_register_root(TRUE);
6667  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6668  KMP_ASSERT(KMP_UBER_GTID(gtid));
6669  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6670
6671  KMP_MB(); /* Flush all pending memory write invalidates.  */
6672
6673  __kmp_common_initialize();
6674
6675#if KMP_OS_UNIX
6676  /* invoke the child fork handler */
6677  __kmp_register_atfork();
6678#endif
6679
6680#if !KMP_DYNAMIC_LIB
6681  {
6682    /* Invoke the exit handler when the program finishes, only for static
6683       library. For dynamic library, we already have _fini and DllMain. */
6684    int rc = atexit(__kmp_internal_end_atexit);
6685    if (rc != 0) {
6686      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6687                  __kmp_msg_null);
6688    }
6689  }
6690#endif
6691
6692#if KMP_HANDLE_SIGNALS
6693#if KMP_OS_UNIX
6694  /* NOTE: make sure that this is called before the user installs their own
6695     signal handlers so that the user handlers are called first. this way they
6696     can return false, not call our handler, avoid terminating the library, and
6697     continue execution where they left off. */
6698  __kmp_install_signals(FALSE);
6699#endif /* KMP_OS_UNIX */
6700#if KMP_OS_WINDOWS
6701  __kmp_install_signals(TRUE);
6702#endif /* KMP_OS_WINDOWS */
6703#endif
6704
6705  /* we have finished the serial initialization */
6706  __kmp_init_counter++;
6707
6708  __kmp_init_serial = TRUE;
6709
6710  if (__kmp_settings) {
6711    __kmp_env_print();
6712  }
6713
6714  if (__kmp_display_env || __kmp_display_env_verbose) {
6715    __kmp_env_print_2();
6716  }
6717
6718#if OMPT_SUPPORT
6719  ompt_post_init();
6720#endif
6721
6722  KMP_MB();
6723
6724  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6725}
6726
6727void __kmp_serial_initialize(void) {
6728  if (__kmp_init_serial) {
6729    return;
6730  }
6731  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6732  if (__kmp_init_serial) {
6733    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6734    return;
6735  }
6736  __kmp_do_serial_initialize();
6737  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6738}
6739
6740static void __kmp_do_middle_initialize(void) {
6741  int i, j;
6742  int prev_dflt_team_nth;
6743
6744  if (!__kmp_init_serial) {
6745    __kmp_do_serial_initialize();
6746  }
6747
6748  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6749
6750  // Save the previous value for the __kmp_dflt_team_nth so that
6751  // we can avoid some reinitialization if it hasn't changed.
6752  prev_dflt_team_nth = __kmp_dflt_team_nth;
6753
6754#if KMP_AFFINITY_SUPPORTED
6755  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6756  // number of cores on the machine.
6757  __kmp_affinity_initialize();
6758
6759  // Run through the __kmp_threads array and set the affinity mask
6760  // for each root thread that is currently registered with the RTL.
6761  for (i = 0; i < __kmp_threads_capacity; i++) {
6762    if (TCR_PTR(__kmp_threads[i]) != NULL) {
6763      __kmp_affinity_set_init_mask(i, TRUE);
6764    }
6765  }
6766#endif /* KMP_AFFINITY_SUPPORTED */
6767
6768  KMP_ASSERT(__kmp_xproc > 0);
6769  if (__kmp_avail_proc == 0) {
6770    __kmp_avail_proc = __kmp_xproc;
6771  }
6772
6773  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6774  // correct them now
6775  j = 0;
6776  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6777    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6778        __kmp_avail_proc;
6779    j++;
6780  }
6781
6782  if (__kmp_dflt_team_nth == 0) {
6783#ifdef KMP_DFLT_NTH_CORES
6784    // Default #threads = #cores
6785    __kmp_dflt_team_nth = __kmp_ncores;
6786    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6787                  "__kmp_ncores (%d)\n",
6788                  __kmp_dflt_team_nth));
6789#else
6790    // Default #threads = #available OS procs
6791    __kmp_dflt_team_nth = __kmp_avail_proc;
6792    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6793                  "__kmp_avail_proc(%d)\n",
6794                  __kmp_dflt_team_nth));
6795#endif /* KMP_DFLT_NTH_CORES */
6796  }
6797
6798  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6799    __kmp_dflt_team_nth = KMP_MIN_NTH;
6800  }
6801  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6802    __kmp_dflt_team_nth = __kmp_sys_max_nth;
6803  }
6804
6805  // There's no harm in continuing if the following check fails,
6806  // but it indicates an error in the previous logic.
6807  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6808
6809  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6810    // Run through the __kmp_threads array and set the num threads icv for each
6811    // root thread that is currently registered with the RTL (which has not
6812    // already explicitly set its nthreads-var with a call to
6813    // omp_set_num_threads()).
6814    for (i = 0; i < __kmp_threads_capacity; i++) {
6815      kmp_info_t *thread = __kmp_threads[i];
6816      if (thread == NULL)
6817        continue;
6818      if (thread->th.th_current_task->td_icvs.nproc != 0)
6819        continue;
6820
6821      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6822    }
6823  }
6824  KA_TRACE(
6825      20,
6826      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6827       __kmp_dflt_team_nth));
6828
6829#ifdef KMP_ADJUST_BLOCKTIME
6830  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6831  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6832    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6833    if (__kmp_nth > __kmp_avail_proc) {
6834      __kmp_zero_bt = TRUE;
6835    }
6836  }
6837#endif /* KMP_ADJUST_BLOCKTIME */
6838
6839  /* we have finished middle initialization */
6840  TCW_SYNC_4(__kmp_init_middle, TRUE);
6841
6842  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6843}
6844
6845void __kmp_middle_initialize(void) {
6846  if (__kmp_init_middle) {
6847    return;
6848  }
6849  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6850  if (__kmp_init_middle) {
6851    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6852    return;
6853  }
6854  __kmp_do_middle_initialize();
6855  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6856}
6857
6858void __kmp_parallel_initialize(void) {
6859  int gtid = __kmp_entry_gtid(); // this might be a new root
6860
6861  /* synchronize parallel initialization (for sibling) */
6862  if (TCR_4(__kmp_init_parallel))
6863    return;
6864  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6865  if (TCR_4(__kmp_init_parallel)) {
6866    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6867    return;
6868  }
6869
6870  /* TODO reinitialization after we have already shut down */
6871  if (TCR_4(__kmp_global.g.g_done)) {
6872    KA_TRACE(
6873        10,
6874        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6875    __kmp_infinite_loop();
6876  }
6877
6878  /* jc: The lock __kmp_initz_lock is already held, so calling
6879     __kmp_serial_initialize would cause a deadlock.  So we call
6880     __kmp_do_serial_initialize directly. */
6881  if (!__kmp_init_middle) {
6882    __kmp_do_middle_initialize();
6883  }
6884  __kmp_resume_if_hard_paused();
6885
6886  /* begin initialization */
6887  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6888  KMP_ASSERT(KMP_UBER_GTID(gtid));
6889
6890#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6891  // Save the FP control regs.
6892  // Worker threads will set theirs to these values at thread startup.
6893  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6894  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6895  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6896#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6897
6898#if KMP_OS_UNIX
6899#if KMP_HANDLE_SIGNALS
6900  /*  must be after __kmp_serial_initialize  */
6901  __kmp_install_signals(TRUE);
6902#endif
6903#endif
6904
6905  __kmp_suspend_initialize();
6906
6907#if defined(USE_LOAD_BALANCE)
6908  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6909    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6910  }
6911#else
6912  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6913    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6914  }
6915#endif
6916
6917  if (__kmp_version) {
6918    __kmp_print_version_2();
6919  }
6920
6921  /* we have finished parallel initialization */
6922  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6923
6924  KMP_MB();
6925  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6926
6927  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6928}
6929
6930/* ------------------------------------------------------------------------ */
6931
6932void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6933                                   kmp_team_t *team) {
6934  kmp_disp_t *dispatch;
6935
6936  KMP_MB();
6937
6938  /* none of the threads have encountered any constructs, yet. */
6939  this_thr->th.th_local.this_construct = 0;
6940#if KMP_CACHE_MANAGE
6941  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6942#endif /* KMP_CACHE_MANAGE */
6943  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6944  KMP_DEBUG_ASSERT(dispatch);
6945  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6946  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6947  // this_thr->th.th_info.ds.ds_tid ] );
6948
6949  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6950  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6951  if (__kmp_env_consistency_check)
6952    __kmp_push_parallel(gtid, team->t.t_ident);
6953
6954  KMP_MB(); /* Flush all pending memory write invalidates.  */
6955}
6956
6957void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6958                                  kmp_team_t *team) {
6959  if (__kmp_env_consistency_check)
6960    __kmp_pop_parallel(gtid, team->t.t_ident);
6961
6962  __kmp_finish_implicit_task(this_thr);
6963}
6964
6965int __kmp_invoke_task_func(int gtid) {
6966  int rc;
6967  int tid = __kmp_tid_from_gtid(gtid);
6968  kmp_info_t *this_thr = __kmp_threads[gtid];
6969  kmp_team_t *team = this_thr->th.th_team;
6970
6971  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6972#if USE_ITT_BUILD
6973  if (__itt_stack_caller_create_ptr) {
6974    __kmp_itt_stack_callee_enter(
6975        (__itt_caller)
6976            team->t.t_stack_id); // inform ittnotify about entering user's code
6977  }
6978#endif /* USE_ITT_BUILD */
6979#if INCLUDE_SSC_MARKS
6980  SSC_MARK_INVOKING();
6981#endif
6982
6983#if OMPT_SUPPORT
6984  void *dummy;
6985  void **exit_frame_p;
6986  ompt_data_t *my_task_data;
6987  ompt_data_t *my_parallel_data;
6988  int ompt_team_size;
6989
6990  if (ompt_enabled.enabled) {
6991    exit_frame_p = &(
6992        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6993  } else {
6994    exit_frame_p = &dummy;
6995  }
6996
6997  my_task_data =
6998      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6999  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7000  if (ompt_enabled.ompt_callback_implicit_task) {
7001    ompt_team_size = team->t.t_nproc;
7002    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7003        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7004        __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7005    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7006  }
7007#endif
7008
7009#if KMP_STATS_ENABLED
7010  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7011  if (previous_state == stats_state_e::TEAMS_REGION) {
7012    KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7013  } else {
7014    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7015  }
7016  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7017#endif
7018
7019  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7020                              tid, (int)team->t.t_argc, (void **)team->t.t_argv
7021#if OMPT_SUPPORT
7022                              ,
7023                              exit_frame_p
7024#endif
7025                              );
7026#if OMPT_SUPPORT
7027  *exit_frame_p = NULL;
7028   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7029#endif
7030
7031#if KMP_STATS_ENABLED
7032  if (previous_state == stats_state_e::TEAMS_REGION) {
7033    KMP_SET_THREAD_STATE(previous_state);
7034  }
7035  KMP_POP_PARTITIONED_TIMER();
7036#endif
7037
7038#if USE_ITT_BUILD
7039  if (__itt_stack_caller_create_ptr) {
7040    __kmp_itt_stack_callee_leave(
7041        (__itt_caller)
7042            team->t.t_stack_id); // inform ittnotify about leaving user's code
7043  }
7044#endif /* USE_ITT_BUILD */
7045  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7046
7047  return rc;
7048}
7049
7050void __kmp_teams_master(int gtid) {
7051  // This routine is called by all master threads in teams construct
7052  kmp_info_t *thr = __kmp_threads[gtid];
7053  kmp_team_t *team = thr->th.th_team;
7054  ident_t *loc = team->t.t_ident;
7055  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7056  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7057  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7058  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7059                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7060
7061  // This thread is a new CG root.  Set up the proper variables.
7062  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7063  tmp->cg_root = thr; // Make thr the CG root
7064  // Init to thread limit that was stored when league masters were forked
7065  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7066  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7067  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7068                 " cg_nthreads to 1\n",
7069                 thr, tmp));
7070  tmp->up = thr->th.th_cg_roots;
7071  thr->th.th_cg_roots = tmp;
7072
7073// Launch league of teams now, but not let workers execute
7074// (they hang on fork barrier until next parallel)
7075#if INCLUDE_SSC_MARKS
7076  SSC_MARK_FORKING();
7077#endif
7078  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7079                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7080                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7081#if INCLUDE_SSC_MARKS
7082  SSC_MARK_JOINING();
7083#endif
7084  // If the team size was reduced from the limit, set it to the new size
7085  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7086    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7087  // AC: last parameter "1" eliminates join barrier which won't work because
7088  // worker threads are in a fork barrier waiting for more parallel regions
7089  __kmp_join_call(loc, gtid
7090#if OMPT_SUPPORT
7091                  ,
7092                  fork_context_intel
7093#endif
7094                  ,
7095                  1);
7096}
7097
7098int __kmp_invoke_teams_master(int gtid) {
7099  kmp_info_t *this_thr = __kmp_threads[gtid];
7100  kmp_team_t *team = this_thr->th.th_team;
7101#if KMP_DEBUG
7102  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7103    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7104                     (void *)__kmp_teams_master);
7105#endif
7106  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7107#if OMPT_SUPPORT
7108  int tid = __kmp_tid_from_gtid(gtid);
7109  ompt_data_t *task_data =
7110      &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7111  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7112  if (ompt_enabled.ompt_callback_implicit_task) {
7113    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7114        ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7115        ompt_task_initial);
7116    OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7117  }
7118#endif
7119  __kmp_teams_master(gtid);
7120#if OMPT_SUPPORT
7121  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7122#endif
7123  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7124  return 1;
7125}
7126
7127/* this sets the requested number of threads for the next parallel region
7128   encountered by this team. since this should be enclosed in the forkjoin
7129   critical section it should avoid race conditions with asymmetrical nested
7130   parallelism */
7131
7132void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7133  kmp_info_t *thr = __kmp_threads[gtid];
7134
7135  if (num_threads > 0)
7136    thr->th.th_set_nproc = num_threads;
7137}
7138
7139/* this sets the requested number of teams for the teams region and/or
7140   the number of threads for the next parallel region encountered  */
7141void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7142                          int num_threads) {
7143  kmp_info_t *thr = __kmp_threads[gtid];
7144  KMP_DEBUG_ASSERT(num_teams >= 0);
7145  KMP_DEBUG_ASSERT(num_threads >= 0);
7146
7147  if (num_teams == 0)
7148    num_teams = 1; // default number of teams is 1.
7149  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7150    if (!__kmp_reserve_warn) {
7151      __kmp_reserve_warn = 1;
7152      __kmp_msg(kmp_ms_warning,
7153                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7154                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7155    }
7156    num_teams = __kmp_teams_max_nth;
7157  }
7158  // Set number of teams (number of threads in the outer "parallel" of the
7159  // teams)
7160  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7161
7162  // Remember the number of threads for inner parallel regions
7163  if (!TCR_4(__kmp_init_middle))
7164    __kmp_middle_initialize(); // get internal globals calculated
7165  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7166  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7167  if (num_threads == 0) {
7168    num_threads = __kmp_avail_proc / num_teams;
7169    // adjust num_threads w/o warning as it is not user setting
7170    // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7171    // no thread_limit clause specified -  do not change thread-limit-var ICV
7172    if (num_threads > __kmp_dflt_team_nth) {
7173      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7174    }
7175    if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7176      num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7177    } // prevent team size to exceed thread-limit-var
7178    if (num_teams * num_threads > __kmp_teams_max_nth) {
7179      num_threads = __kmp_teams_max_nth / num_teams;
7180    }
7181  } else {
7182    // This thread will be the master of the league masters
7183    // Store new thread limit; old limit is saved in th_cg_roots list
7184    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7185    // num_threads = min(num_threads, nthreads-var)
7186    if (num_threads > __kmp_dflt_team_nth) {
7187      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7188    }
7189    if (num_teams * num_threads > __kmp_teams_max_nth) {
7190      int new_threads = __kmp_teams_max_nth / num_teams;
7191      if (!__kmp_reserve_warn) { // user asked for too many threads
7192        __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7193        __kmp_msg(kmp_ms_warning,
7194                  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7195                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7196      }
7197      num_threads = new_threads;
7198    }
7199  }
7200  thr->th.th_teams_size.nth = num_threads;
7201}
7202
7203// Set the proc_bind var to use in the following parallel region.
7204void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7205  kmp_info_t *thr = __kmp_threads[gtid];
7206  thr->th.th_set_proc_bind = proc_bind;
7207}
7208
7209/* Launch the worker threads into the microtask. */
7210
7211void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7212  kmp_info_t *this_thr = __kmp_threads[gtid];
7213
7214#ifdef KMP_DEBUG
7215  int f;
7216#endif /* KMP_DEBUG */
7217
7218  KMP_DEBUG_ASSERT(team);
7219  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7220  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7221  KMP_MB(); /* Flush all pending memory write invalidates.  */
7222
7223  team->t.t_construct = 0; /* no single directives seen yet */
7224  team->t.t_ordered.dt.t_value =
7225      0; /* thread 0 enters the ordered section first */
7226
7227  /* Reset the identifiers on the dispatch buffer */
7228  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7229  if (team->t.t_max_nproc > 1) {
7230    int i;
7231    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7232      team->t.t_disp_buffer[i].buffer_index = i;
7233      team->t.t_disp_buffer[i].doacross_buf_idx = i;
7234    }
7235  } else {
7236    team->t.t_disp_buffer[0].buffer_index = 0;
7237    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7238  }
7239
7240  KMP_MB(); /* Flush all pending memory write invalidates.  */
7241  KMP_ASSERT(this_thr->th.th_team == team);
7242
7243#ifdef KMP_DEBUG
7244  for (f = 0; f < team->t.t_nproc; f++) {
7245    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7246                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7247  }
7248#endif /* KMP_DEBUG */
7249
7250  /* release the worker threads so they may begin working */
7251  __kmp_fork_barrier(gtid, 0);
7252}
7253
7254void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7255  kmp_info_t *this_thr = __kmp_threads[gtid];
7256
7257  KMP_DEBUG_ASSERT(team);
7258  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7259  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7260  KMP_MB(); /* Flush all pending memory write invalidates.  */
7261
7262/* Join barrier after fork */
7263
7264#ifdef KMP_DEBUG
7265  if (__kmp_threads[gtid] &&
7266      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7267    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7268                 __kmp_threads[gtid]);
7269    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7270                 "team->t.t_nproc=%d\n",
7271                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7272                 team->t.t_nproc);
7273    __kmp_print_structure();
7274  }
7275  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7276                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7277#endif /* KMP_DEBUG */
7278
7279  __kmp_join_barrier(gtid); /* wait for everyone */
7280#if OMPT_SUPPORT
7281  if (ompt_enabled.enabled &&
7282      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7283    int ds_tid = this_thr->th.th_info.ds.ds_tid;
7284    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7285    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7286#if OMPT_OPTIONAL
7287    void *codeptr = NULL;
7288    if (KMP_MASTER_TID(ds_tid) &&
7289        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7290         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7291      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7292
7293    if (ompt_enabled.ompt_callback_sync_region_wait) {
7294      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7295          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7296          codeptr);
7297    }
7298    if (ompt_enabled.ompt_callback_sync_region) {
7299      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7300          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7301          codeptr);
7302    }
7303#endif
7304    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7305      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7306          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7307    }
7308  }
7309#endif
7310
7311  KMP_MB(); /* Flush all pending memory write invalidates.  */
7312  KMP_ASSERT(this_thr->th.th_team == team);
7313}
7314
7315/* ------------------------------------------------------------------------ */
7316
7317#ifdef USE_LOAD_BALANCE
7318
7319// Return the worker threads actively spinning in the hot team, if we
7320// are at the outermost level of parallelism.  Otherwise, return 0.
7321static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7322  int i;
7323  int retval;
7324  kmp_team_t *hot_team;
7325
7326  if (root->r.r_active) {
7327    return 0;
7328  }
7329  hot_team = root->r.r_hot_team;
7330  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7331    return hot_team->t.t_nproc - 1; // Don't count master thread
7332  }
7333
7334  // Skip the master thread - it is accounted for elsewhere.
7335  retval = 0;
7336  for (i = 1; i < hot_team->t.t_nproc; i++) {
7337    if (hot_team->t.t_threads[i]->th.th_active) {
7338      retval++;
7339    }
7340  }
7341  return retval;
7342}
7343
7344// Perform an automatic adjustment to the number of
7345// threads used by the next parallel region.
7346static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7347  int retval;
7348  int pool_active;
7349  int hot_team_active;
7350  int team_curr_active;
7351  int system_active;
7352
7353  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7354                set_nproc));
7355  KMP_DEBUG_ASSERT(root);
7356  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7357                       ->th.th_current_task->td_icvs.dynamic == TRUE);
7358  KMP_DEBUG_ASSERT(set_nproc > 1);
7359
7360  if (set_nproc == 1) {
7361    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7362    return 1;
7363  }
7364
7365  // Threads that are active in the thread pool, active in the hot team for this
7366  // particular root (if we are at the outer par level), and the currently
7367  // executing thread (to become the master) are available to add to the new
7368  // team, but are currently contributing to the system load, and must be
7369  // accounted for.
7370  pool_active = __kmp_thread_pool_active_nth;
7371  hot_team_active = __kmp_active_hot_team_nproc(root);
7372  team_curr_active = pool_active + hot_team_active + 1;
7373
7374  // Check the system load.
7375  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7376  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7377                "hot team active = %d\n",
7378                system_active, pool_active, hot_team_active));
7379
7380  if (system_active < 0) {
7381    // There was an error reading the necessary info from /proc, so use the
7382    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7383    // = dynamic_thread_limit, we shouldn't wind up getting back here.
7384    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7385    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7386
7387    // Make this call behave like the thread limit algorithm.
7388    retval = __kmp_avail_proc - __kmp_nth +
7389             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7390    if (retval > set_nproc) {
7391      retval = set_nproc;
7392    }
7393    if (retval < KMP_MIN_NTH) {
7394      retval = KMP_MIN_NTH;
7395    }
7396
7397    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7398                  retval));
7399    return retval;
7400  }
7401
7402  // There is a slight delay in the load balance algorithm in detecting new
7403  // running procs. The real system load at this instant should be at least as
7404  // large as the #active omp thread that are available to add to the team.
7405  if (system_active < team_curr_active) {
7406    system_active = team_curr_active;
7407  }
7408  retval = __kmp_avail_proc - system_active + team_curr_active;
7409  if (retval > set_nproc) {
7410    retval = set_nproc;
7411  }
7412  if (retval < KMP_MIN_NTH) {
7413    retval = KMP_MIN_NTH;
7414  }
7415
7416  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7417  return retval;
7418} // __kmp_load_balance_nproc()
7419
7420#endif /* USE_LOAD_BALANCE */
7421
7422/* ------------------------------------------------------------------------ */
7423
7424/* NOTE: this is called with the __kmp_init_lock held */
7425void __kmp_cleanup(void) {
7426  int f;
7427
7428  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7429
7430  if (TCR_4(__kmp_init_parallel)) {
7431#if KMP_HANDLE_SIGNALS
7432    __kmp_remove_signals();
7433#endif
7434    TCW_4(__kmp_init_parallel, FALSE);
7435  }
7436
7437  if (TCR_4(__kmp_init_middle)) {
7438#if KMP_AFFINITY_SUPPORTED
7439    __kmp_affinity_uninitialize();
7440#endif /* KMP_AFFINITY_SUPPORTED */
7441    __kmp_cleanup_hierarchy();
7442    TCW_4(__kmp_init_middle, FALSE);
7443  }
7444
7445  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7446
7447  if (__kmp_init_serial) {
7448    __kmp_runtime_destroy();
7449    __kmp_init_serial = FALSE;
7450  }
7451
7452  __kmp_cleanup_threadprivate_caches();
7453
7454  for (f = 0; f < __kmp_threads_capacity; f++) {
7455    if (__kmp_root[f] != NULL) {
7456      __kmp_free(__kmp_root[f]);
7457      __kmp_root[f] = NULL;
7458    }
7459  }
7460  __kmp_free(__kmp_threads);
7461  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7462  // there is no need in freeing __kmp_root.
7463  __kmp_threads = NULL;
7464  __kmp_root = NULL;
7465  __kmp_threads_capacity = 0;
7466
7467#if KMP_USE_DYNAMIC_LOCK
7468  __kmp_cleanup_indirect_user_locks();
7469#else
7470  __kmp_cleanup_user_locks();
7471#endif
7472
7473#if KMP_AFFINITY_SUPPORTED
7474  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7475  __kmp_cpuinfo_file = NULL;
7476#endif /* KMP_AFFINITY_SUPPORTED */
7477
7478#if KMP_USE_ADAPTIVE_LOCKS
7479#if KMP_DEBUG_ADAPTIVE_LOCKS
7480  __kmp_print_speculative_stats();
7481#endif
7482#endif
7483  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7484  __kmp_nested_nth.nth = NULL;
7485  __kmp_nested_nth.size = 0;
7486  __kmp_nested_nth.used = 0;
7487  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7488  __kmp_nested_proc_bind.bind_types = NULL;
7489  __kmp_nested_proc_bind.size = 0;
7490  __kmp_nested_proc_bind.used = 0;
7491  if (__kmp_affinity_format) {
7492    KMP_INTERNAL_FREE(__kmp_affinity_format);
7493    __kmp_affinity_format = NULL;
7494  }
7495
7496  __kmp_i18n_catclose();
7497
7498#if KMP_USE_HIER_SCHED
7499  __kmp_hier_scheds.deallocate();
7500#endif
7501
7502#if KMP_STATS_ENABLED
7503  __kmp_stats_fini();
7504#endif
7505
7506  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7507}
7508
7509/* ------------------------------------------------------------------------ */
7510
7511int __kmp_ignore_mppbeg(void) {
7512  char *env;
7513
7514  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7515    if (__kmp_str_match_false(env))
7516      return FALSE;
7517  }
7518  // By default __kmpc_begin() is no-op.
7519  return TRUE;
7520}
7521
7522int __kmp_ignore_mppend(void) {
7523  char *env;
7524
7525  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7526    if (__kmp_str_match_false(env))
7527      return FALSE;
7528  }
7529  // By default __kmpc_end() is no-op.
7530  return TRUE;
7531}
7532
7533void __kmp_internal_begin(void) {
7534  int gtid;
7535  kmp_root_t *root;
7536
7537  /* this is a very important step as it will register new sibling threads
7538     and assign these new uber threads a new gtid */
7539  gtid = __kmp_entry_gtid();
7540  root = __kmp_threads[gtid]->th.th_root;
7541  KMP_ASSERT(KMP_UBER_GTID(gtid));
7542
7543  if (root->r.r_begin)
7544    return;
7545  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7546  if (root->r.r_begin) {
7547    __kmp_release_lock(&root->r.r_begin_lock, gtid);
7548    return;
7549  }
7550
7551  root->r.r_begin = TRUE;
7552
7553  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7554}
7555
7556/* ------------------------------------------------------------------------ */
7557
7558void __kmp_user_set_library(enum library_type arg) {
7559  int gtid;
7560  kmp_root_t *root;
7561  kmp_info_t *thread;
7562
7563  /* first, make sure we are initialized so we can get our gtid */
7564
7565  gtid = __kmp_entry_gtid();
7566  thread = __kmp_threads[gtid];
7567
7568  root = thread->th.th_root;
7569
7570  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7571                library_serial));
7572  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7573                                  thread */
7574    KMP_WARNING(SetLibraryIncorrectCall);
7575    return;
7576  }
7577
7578  switch (arg) {
7579  case library_serial:
7580    thread->th.th_set_nproc = 0;
7581    set__nproc(thread, 1);
7582    break;
7583  case library_turnaround:
7584    thread->th.th_set_nproc = 0;
7585    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7586                                           : __kmp_dflt_team_nth_ub);
7587    break;
7588  case library_throughput:
7589    thread->th.th_set_nproc = 0;
7590    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7591                                           : __kmp_dflt_team_nth_ub);
7592    break;
7593  default:
7594    KMP_FATAL(UnknownLibraryType, arg);
7595  }
7596
7597  __kmp_aux_set_library(arg);
7598}
7599
7600void __kmp_aux_set_stacksize(size_t arg) {
7601  if (!__kmp_init_serial)
7602    __kmp_serial_initialize();
7603
7604#if KMP_OS_DARWIN
7605  if (arg & (0x1000 - 1)) {
7606    arg &= ~(0x1000 - 1);
7607    if (arg + 0x1000) /* check for overflow if we round up */
7608      arg += 0x1000;
7609  }
7610#endif
7611  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7612
7613  /* only change the default stacksize before the first parallel region */
7614  if (!TCR_4(__kmp_init_parallel)) {
7615    size_t value = arg; /* argument is in bytes */
7616
7617    if (value < __kmp_sys_min_stksize)
7618      value = __kmp_sys_min_stksize;
7619    else if (value > KMP_MAX_STKSIZE)
7620      value = KMP_MAX_STKSIZE;
7621
7622    __kmp_stksize = value;
7623
7624    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7625  }
7626
7627  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7628}
7629
7630/* set the behaviour of the runtime library */
7631/* TODO this can cause some odd behaviour with sibling parallelism... */
7632void __kmp_aux_set_library(enum library_type arg) {
7633  __kmp_library = arg;
7634
7635  switch (__kmp_library) {
7636  case library_serial: {
7637    KMP_INFORM(LibraryIsSerial);
7638  } break;
7639  case library_turnaround:
7640    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7641      __kmp_use_yield = 2; // only yield when oversubscribed
7642    break;
7643  case library_throughput:
7644    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7645      __kmp_dflt_blocktime = 200;
7646    break;
7647  default:
7648    KMP_FATAL(UnknownLibraryType, arg);
7649  }
7650}
7651
7652/* Getting team information common for all team API */
7653// Returns NULL if not in teams construct
7654static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7655  kmp_info_t *thr = __kmp_entry_thread();
7656  teams_serialized = 0;
7657  if (thr->th.th_teams_microtask) {
7658    kmp_team_t *team = thr->th.th_team;
7659    int tlevel = thr->th.th_teams_level; // the level of the teams construct
7660    int ii = team->t.t_level;
7661    teams_serialized = team->t.t_serialized;
7662    int level = tlevel + 1;
7663    KMP_DEBUG_ASSERT(ii >= tlevel);
7664    while (ii > level) {
7665      for (teams_serialized = team->t.t_serialized;
7666           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7667      }
7668      if (team->t.t_serialized && (!teams_serialized)) {
7669        team = team->t.t_parent;
7670        continue;
7671      }
7672      if (ii > level) {
7673        team = team->t.t_parent;
7674        ii--;
7675      }
7676    }
7677    return team;
7678  }
7679  return NULL;
7680}
7681
7682int __kmp_aux_get_team_num() {
7683  int serialized;
7684  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7685  if (team) {
7686    if (serialized > 1) {
7687      return 0; // teams region is serialized ( 1 team of 1 thread ).
7688    } else {
7689      return team->t.t_master_tid;
7690    }
7691  }
7692  return 0;
7693}
7694
7695int __kmp_aux_get_num_teams() {
7696  int serialized;
7697  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7698  if (team) {
7699    if (serialized > 1) {
7700      return 1;
7701    } else {
7702      return team->t.t_parent->t.t_nproc;
7703    }
7704  }
7705  return 1;
7706}
7707
7708/* ------------------------------------------------------------------------ */
7709
7710/*
7711 * Affinity Format Parser
7712 *
7713 * Field is in form of: %[[[0].]size]type
7714 * % and type are required (%% means print a literal '%')
7715 * type is either single char or long name surrounded by {},
7716 * e.g., N or {num_threads}
7717 * 0 => leading zeros
7718 * . => right justified when size is specified
7719 * by default output is left justified
7720 * size is the *minimum* field length
7721 * All other characters are printed as is
7722 *
7723 * Available field types:
7724 * L {thread_level}      - omp_get_level()
7725 * n {thread_num}        - omp_get_thread_num()
7726 * h {host}              - name of host machine
7727 * P {process_id}        - process id (integer)
7728 * T {thread_identifier} - native thread identifier (integer)
7729 * N {num_threads}       - omp_get_num_threads()
7730 * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7731 * a {thread_affinity}   - comma separated list of integers or integer ranges
7732 *                         (values of affinity mask)
7733 *
7734 * Implementation-specific field types can be added
7735 * If a type is unknown, print "undefined"
7736*/
7737
7738// Structure holding the short name, long name, and corresponding data type
7739// for snprintf.  A table of these will represent the entire valid keyword
7740// field types.
7741typedef struct kmp_affinity_format_field_t {
7742  char short_name; // from spec e.g., L -> thread level
7743  const char *long_name; // from spec thread_level -> thread level
7744  char field_format; // data type for snprintf (typically 'd' or 's'
7745  // for integer or string)
7746} kmp_affinity_format_field_t;
7747
7748static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7749#if KMP_AFFINITY_SUPPORTED
7750    {'A', "thread_affinity", 's'},
7751#endif
7752    {'t', "team_num", 'd'},
7753    {'T', "num_teams", 'd'},
7754    {'L', "nesting_level", 'd'},
7755    {'n', "thread_num", 'd'},
7756    {'N', "num_threads", 'd'},
7757    {'a', "ancestor_tnum", 'd'},
7758    {'H', "host", 's'},
7759    {'P', "process_id", 'd'},
7760    {'i', "native_thread_id", 'd'}};
7761
7762// Return the number of characters it takes to hold field
7763static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7764                                            const char **ptr,
7765                                            kmp_str_buf_t *field_buffer) {
7766  int rc, format_index, field_value;
7767  const char *width_left, *width_right;
7768  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7769  static const int FORMAT_SIZE = 20;
7770  char format[FORMAT_SIZE] = {0};
7771  char absolute_short_name = 0;
7772
7773  KMP_DEBUG_ASSERT(gtid >= 0);
7774  KMP_DEBUG_ASSERT(th);
7775  KMP_DEBUG_ASSERT(**ptr == '%');
7776  KMP_DEBUG_ASSERT(field_buffer);
7777
7778  __kmp_str_buf_clear(field_buffer);
7779
7780  // Skip the initial %
7781  (*ptr)++;
7782
7783  // Check for %% first
7784  if (**ptr == '%') {
7785    __kmp_str_buf_cat(field_buffer, "%", 1);
7786    (*ptr)++; // skip over the second %
7787    return 1;
7788  }
7789
7790  // Parse field modifiers if they are present
7791  pad_zeros = false;
7792  if (**ptr == '0') {
7793    pad_zeros = true;
7794    (*ptr)++; // skip over 0
7795  }
7796  right_justify = false;
7797  if (**ptr == '.') {
7798    right_justify = true;
7799    (*ptr)++; // skip over .
7800  }
7801  // Parse width of field: [width_left, width_right)
7802  width_left = width_right = NULL;
7803  if (**ptr >= '0' && **ptr <= '9') {
7804    width_left = *ptr;
7805    SKIP_DIGITS(*ptr);
7806    width_right = *ptr;
7807  }
7808
7809  // Create the format for KMP_SNPRINTF based on flags parsed above
7810  format_index = 0;
7811  format[format_index++] = '%';
7812  if (!right_justify)
7813    format[format_index++] = '-';
7814  if (pad_zeros)
7815    format[format_index++] = '0';
7816  if (width_left && width_right) {
7817    int i = 0;
7818    // Only allow 8 digit number widths.
7819    // This also prevents overflowing format variable
7820    while (i < 8 && width_left < width_right) {
7821      format[format_index++] = *width_left;
7822      width_left++;
7823      i++;
7824    }
7825  }
7826
7827  // Parse a name (long or short)
7828  // Canonicalize the name into absolute_short_name
7829  found_valid_name = false;
7830  parse_long_name = (**ptr == '{');
7831  if (parse_long_name)
7832    (*ptr)++; // skip initial left brace
7833  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7834                             sizeof(__kmp_affinity_format_table[0]);
7835       ++i) {
7836    char short_name = __kmp_affinity_format_table[i].short_name;
7837    const char *long_name = __kmp_affinity_format_table[i].long_name;
7838    char field_format = __kmp_affinity_format_table[i].field_format;
7839    if (parse_long_name) {
7840      int length = KMP_STRLEN(long_name);
7841      if (strncmp(*ptr, long_name, length) == 0) {
7842        found_valid_name = true;
7843        (*ptr) += length; // skip the long name
7844      }
7845    } else if (**ptr == short_name) {
7846      found_valid_name = true;
7847      (*ptr)++; // skip the short name
7848    }
7849    if (found_valid_name) {
7850      format[format_index++] = field_format;
7851      format[format_index++] = '\0';
7852      absolute_short_name = short_name;
7853      break;
7854    }
7855  }
7856  if (parse_long_name) {
7857    if (**ptr != '}') {
7858      absolute_short_name = 0;
7859    } else {
7860      (*ptr)++; // skip over the right brace
7861    }
7862  }
7863
7864  // Attempt to fill the buffer with the requested
7865  // value using snprintf within __kmp_str_buf_print()
7866  switch (absolute_short_name) {
7867  case 't':
7868    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7869    break;
7870  case 'T':
7871    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7872    break;
7873  case 'L':
7874    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7875    break;
7876  case 'n':
7877    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7878    break;
7879  case 'H': {
7880    static const int BUFFER_SIZE = 256;
7881    char buf[BUFFER_SIZE];
7882    __kmp_expand_host_name(buf, BUFFER_SIZE);
7883    rc = __kmp_str_buf_print(field_buffer, format, buf);
7884  } break;
7885  case 'P':
7886    rc = __kmp_str_buf_print(field_buffer, format, getpid());
7887    break;
7888  case 'i':
7889    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7890    break;
7891  case 'N':
7892    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7893    break;
7894  case 'a':
7895    field_value =
7896        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7897    rc = __kmp_str_buf_print(field_buffer, format, field_value);
7898    break;
7899#if KMP_AFFINITY_SUPPORTED
7900  case 'A': {
7901    kmp_str_buf_t buf;
7902    __kmp_str_buf_init(&buf);
7903    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7904    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7905    __kmp_str_buf_free(&buf);
7906  } break;
7907#endif
7908  default:
7909    // According to spec, If an implementation does not have info for field
7910    // type, then "undefined" is printed
7911    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7912    // Skip the field
7913    if (parse_long_name) {
7914      SKIP_TOKEN(*ptr);
7915      if (**ptr == '}')
7916        (*ptr)++;
7917    } else {
7918      (*ptr)++;
7919    }
7920  }
7921
7922  KMP_ASSERT(format_index <= FORMAT_SIZE);
7923  return rc;
7924}
7925
7926/*
7927 * Return number of characters needed to hold the affinity string
7928 * (not including null byte character)
7929 * The resultant string is printed to buffer, which the caller can then
7930 * handle afterwards
7931*/
7932size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7933                                  kmp_str_buf_t *buffer) {
7934  const char *parse_ptr;
7935  size_t retval;
7936  const kmp_info_t *th;
7937  kmp_str_buf_t field;
7938
7939  KMP_DEBUG_ASSERT(buffer);
7940  KMP_DEBUG_ASSERT(gtid >= 0);
7941
7942  __kmp_str_buf_init(&field);
7943  __kmp_str_buf_clear(buffer);
7944
7945  th = __kmp_threads[gtid];
7946  retval = 0;
7947
7948  // If format is NULL or zero-length string, then we use
7949  // affinity-format-var ICV
7950  parse_ptr = format;
7951  if (parse_ptr == NULL || *parse_ptr == '\0') {
7952    parse_ptr = __kmp_affinity_format;
7953  }
7954  KMP_DEBUG_ASSERT(parse_ptr);
7955
7956  while (*parse_ptr != '\0') {
7957    // Parse a field
7958    if (*parse_ptr == '%') {
7959      // Put field in the buffer
7960      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7961      __kmp_str_buf_catbuf(buffer, &field);
7962      retval += rc;
7963    } else {
7964      // Put literal character in buffer
7965      __kmp_str_buf_cat(buffer, parse_ptr, 1);
7966      retval++;
7967      parse_ptr++;
7968    }
7969  }
7970  __kmp_str_buf_free(&field);
7971  return retval;
7972}
7973
7974// Displays the affinity string to stdout
7975void __kmp_aux_display_affinity(int gtid, const char *format) {
7976  kmp_str_buf_t buf;
7977  __kmp_str_buf_init(&buf);
7978  __kmp_aux_capture_affinity(gtid, format, &buf);
7979  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7980  __kmp_str_buf_free(&buf);
7981}
7982
7983/* ------------------------------------------------------------------------ */
7984
7985void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7986  int blocktime = arg; /* argument is in milliseconds */
7987#if KMP_USE_MONITOR
7988  int bt_intervals;
7989#endif
7990  int bt_set;
7991
7992  __kmp_save_internal_controls(thread);
7993
7994  /* Normalize and set blocktime for the teams */
7995  if (blocktime < KMP_MIN_BLOCKTIME)
7996    blocktime = KMP_MIN_BLOCKTIME;
7997  else if (blocktime > KMP_MAX_BLOCKTIME)
7998    blocktime = KMP_MAX_BLOCKTIME;
7999
8000  set__blocktime_team(thread->th.th_team, tid, blocktime);
8001  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8002
8003#if KMP_USE_MONITOR
8004  /* Calculate and set blocktime intervals for the teams */
8005  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8006
8007  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8008  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8009#endif
8010
8011  /* Set whether blocktime has been set to "TRUE" */
8012  bt_set = TRUE;
8013
8014  set__bt_set_team(thread->th.th_team, tid, bt_set);
8015  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8016#if KMP_USE_MONITOR
8017  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8018                "bt_intervals=%d, monitor_updates=%d\n",
8019                __kmp_gtid_from_tid(tid, thread->th.th_team),
8020                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8021                __kmp_monitor_wakeups));
8022#else
8023  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8024                __kmp_gtid_from_tid(tid, thread->th.th_team),
8025                thread->th.th_team->t.t_id, tid, blocktime));
8026#endif
8027}
8028
8029void __kmp_aux_set_defaults(char const *str, int len) {
8030  if (!__kmp_init_serial) {
8031    __kmp_serial_initialize();
8032  }
8033  __kmp_env_initialize(str);
8034
8035  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8036    __kmp_env_print();
8037  }
8038} // __kmp_aux_set_defaults
8039
8040/* ------------------------------------------------------------------------ */
8041/* internal fast reduction routines */
8042
8043PACKED_REDUCTION_METHOD_T
8044__kmp_determine_reduction_method(
8045    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8046    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8047    kmp_critical_name *lck) {
8048
8049  // Default reduction method: critical construct ( lck != NULL, like in current
8050  // PAROPT )
8051  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8052  // can be selected by RTL
8053  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8054  // can be selected by RTL
8055  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8056  // among generated by PAROPT.
8057
8058  PACKED_REDUCTION_METHOD_T retval;
8059
8060  int team_size;
8061
8062  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8063  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8064
8065#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8066  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8067#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8068
8069  retval = critical_reduce_block;
8070
8071  // another choice of getting a team size (with 1 dynamic deference) is slower
8072  team_size = __kmp_get_team_num_threads(global_tid);
8073  if (team_size == 1) {
8074
8075    retval = empty_reduce_block;
8076
8077  } else {
8078
8079    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8080
8081#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8082    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8083
8084#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8085    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8086
8087    int teamsize_cutoff = 4;
8088
8089#if KMP_MIC_SUPPORTED
8090    if (__kmp_mic_type != non_mic) {
8091      teamsize_cutoff = 8;
8092    }
8093#endif
8094    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8095    if (tree_available) {
8096      if (team_size <= teamsize_cutoff) {
8097        if (atomic_available) {
8098          retval = atomic_reduce_block;
8099        }
8100      } else {
8101        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8102      }
8103    } else if (atomic_available) {
8104      retval = atomic_reduce_block;
8105    }
8106#else
8107#error "Unknown or unsupported OS"
8108#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8109       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8110
8111#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8112
8113#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8114
8115    // basic tuning
8116
8117    if (atomic_available) {
8118      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8119        retval = atomic_reduce_block;
8120      }
8121    } // otherwise: use critical section
8122
8123#elif KMP_OS_DARWIN
8124
8125    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8126    if (atomic_available && (num_vars <= 3)) {
8127      retval = atomic_reduce_block;
8128    } else if (tree_available) {
8129      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8130          (reduce_size < (2000 * sizeof(kmp_real64)))) {
8131        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8132      }
8133    } // otherwise: use critical section
8134
8135#else
8136#error "Unknown or unsupported OS"
8137#endif
8138
8139#else
8140#error "Unknown or unsupported architecture"
8141#endif
8142  }
8143
8144  // KMP_FORCE_REDUCTION
8145
8146  // If the team is serialized (team_size == 1), ignore the forced reduction
8147  // method and stay with the unsynchronized method (empty_reduce_block)
8148  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8149      team_size != 1) {
8150
8151    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8152
8153    int atomic_available, tree_available;
8154
8155    switch ((forced_retval = __kmp_force_reduction_method)) {
8156    case critical_reduce_block:
8157      KMP_ASSERT(lck); // lck should be != 0
8158      break;
8159
8160    case atomic_reduce_block:
8161      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8162      if (!atomic_available) {
8163        KMP_WARNING(RedMethodNotSupported, "atomic");
8164        forced_retval = critical_reduce_block;
8165      }
8166      break;
8167
8168    case tree_reduce_block:
8169      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8170      if (!tree_available) {
8171        KMP_WARNING(RedMethodNotSupported, "tree");
8172        forced_retval = critical_reduce_block;
8173      } else {
8174#if KMP_FAST_REDUCTION_BARRIER
8175        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8176#endif
8177      }
8178      break;
8179
8180    default:
8181      KMP_ASSERT(0); // "unsupported method specified"
8182    }
8183
8184    retval = forced_retval;
8185  }
8186
8187  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8188
8189#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8190#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8191
8192  return (retval);
8193}
8194
8195// this function is for testing set/get/determine reduce method
8196kmp_int32 __kmp_get_reduce_method(void) {
8197  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8198}
8199
8200// Soft pause sets up threads to ignore blocktime and just go to sleep.
8201// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8202void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8203
8204// Hard pause shuts down the runtime completely.  Resume happens naturally when
8205// OpenMP is used subsequently.
8206void __kmp_hard_pause() {
8207  __kmp_pause_status = kmp_hard_paused;
8208  __kmp_internal_end_thread(-1);
8209}
8210
8211// Soft resume sets __kmp_pause_status, and wakes up all threads.
8212void __kmp_resume_if_soft_paused() {
8213  if (__kmp_pause_status == kmp_soft_paused) {
8214    __kmp_pause_status = kmp_not_paused;
8215
8216    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8217      kmp_info_t *thread = __kmp_threads[gtid];
8218      if (thread) { // Wake it if sleeping
8219        kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8220        if (fl.is_sleeping())
8221          fl.resume(gtid);
8222        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8223          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8224        } else { // thread holds the lock and may sleep soon
8225          do { // until either the thread sleeps, or we can get the lock
8226            if (fl.is_sleeping()) {
8227              fl.resume(gtid);
8228              break;
8229            } else if (__kmp_try_suspend_mx(thread)) {
8230              __kmp_unlock_suspend_mx(thread);
8231              break;
8232            }
8233          } while (1);
8234        }
8235      }
8236    }
8237  }
8238}
8239
8240// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8241// TODO: add warning messages
8242int __kmp_pause_resource(kmp_pause_status_t level) {
8243  if (level == kmp_not_paused) { // requesting resume
8244    if (__kmp_pause_status == kmp_not_paused) {
8245      // error message about runtime not being paused, so can't resume
8246      return 1;
8247    } else {
8248      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8249                       __kmp_pause_status == kmp_hard_paused);
8250      __kmp_pause_status = kmp_not_paused;
8251      return 0;
8252    }
8253  } else if (level == kmp_soft_paused) { // requesting soft pause
8254    if (__kmp_pause_status != kmp_not_paused) {
8255      // error message about already being paused
8256      return 1;
8257    } else {
8258      __kmp_soft_pause();
8259      return 0;
8260    }
8261  } else if (level == kmp_hard_paused) { // requesting hard pause
8262    if (__kmp_pause_status != kmp_not_paused) {
8263      // error message about already being paused
8264      return 1;
8265    } else {
8266      __kmp_hard_pause();
8267      return 0;
8268    }
8269  } else {
8270    // error message about invalid level
8271    return 1;
8272  }
8273}
8274