kmp_runtime.cpp revision 345153
1228753Smm/*
2228753Smm * kmp_runtime.cpp -- KPTS runtime support library
3228753Smm */
4228753Smm
5228753Smm//===----------------------------------------------------------------------===//
6228753Smm//
7228753Smm//                     The LLVM Compiler Infrastructure
8228753Smm//
9228753Smm// This file is dual licensed under the MIT and the University of Illinois Open
10228753Smm// Source Licenses. See LICENSE.txt for details.
11228753Smm//
12228753Smm//===----------------------------------------------------------------------===//
13228753Smm
14228753Smm#include "kmp.h"
15228753Smm#include "kmp_affinity.h"
16228753Smm#include "kmp_atomic.h"
17228753Smm#include "kmp_environment.h"
18228753Smm#include "kmp_error.h"
19228753Smm#include "kmp_i18n.h"
20228753Smm#include "kmp_io.h"
21228753Smm#include "kmp_itt.h"
22228753Smm#include "kmp_settings.h"
23228753Smm#include "kmp_stats.h"
24228753Smm#include "kmp_str.h"
25228753Smm#include "kmp_wait_release.h"
26228753Smm#include "kmp_wrapper_getpid.h"
27228753Smm#include "kmp_dispatch.h"
28228753Smm#if KMP_USE_HIER_SCHED
29228753Smm#include "kmp_dispatch_hier.h"
30228753Smm#endif
31228753Smm
32228753Smm#if OMPT_SUPPORT
33228753Smm#include "ompt-specific.h"
34228753Smm#endif
35228753Smm
36228753Smm/* these are temporary issues to be dealt with */
37228753Smm#define KMP_USE_PRCTL 0
38228753Smm
39#if KMP_OS_WINDOWS
40#include <process.h>
41#endif
42
43#include "tsan_annotations.h"
44
45#if defined(KMP_GOMP_COMPAT)
46char const __kmp_version_alt_comp[] =
47    KMP_VERSION_PREFIX "alternative compiler support: yes";
48#endif /* defined(KMP_GOMP_COMPAT) */
49
50char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51#if OMP_50_ENABLED
52                                                        "5.0 (201611)";
53#elif OMP_45_ENABLED
54                                                        "4.5 (201511)";
55#elif OMP_40_ENABLED
56                                                        "4.0 (201307)";
57#else
58                                                        "3.1 (201107)";
59#endif
60
61#ifdef KMP_DEBUG
62char const __kmp_version_lock[] =
63    KMP_VERSION_PREFIX "lock type: run time selectable";
64#endif /* KMP_DEBUG */
65
66#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67
68/* ------------------------------------------------------------------------ */
69
70#if KMP_USE_MONITOR
71kmp_info_t __kmp_monitor;
72#endif
73
74/* Forward declarations */
75
76void __kmp_cleanup(void);
77
78static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                  int gtid);
80static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                  kmp_internal_control_t *new_icvs,
82                                  ident_t *loc);
83#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84static void __kmp_partition_places(kmp_team_t *team,
85                                   int update_master_only = 0);
86#endif
87static void __kmp_do_serial_initialize(void);
88void __kmp_fork_barrier(int gtid, int tid);
89void __kmp_join_barrier(int gtid);
90void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                          kmp_internal_control_t *new_icvs, ident_t *loc);
92
93#ifdef USE_LOAD_BALANCE
94static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95#endif
96
97static int __kmp_expand_threads(int nNeed);
98#if KMP_OS_WINDOWS
99static int __kmp_unregister_root_other_thread(int gtid);
100#endif
101static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104
105/* Calculate the identifier of the current thread */
106/* fast (and somewhat portable) way to get unique identifier of executing
107   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115
116  KA_TRACE(
117      1000,
118      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
119       __kmp_nth, __kmp_all_nth));
120
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124     __kmp_init_gtid for this to work. */
125
126  if (!TCR_4(__kmp_init_gtid))
127    return KMP_GTID_DNE;
128
129#ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132    return __kmp_gtid;
133  }
134#endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137    return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145     access to __kmp_threads array. For example:
146     1. Current thread loads other_threads[i] to thr and checks it, it is
147        non-NULL.
148     2. Current thread is suspended by OS.
149     3. Another thread unregisters and finishes (debug versions of free()
150        may fill memory with something like 0xEF).
151     4. Current thread is resumed.
152     5. Current thread reads junk from *thr.
153     TODO: Fix it.  --ln  */
154
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156
157    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158    if (!thr)
159      continue;
160
161    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163
164    /* stack grows down -- search through all of the active threads */
165
166    if (stack_addr <= stack_base) {
167      size_t stack_diff = stack_base - stack_addr;
168
169      if (stack_diff <= stack_size) {
170        /* The only way we can be closer than the allocated */
171        /* stack size is if we are running on this thread. */
172        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173        return i;
174      }
175    }
176  }
177
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181            "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183
184  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
185
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188    return i;
189
190  /* dynamically updated stack window for uber threads to avoid get_specific
191     call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193    KMP_FATAL(StackOverflow, i);
194  }
195
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201                stack_base);
202  } else {
203    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204            stack_base - stack_addr);
205  }
206
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212                                 other_threads[i]->th.th_info.ds.ds_stacksize,
213                                 "th_%d stack (refinement)", i);
214  }
215  return i;
216}
217
218int __kmp_get_global_thread_id_reg() {
219  int gtid;
220
221  if (!__kmp_init_serial) {
222    gtid = KMP_GTID_DNE;
223  } else
224#ifdef KMP_TDATA_GTID
225      if (TCR_4(__kmp_gtid_mode) >= 3) {
226    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227    gtid = __kmp_gtid;
228  } else
229#endif
230      if (TCR_4(__kmp_gtid_mode) >= 2) {
231    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232    gtid = __kmp_gtid_get_specific();
233  } else {
234    KA_TRACE(1000,
235             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236    gtid = __kmp_get_global_thread_id();
237  }
238
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241    KA_TRACE(10,
242             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243              "Registering a new gtid.\n"));
244    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245    if (!__kmp_init_serial) {
246      __kmp_do_serial_initialize();
247      gtid = __kmp_gtid_get_specific();
248    } else {
249      gtid = __kmp_register_root(FALSE);
250    }
251    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254
255  KMP_DEBUG_ASSERT(gtid >= 0);
256
257  return gtid;
258}
259
260/* caller must hold forkjoin_lock */
261void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271
272    gtid = __kmp_gtid_from_thread(th);
273
274    if (gtid == KMP_GTID_MONITOR) {
275      __kmp_print_storage_map_gtid(
276          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277          "th_%s stack (%s)", "mon",
278          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279    } else {
280      __kmp_print_storage_map_gtid(
281          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282          "th_%d stack (%s)", gtid,
283          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284    }
285  }
286
287  /* No point in checking ubermaster threads since they use refinement and
288   * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291    KA_TRACE(10,
292             ("__kmp_check_stack_overlap: performing extensive checking\n"));
293    if (stack_beg == NULL) {
294      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296    }
297
298    for (f = 0; f < __kmp_threads_capacity; f++) {
299      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300
301      if (f_th && f_th != th) {
302        char *other_stack_end =
303            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304        char *other_stack_beg =
305            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308
309          /* Print the other stack values before the abort */
310          if (__kmp_storage_map)
311            __kmp_print_storage_map_gtid(
312                -1, other_stack_beg, other_stack_end,
313                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315
316          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317                      __kmp_msg_null);
318        }
319      }
320    }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323}
324
325/* ------------------------------------------------------------------------ */
326
327void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329
330  while (!done) {
331    KMP_YIELD(1);
332  }
333}
334
335#define MAX_MESSAGE 512
336
337void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338                                  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344               p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347#if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351      if (__kmp_storage_map_verbose) {
352        node = __kmp_get_host_node(p1);
353        if (node < 0) /* doesn't work, so don't try this next time */
354          __kmp_storage_map_verbose = FALSE;
355        else {
356          char *last;
357          int lastNode;
358          int localProc = __kmp_get_cpu_from_gtid(gtid);
359
360          const int page_size = KMP_GET_PAGE_SIZE();
361
362          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364          if (localProc >= 0)
365            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
366                                 localProc >> 1);
367          else
368            __kmp_printf_no_lock("  GTID %d\n", gtid);
369#if KMP_USE_PRCTL
370          /* The more elaborate format is disabled for now because of the prctl
371           * hanging bug. */
372          do {
373            last = p1;
374            lastNode = node;
375            /* This loop collates adjacent pages with the same host node. */
376            do {
377              (char *)p1 += page_size;
378            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
380                                 lastNode);
381          } while (p1 <= p2);
382#else
383          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
384                               (char *)p1 + (page_size - 1),
385                               __kmp_get_host_node(p1));
386          if (p1 < p2) {
387            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
388                                 (char *)p2 + (page_size - 1),
389                                 __kmp_get_host_node(p2));
390          }
391#endif
392        }
393      }
394    } else
395      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397#endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399}
400
401void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406    return;
407  }
408
409  va_start(ap, format);
410
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415
416  va_end(ap);
417}
418
419void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422
423  if (__kmp_debug_buf) {
424    __kmp_dump_debug_buffer();
425  }
426
427  if (KMP_OS_WINDOWS) {
428    // Let other threads know of abnormal termination and prevent deadlock
429    // if abort happened during library initialization or shutdown
430    __kmp_global.g.g_abort = SIGABRT;
431
432    /* On Windows* OS by default abort() causes pop-up error box, which stalls
433       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434       boxes. _set_abort_behavior() works well, but this function is not
435       available in VS7 (this is not problem for DLL, but it is a problem for
436       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437       help, at least in some versions of MS C RTL.
438
439       It seems following sequence is the only way to simulate abort() and
440       avoid pop-up error box. */
441    raise(SIGABRT);
442    _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444    abort();
445  }
446
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449
450} // __kmp_abort_process
451
452void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456} // __kmp_abort_thread
457
458/* Print out the storage map for the major kmp_info_t thread data structures
459   that are allocated together. */
460
461static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                               gtid);
464
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
470
471  __kmp_print_storage_map_gtid(
472      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                               &thr->th.th_bar[bs_plain_barrier + 1],
477                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                               gtid);
479
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                               gtid);
484
485#if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                               &thr->th.th_bar[bs_reduction_barrier + 1],
488                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                               gtid);
490#endif // KMP_FAST_REDUCTION_BARRIER
491}
492
493/* Print out the storage map for the major kmp_team_t team data structures
494   that are allocated together. */
495
496static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                         int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                               header, team_id);
501
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                               &team->t.t_bar[bs_last_barrier],
504                               sizeof(kmp_balign_team_t) * bs_last_barrier,
505                               "%s_%d.t_bar", header, team_id);
506
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                               &team->t.t_bar[bs_plain_barrier + 1],
509                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                               header, team_id);
511
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                               &team->t.t_bar[bs_forkjoin_barrier + 1],
514                               sizeof(kmp_balign_team_t),
515                               "%s_%d.t_bar[forkjoin]", header, team_id);
516
517#if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                               &team->t.t_bar[bs_reduction_barrier + 1],
520                               sizeof(kmp_balign_team_t),
521                               "%s_%d.t_bar[reduction]", header, team_id);
522#endif // KMP_FAST_REDUCTION_BARRIER
523
524  __kmp_print_storage_map_gtid(
525      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527
528  __kmp_print_storage_map_gtid(
529      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                               &team->t.t_disp_buffer[num_disp_buff],
534                               sizeof(dispatch_shared_info_t) * num_disp_buff,
535                               "%s_%d.t_disp_buffer", header, team_id);
536
537  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538                               sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539                               team_id);
540}
541
542static void __kmp_init_allocator() {
543#if OMP_50_ENABLED
544  __kmp_init_memkind();
545#endif
546}
547static void __kmp_fini_allocator() {
548#if OMP_50_ENABLED
549  __kmp_fini_memkind();
550#endif
551}
552
553/* ------------------------------------------------------------------------ */
554
555#if KMP_DYNAMIC_LIB
556#if KMP_OS_WINDOWS
557
558static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
559  // TODO: Change to __kmp_break_bootstrap_lock().
560  __kmp_init_bootstrap_lock(lck); // make the lock released
561}
562
563static void __kmp_reset_locks_on_process_detach(int gtid_req) {
564  int i;
565  int thread_count;
566
567  // PROCESS_DETACH is expected to be called by a thread that executes
568  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
569  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
570  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
571  // threads can be still alive here, although being about to be terminated. The
572  // threads in the array with ds_thread==0 are most suspicious. Actually, it
573  // can be not safe to access the __kmp_threads[].
574
575  // TODO: does it make sense to check __kmp_roots[] ?
576
577  // Let's check that there are no other alive threads registered with the OMP
578  // lib.
579  while (1) {
580    thread_count = 0;
581    for (i = 0; i < __kmp_threads_capacity; ++i) {
582      if (!__kmp_threads)
583        continue;
584      kmp_info_t *th = __kmp_threads[i];
585      if (th == NULL)
586        continue;
587      int gtid = th->th.th_info.ds.ds_gtid;
588      if (gtid == gtid_req)
589        continue;
590      if (gtid < 0)
591        continue;
592      DWORD exit_val;
593      int alive = __kmp_is_thread_alive(th, &exit_val);
594      if (alive) {
595        ++thread_count;
596      }
597    }
598    if (thread_count == 0)
599      break; // success
600  }
601
602  // Assume that I'm alone. Now it might be safe to check and reset locks.
603  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
604  __kmp_reset_lock(&__kmp_forkjoin_lock);
605#ifdef KMP_DEBUG
606  __kmp_reset_lock(&__kmp_stdio_lock);
607#endif // KMP_DEBUG
608}
609
610BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
611  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
612
613  switch (fdwReason) {
614
615  case DLL_PROCESS_ATTACH:
616    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
617
618    return TRUE;
619
620  case DLL_PROCESS_DETACH:
621    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
622
623    if (lpReserved != NULL) {
624      // lpReserved is used for telling the difference:
625      //   lpReserved == NULL when FreeLibrary() was called,
626      //   lpReserved != NULL when the process terminates.
627      // When FreeLibrary() is called, worker threads remain alive. So they will
628      // release the forkjoin lock by themselves. When the process terminates,
629      // worker threads disappear triggering the problem of unreleased forkjoin
630      // lock as described below.
631
632      // A worker thread can take the forkjoin lock. The problem comes up if
633      // that worker thread becomes dead before it releases the forkjoin lock.
634      // The forkjoin lock remains taken, while the thread executing
635      // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
636      // to take the forkjoin lock and will always fail, so that the application
637      // will never finish [normally]. This scenario is possible if
638      // __kmpc_end() has not been executed. It looks like it's not a corner
639      // case, but common cases:
640      // - the main function was compiled by an alternative compiler;
641      // - the main function was compiled by icl but without /Qopenmp
642      //   (application with plugins);
643      // - application terminates by calling C exit(), Fortran CALL EXIT() or
644      //   Fortran STOP.
645      // - alive foreign thread prevented __kmpc_end from doing cleanup.
646      //
647      // This is a hack to work around the problem.
648      // TODO: !!! figure out something better.
649      __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
650    }
651
652    __kmp_internal_end_library(__kmp_gtid_get_specific());
653
654    return TRUE;
655
656  case DLL_THREAD_ATTACH:
657    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
658
659    /* if we want to register new siblings all the time here call
660     * __kmp_get_gtid(); */
661    return TRUE;
662
663  case DLL_THREAD_DETACH:
664    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
665
666    __kmp_internal_end_thread(__kmp_gtid_get_specific());
667    return TRUE;
668  }
669
670  return TRUE;
671}
672
673#endif /* KMP_OS_WINDOWS */
674#endif /* KMP_DYNAMIC_LIB */
675
676/* Change the library type to "status" and return the old type */
677/* called from within initialization routines where __kmp_initz_lock is held */
678int __kmp_change_library(int status) {
679  int old_status;
680
681  old_status = __kmp_yield_init &
682               1; // check whether KMP_LIBRARY=throughput (even init count)
683
684  if (status) {
685    __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
686  } else {
687    __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
688  }
689
690  return old_status; // return previous setting of whether
691  // KMP_LIBRARY=throughput
692}
693
694/* __kmp_parallel_deo -- Wait until it's our turn. */
695void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697#ifdef BUILD_PARALLEL_ORDERED
698  kmp_team_t *team = __kmp_team_from_gtid(gtid);
699#endif /* BUILD_PARALLEL_ORDERED */
700
701  if (__kmp_env_consistency_check) {
702    if (__kmp_threads[gtid]->th.th_root->r.r_active)
703#if KMP_USE_DYNAMIC_LOCK
704      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
705#else
706      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
707#endif
708  }
709#ifdef BUILD_PARALLEL_ORDERED
710  if (!team->t.t_serialized) {
711    KMP_MB();
712    KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
713                   KMP_EQ, NULL);
714    KMP_MB();
715  }
716#endif /* BUILD_PARALLEL_ORDERED */
717}
718
719/* __kmp_parallel_dxo -- Signal the next task. */
720void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
721  int gtid = *gtid_ref;
722#ifdef BUILD_PARALLEL_ORDERED
723  int tid = __kmp_tid_from_gtid(gtid);
724  kmp_team_t *team = __kmp_team_from_gtid(gtid);
725#endif /* BUILD_PARALLEL_ORDERED */
726
727  if (__kmp_env_consistency_check) {
728    if (__kmp_threads[gtid]->th.th_root->r.r_active)
729      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
730  }
731#ifdef BUILD_PARALLEL_ORDERED
732  if (!team->t.t_serialized) {
733    KMP_MB(); /* Flush all pending memory write invalidates.  */
734
735    /* use the tid of the next thread in this team */
736    /* TODO replace with general release procedure */
737    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
738
739    KMP_MB(); /* Flush all pending memory write invalidates.  */
740  }
741#endif /* BUILD_PARALLEL_ORDERED */
742}
743
744/* ------------------------------------------------------------------------ */
745/* The BARRIER for a SINGLE process section is always explicit   */
746
747int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
748  int status;
749  kmp_info_t *th;
750  kmp_team_t *team;
751
752  if (!TCR_4(__kmp_init_parallel))
753    __kmp_parallel_initialize();
754
755  th = __kmp_threads[gtid];
756  team = th->th.th_team;
757  status = 0;
758
759  th->th.th_ident = id_ref;
760
761  if (team->t.t_serialized) {
762    status = 1;
763  } else {
764    kmp_int32 old_this = th->th.th_local.this_construct;
765
766    ++th->th.th_local.this_construct;
767    /* try to set team count to thread count--success means thread got the
768       single block */
769    /* TODO: Should this be acquire or release? */
770    if (team->t.t_construct == old_this) {
771      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
772                                              th->th.th_local.this_construct);
773    }
774#if USE_ITT_BUILD
775    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
776        KMP_MASTER_GTID(gtid) &&
777#if OMP_40_ENABLED
778        th->th.th_teams_microtask == NULL &&
779#endif
780        team->t.t_active_level ==
781            1) { // Only report metadata by master of active team at level 1
782      __kmp_itt_metadata_single(id_ref);
783    }
784#endif /* USE_ITT_BUILD */
785  }
786
787  if (__kmp_env_consistency_check) {
788    if (status && push_ws) {
789      __kmp_push_workshare(gtid, ct_psingle, id_ref);
790    } else {
791      __kmp_check_workshare(gtid, ct_psingle, id_ref);
792    }
793  }
794#if USE_ITT_BUILD
795  if (status) {
796    __kmp_itt_single_start(gtid);
797  }
798#endif /* USE_ITT_BUILD */
799  return status;
800}
801
802void __kmp_exit_single(int gtid) {
803#if USE_ITT_BUILD
804  __kmp_itt_single_end(gtid);
805#endif /* USE_ITT_BUILD */
806  if (__kmp_env_consistency_check)
807    __kmp_pop_workshare(gtid, ct_psingle, NULL);
808}
809
810/* determine if we can go parallel or must use a serialized parallel region and
811 * how many threads we can use
812 * set_nproc is the number of threads requested for the team
813 * returns 0 if we should serialize or only use one thread,
814 * otherwise the number of threads to use
815 * The forkjoin lock is held by the caller. */
816static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
817                                 int master_tid, int set_nthreads
818#if OMP_40_ENABLED
819                                 ,
820                                 int enter_teams
821#endif /* OMP_40_ENABLED */
822                                 ) {
823  int capacity;
824  int new_nthreads;
825  KMP_DEBUG_ASSERT(__kmp_init_serial);
826  KMP_DEBUG_ASSERT(root && parent_team);
827
828  // If dyn-var is set, dynamically adjust the number of desired threads,
829  // according to the method specified by dynamic_mode.
830  new_nthreads = set_nthreads;
831  if (!get__dynamic_2(parent_team, master_tid)) {
832    ;
833  }
834#ifdef USE_LOAD_BALANCE
835  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
836    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
837    if (new_nthreads == 1) {
838      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
839                    "reservation to 1 thread\n",
840                    master_tid));
841      return 1;
842    }
843    if (new_nthreads < set_nthreads) {
844      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
845                    "reservation to %d threads\n",
846                    master_tid, new_nthreads));
847    }
848  }
849#endif /* USE_LOAD_BALANCE */
850  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
851    new_nthreads = __kmp_avail_proc - __kmp_nth +
852                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853    if (new_nthreads <= 1) {
854      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
855                    "reservation to 1 thread\n",
856                    master_tid));
857      return 1;
858    }
859    if (new_nthreads < set_nthreads) {
860      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
861                    "reservation to %d threads\n",
862                    master_tid, new_nthreads));
863    } else {
864      new_nthreads = set_nthreads;
865    }
866  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
867    if (set_nthreads > 2) {
868      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
869      new_nthreads = (new_nthreads % set_nthreads) + 1;
870      if (new_nthreads == 1) {
871        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
872                      "reservation to 1 thread\n",
873                      master_tid));
874        return 1;
875      }
876      if (new_nthreads < set_nthreads) {
877        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
878                      "reservation to %d threads\n",
879                      master_tid, new_nthreads));
880      }
881    }
882  } else {
883    KMP_ASSERT(0);
884  }
885
886  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
887  if (__kmp_nth + new_nthreads -
888          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
889      __kmp_max_nth) {
890    int tl_nthreads = __kmp_max_nth - __kmp_nth +
891                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
892    if (tl_nthreads <= 0) {
893      tl_nthreads = 1;
894    }
895
896    // If dyn-var is false, emit a 1-time warning.
897    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
898      __kmp_reserve_warn = 1;
899      __kmp_msg(kmp_ms_warning,
900                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
901                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
902    }
903    if (tl_nthreads == 1) {
904      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
905                    "reduced reservation to 1 thread\n",
906                    master_tid));
907      return 1;
908    }
909    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
910                  "reservation to %d threads\n",
911                  master_tid, tl_nthreads));
912    new_nthreads = tl_nthreads;
913  }
914
915  // Respect OMP_THREAD_LIMIT
916  if (root->r.r_cg_nthreads + new_nthreads -
917          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
918      __kmp_cg_max_nth) {
919    int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
920                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
921    if (tl_nthreads <= 0) {
922      tl_nthreads = 1;
923    }
924
925    // If dyn-var is false, emit a 1-time warning.
926    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
927      __kmp_reserve_warn = 1;
928      __kmp_msg(kmp_ms_warning,
929                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
930                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
931    }
932    if (tl_nthreads == 1) {
933      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
934                    "reduced reservation to 1 thread\n",
935                    master_tid));
936      return 1;
937    }
938    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
939                  "reservation to %d threads\n",
940                  master_tid, tl_nthreads));
941    new_nthreads = tl_nthreads;
942  }
943
944  // Check if the threads array is large enough, or needs expanding.
945  // See comment in __kmp_register_root() about the adjustment if
946  // __kmp_threads[0] == NULL.
947  capacity = __kmp_threads_capacity;
948  if (TCR_PTR(__kmp_threads[0]) == NULL) {
949    --capacity;
950  }
951  if (__kmp_nth + new_nthreads -
952          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
953      capacity) {
954    // Expand the threads array.
955    int slotsRequired = __kmp_nth + new_nthreads -
956                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
957                        capacity;
958    int slotsAdded = __kmp_expand_threads(slotsRequired);
959    if (slotsAdded < slotsRequired) {
960      // The threads array was not expanded enough.
961      new_nthreads -= (slotsRequired - slotsAdded);
962      KMP_ASSERT(new_nthreads >= 1);
963
964      // If dyn-var is false, emit a 1-time warning.
965      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
966        __kmp_reserve_warn = 1;
967        if (__kmp_tp_cached) {
968          __kmp_msg(kmp_ms_warning,
969                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
970                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
971                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
972        } else {
973          __kmp_msg(kmp_ms_warning,
974                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
975                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
976        }
977      }
978    }
979  }
980
981#ifdef KMP_DEBUG
982  if (new_nthreads == 1) {
983    KC_TRACE(10,
984             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
985              "dead roots and rechecking; requested %d threads\n",
986              __kmp_get_gtid(), set_nthreads));
987  } else {
988    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
989                  " %d threads\n",
990                  __kmp_get_gtid(), new_nthreads, set_nthreads));
991  }
992#endif // KMP_DEBUG
993  return new_nthreads;
994}
995
996/* Allocate threads from the thread pool and assign them to the new team. We are
997   assured that there are enough threads available, because we checked on that
998   earlier within critical section forkjoin */
999static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1000                                    kmp_info_t *master_th, int master_gtid) {
1001  int i;
1002  int use_hot_team;
1003
1004  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1005  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1006  KMP_MB();
1007
1008  /* first, let's setup the master thread */
1009  master_th->th.th_info.ds.ds_tid = 0;
1010  master_th->th.th_team = team;
1011  master_th->th.th_team_nproc = team->t.t_nproc;
1012  master_th->th.th_team_master = master_th;
1013  master_th->th.th_team_serialized = FALSE;
1014  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1015
1016/* make sure we are not the optimized hot team */
1017#if KMP_NESTED_HOT_TEAMS
1018  use_hot_team = 0;
1019  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1020  if (hot_teams) { // hot teams array is not allocated if
1021    // KMP_HOT_TEAMS_MAX_LEVEL=0
1022    int level = team->t.t_active_level - 1; // index in array of hot teams
1023    if (master_th->th.th_teams_microtask) { // are we inside the teams?
1024      if (master_th->th.th_teams_size.nteams > 1) {
1025        ++level; // level was not increased in teams construct for
1026        // team_of_masters
1027      }
1028      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1029          master_th->th.th_teams_level == team->t.t_level) {
1030        ++level; // level was not increased in teams construct for
1031        // team_of_workers before the parallel
1032      } // team->t.t_level will be increased inside parallel
1033    }
1034    if (level < __kmp_hot_teams_max_level) {
1035      if (hot_teams[level].hot_team) {
1036        // hot team has already been allocated for given level
1037        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1038        use_hot_team = 1; // the team is ready to use
1039      } else {
1040        use_hot_team = 0; // AC: threads are not allocated yet
1041        hot_teams[level].hot_team = team; // remember new hot team
1042        hot_teams[level].hot_team_nth = team->t.t_nproc;
1043      }
1044    } else {
1045      use_hot_team = 0;
1046    }
1047  }
1048#else
1049  use_hot_team = team == root->r.r_hot_team;
1050#endif
1051  if (!use_hot_team) {
1052
1053    /* install the master thread */
1054    team->t.t_threads[0] = master_th;
1055    __kmp_initialize_info(master_th, team, 0, master_gtid);
1056
1057    /* now, install the worker threads */
1058    for (i = 1; i < team->t.t_nproc; i++) {
1059
1060      /* fork or reallocate a new thread and install it in team */
1061      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1062      team->t.t_threads[i] = thr;
1063      KMP_DEBUG_ASSERT(thr);
1064      KMP_DEBUG_ASSERT(thr->th.th_team == team);
1065      /* align team and thread arrived states */
1066      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1067                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
1068                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1069                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1070                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1071                    team->t.t_bar[bs_plain_barrier].b_arrived));
1072#if OMP_40_ENABLED
1073      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1074      thr->th.th_teams_level = master_th->th.th_teams_level;
1075      thr->th.th_teams_size = master_th->th.th_teams_size;
1076#endif
1077      { // Initialize threads' barrier data.
1078        int b;
1079        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1080        for (b = 0; b < bs_last_barrier; ++b) {
1081          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1082          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1083#if USE_DEBUGGER
1084          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1085#endif
1086        }
1087      }
1088    }
1089
1090#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1091    __kmp_partition_places(team);
1092#endif
1093  }
1094
1095#if OMP_50_ENABLED
1096  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1097    for (i = 0; i < team->t.t_nproc; i++) {
1098      kmp_info_t *thr = team->t.t_threads[i];
1099      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1100          thr->th.th_prev_level != team->t.t_level) {
1101        team->t.t_display_affinity = 1;
1102        break;
1103      }
1104    }
1105  }
1106#endif
1107
1108  KMP_MB();
1109}
1110
1111#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1112// Propagate any changes to the floating point control registers out to the team
1113// We try to avoid unnecessary writes to the relevant cache line in the team
1114// structure, so we don't make changes unless they are needed.
1115inline static void propagateFPControl(kmp_team_t *team) {
1116  if (__kmp_inherit_fp_control) {
1117    kmp_int16 x87_fpu_control_word;
1118    kmp_uint32 mxcsr;
1119
1120    // Get master values of FPU control flags (both X87 and vector)
1121    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1122    __kmp_store_mxcsr(&mxcsr);
1123    mxcsr &= KMP_X86_MXCSR_MASK;
1124
1125    // There is no point looking at t_fp_control_saved here.
1126    // If it is TRUE, we still have to update the values if they are different
1127    // from those we now have. If it is FALSE we didn't save anything yet, but
1128    // our objective is the same. We have to ensure that the values in the team
1129    // are the same as those we have.
1130    // So, this code achieves what we need whether or not t_fp_control_saved is
1131    // true. By checking whether the value needs updating we avoid unnecessary
1132    // writes that would put the cache-line into a written state, causing all
1133    // threads in the team to have to read it again.
1134    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1135    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1136    // Although we don't use this value, other code in the runtime wants to know
1137    // whether it should restore them. So we must ensure it is correct.
1138    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1139  } else {
1140    // Similarly here. Don't write to this cache-line in the team structure
1141    // unless we have to.
1142    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1143  }
1144}
1145
1146// Do the opposite, setting the hardware registers to the updated values from
1147// the team.
1148inline static void updateHWFPControl(kmp_team_t *team) {
1149  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1150    // Only reset the fp control regs if they have been changed in the team.
1151    // the parallel region that we are exiting.
1152    kmp_int16 x87_fpu_control_word;
1153    kmp_uint32 mxcsr;
1154    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1155    __kmp_store_mxcsr(&mxcsr);
1156    mxcsr &= KMP_X86_MXCSR_MASK;
1157
1158    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1159      __kmp_clear_x87_fpu_status_word();
1160      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1161    }
1162
1163    if (team->t.t_mxcsr != mxcsr) {
1164      __kmp_load_mxcsr(&team->t.t_mxcsr);
1165    }
1166  }
1167}
1168#else
1169#define propagateFPControl(x) ((void)0)
1170#define updateHWFPControl(x) ((void)0)
1171#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1172
1173static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1174                                     int realloc); // forward declaration
1175
1176/* Run a parallel region that has been serialized, so runs only in a team of the
1177   single master thread. */
1178void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1179  kmp_info_t *this_thr;
1180  kmp_team_t *serial_team;
1181
1182  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1183
1184  /* Skip all this code for autopar serialized loops since it results in
1185     unacceptable overhead */
1186  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1187    return;
1188
1189  if (!TCR_4(__kmp_init_parallel))
1190    __kmp_parallel_initialize();
1191
1192  this_thr = __kmp_threads[global_tid];
1193  serial_team = this_thr->th.th_serial_team;
1194
1195  /* utilize the serialized team held by this thread */
1196  KMP_DEBUG_ASSERT(serial_team);
1197  KMP_MB();
1198
1199  if (__kmp_tasking_mode != tskm_immediate_exec) {
1200    KMP_DEBUG_ASSERT(
1201        this_thr->th.th_task_team ==
1202        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1203    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1204                     NULL);
1205    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1206                  "team %p, new task_team = NULL\n",
1207                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1208    this_thr->th.th_task_team = NULL;
1209  }
1210
1211#if OMP_40_ENABLED
1212  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1213  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1214    proc_bind = proc_bind_false;
1215  } else if (proc_bind == proc_bind_default) {
1216    // No proc_bind clause was specified, so use the current value
1217    // of proc-bind-var for this parallel region.
1218    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1219  }
1220  // Reset for next parallel region
1221  this_thr->th.th_set_proc_bind = proc_bind_default;
1222#endif /* OMP_40_ENABLED */
1223
1224#if OMPT_SUPPORT
1225  ompt_data_t ompt_parallel_data = ompt_data_none;
1226  ompt_data_t *implicit_task_data;
1227  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1228  if (ompt_enabled.enabled &&
1229      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1230
1231    ompt_task_info_t *parent_task_info;
1232    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1233
1234    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1235    if (ompt_enabled.ompt_callback_parallel_begin) {
1236      int team_size = 1;
1237
1238      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1239          &(parent_task_info->task_data), &(parent_task_info->frame),
1240          &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1241          codeptr);
1242    }
1243  }
1244#endif // OMPT_SUPPORT
1245
1246  if (this_thr->th.th_team != serial_team) {
1247    // Nested level will be an index in the nested nthreads array
1248    int level = this_thr->th.th_team->t.t_level;
1249
1250    if (serial_team->t.t_serialized) {
1251      /* this serial team was already used
1252         TODO increase performance by making this locks more specific */
1253      kmp_team_t *new_team;
1254
1255      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1256
1257      new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1258#if OMPT_SUPPORT
1259                                     ompt_parallel_data,
1260#endif
1261#if OMP_40_ENABLED
1262                                     proc_bind,
1263#endif
1264                                     &this_thr->th.th_current_task->td_icvs,
1265                                     0 USE_NESTED_HOT_ARG(NULL));
1266      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1267      KMP_ASSERT(new_team);
1268
1269      /* setup new serialized team and install it */
1270      new_team->t.t_threads[0] = this_thr;
1271      new_team->t.t_parent = this_thr->th.th_team;
1272      serial_team = new_team;
1273      this_thr->th.th_serial_team = serial_team;
1274
1275      KF_TRACE(
1276          10,
1277          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1278           global_tid, serial_team));
1279
1280      /* TODO the above breaks the requirement that if we run out of resources,
1281         then we can still guarantee that serialized teams are ok, since we may
1282         need to allocate a new one */
1283    } else {
1284      KF_TRACE(
1285          10,
1286          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1287           global_tid, serial_team));
1288    }
1289
1290    /* we have to initialize this serial team */
1291    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1292    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1293    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1294    serial_team->t.t_ident = loc;
1295    serial_team->t.t_serialized = 1;
1296    serial_team->t.t_nproc = 1;
1297    serial_team->t.t_parent = this_thr->th.th_team;
1298    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1299    this_thr->th.th_team = serial_team;
1300    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1301
1302    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1303                  this_thr->th.th_current_task));
1304    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1305    this_thr->th.th_current_task->td_flags.executing = 0;
1306
1307    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1308
1309    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1310       implicit task for each serialized task represented by
1311       team->t.t_serialized? */
1312    copy_icvs(&this_thr->th.th_current_task->td_icvs,
1313              &this_thr->th.th_current_task->td_parent->td_icvs);
1314
1315    // Thread value exists in the nested nthreads array for the next nested
1316    // level
1317    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1318      this_thr->th.th_current_task->td_icvs.nproc =
1319          __kmp_nested_nth.nth[level + 1];
1320    }
1321
1322#if OMP_40_ENABLED
1323    if (__kmp_nested_proc_bind.used &&
1324        (level + 1 < __kmp_nested_proc_bind.used)) {
1325      this_thr->th.th_current_task->td_icvs.proc_bind =
1326          __kmp_nested_proc_bind.bind_types[level + 1];
1327    }
1328#endif /* OMP_40_ENABLED */
1329
1330#if USE_DEBUGGER
1331    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1332#endif
1333    this_thr->th.th_info.ds.ds_tid = 0;
1334
1335    /* set thread cache values */
1336    this_thr->th.th_team_nproc = 1;
1337    this_thr->th.th_team_master = this_thr;
1338    this_thr->th.th_team_serialized = 1;
1339
1340    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1341    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1342#if OMP_50_ENABLED
1343    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1344#endif
1345
1346    propagateFPControl(serial_team);
1347
1348    /* check if we need to allocate dispatch buffers stack */
1349    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350    if (!serial_team->t.t_dispatch->th_disp_buffer) {
1351      serial_team->t.t_dispatch->th_disp_buffer =
1352          (dispatch_private_info_t *)__kmp_allocate(
1353              sizeof(dispatch_private_info_t));
1354    }
1355    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1356
1357    KMP_MB();
1358
1359  } else {
1360    /* this serialized team is already being used,
1361     * that's fine, just add another nested level */
1362    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1363    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1364    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1365    ++serial_team->t.t_serialized;
1366    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1367
1368    // Nested level will be an index in the nested nthreads array
1369    int level = this_thr->th.th_team->t.t_level;
1370    // Thread value exists in the nested nthreads array for the next nested
1371    // level
1372    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1373      this_thr->th.th_current_task->td_icvs.nproc =
1374          __kmp_nested_nth.nth[level + 1];
1375    }
1376    serial_team->t.t_level++;
1377    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378                  "of serial team %p to %d\n",
1379                  global_tid, serial_team, serial_team->t.t_level));
1380
1381    /* allocate/push dispatch buffers stack */
1382    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383    {
1384      dispatch_private_info_t *disp_buffer =
1385          (dispatch_private_info_t *)__kmp_allocate(
1386              sizeof(dispatch_private_info_t));
1387      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389    }
1390    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391
1392    KMP_MB();
1393  }
1394#if OMP_40_ENABLED
1395  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1396#endif
1397
1398#if OMP_50_ENABLED
1399  // Perform the display affinity functionality for
1400  // serialized parallel regions
1401  if (__kmp_display_affinity) {
1402    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403        this_thr->th.th_prev_num_threads != 1) {
1404      // NULL means use the affinity-format-var ICV
1405      __kmp_aux_display_affinity(global_tid, NULL);
1406      this_thr->th.th_prev_level = serial_team->t.t_level;
1407      this_thr->th.th_prev_num_threads = 1;
1408    }
1409  }
1410#endif
1411
1412  if (__kmp_env_consistency_check)
1413    __kmp_push_parallel(global_tid, NULL);
1414#if OMPT_SUPPORT
1415  serial_team->t.ompt_team_info.master_return_address = codeptr;
1416  if (ompt_enabled.enabled &&
1417      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1418    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1419
1420    ompt_lw_taskteam_t lw_taskteam;
1421    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422                            &ompt_parallel_data, codeptr);
1423
1424    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425    // don't use lw_taskteam after linking. content was swaped
1426
1427    /* OMPT implicit task begin */
1428    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1429    if (ompt_enabled.ompt_callback_implicit_task) {
1430      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1431          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1432          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433      OMPT_CUR_TASK_INFO(this_thr)
1434          ->thread_num = __kmp_tid_from_gtid(global_tid);
1435    }
1436
1437    /* OMPT state */
1438    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1440  }
1441#endif
1442}
1443
1444/* most of the work for a fork */
1445/* return true if we really went parallel, false if serialized */
1446int __kmp_fork_call(ident_t *loc, int gtid,
1447                    enum fork_context_e call_context, // Intel, GNU, ...
1448                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
1449/* TODO: revert workaround for Intel(R) 64 tracker #96 */
1450#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1451                    va_list *ap
1452#else
1453                    va_list ap
1454#endif
1455                    ) {
1456  void **argv;
1457  int i;
1458  int master_tid;
1459  int master_this_cons;
1460  kmp_team_t *team;
1461  kmp_team_t *parent_team;
1462  kmp_info_t *master_th;
1463  kmp_root_t *root;
1464  int nthreads;
1465  int master_active;
1466  int master_set_numthreads;
1467  int level;
1468#if OMP_40_ENABLED
1469  int active_level;
1470  int teams_level;
1471#endif
1472#if KMP_NESTED_HOT_TEAMS
1473  kmp_hot_team_ptr_t **p_hot_teams;
1474#endif
1475  { // KMP_TIME_BLOCK
1476    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1477    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1478
1479    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1480    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1481      /* Some systems prefer the stack for the root thread(s) to start with */
1482      /* some gap from the parent stack to prevent false sharing. */
1483      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1484      /* These 2 lines below are so this does not get optimized out */
1485      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1486        __kmp_stkpadding += (short)((kmp_int64)dummy);
1487    }
1488
1489    /* initialize if needed */
1490    KMP_DEBUG_ASSERT(
1491        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1492    if (!TCR_4(__kmp_init_parallel))
1493      __kmp_parallel_initialize();
1494
1495    /* setup current data */
1496    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1497    // shutdown
1498    parent_team = master_th->th.th_team;
1499    master_tid = master_th->th.th_info.ds.ds_tid;
1500    master_this_cons = master_th->th.th_local.this_construct;
1501    root = master_th->th.th_root;
1502    master_active = root->r.r_active;
1503    master_set_numthreads = master_th->th.th_set_nproc;
1504
1505#if OMPT_SUPPORT
1506    ompt_data_t ompt_parallel_data = ompt_data_none;
1507    ompt_data_t *parent_task_data;
1508    ompt_frame_t *ompt_frame;
1509    ompt_data_t *implicit_task_data;
1510    void *return_address = NULL;
1511
1512    if (ompt_enabled.enabled) {
1513      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1514                                    NULL, NULL);
1515      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1516    }
1517#endif
1518
1519    // Nested level will be an index in the nested nthreads array
1520    level = parent_team->t.t_level;
1521    // used to launch non-serial teams even if nested is not allowed
1522    active_level = parent_team->t.t_active_level;
1523#if OMP_40_ENABLED
1524    // needed to check nesting inside the teams
1525    teams_level = master_th->th.th_teams_level;
1526#endif
1527#if KMP_NESTED_HOT_TEAMS
1528    p_hot_teams = &master_th->th.th_hot_teams;
1529    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1530      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1531          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1532      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1533      // it is either actual or not needed (when active_level > 0)
1534      (*p_hot_teams)[0].hot_team_nth = 1;
1535    }
1536#endif
1537
1538#if OMPT_SUPPORT
1539    if (ompt_enabled.enabled) {
1540      if (ompt_enabled.ompt_callback_parallel_begin) {
1541        int team_size = master_set_numthreads
1542                            ? master_set_numthreads
1543                            : get__nproc_2(parent_team, master_tid);
1544        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1545            parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1546            OMPT_INVOKER(call_context), return_address);
1547      }
1548      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1549    }
1550#endif
1551
1552    master_th->th.th_ident = loc;
1553
1554#if OMP_40_ENABLED
1555    if (master_th->th.th_teams_microtask && ap &&
1556        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1557      // AC: This is start of parallel that is nested inside teams construct.
1558      // The team is actual (hot), all workers are ready at the fork barrier.
1559      // No lock needed to initialize the team a bit, then free workers.
1560      parent_team->t.t_ident = loc;
1561      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1562      parent_team->t.t_argc = argc;
1563      argv = (void **)parent_team->t.t_argv;
1564      for (i = argc - 1; i >= 0; --i)
1565/* TODO: revert workaround for Intel(R) 64 tracker #96 */
1566#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1567        *argv++ = va_arg(*ap, void *);
1568#else
1569        *argv++ = va_arg(ap, void *);
1570#endif
1571      // Increment our nested depth levels, but not increase the serialization
1572      if (parent_team == master_th->th.th_serial_team) {
1573        // AC: we are in serialized parallel
1574        __kmpc_serialized_parallel(loc, gtid);
1575        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1576        // AC: need this in order enquiry functions work
1577        // correctly, will restore at join time
1578        parent_team->t.t_serialized--;
1579#if OMPT_SUPPORT
1580        void *dummy;
1581        void **exit_runtime_p;
1582
1583        ompt_lw_taskteam_t lw_taskteam;
1584
1585        if (ompt_enabled.enabled) {
1586          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1587                                  &ompt_parallel_data, return_address);
1588          exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1589
1590          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1591          // don't use lw_taskteam after linking. content was swaped
1592
1593          /* OMPT implicit task begin */
1594          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1595          if (ompt_enabled.ompt_callback_implicit_task) {
1596            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1597                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1598                implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1599            OMPT_CUR_TASK_INFO(master_th)
1600                ->thread_num = __kmp_tid_from_gtid(gtid);
1601          }
1602
1603          /* OMPT state */
1604          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1605        } else {
1606          exit_runtime_p = &dummy;
1607        }
1608#endif
1609
1610        {
1611          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1612          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1613          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1614#if OMPT_SUPPORT
1615                                 ,
1616                                 exit_runtime_p
1617#endif
1618                                 );
1619        }
1620
1621#if OMPT_SUPPORT
1622        *exit_runtime_p = NULL;
1623        if (ompt_enabled.enabled) {
1624          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1625          if (ompt_enabled.ompt_callback_implicit_task) {
1626            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1627                ompt_scope_end, NULL, implicit_task_data, 1,
1628                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1629          }
1630          __ompt_lw_taskteam_unlink(master_th);
1631
1632          if (ompt_enabled.ompt_callback_parallel_end) {
1633            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1634                OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1635                OMPT_INVOKER(call_context), return_address);
1636          }
1637          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1638        }
1639#endif
1640        return TRUE;
1641      }
1642
1643      parent_team->t.t_pkfn = microtask;
1644      parent_team->t.t_invoke = invoker;
1645      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1646      parent_team->t.t_active_level++;
1647      parent_team->t.t_level++;
1648#if OMP_50_ENABLED
1649      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1650#endif
1651
1652      /* Change number of threads in the team if requested */
1653      if (master_set_numthreads) { // The parallel has num_threads clause
1654        if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1655          // AC: only can reduce number of threads dynamically, can't increase
1656          kmp_info_t **other_threads = parent_team->t.t_threads;
1657          parent_team->t.t_nproc = master_set_numthreads;
1658          for (i = 0; i < master_set_numthreads; ++i) {
1659            other_threads[i]->th.th_team_nproc = master_set_numthreads;
1660          }
1661          // Keep extra threads hot in the team for possible next parallels
1662        }
1663        master_th->th.th_set_nproc = 0;
1664      }
1665
1666#if USE_DEBUGGER
1667      if (__kmp_debugging) { // Let debugger override number of threads.
1668        int nth = __kmp_omp_num_threads(loc);
1669        if (nth > 0) { // 0 means debugger doesn't want to change num threads
1670          master_set_numthreads = nth;
1671        }
1672      }
1673#endif
1674
1675      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1676                    "master_th=%p, gtid=%d\n",
1677                    root, parent_team, master_th, gtid));
1678      __kmp_internal_fork(loc, gtid, parent_team);
1679      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1680                    "master_th=%p, gtid=%d\n",
1681                    root, parent_team, master_th, gtid));
1682
1683      /* Invoke microtask for MASTER thread */
1684      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1685                    parent_team->t.t_id, parent_team->t.t_pkfn));
1686
1687      if (!parent_team->t.t_invoke(gtid)) {
1688        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1689      }
1690      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1691                    parent_team->t.t_id, parent_team->t.t_pkfn));
1692      KMP_MB(); /* Flush all pending memory write invalidates.  */
1693
1694      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1695
1696      return TRUE;
1697    } // Parallel closely nested in teams construct
1698#endif /* OMP_40_ENABLED */
1699
1700#if KMP_DEBUG
1701    if (__kmp_tasking_mode != tskm_immediate_exec) {
1702      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1703                       parent_team->t.t_task_team[master_th->th.th_task_state]);
1704    }
1705#endif
1706
1707    if (parent_team->t.t_active_level >=
1708        master_th->th.th_current_task->td_icvs.max_active_levels) {
1709      nthreads = 1;
1710    } else {
1711#if OMP_40_ENABLED
1712      int enter_teams = ((ap == NULL && active_level == 0) ||
1713                         (ap && teams_level > 0 && teams_level == level));
1714#endif
1715      nthreads =
1716          master_set_numthreads
1717              ? master_set_numthreads
1718              : get__nproc_2(
1719                    parent_team,
1720                    master_tid); // TODO: get nproc directly from current task
1721
1722      // Check if we need to take forkjoin lock? (no need for serialized
1723      // parallel out of teams construct). This code moved here from
1724      // __kmp_reserve_threads() to speedup nested serialized parallels.
1725      if (nthreads > 1) {
1726        if ((!get__nested(master_th) && (root->r.r_in_parallel
1727#if OMP_40_ENABLED
1728                                         && !enter_teams
1729#endif /* OMP_40_ENABLED */
1730                                         )) ||
1731            (__kmp_library == library_serial)) {
1732          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1733                        " threads\n",
1734                        gtid, nthreads));
1735          nthreads = 1;
1736        }
1737      }
1738      if (nthreads > 1) {
1739        /* determine how many new threads we can use */
1740        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1741        nthreads = __kmp_reserve_threads(
1742            root, parent_team, master_tid, nthreads
1743#if OMP_40_ENABLED
1744            /* AC: If we execute teams from parallel region (on host), then
1745               teams should be created but each can only have 1 thread if
1746               nesting is disabled. If teams called from serial region, then
1747               teams and their threads should be created regardless of the
1748               nesting setting. */
1749            ,
1750            enter_teams
1751#endif /* OMP_40_ENABLED */
1752            );
1753        if (nthreads == 1) {
1754          // Free lock for single thread execution here; for multi-thread
1755          // execution it will be freed later after team of threads created
1756          // and initialized
1757          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1758        }
1759      }
1760    }
1761    KMP_DEBUG_ASSERT(nthreads > 0);
1762
1763    // If we temporarily changed the set number of threads then restore it now
1764    master_th->th.th_set_nproc = 0;
1765
1766    /* create a serialized parallel region? */
1767    if (nthreads == 1) {
1768/* josh todo: hypothetical question: what do we do for OS X*? */
1769#if KMP_OS_LINUX &&                                                            \
1770    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1771      void *args[argc];
1772#else
1773      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1774#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1775          KMP_ARCH_AARCH64) */
1776
1777      KA_TRACE(20,
1778               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1779
1780      __kmpc_serialized_parallel(loc, gtid);
1781
1782      if (call_context == fork_context_intel) {
1783        /* TODO this sucks, use the compiler itself to pass args! :) */
1784        master_th->th.th_serial_team->t.t_ident = loc;
1785#if OMP_40_ENABLED
1786        if (!ap) {
1787          // revert change made in __kmpc_serialized_parallel()
1788          master_th->th.th_serial_team->t.t_level--;
1789// Get args from parent team for teams construct
1790
1791#if OMPT_SUPPORT
1792          void *dummy;
1793          void **exit_runtime_p;
1794          ompt_task_info_t *task_info;
1795
1796          ompt_lw_taskteam_t lw_taskteam;
1797
1798          if (ompt_enabled.enabled) {
1799            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1800                                    &ompt_parallel_data, return_address);
1801
1802            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1803            // don't use lw_taskteam after linking. content was swaped
1804
1805            task_info = OMPT_CUR_TASK_INFO(master_th);
1806            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1807            if (ompt_enabled.ompt_callback_implicit_task) {
1808              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1809                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1810                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1811              OMPT_CUR_TASK_INFO(master_th)
1812                  ->thread_num = __kmp_tid_from_gtid(gtid);
1813            }
1814
1815            /* OMPT state */
1816            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1817          } else {
1818            exit_runtime_p = &dummy;
1819          }
1820#endif
1821
1822          {
1823            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1824            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1825            __kmp_invoke_microtask(microtask, gtid, 0, argc,
1826                                   parent_team->t.t_argv
1827#if OMPT_SUPPORT
1828                                   ,
1829                                   exit_runtime_p
1830#endif
1831                                   );
1832          }
1833
1834#if OMPT_SUPPORT
1835          if (ompt_enabled.enabled) {
1836            exit_runtime_p = NULL;
1837            if (ompt_enabled.ompt_callback_implicit_task) {
1838              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1839                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1840                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1841            }
1842
1843            __ompt_lw_taskteam_unlink(master_th);
1844            if (ompt_enabled.ompt_callback_parallel_end) {
1845              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1846                  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1847                  OMPT_INVOKER(call_context), return_address);
1848            }
1849            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1850          }
1851#endif
1852        } else if (microtask == (microtask_t)__kmp_teams_master) {
1853          KMP_DEBUG_ASSERT(master_th->th.th_team ==
1854                           master_th->th.th_serial_team);
1855          team = master_th->th.th_team;
1856          // team->t.t_pkfn = microtask;
1857          team->t.t_invoke = invoker;
1858          __kmp_alloc_argv_entries(argc, team, TRUE);
1859          team->t.t_argc = argc;
1860          argv = (void **)team->t.t_argv;
1861          if (ap) {
1862            for (i = argc - 1; i >= 0; --i)
1863// TODO: revert workaround for Intel(R) 64 tracker #96
1864#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1865              *argv++ = va_arg(*ap, void *);
1866#else
1867              *argv++ = va_arg(ap, void *);
1868#endif
1869          } else {
1870            for (i = 0; i < argc; ++i)
1871              // Get args from parent team for teams construct
1872              argv[i] = parent_team->t.t_argv[i];
1873          }
1874          // AC: revert change made in __kmpc_serialized_parallel()
1875          //     because initial code in teams should have level=0
1876          team->t.t_level--;
1877          // AC: call special invoker for outer "parallel" of teams construct
1878          invoker(gtid);
1879        } else {
1880#endif /* OMP_40_ENABLED */
1881          argv = args;
1882          for (i = argc - 1; i >= 0; --i)
1883// TODO: revert workaround for Intel(R) 64 tracker #96
1884#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1885            *argv++ = va_arg(*ap, void *);
1886#else
1887          *argv++ = va_arg(ap, void *);
1888#endif
1889          KMP_MB();
1890
1891#if OMPT_SUPPORT
1892          void *dummy;
1893          void **exit_runtime_p;
1894          ompt_task_info_t *task_info;
1895
1896          ompt_lw_taskteam_t lw_taskteam;
1897
1898          if (ompt_enabled.enabled) {
1899            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1900                                    &ompt_parallel_data, return_address);
1901            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1902            // don't use lw_taskteam after linking. content was swaped
1903            task_info = OMPT_CUR_TASK_INFO(master_th);
1904            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1905
1906            /* OMPT implicit task begin */
1907            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1908            if (ompt_enabled.ompt_callback_implicit_task) {
1909              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1910                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1911                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1912              OMPT_CUR_TASK_INFO(master_th)
1913                  ->thread_num = __kmp_tid_from_gtid(gtid);
1914            }
1915
1916            /* OMPT state */
1917            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1918          } else {
1919            exit_runtime_p = &dummy;
1920          }
1921#endif
1922
1923          {
1924            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1925            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1926            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1927#if OMPT_SUPPORT
1928                                   ,
1929                                   exit_runtime_p
1930#endif
1931                                   );
1932          }
1933
1934#if OMPT_SUPPORT
1935          if (ompt_enabled.enabled) {
1936            *exit_runtime_p = NULL;
1937            if (ompt_enabled.ompt_callback_implicit_task) {
1938              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1939                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1940                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1941            }
1942
1943            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1944            __ompt_lw_taskteam_unlink(master_th);
1945            if (ompt_enabled.ompt_callback_parallel_end) {
1946              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1947                  &ompt_parallel_data, parent_task_data,
1948                  OMPT_INVOKER(call_context), return_address);
1949            }
1950            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1951          }
1952#endif
1953#if OMP_40_ENABLED
1954        }
1955#endif /* OMP_40_ENABLED */
1956      } else if (call_context == fork_context_gnu) {
1957#if OMPT_SUPPORT
1958        ompt_lw_taskteam_t lwt;
1959        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1960                                return_address);
1961
1962        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1963        __ompt_lw_taskteam_link(&lwt, master_th, 1);
1964// don't use lw_taskteam after linking. content was swaped
1965#endif
1966
1967        // we were called from GNU native code
1968        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1969        return FALSE;
1970      } else {
1971        KMP_ASSERT2(call_context < fork_context_last,
1972                    "__kmp_fork_call: unknown fork_context parameter");
1973      }
1974
1975      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1976      KMP_MB();
1977      return FALSE;
1978    } // if (nthreads == 1)
1979
1980    // GEH: only modify the executing flag in the case when not serialized
1981    //      serialized case is handled in kmpc_serialized_parallel
1982    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1983                  "curtask=%p, curtask_max_aclevel=%d\n",
1984                  parent_team->t.t_active_level, master_th,
1985                  master_th->th.th_current_task,
1986                  master_th->th.th_current_task->td_icvs.max_active_levels));
1987    // TODO: GEH - cannot do this assertion because root thread not set up as
1988    // executing
1989    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1990    master_th->th.th_current_task->td_flags.executing = 0;
1991
1992#if OMP_40_ENABLED
1993    if (!master_th->th.th_teams_microtask || level > teams_level)
1994#endif /* OMP_40_ENABLED */
1995    {
1996      /* Increment our nested depth level */
1997      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1998    }
1999
2000    // See if we need to make a copy of the ICVs.
2001    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2002    if ((level + 1 < __kmp_nested_nth.used) &&
2003        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2004      nthreads_icv = __kmp_nested_nth.nth[level + 1];
2005    } else {
2006      nthreads_icv = 0; // don't update
2007    }
2008
2009#if OMP_40_ENABLED
2010    // Figure out the proc_bind_policy for the new team.
2011    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2012    kmp_proc_bind_t proc_bind_icv =
2013        proc_bind_default; // proc_bind_default means don't update
2014    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2015      proc_bind = proc_bind_false;
2016    } else {
2017      if (proc_bind == proc_bind_default) {
2018        // No proc_bind clause specified; use current proc-bind-var for this
2019        // parallel region
2020        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2021      }
2022      /* else: The proc_bind policy was specified explicitly on parallel clause.
2023         This overrides proc-bind-var for this parallel region, but does not
2024         change proc-bind-var. */
2025      // Figure the value of proc-bind-var for the child threads.
2026      if ((level + 1 < __kmp_nested_proc_bind.used) &&
2027          (__kmp_nested_proc_bind.bind_types[level + 1] !=
2028           master_th->th.th_current_task->td_icvs.proc_bind)) {
2029        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2030      }
2031    }
2032
2033    // Reset for next parallel region
2034    master_th->th.th_set_proc_bind = proc_bind_default;
2035#endif /* OMP_40_ENABLED */
2036
2037    if ((nthreads_icv > 0)
2038#if OMP_40_ENABLED
2039        || (proc_bind_icv != proc_bind_default)
2040#endif /* OMP_40_ENABLED */
2041            ) {
2042      kmp_internal_control_t new_icvs;
2043      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2044      new_icvs.next = NULL;
2045      if (nthreads_icv > 0) {
2046        new_icvs.nproc = nthreads_icv;
2047      }
2048
2049#if OMP_40_ENABLED
2050      if (proc_bind_icv != proc_bind_default) {
2051        new_icvs.proc_bind = proc_bind_icv;
2052      }
2053#endif /* OMP_40_ENABLED */
2054
2055      /* allocate a new parallel team */
2056      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057      team = __kmp_allocate_team(root, nthreads, nthreads,
2058#if OMPT_SUPPORT
2059                                 ompt_parallel_data,
2060#endif
2061#if OMP_40_ENABLED
2062                                 proc_bind,
2063#endif
2064                                 &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2065    } else {
2066      /* allocate a new parallel team */
2067      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2068      team = __kmp_allocate_team(root, nthreads, nthreads,
2069#if OMPT_SUPPORT
2070                                 ompt_parallel_data,
2071#endif
2072#if OMP_40_ENABLED
2073                                 proc_bind,
2074#endif
2075                                 &master_th->th.th_current_task->td_icvs,
2076                                 argc USE_NESTED_HOT_ARG(master_th));
2077    }
2078    KF_TRACE(
2079        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2080
2081    /* setup the new team */
2082    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2083    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2084    KMP_CHECK_UPDATE(team->t.t_ident, loc);
2085    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2086    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2087#if OMPT_SUPPORT
2088    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2089                          return_address);
2090#endif
2091    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2092// TODO: parent_team->t.t_level == INT_MAX ???
2093#if OMP_40_ENABLED
2094    if (!master_th->th.th_teams_microtask || level > teams_level) {
2095#endif /* OMP_40_ENABLED */
2096      int new_level = parent_team->t.t_level + 1;
2097      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2098      new_level = parent_team->t.t_active_level + 1;
2099      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2100#if OMP_40_ENABLED
2101    } else {
2102      // AC: Do not increase parallel level at start of the teams construct
2103      int new_level = parent_team->t.t_level;
2104      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2105      new_level = parent_team->t.t_active_level;
2106      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2107    }
2108#endif /* OMP_40_ENABLED */
2109    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2110    // set master's schedule as new run-time schedule
2111    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2112
2113#if OMP_40_ENABLED
2114    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2115#endif
2116#if OMP_50_ENABLED
2117    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2118#endif
2119
2120    // Update the floating point rounding in the team if required.
2121    propagateFPControl(team);
2122
2123    if (__kmp_tasking_mode != tskm_immediate_exec) {
2124      // Set master's task team to team's task team. Unless this is hot team, it
2125      // should be NULL.
2126      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2127                       parent_team->t.t_task_team[master_th->th.th_task_state]);
2128      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2129                    "%p, new task_team %p / team %p\n",
2130                    __kmp_gtid_from_thread(master_th),
2131                    master_th->th.th_task_team, parent_team,
2132                    team->t.t_task_team[master_th->th.th_task_state], team));
2133
2134      if (active_level || master_th->th.th_task_team) {
2135        // Take a memo of master's task_state
2136        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2137        if (master_th->th.th_task_state_top >=
2138            master_th->th.th_task_state_stack_sz) { // increase size
2139          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2140          kmp_uint8 *old_stack, *new_stack;
2141          kmp_uint32 i;
2142          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2143          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2144            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2145          }
2146          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2147               ++i) { // zero-init rest of stack
2148            new_stack[i] = 0;
2149          }
2150          old_stack = master_th->th.th_task_state_memo_stack;
2151          master_th->th.th_task_state_memo_stack = new_stack;
2152          master_th->th.th_task_state_stack_sz = new_size;
2153          __kmp_free(old_stack);
2154        }
2155        // Store master's task_state on stack
2156        master_th->th
2157            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2158            master_th->th.th_task_state;
2159        master_th->th.th_task_state_top++;
2160#if KMP_NESTED_HOT_TEAMS
2161        if (master_th->th.th_hot_teams &&
2162            active_level < __kmp_hot_teams_max_level &&
2163            team == master_th->th.th_hot_teams[active_level].hot_team) {
2164          // Restore master's nested state if nested hot team
2165          master_th->th.th_task_state =
2166              master_th->th
2167                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2168        } else {
2169#endif
2170          master_th->th.th_task_state = 0;
2171#if KMP_NESTED_HOT_TEAMS
2172        }
2173#endif
2174      }
2175#if !KMP_NESTED_HOT_TEAMS
2176      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2177                       (team == root->r.r_hot_team));
2178#endif
2179    }
2180
2181    KA_TRACE(
2182        20,
2183        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2184         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2185         team->t.t_nproc));
2186    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2187                     (team->t.t_master_tid == 0 &&
2188                      (team->t.t_parent == root->r.r_root_team ||
2189                       team->t.t_parent->t.t_serialized)));
2190    KMP_MB();
2191
2192    /* now, setup the arguments */
2193    argv = (void **)team->t.t_argv;
2194#if OMP_40_ENABLED
2195    if (ap) {
2196#endif /* OMP_40_ENABLED */
2197      for (i = argc - 1; i >= 0; --i) {
2198// TODO: revert workaround for Intel(R) 64 tracker #96
2199#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2200        void *new_argv = va_arg(*ap, void *);
2201#else
2202      void *new_argv = va_arg(ap, void *);
2203#endif
2204        KMP_CHECK_UPDATE(*argv, new_argv);
2205        argv++;
2206      }
2207#if OMP_40_ENABLED
2208    } else {
2209      for (i = 0; i < argc; ++i) {
2210        // Get args from parent team for teams construct
2211        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2212      }
2213    }
2214#endif /* OMP_40_ENABLED */
2215
2216    /* now actually fork the threads */
2217    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2218    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2219      root->r.r_active = TRUE;
2220
2221    __kmp_fork_team_threads(root, team, master_th, gtid);
2222    __kmp_setup_icv_copy(team, nthreads,
2223                         &master_th->th.th_current_task->td_icvs, loc);
2224
2225#if OMPT_SUPPORT
2226    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2227#endif
2228
2229    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2230
2231#if USE_ITT_BUILD
2232    if (team->t.t_active_level == 1 // only report frames at level 1
2233#if OMP_40_ENABLED
2234        && !master_th->th.th_teams_microtask // not in teams construct
2235#endif /* OMP_40_ENABLED */
2236        ) {
2237#if USE_ITT_NOTIFY
2238      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2239          (__kmp_forkjoin_frames_mode == 3 ||
2240           __kmp_forkjoin_frames_mode == 1)) {
2241        kmp_uint64 tmp_time = 0;
2242        if (__itt_get_timestamp_ptr)
2243          tmp_time = __itt_get_timestamp();
2244        // Internal fork - report frame begin
2245        master_th->th.th_frame_time = tmp_time;
2246        if (__kmp_forkjoin_frames_mode == 3)
2247          team->t.t_region_time = tmp_time;
2248      } else
2249// only one notification scheme (either "submit" or "forking/joined", not both)
2250#endif /* USE_ITT_NOTIFY */
2251          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2252              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2253        // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2254        __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2255      }
2256    }
2257#endif /* USE_ITT_BUILD */
2258
2259    /* now go on and do the work */
2260    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2261    KMP_MB();
2262    KF_TRACE(10,
2263             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2264              root, team, master_th, gtid));
2265
2266#if USE_ITT_BUILD
2267    if (__itt_stack_caller_create_ptr) {
2268      team->t.t_stack_id =
2269          __kmp_itt_stack_caller_create(); // create new stack stitching id
2270      // before entering fork barrier
2271    }
2272#endif /* USE_ITT_BUILD */
2273
2274#if OMP_40_ENABLED
2275    // AC: skip __kmp_internal_fork at teams construct, let only master
2276    // threads execute
2277    if (ap)
2278#endif /* OMP_40_ENABLED */
2279    {
2280      __kmp_internal_fork(loc, gtid, team);
2281      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2282                    "master_th=%p, gtid=%d\n",
2283                    root, team, master_th, gtid));
2284    }
2285
2286    if (call_context == fork_context_gnu) {
2287      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2288      return TRUE;
2289    }
2290
2291    /* Invoke microtask for MASTER thread */
2292    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2293                  team->t.t_id, team->t.t_pkfn));
2294  } // END of timer KMP_fork_call block
2295
2296  if (!team->t.t_invoke(gtid)) {
2297    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2298  }
2299  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2300                team->t.t_id, team->t.t_pkfn));
2301  KMP_MB(); /* Flush all pending memory write invalidates.  */
2302
2303  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2304
2305#if OMPT_SUPPORT
2306  if (ompt_enabled.enabled) {
2307    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2308  }
2309#endif
2310
2311  return TRUE;
2312}
2313
2314#if OMPT_SUPPORT
2315static inline void __kmp_join_restore_state(kmp_info_t *thread,
2316                                            kmp_team_t *team) {
2317  // restore state outside the region
2318  thread->th.ompt_thread_info.state =
2319      ((team->t.t_serialized) ? ompt_state_work_serial
2320                              : ompt_state_work_parallel);
2321}
2322
2323static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2324                                   kmp_team_t *team, ompt_data_t *parallel_data,
2325                                   fork_context_e fork_context, void *codeptr) {
2326  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2327  if (ompt_enabled.ompt_callback_parallel_end) {
2328    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2329        parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2330        codeptr);
2331  }
2332
2333  task_info->frame.enter_frame = ompt_data_none;
2334  __kmp_join_restore_state(thread, team);
2335}
2336#endif
2337
2338void __kmp_join_call(ident_t *loc, int gtid
2339#if OMPT_SUPPORT
2340                     ,
2341                     enum fork_context_e fork_context
2342#endif
2343#if OMP_40_ENABLED
2344                     ,
2345                     int exit_teams
2346#endif /* OMP_40_ENABLED */
2347                     ) {
2348  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2349  kmp_team_t *team;
2350  kmp_team_t *parent_team;
2351  kmp_info_t *master_th;
2352  kmp_root_t *root;
2353  int master_active;
2354  int i;
2355
2356  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2357
2358  /* setup current data */
2359  master_th = __kmp_threads[gtid];
2360  root = master_th->th.th_root;
2361  team = master_th->th.th_team;
2362  parent_team = team->t.t_parent;
2363
2364  master_th->th.th_ident = loc;
2365
2366#if OMPT_SUPPORT
2367  if (ompt_enabled.enabled) {
2368    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2369  }
2370#endif
2371
2372#if KMP_DEBUG
2373  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2374    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2375                  "th_task_team = %p\n",
2376                  __kmp_gtid_from_thread(master_th), team,
2377                  team->t.t_task_team[master_th->th.th_task_state],
2378                  master_th->th.th_task_team));
2379    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2380                     team->t.t_task_team[master_th->th.th_task_state]);
2381  }
2382#endif
2383
2384  if (team->t.t_serialized) {
2385#if OMP_40_ENABLED
2386    if (master_th->th.th_teams_microtask) {
2387      // We are in teams construct
2388      int level = team->t.t_level;
2389      int tlevel = master_th->th.th_teams_level;
2390      if (level == tlevel) {
2391        // AC: we haven't incremented it earlier at start of teams construct,
2392        //     so do it here - at the end of teams construct
2393        team->t.t_level++;
2394      } else if (level == tlevel + 1) {
2395        // AC: we are exiting parallel inside teams, need to increment
2396        // serialization in order to restore it in the next call to
2397        // __kmpc_end_serialized_parallel
2398        team->t.t_serialized++;
2399      }
2400    }
2401#endif /* OMP_40_ENABLED */
2402    __kmpc_end_serialized_parallel(loc, gtid);
2403
2404#if OMPT_SUPPORT
2405    if (ompt_enabled.enabled) {
2406      __kmp_join_restore_state(master_th, parent_team);
2407    }
2408#endif
2409
2410    return;
2411  }
2412
2413  master_active = team->t.t_master_active;
2414
2415#if OMP_40_ENABLED
2416  if (!exit_teams)
2417#endif /* OMP_40_ENABLED */
2418  {
2419    // AC: No barrier for internal teams at exit from teams construct.
2420    //     But there is barrier for external team (league).
2421    __kmp_internal_join(loc, gtid, team);
2422  }
2423#if OMP_40_ENABLED
2424  else {
2425    master_th->th.th_task_state =
2426        0; // AC: no tasking in teams (out of any parallel)
2427  }
2428#endif /* OMP_40_ENABLED */
2429
2430  KMP_MB();
2431
2432#if OMPT_SUPPORT
2433  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2434  void *codeptr = team->t.ompt_team_info.master_return_address;
2435#endif
2436
2437#if USE_ITT_BUILD
2438  if (__itt_stack_caller_create_ptr) {
2439    __kmp_itt_stack_caller_destroy(
2440        (__itt_caller)team->t
2441            .t_stack_id); // destroy the stack stitching id after join barrier
2442  }
2443
2444  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2445  if (team->t.t_active_level == 1
2446#if OMP_40_ENABLED
2447      && !master_th->th.th_teams_microtask /* not in teams construct */
2448#endif /* OMP_40_ENABLED */
2449      ) {
2450    master_th->th.th_ident = loc;
2451    // only one notification scheme (either "submit" or "forking/joined", not
2452    // both)
2453    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2454        __kmp_forkjoin_frames_mode == 3)
2455      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2456                             master_th->th.th_frame_time, 0, loc,
2457                             master_th->th.th_team_nproc, 1);
2458    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2459             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2460      __kmp_itt_region_joined(gtid);
2461  } // active_level == 1
2462#endif /* USE_ITT_BUILD */
2463
2464#if OMP_40_ENABLED
2465  if (master_th->th.th_teams_microtask && !exit_teams &&
2466      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2467      team->t.t_level == master_th->th.th_teams_level + 1) {
2468    // AC: We need to leave the team structure intact at the end of parallel
2469    // inside the teams construct, so that at the next parallel same (hot) team
2470    // works, only adjust nesting levels
2471
2472    /* Decrement our nested depth level */
2473    team->t.t_level--;
2474    team->t.t_active_level--;
2475    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2476
2477    /* Restore number of threads in the team if needed */
2478    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2479      int old_num = master_th->th.th_team_nproc;
2480      int new_num = master_th->th.th_teams_size.nth;
2481      kmp_info_t **other_threads = team->t.t_threads;
2482      team->t.t_nproc = new_num;
2483      for (i = 0; i < old_num; ++i) {
2484        other_threads[i]->th.th_team_nproc = new_num;
2485      }
2486      // Adjust states of non-used threads of the team
2487      for (i = old_num; i < new_num; ++i) {
2488        // Re-initialize thread's barrier data.
2489        int b;
2490        kmp_balign_t *balign = other_threads[i]->th.th_bar;
2491        for (b = 0; b < bs_last_barrier; ++b) {
2492          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2493          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2494#if USE_DEBUGGER
2495          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2496#endif
2497        }
2498        if (__kmp_tasking_mode != tskm_immediate_exec) {
2499          // Synchronize thread's task state
2500          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2501        }
2502      }
2503    }
2504
2505#if OMPT_SUPPORT
2506    if (ompt_enabled.enabled) {
2507      __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2508                      codeptr);
2509    }
2510#endif
2511
2512    return;
2513  }
2514#endif /* OMP_40_ENABLED */
2515
2516  /* do cleanup and restore the parent team */
2517  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2518  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2519
2520  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2521
2522  /* jc: The following lock has instructions with REL and ACQ semantics,
2523     separating the parallel user code called in this parallel region
2524     from the serial user code called after this function returns. */
2525  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2526
2527#if OMP_40_ENABLED
2528  if (!master_th->th.th_teams_microtask ||
2529      team->t.t_level > master_th->th.th_teams_level)
2530#endif /* OMP_40_ENABLED */
2531  {
2532    /* Decrement our nested depth level */
2533    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2534  }
2535  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2536
2537#if OMPT_SUPPORT
2538  if (ompt_enabled.enabled) {
2539    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2540    if (ompt_enabled.ompt_callback_implicit_task) {
2541      int ompt_team_size = team->t.t_nproc;
2542      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2543          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2544          OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2545    }
2546
2547    task_info->frame.exit_frame = ompt_data_none;
2548    task_info->task_data = ompt_data_none;
2549  }
2550#endif
2551
2552  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2553                master_th, team));
2554  __kmp_pop_current_task_from_thread(master_th);
2555
2556#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2557  // Restore master thread's partition.
2558  master_th->th.th_first_place = team->t.t_first_place;
2559  master_th->th.th_last_place = team->t.t_last_place;
2560#endif /* OMP_40_ENABLED */
2561#if OMP_50_ENABLED
2562  master_th->th.th_def_allocator = team->t.t_def_allocator;
2563#endif
2564
2565  updateHWFPControl(team);
2566
2567  if (root->r.r_active != master_active)
2568    root->r.r_active = master_active;
2569
2570  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2571                            master_th)); // this will free worker threads
2572
2573  /* this race was fun to find. make sure the following is in the critical
2574     region otherwise assertions may fail occasionally since the old team may be
2575     reallocated and the hierarchy appears inconsistent. it is actually safe to
2576     run and won't cause any bugs, but will cause those assertion failures. it's
2577     only one deref&assign so might as well put this in the critical region */
2578  master_th->th.th_team = parent_team;
2579  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2580  master_th->th.th_team_master = parent_team->t.t_threads[0];
2581  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2582
2583  /* restore serialized team, if need be */
2584  if (parent_team->t.t_serialized &&
2585      parent_team != master_th->th.th_serial_team &&
2586      parent_team != root->r.r_root_team) {
2587    __kmp_free_team(root,
2588                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2589    master_th->th.th_serial_team = parent_team;
2590  }
2591
2592  if (__kmp_tasking_mode != tskm_immediate_exec) {
2593    if (master_th->th.th_task_state_top >
2594        0) { // Restore task state from memo stack
2595      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2596      // Remember master's state if we re-use this nested hot team
2597      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2598          master_th->th.th_task_state;
2599      --master_th->th.th_task_state_top; // pop
2600      // Now restore state at this level
2601      master_th->th.th_task_state =
2602          master_th->th
2603              .th_task_state_memo_stack[master_th->th.th_task_state_top];
2604    }
2605    // Copy the task team from the parent team to the master thread
2606    master_th->th.th_task_team =
2607        parent_team->t.t_task_team[master_th->th.th_task_state];
2608    KA_TRACE(20,
2609             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2610              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2611              parent_team));
2612  }
2613
2614  // TODO: GEH - cannot do this assertion because root thread not set up as
2615  // executing
2616  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2617  master_th->th.th_current_task->td_flags.executing = 1;
2618
2619  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2620
2621#if OMPT_SUPPORT
2622  if (ompt_enabled.enabled) {
2623    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2624                    codeptr);
2625  }
2626#endif
2627
2628  KMP_MB();
2629  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2630}
2631
2632/* Check whether we should push an internal control record onto the
2633   serial team stack.  If so, do it.  */
2634void __kmp_save_internal_controls(kmp_info_t *thread) {
2635
2636  if (thread->th.th_team != thread->th.th_serial_team) {
2637    return;
2638  }
2639  if (thread->th.th_team->t.t_serialized > 1) {
2640    int push = 0;
2641
2642    if (thread->th.th_team->t.t_control_stack_top == NULL) {
2643      push = 1;
2644    } else {
2645      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2646          thread->th.th_team->t.t_serialized) {
2647        push = 1;
2648      }
2649    }
2650    if (push) { /* push a record on the serial team's stack */
2651      kmp_internal_control_t *control =
2652          (kmp_internal_control_t *)__kmp_allocate(
2653              sizeof(kmp_internal_control_t));
2654
2655      copy_icvs(control, &thread->th.th_current_task->td_icvs);
2656
2657      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2658
2659      control->next = thread->th.th_team->t.t_control_stack_top;
2660      thread->th.th_team->t.t_control_stack_top = control;
2661    }
2662  }
2663}
2664
2665/* Changes set_nproc */
2666void __kmp_set_num_threads(int new_nth, int gtid) {
2667  kmp_info_t *thread;
2668  kmp_root_t *root;
2669
2670  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2671  KMP_DEBUG_ASSERT(__kmp_init_serial);
2672
2673  if (new_nth < 1)
2674    new_nth = 1;
2675  else if (new_nth > __kmp_max_nth)
2676    new_nth = __kmp_max_nth;
2677
2678  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2679  thread = __kmp_threads[gtid];
2680  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2681    return; // nothing to do
2682
2683  __kmp_save_internal_controls(thread);
2684
2685  set__nproc(thread, new_nth);
2686
2687  // If this omp_set_num_threads() call will cause the hot team size to be
2688  // reduced (in the absence of a num_threads clause), then reduce it now,
2689  // rather than waiting for the next parallel region.
2690  root = thread->th.th_root;
2691  if (__kmp_init_parallel && (!root->r.r_active) &&
2692      (root->r.r_hot_team->t.t_nproc > new_nth)
2693#if KMP_NESTED_HOT_TEAMS
2694      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2695#endif
2696      ) {
2697    kmp_team_t *hot_team = root->r.r_hot_team;
2698    int f;
2699
2700    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2701
2702    // Release the extra threads we don't need any more.
2703    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2704      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2705      if (__kmp_tasking_mode != tskm_immediate_exec) {
2706        // When decreasing team size, threads no longer in the team should unref
2707        // task team.
2708        hot_team->t.t_threads[f]->th.th_task_team = NULL;
2709      }
2710      __kmp_free_thread(hot_team->t.t_threads[f]);
2711      hot_team->t.t_threads[f] = NULL;
2712    }
2713    hot_team->t.t_nproc = new_nth;
2714#if KMP_NESTED_HOT_TEAMS
2715    if (thread->th.th_hot_teams) {
2716      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2717      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2718    }
2719#endif
2720
2721    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2722
2723    // Update the t_nproc field in the threads that are still active.
2724    for (f = 0; f < new_nth; f++) {
2725      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2726      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2727    }
2728    // Special flag in case omp_set_num_threads() call
2729    hot_team->t.t_size_changed = -1;
2730  }
2731}
2732
2733/* Changes max_active_levels */
2734void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2735  kmp_info_t *thread;
2736
2737  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2738                "%d = (%d)\n",
2739                gtid, max_active_levels));
2740  KMP_DEBUG_ASSERT(__kmp_init_serial);
2741
2742  // validate max_active_levels
2743  if (max_active_levels < 0) {
2744    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2745    // We ignore this call if the user has specified a negative value.
2746    // The current setting won't be changed. The last valid setting will be
2747    // used. A warning will be issued (if warnings are allowed as controlled by
2748    // the KMP_WARNINGS env var).
2749    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2750                  "max_active_levels for thread %d = (%d)\n",
2751                  gtid, max_active_levels));
2752    return;
2753  }
2754  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2755    // it's OK, the max_active_levels is within the valid range: [ 0;
2756    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2757    // We allow a zero value. (implementation defined behavior)
2758  } else {
2759    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2760                KMP_MAX_ACTIVE_LEVELS_LIMIT);
2761    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2762    // Current upper limit is MAX_INT. (implementation defined behavior)
2763    // If the input exceeds the upper limit, we correct the input to be the
2764    // upper limit. (implementation defined behavior)
2765    // Actually, the flow should never get here until we use MAX_INT limit.
2766  }
2767  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2768                "max_active_levels for thread %d = (%d)\n",
2769                gtid, max_active_levels));
2770
2771  thread = __kmp_threads[gtid];
2772
2773  __kmp_save_internal_controls(thread);
2774
2775  set__max_active_levels(thread, max_active_levels);
2776}
2777
2778/* Gets max_active_levels */
2779int __kmp_get_max_active_levels(int gtid) {
2780  kmp_info_t *thread;
2781
2782  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2783  KMP_DEBUG_ASSERT(__kmp_init_serial);
2784
2785  thread = __kmp_threads[gtid];
2786  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2787  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2788                "curtask_maxaclevel=%d\n",
2789                gtid, thread->th.th_current_task,
2790                thread->th.th_current_task->td_icvs.max_active_levels));
2791  return thread->th.th_current_task->td_icvs.max_active_levels;
2792}
2793
2794/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2795void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2796  kmp_info_t *thread;
2797  //    kmp_team_t *team;
2798
2799  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2800                gtid, (int)kind, chunk));
2801  KMP_DEBUG_ASSERT(__kmp_init_serial);
2802
2803  // Check if the kind parameter is valid, correct if needed.
2804  // Valid parameters should fit in one of two intervals - standard or extended:
2805  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2806  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2807  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2808      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2809    // TODO: Hint needs attention in case we change the default schedule.
2810    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2811              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2812              __kmp_msg_null);
2813    kind = kmp_sched_default;
2814    chunk = 0; // ignore chunk value in case of bad kind
2815  }
2816
2817  thread = __kmp_threads[gtid];
2818
2819  __kmp_save_internal_controls(thread);
2820
2821  if (kind < kmp_sched_upper_std) {
2822    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2823      // differ static chunked vs. unchunked:  chunk should be invalid to
2824      // indicate unchunked schedule (which is the default)
2825      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2826    } else {
2827      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2828          __kmp_sch_map[kind - kmp_sched_lower - 1];
2829    }
2830  } else {
2831    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2832    //    kmp_sched_lower - 2 ];
2833    thread->th.th_current_task->td_icvs.sched.r_sched_type =
2834        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2835                      kmp_sched_lower - 2];
2836  }
2837  if (kind == kmp_sched_auto || chunk < 1) {
2838    // ignore parameter chunk for schedule auto
2839    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2840  } else {
2841    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2842  }
2843}
2844
2845/* Gets def_sched_var ICV values */
2846void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2847  kmp_info_t *thread;
2848  enum sched_type th_type;
2849
2850  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2851  KMP_DEBUG_ASSERT(__kmp_init_serial);
2852
2853  thread = __kmp_threads[gtid];
2854
2855  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2856
2857  switch (th_type) {
2858  case kmp_sch_static:
2859  case kmp_sch_static_greedy:
2860  case kmp_sch_static_balanced:
2861    *kind = kmp_sched_static;
2862    *chunk = 0; // chunk was not set, try to show this fact via zero value
2863    return;
2864  case kmp_sch_static_chunked:
2865    *kind = kmp_sched_static;
2866    break;
2867  case kmp_sch_dynamic_chunked:
2868    *kind = kmp_sched_dynamic;
2869    break;
2870  case kmp_sch_guided_chunked:
2871  case kmp_sch_guided_iterative_chunked:
2872  case kmp_sch_guided_analytical_chunked:
2873    *kind = kmp_sched_guided;
2874    break;
2875  case kmp_sch_auto:
2876    *kind = kmp_sched_auto;
2877    break;
2878  case kmp_sch_trapezoidal:
2879    *kind = kmp_sched_trapezoidal;
2880    break;
2881#if KMP_STATIC_STEAL_ENABLED
2882  case kmp_sch_static_steal:
2883    *kind = kmp_sched_static_steal;
2884    break;
2885#endif
2886  default:
2887    KMP_FATAL(UnknownSchedulingType, th_type);
2888  }
2889
2890  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2891}
2892
2893int __kmp_get_ancestor_thread_num(int gtid, int level) {
2894
2895  int ii, dd;
2896  kmp_team_t *team;
2897  kmp_info_t *thr;
2898
2899  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2900  KMP_DEBUG_ASSERT(__kmp_init_serial);
2901
2902  // validate level
2903  if (level == 0)
2904    return 0;
2905  if (level < 0)
2906    return -1;
2907  thr = __kmp_threads[gtid];
2908  team = thr->th.th_team;
2909  ii = team->t.t_level;
2910  if (level > ii)
2911    return -1;
2912
2913#if OMP_40_ENABLED
2914  if (thr->th.th_teams_microtask) {
2915    // AC: we are in teams region where multiple nested teams have same level
2916    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2917    if (level <=
2918        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2919      KMP_DEBUG_ASSERT(ii >= tlevel);
2920      // AC: As we need to pass by the teams league, we need to artificially
2921      // increase ii
2922      if (ii == tlevel) {
2923        ii += 2; // three teams have same level
2924      } else {
2925        ii++; // two teams have same level
2926      }
2927    }
2928  }
2929#endif
2930
2931  if (ii == level)
2932    return __kmp_tid_from_gtid(gtid);
2933
2934  dd = team->t.t_serialized;
2935  level++;
2936  while (ii > level) {
2937    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2938    }
2939    if ((team->t.t_serialized) && (!dd)) {
2940      team = team->t.t_parent;
2941      continue;
2942    }
2943    if (ii > level) {
2944      team = team->t.t_parent;
2945      dd = team->t.t_serialized;
2946      ii--;
2947    }
2948  }
2949
2950  return (dd > 1) ? (0) : (team->t.t_master_tid);
2951}
2952
2953int __kmp_get_team_size(int gtid, int level) {
2954
2955  int ii, dd;
2956  kmp_team_t *team;
2957  kmp_info_t *thr;
2958
2959  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2960  KMP_DEBUG_ASSERT(__kmp_init_serial);
2961
2962  // validate level
2963  if (level == 0)
2964    return 1;
2965  if (level < 0)
2966    return -1;
2967  thr = __kmp_threads[gtid];
2968  team = thr->th.th_team;
2969  ii = team->t.t_level;
2970  if (level > ii)
2971    return -1;
2972
2973#if OMP_40_ENABLED
2974  if (thr->th.th_teams_microtask) {
2975    // AC: we are in teams region where multiple nested teams have same level
2976    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2977    if (level <=
2978        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2979      KMP_DEBUG_ASSERT(ii >= tlevel);
2980      // AC: As we need to pass by the teams league, we need to artificially
2981      // increase ii
2982      if (ii == tlevel) {
2983        ii += 2; // three teams have same level
2984      } else {
2985        ii++; // two teams have same level
2986      }
2987    }
2988  }
2989#endif
2990
2991  while (ii > level) {
2992    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2993    }
2994    if (team->t.t_serialized && (!dd)) {
2995      team = team->t.t_parent;
2996      continue;
2997    }
2998    if (ii > level) {
2999      team = team->t.t_parent;
3000      ii--;
3001    }
3002  }
3003
3004  return team->t.t_nproc;
3005}
3006
3007kmp_r_sched_t __kmp_get_schedule_global() {
3008  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3009  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3010  // independently. So one can get the updated schedule here.
3011
3012  kmp_r_sched_t r_sched;
3013
3014  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3015  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3016  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3017  // different roots (even in OMP 2.5)
3018  if (__kmp_sched == kmp_sch_static) {
3019    // replace STATIC with more detailed schedule (balanced or greedy)
3020    r_sched.r_sched_type = __kmp_static;
3021  } else if (__kmp_sched == kmp_sch_guided_chunked) {
3022    // replace GUIDED with more detailed schedule (iterative or analytical)
3023    r_sched.r_sched_type = __kmp_guided;
3024  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3025    r_sched.r_sched_type = __kmp_sched;
3026  }
3027
3028  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3029    // __kmp_chunk may be wrong here (if it was not ever set)
3030    r_sched.chunk = KMP_DEFAULT_CHUNK;
3031  } else {
3032    r_sched.chunk = __kmp_chunk;
3033  }
3034
3035  return r_sched;
3036}
3037
3038/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3039   at least argc number of *t_argv entries for the requested team. */
3040static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3041
3042  KMP_DEBUG_ASSERT(team);
3043  if (!realloc || argc > team->t.t_max_argc) {
3044
3045    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3046                   "current entries=%d\n",
3047                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3048    /* if previously allocated heap space for args, free them */
3049    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3050      __kmp_free((void *)team->t.t_argv);
3051
3052    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3053      /* use unused space in the cache line for arguments */
3054      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3055      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3056                     "argv entries\n",
3057                     team->t.t_id, team->t.t_max_argc));
3058      team->t.t_argv = &team->t.t_inline_argv[0];
3059      if (__kmp_storage_map) {
3060        __kmp_print_storage_map_gtid(
3061            -1, &team->t.t_inline_argv[0],
3062            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3063            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3064            team->t.t_id);
3065      }
3066    } else {
3067      /* allocate space for arguments in the heap */
3068      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3069                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
3070                               : 2 * argc;
3071      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3072                     "argv entries\n",
3073                     team->t.t_id, team->t.t_max_argc));
3074      team->t.t_argv =
3075          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3076      if (__kmp_storage_map) {
3077        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3078                                     &team->t.t_argv[team->t.t_max_argc],
3079                                     sizeof(void *) * team->t.t_max_argc,
3080                                     "team_%d.t_argv", team->t.t_id);
3081      }
3082    }
3083  }
3084}
3085
3086static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3087  int i;
3088  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3089  team->t.t_threads =
3090      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3091  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3092      sizeof(dispatch_shared_info_t) * num_disp_buff);
3093  team->t.t_dispatch =
3094      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3095  team->t.t_implicit_task_taskdata =
3096      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3097  team->t.t_max_nproc = max_nth;
3098
3099  /* setup dispatch buffers */
3100  for (i = 0; i < num_disp_buff; ++i) {
3101    team->t.t_disp_buffer[i].buffer_index = i;
3102#if OMP_45_ENABLED
3103    team->t.t_disp_buffer[i].doacross_buf_idx = i;
3104#endif
3105  }
3106}
3107
3108static void __kmp_free_team_arrays(kmp_team_t *team) {
3109  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3110  int i;
3111  for (i = 0; i < team->t.t_max_nproc; ++i) {
3112    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3113      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3114      team->t.t_dispatch[i].th_disp_buffer = NULL;
3115    }
3116  }
3117#if KMP_USE_HIER_SCHED
3118  __kmp_dispatch_free_hierarchies(team);
3119#endif
3120  __kmp_free(team->t.t_threads);
3121  __kmp_free(team->t.t_disp_buffer);
3122  __kmp_free(team->t.t_dispatch);
3123  __kmp_free(team->t.t_implicit_task_taskdata);
3124  team->t.t_threads = NULL;
3125  team->t.t_disp_buffer = NULL;
3126  team->t.t_dispatch = NULL;
3127  team->t.t_implicit_task_taskdata = 0;
3128}
3129
3130static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3131  kmp_info_t **oldThreads = team->t.t_threads;
3132
3133  __kmp_free(team->t.t_disp_buffer);
3134  __kmp_free(team->t.t_dispatch);
3135  __kmp_free(team->t.t_implicit_task_taskdata);
3136  __kmp_allocate_team_arrays(team, max_nth);
3137
3138  KMP_MEMCPY(team->t.t_threads, oldThreads,
3139             team->t.t_nproc * sizeof(kmp_info_t *));
3140
3141  __kmp_free(oldThreads);
3142}
3143
3144static kmp_internal_control_t __kmp_get_global_icvs(void) {
3145
3146  kmp_r_sched_t r_sched =
3147      __kmp_get_schedule_global(); // get current state of scheduling globals
3148
3149#if OMP_40_ENABLED
3150  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3151#endif /* OMP_40_ENABLED */
3152
3153  kmp_internal_control_t g_icvs = {
3154    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3155    (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3156    // for nested parallelism (per thread)
3157    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3158    // adjustment of threads (per thread)
3159    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3160    // whether blocktime is explicitly set
3161    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3162#if KMP_USE_MONITOR
3163    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3164// intervals
3165#endif
3166    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3167    // next parallel region (per thread)
3168    // (use a max ub on value if __kmp_parallel_initialize not called yet)
3169    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3170    // for max_active_levels
3171    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3172// {sched,chunk} pair
3173#if OMP_40_ENABLED
3174    __kmp_nested_proc_bind.bind_types[0],
3175    __kmp_default_device,
3176#endif /* OMP_40_ENABLED */
3177    NULL // struct kmp_internal_control *next;
3178  };
3179
3180  return g_icvs;
3181}
3182
3183static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3184
3185  kmp_internal_control_t gx_icvs;
3186  gx_icvs.serial_nesting_level =
3187      0; // probably =team->t.t_serial like in save_inter_controls
3188  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3189  gx_icvs.next = NULL;
3190
3191  return gx_icvs;
3192}
3193
3194static void __kmp_initialize_root(kmp_root_t *root) {
3195  int f;
3196  kmp_team_t *root_team;
3197  kmp_team_t *hot_team;
3198  int hot_team_max_nth;
3199  kmp_r_sched_t r_sched =
3200      __kmp_get_schedule_global(); // get current state of scheduling globals
3201  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3202  KMP_DEBUG_ASSERT(root);
3203  KMP_ASSERT(!root->r.r_begin);
3204
3205  /* setup the root state structure */
3206  __kmp_init_lock(&root->r.r_begin_lock);
3207  root->r.r_begin = FALSE;
3208  root->r.r_active = FALSE;
3209  root->r.r_in_parallel = 0;
3210  root->r.r_blocktime = __kmp_dflt_blocktime;
3211  root->r.r_nested = __kmp_dflt_nested;
3212  root->r.r_cg_nthreads = 1;
3213
3214  /* setup the root team for this task */
3215  /* allocate the root team structure */
3216  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3217
3218  root_team =
3219      __kmp_allocate_team(root,
3220                          1, // new_nproc
3221                          1, // max_nproc
3222#if OMPT_SUPPORT
3223                          ompt_data_none, // root parallel id
3224#endif
3225#if OMP_40_ENABLED
3226                          __kmp_nested_proc_bind.bind_types[0],
3227#endif
3228                          &r_icvs,
3229                          0 // argc
3230                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3231                          );
3232#if USE_DEBUGGER
3233  // Non-NULL value should be assigned to make the debugger display the root
3234  // team.
3235  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3236#endif
3237
3238  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3239
3240  root->r.r_root_team = root_team;
3241  root_team->t.t_control_stack_top = NULL;
3242
3243  /* initialize root team */
3244  root_team->t.t_threads[0] = NULL;
3245  root_team->t.t_nproc = 1;
3246  root_team->t.t_serialized = 1;
3247  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3248  root_team->t.t_sched.sched = r_sched.sched;
3249  KA_TRACE(
3250      20,
3251      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3252       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3253
3254  /* setup the  hot team for this task */
3255  /* allocate the hot team structure */
3256  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3257
3258  hot_team =
3259      __kmp_allocate_team(root,
3260                          1, // new_nproc
3261                          __kmp_dflt_team_nth_ub * 2, // max_nproc
3262#if OMPT_SUPPORT
3263                          ompt_data_none, // root parallel id
3264#endif
3265#if OMP_40_ENABLED
3266                          __kmp_nested_proc_bind.bind_types[0],
3267#endif
3268                          &r_icvs,
3269                          0 // argc
3270                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3271                          );
3272  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3273
3274  root->r.r_hot_team = hot_team;
3275  root_team->t.t_control_stack_top = NULL;
3276
3277  /* first-time initialization */
3278  hot_team->t.t_parent = root_team;
3279
3280  /* initialize hot team */
3281  hot_team_max_nth = hot_team->t.t_max_nproc;
3282  for (f = 0; f < hot_team_max_nth; ++f) {
3283    hot_team->t.t_threads[f] = NULL;
3284  }
3285  hot_team->t.t_nproc = 1;
3286  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3287  hot_team->t.t_sched.sched = r_sched.sched;
3288  hot_team->t.t_size_changed = 0;
3289}
3290
3291#ifdef KMP_DEBUG
3292
3293typedef struct kmp_team_list_item {
3294  kmp_team_p const *entry;
3295  struct kmp_team_list_item *next;
3296} kmp_team_list_item_t;
3297typedef kmp_team_list_item_t *kmp_team_list_t;
3298
3299static void __kmp_print_structure_team_accum( // Add team to list of teams.
3300    kmp_team_list_t list, // List of teams.
3301    kmp_team_p const *team // Team to add.
3302    ) {
3303
3304  // List must terminate with item where both entry and next are NULL.
3305  // Team is added to the list only once.
3306  // List is sorted in ascending order by team id.
3307  // Team id is *not* a key.
3308
3309  kmp_team_list_t l;
3310
3311  KMP_DEBUG_ASSERT(list != NULL);
3312  if (team == NULL) {
3313    return;
3314  }
3315
3316  __kmp_print_structure_team_accum(list, team->t.t_parent);
3317  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3318
3319  // Search list for the team.
3320  l = list;
3321  while (l->next != NULL && l->entry != team) {
3322    l = l->next;
3323  }
3324  if (l->next != NULL) {
3325    return; // Team has been added before, exit.
3326  }
3327
3328  // Team is not found. Search list again for insertion point.
3329  l = list;
3330  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3331    l = l->next;
3332  }
3333
3334  // Insert team.
3335  {
3336    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3337        sizeof(kmp_team_list_item_t));
3338    *item = *l;
3339    l->entry = team;
3340    l->next = item;
3341  }
3342}
3343
3344static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3345
3346                                       ) {
3347  __kmp_printf("%s", title);
3348  if (team != NULL) {
3349    __kmp_printf("%2x %p\n", team->t.t_id, team);
3350  } else {
3351    __kmp_printf(" - (nil)\n");
3352  }
3353}
3354
3355static void __kmp_print_structure_thread(char const *title,
3356                                         kmp_info_p const *thread) {
3357  __kmp_printf("%s", title);
3358  if (thread != NULL) {
3359    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3360  } else {
3361    __kmp_printf(" - (nil)\n");
3362  }
3363}
3364
3365void __kmp_print_structure(void) {
3366
3367  kmp_team_list_t list;
3368
3369  // Initialize list of teams.
3370  list =
3371      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3372  list->entry = NULL;
3373  list->next = NULL;
3374
3375  __kmp_printf("\n------------------------------\nGlobal Thread "
3376               "Table\n------------------------------\n");
3377  {
3378    int gtid;
3379    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3380      __kmp_printf("%2d", gtid);
3381      if (__kmp_threads != NULL) {
3382        __kmp_printf(" %p", __kmp_threads[gtid]);
3383      }
3384      if (__kmp_root != NULL) {
3385        __kmp_printf(" %p", __kmp_root[gtid]);
3386      }
3387      __kmp_printf("\n");
3388    }
3389  }
3390
3391  // Print out __kmp_threads array.
3392  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3393               "----------\n");
3394  if (__kmp_threads != NULL) {
3395    int gtid;
3396    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3397      kmp_info_t const *thread = __kmp_threads[gtid];
3398      if (thread != NULL) {
3399        __kmp_printf("GTID %2d %p:\n", gtid, thread);
3400        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3401        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3402        __kmp_print_structure_team("    Serial Team:  ",
3403                                   thread->th.th_serial_team);
3404        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3405        __kmp_print_structure_thread("    Master:       ",
3406                                     thread->th.th_team_master);
3407        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3408        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3409#if OMP_40_ENABLED
3410        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3411#endif
3412        __kmp_print_structure_thread("    Next in pool: ",
3413                                     thread->th.th_next_pool);
3414        __kmp_printf("\n");
3415        __kmp_print_structure_team_accum(list, thread->th.th_team);
3416        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3417      }
3418    }
3419  } else {
3420    __kmp_printf("Threads array is not allocated.\n");
3421  }
3422
3423  // Print out __kmp_root array.
3424  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3425               "--------\n");
3426  if (__kmp_root != NULL) {
3427    int gtid;
3428    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3429      kmp_root_t const *root = __kmp_root[gtid];
3430      if (root != NULL) {
3431        __kmp_printf("GTID %2d %p:\n", gtid, root);
3432        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3433        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3434        __kmp_print_structure_thread("    Uber Thread:  ",
3435                                     root->r.r_uber_thread);
3436        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3437        __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3438        __kmp_printf("    In Parallel:  %2d\n",
3439                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3440        __kmp_printf("\n");
3441        __kmp_print_structure_team_accum(list, root->r.r_root_team);
3442        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3443      }
3444    }
3445  } else {
3446    __kmp_printf("Ubers array is not allocated.\n");
3447  }
3448
3449  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3450               "--------\n");
3451  while (list->next != NULL) {
3452    kmp_team_p const *team = list->entry;
3453    int i;
3454    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3455    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3456    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3457    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3458    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3459    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3460    for (i = 0; i < team->t.t_nproc; ++i) {
3461      __kmp_printf("    Thread %2d:      ", i);
3462      __kmp_print_structure_thread("", team->t.t_threads[i]);
3463    }
3464    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3465    __kmp_printf("\n");
3466    list = list->next;
3467  }
3468
3469  // Print out __kmp_thread_pool and __kmp_team_pool.
3470  __kmp_printf("\n------------------------------\nPools\n----------------------"
3471               "--------\n");
3472  __kmp_print_structure_thread("Thread pool:          ",
3473                               CCAST(kmp_info_t *, __kmp_thread_pool));
3474  __kmp_print_structure_team("Team pool:            ",
3475                             CCAST(kmp_team_t *, __kmp_team_pool));
3476  __kmp_printf("\n");
3477
3478  // Free team list.
3479  while (list != NULL) {
3480    kmp_team_list_item_t *item = list;
3481    list = list->next;
3482    KMP_INTERNAL_FREE(item);
3483  }
3484}
3485
3486#endif
3487
3488//---------------------------------------------------------------------------
3489//  Stuff for per-thread fast random number generator
3490//  Table of primes
3491static const unsigned __kmp_primes[] = {
3492    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3493    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3494    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3495    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3496    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3497    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3498    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3499    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3500    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3501    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3502    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3503
3504//---------------------------------------------------------------------------
3505//  __kmp_get_random: Get a random number using a linear congruential method.
3506unsigned short __kmp_get_random(kmp_info_t *thread) {
3507  unsigned x = thread->th.th_x;
3508  unsigned short r = x >> 16;
3509
3510  thread->th.th_x = x * thread->th.th_a + 1;
3511
3512  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3513                thread->th.th_info.ds.ds_tid, r));
3514
3515  return r;
3516}
3517//--------------------------------------------------------
3518// __kmp_init_random: Initialize a random number generator
3519void __kmp_init_random(kmp_info_t *thread) {
3520  unsigned seed = thread->th.th_info.ds.ds_tid;
3521
3522  thread->th.th_a =
3523      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3524  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3525  KA_TRACE(30,
3526           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3527}
3528
3529#if KMP_OS_WINDOWS
3530/* reclaim array entries for root threads that are already dead, returns number
3531 * reclaimed */
3532static int __kmp_reclaim_dead_roots(void) {
3533  int i, r = 0;
3534
3535  for (i = 0; i < __kmp_threads_capacity; ++i) {
3536    if (KMP_UBER_GTID(i) &&
3537        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3538        !__kmp_root[i]
3539             ->r.r_active) { // AC: reclaim only roots died in non-active state
3540      r += __kmp_unregister_root_other_thread(i);
3541    }
3542  }
3543  return r;
3544}
3545#endif
3546
3547/* This function attempts to create free entries in __kmp_threads and
3548   __kmp_root, and returns the number of free entries generated.
3549
3550   For Windows* OS static library, the first mechanism used is to reclaim array
3551   entries for root threads that are already dead.
3552
3553   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3554   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3555   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3556   threadprivate cache array has been created. Synchronization with
3557   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3558
3559   After any dead root reclamation, if the clipping value allows array expansion
3560   to result in the generation of a total of nNeed free slots, the function does
3561   that expansion. If not, nothing is done beyond the possible initial root
3562   thread reclamation.
3563
3564   If any argument is negative, the behavior is undefined. */
3565static int __kmp_expand_threads(int nNeed) {
3566  int added = 0;
3567  int minimumRequiredCapacity;
3568  int newCapacity;
3569  kmp_info_t **newThreads;
3570  kmp_root_t **newRoot;
3571
3572// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3573// resizing __kmp_threads does not need additional protection if foreign
3574// threads are present
3575
3576#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3577  /* only for Windows static library */
3578  /* reclaim array entries for root threads that are already dead */
3579  added = __kmp_reclaim_dead_roots();
3580
3581  if (nNeed) {
3582    nNeed -= added;
3583    if (nNeed < 0)
3584      nNeed = 0;
3585  }
3586#endif
3587  if (nNeed <= 0)
3588    return added;
3589
3590  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3591  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3592  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3593  // > __kmp_max_nth in one of two ways:
3594  //
3595  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3596  //    may not be resused by another thread, so we may need to increase
3597  //    __kmp_threads_capacity to __kmp_max_nth + 1.
3598  //
3599  // 2) New foreign root(s) are encountered.  We always register new foreign
3600  //    roots. This may cause a smaller # of threads to be allocated at
3601  //    subsequent parallel regions, but the worker threads hang around (and
3602  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3603  //
3604  // Anyway, that is the reason for moving the check to see if
3605  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3606  // instead of having it performed here. -BB
3607
3608  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3609
3610  /* compute expansion headroom to check if we can expand */
3611  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3612    /* possible expansion too small -- give up */
3613    return added;
3614  }
3615  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3616
3617  newCapacity = __kmp_threads_capacity;
3618  do {
3619    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3620                                                          : __kmp_sys_max_nth;
3621  } while (newCapacity < minimumRequiredCapacity);
3622  newThreads = (kmp_info_t **)__kmp_allocate(
3623      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3624  newRoot =
3625      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3626  KMP_MEMCPY(newThreads, __kmp_threads,
3627             __kmp_threads_capacity * sizeof(kmp_info_t *));
3628  KMP_MEMCPY(newRoot, __kmp_root,
3629             __kmp_threads_capacity * sizeof(kmp_root_t *));
3630
3631  kmp_info_t **temp_threads = __kmp_threads;
3632  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3633  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3634  __kmp_free(temp_threads);
3635  added += newCapacity - __kmp_threads_capacity;
3636  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3637
3638  if (newCapacity > __kmp_tp_capacity) {
3639    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3640    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3641      __kmp_threadprivate_resize_cache(newCapacity);
3642    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3643      *(volatile int *)&__kmp_tp_capacity = newCapacity;
3644    }
3645    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3646  }
3647
3648  return added;
3649}
3650
3651/* Register the current thread as a root thread and obtain our gtid. We must
3652   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3653   thread that calls from __kmp_do_serial_initialize() */
3654int __kmp_register_root(int initial_thread) {
3655  kmp_info_t *root_thread;
3656  kmp_root_t *root;
3657  int gtid;
3658  int capacity;
3659  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3660  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3661  KMP_MB();
3662
3663  /* 2007-03-02:
3664     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3665     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3666     work as expected -- it may return false (that means there is at least one
3667     empty slot in __kmp_threads array), but it is possible the only free slot
3668     is #0, which is reserved for initial thread and so cannot be used for this
3669     one. Following code workarounds this bug.
3670
3671     However, right solution seems to be not reserving slot #0 for initial
3672     thread because:
3673     (1) there is no magic in slot #0,
3674     (2) we cannot detect initial thread reliably (the first thread which does
3675        serial initialization may be not a real initial thread).
3676  */
3677  capacity = __kmp_threads_capacity;
3678  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3679    --capacity;
3680  }
3681
3682  /* see if there are too many threads */
3683  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3684    if (__kmp_tp_cached) {
3685      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3686                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3687                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3688    } else {
3689      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3690                  __kmp_msg_null);
3691    }
3692  }
3693
3694  /* find an available thread slot */
3695  /* Don't reassign the zero slot since we need that to only be used by initial
3696     thread */
3697  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3698       gtid++)
3699    ;
3700  KA_TRACE(1,
3701           ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3702  KMP_ASSERT(gtid < __kmp_threads_capacity);
3703
3704  /* update global accounting */
3705  __kmp_all_nth++;
3706  TCW_4(__kmp_nth, __kmp_nth + 1);
3707
3708  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3709  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3710  if (__kmp_adjust_gtid_mode) {
3711    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3712      if (TCR_4(__kmp_gtid_mode) != 2) {
3713        TCW_4(__kmp_gtid_mode, 2);
3714      }
3715    } else {
3716      if (TCR_4(__kmp_gtid_mode) != 1) {
3717        TCW_4(__kmp_gtid_mode, 1);
3718      }
3719    }
3720  }
3721
3722#ifdef KMP_ADJUST_BLOCKTIME
3723  /* Adjust blocktime to zero if necessary            */
3724  /* Middle initialization might not have occurred yet */
3725  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3726    if (__kmp_nth > __kmp_avail_proc) {
3727      __kmp_zero_bt = TRUE;
3728    }
3729  }
3730#endif /* KMP_ADJUST_BLOCKTIME */
3731
3732  /* setup this new hierarchy */
3733  if (!(root = __kmp_root[gtid])) {
3734    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3735    KMP_DEBUG_ASSERT(!root->r.r_root_team);
3736  }
3737
3738#if KMP_STATS_ENABLED
3739  // Initialize stats as soon as possible (right after gtid assignment).
3740  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3741  __kmp_stats_thread_ptr->startLife();
3742  KMP_SET_THREAD_STATE(SERIAL_REGION);
3743  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3744#endif
3745  __kmp_initialize_root(root);
3746
3747  /* setup new root thread structure */
3748  if (root->r.r_uber_thread) {
3749    root_thread = root->r.r_uber_thread;
3750  } else {
3751    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3752    if (__kmp_storage_map) {
3753      __kmp_print_thread_storage_map(root_thread, gtid);
3754    }
3755    root_thread->th.th_info.ds.ds_gtid = gtid;
3756#if OMPT_SUPPORT
3757    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3758#endif
3759    root_thread->th.th_root = root;
3760    if (__kmp_env_consistency_check) {
3761      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3762    }
3763#if USE_FAST_MEMORY
3764    __kmp_initialize_fast_memory(root_thread);
3765#endif /* USE_FAST_MEMORY */
3766
3767#if KMP_USE_BGET
3768    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3769    __kmp_initialize_bget(root_thread);
3770#endif
3771    __kmp_init_random(root_thread); // Initialize random number generator
3772  }
3773
3774  /* setup the serial team held in reserve by the root thread */
3775  if (!root_thread->th.th_serial_team) {
3776    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3777    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3778    root_thread->th.th_serial_team =
3779        __kmp_allocate_team(root, 1, 1,
3780#if OMPT_SUPPORT
3781                            ompt_data_none, // root parallel id
3782#endif
3783#if OMP_40_ENABLED
3784                            proc_bind_default,
3785#endif
3786                            &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3787  }
3788  KMP_ASSERT(root_thread->th.th_serial_team);
3789  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3790                root_thread->th.th_serial_team));
3791
3792  /* drop root_thread into place */
3793  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3794
3795  root->r.r_root_team->t.t_threads[0] = root_thread;
3796  root->r.r_hot_team->t.t_threads[0] = root_thread;
3797  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3798  // AC: the team created in reserve, not for execution (it is unused for now).
3799  root_thread->th.th_serial_team->t.t_serialized = 0;
3800  root->r.r_uber_thread = root_thread;
3801
3802  /* initialize the thread, get it ready to go */
3803  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3804  TCW_4(__kmp_init_gtid, TRUE);
3805
3806  /* prepare the master thread for get_gtid() */
3807  __kmp_gtid_set_specific(gtid);
3808
3809#if USE_ITT_BUILD
3810  __kmp_itt_thread_name(gtid);
3811#endif /* USE_ITT_BUILD */
3812
3813#ifdef KMP_TDATA_GTID
3814  __kmp_gtid = gtid;
3815#endif
3816  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3817  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3818
3819  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3820                "plain=%u\n",
3821                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3822                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3823                KMP_INIT_BARRIER_STATE));
3824  { // Initialize barrier data.
3825    int b;
3826    for (b = 0; b < bs_last_barrier; ++b) {
3827      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3828#if USE_DEBUGGER
3829      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3830#endif
3831    }
3832  }
3833  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3834                   KMP_INIT_BARRIER_STATE);
3835
3836#if KMP_AFFINITY_SUPPORTED
3837#if OMP_40_ENABLED
3838  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3839  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3840  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3841  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3842#endif
3843  if (TCR_4(__kmp_init_middle)) {
3844    __kmp_affinity_set_init_mask(gtid, TRUE);
3845  }
3846#endif /* KMP_AFFINITY_SUPPORTED */
3847#if OMP_50_ENABLED
3848  root_thread->th.th_def_allocator = __kmp_def_allocator;
3849  root_thread->th.th_prev_level = 0;
3850  root_thread->th.th_prev_num_threads = 1;
3851#endif
3852
3853  __kmp_root_counter++;
3854
3855#if OMPT_SUPPORT
3856  if (!initial_thread && ompt_enabled.enabled) {
3857
3858    kmp_info_t *root_thread = ompt_get_thread();
3859
3860    ompt_set_thread_state(root_thread, ompt_state_overhead);
3861
3862    if (ompt_enabled.ompt_callback_thread_begin) {
3863      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3864          ompt_thread_initial, __ompt_get_thread_data_internal());
3865    }
3866    ompt_data_t *task_data;
3867    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3868    if (ompt_enabled.ompt_callback_task_create) {
3869      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3870          NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3871      // initial task has nothing to return to
3872    }
3873
3874    ompt_set_thread_state(root_thread, ompt_state_work_serial);
3875  }
3876#endif
3877
3878  KMP_MB();
3879  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3880
3881  return gtid;
3882}
3883
3884#if KMP_NESTED_HOT_TEAMS
3885static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3886                                const int max_level) {
3887  int i, n, nth;
3888  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3889  if (!hot_teams || !hot_teams[level].hot_team) {
3890    return 0;
3891  }
3892  KMP_DEBUG_ASSERT(level < max_level);
3893  kmp_team_t *team = hot_teams[level].hot_team;
3894  nth = hot_teams[level].hot_team_nth;
3895  n = nth - 1; // master is not freed
3896  if (level < max_level - 1) {
3897    for (i = 0; i < nth; ++i) {
3898      kmp_info_t *th = team->t.t_threads[i];
3899      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3900      if (i > 0 && th->th.th_hot_teams) {
3901        __kmp_free(th->th.th_hot_teams);
3902        th->th.th_hot_teams = NULL;
3903      }
3904    }
3905  }
3906  __kmp_free_team(root, team, NULL);
3907  return n;
3908}
3909#endif
3910
3911// Resets a root thread and clear its root and hot teams.
3912// Returns the number of __kmp_threads entries directly and indirectly freed.
3913static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3914  kmp_team_t *root_team = root->r.r_root_team;
3915  kmp_team_t *hot_team = root->r.r_hot_team;
3916  int n = hot_team->t.t_nproc;
3917  int i;
3918
3919  KMP_DEBUG_ASSERT(!root->r.r_active);
3920
3921  root->r.r_root_team = NULL;
3922  root->r.r_hot_team = NULL;
3923  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3924  // before call to __kmp_free_team().
3925  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3926#if KMP_NESTED_HOT_TEAMS
3927  if (__kmp_hot_teams_max_level >
3928      0) { // need to free nested hot teams and their threads if any
3929    for (i = 0; i < hot_team->t.t_nproc; ++i) {
3930      kmp_info_t *th = hot_team->t.t_threads[i];
3931      if (__kmp_hot_teams_max_level > 1) {
3932        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3933      }
3934      if (th->th.th_hot_teams) {
3935        __kmp_free(th->th.th_hot_teams);
3936        th->th.th_hot_teams = NULL;
3937      }
3938    }
3939  }
3940#endif
3941  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3942
3943  // Before we can reap the thread, we need to make certain that all other
3944  // threads in the teams that had this root as ancestor have stopped trying to
3945  // steal tasks.
3946  if (__kmp_tasking_mode != tskm_immediate_exec) {
3947    __kmp_wait_to_unref_task_teams();
3948  }
3949
3950#if KMP_OS_WINDOWS
3951  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3952  KA_TRACE(
3953      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3954           "\n",
3955           (LPVOID) & (root->r.r_uber_thread->th),
3956           root->r.r_uber_thread->th.th_info.ds.ds_thread));
3957  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3958#endif /* KMP_OS_WINDOWS */
3959
3960#if OMPT_SUPPORT
3961  if (ompt_enabled.ompt_callback_thread_end) {
3962    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3963        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3964  }
3965#endif
3966
3967  TCW_4(__kmp_nth,
3968        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3969  root->r.r_cg_nthreads--;
3970
3971  __kmp_reap_thread(root->r.r_uber_thread, 1);
3972
3973  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3974  // of freeing.
3975  root->r.r_uber_thread = NULL;
3976  /* mark root as no longer in use */
3977  root->r.r_begin = FALSE;
3978
3979  return n;
3980}
3981
3982void __kmp_unregister_root_current_thread(int gtid) {
3983  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3984  /* this lock should be ok, since unregister_root_current_thread is never
3985     called during an abort, only during a normal close. furthermore, if you
3986     have the forkjoin lock, you should never try to get the initz lock */
3987  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3988  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3989    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3990                  "exiting T#%d\n",
3991                  gtid));
3992    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993    return;
3994  }
3995  kmp_root_t *root = __kmp_root[gtid];
3996
3997  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3998  KMP_ASSERT(KMP_UBER_GTID(gtid));
3999  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4000  KMP_ASSERT(root->r.r_active == FALSE);
4001
4002  KMP_MB();
4003
4004#if OMP_45_ENABLED
4005  kmp_info_t *thread = __kmp_threads[gtid];
4006  kmp_team_t *team = thread->th.th_team;
4007  kmp_task_team_t *task_team = thread->th.th_task_team;
4008
4009  // we need to wait for the proxy tasks before finishing the thread
4010  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4011#if OMPT_SUPPORT
4012    // the runtime is shutting down so we won't report any events
4013    thread->th.ompt_thread_info.state = ompt_state_undefined;
4014#endif
4015    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4016  }
4017#endif
4018
4019  __kmp_reset_root(gtid, root);
4020
4021  /* free up this thread slot */
4022  __kmp_gtid_set_specific(KMP_GTID_DNE);
4023#ifdef KMP_TDATA_GTID
4024  __kmp_gtid = KMP_GTID_DNE;
4025#endif
4026
4027  KMP_MB();
4028  KC_TRACE(10,
4029           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4030
4031  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4032}
4033
4034#if KMP_OS_WINDOWS
4035/* __kmp_forkjoin_lock must be already held
4036   Unregisters a root thread that is not the current thread.  Returns the number
4037   of __kmp_threads entries freed as a result. */
4038static int __kmp_unregister_root_other_thread(int gtid) {
4039  kmp_root_t *root = __kmp_root[gtid];
4040  int r;
4041
4042  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4043  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4044  KMP_ASSERT(KMP_UBER_GTID(gtid));
4045  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4046  KMP_ASSERT(root->r.r_active == FALSE);
4047
4048  r = __kmp_reset_root(gtid, root);
4049  KC_TRACE(10,
4050           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4051  return r;
4052}
4053#endif
4054
4055#if KMP_DEBUG
4056void __kmp_task_info() {
4057
4058  kmp_int32 gtid = __kmp_entry_gtid();
4059  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4060  kmp_info_t *this_thr = __kmp_threads[gtid];
4061  kmp_team_t *steam = this_thr->th.th_serial_team;
4062  kmp_team_t *team = this_thr->th.th_team;
4063
4064  __kmp_printf(
4065      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4066      "ptask=%p\n",
4067      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4068      team->t.t_implicit_task_taskdata[tid].td_parent);
4069}
4070#endif // KMP_DEBUG
4071
4072/* TODO optimize with one big memclr, take out what isn't needed, split
4073   responsibility to workers as much as possible, and delay initialization of
4074   features as much as possible  */
4075static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4076                                  int tid, int gtid) {
4077  /* this_thr->th.th_info.ds.ds_gtid is setup in
4078     kmp_allocate_thread/create_worker.
4079     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4080  kmp_info_t *master = team->t.t_threads[0];
4081  KMP_DEBUG_ASSERT(this_thr != NULL);
4082  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4083  KMP_DEBUG_ASSERT(team);
4084  KMP_DEBUG_ASSERT(team->t.t_threads);
4085  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4086  KMP_DEBUG_ASSERT(master);
4087  KMP_DEBUG_ASSERT(master->th.th_root);
4088
4089  KMP_MB();
4090
4091  TCW_SYNC_PTR(this_thr->th.th_team, team);
4092
4093  this_thr->th.th_info.ds.ds_tid = tid;
4094  this_thr->th.th_set_nproc = 0;
4095  if (__kmp_tasking_mode != tskm_immediate_exec)
4096    // When tasking is possible, threads are not safe to reap until they are
4097    // done tasking; this will be set when tasking code is exited in wait
4098    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4099  else // no tasking --> always safe to reap
4100    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4101#if OMP_40_ENABLED
4102  this_thr->th.th_set_proc_bind = proc_bind_default;
4103#if KMP_AFFINITY_SUPPORTED
4104  this_thr->th.th_new_place = this_thr->th.th_current_place;
4105#endif
4106#endif
4107  this_thr->th.th_root = master->th.th_root;
4108
4109  /* setup the thread's cache of the team structure */
4110  this_thr->th.th_team_nproc = team->t.t_nproc;
4111  this_thr->th.th_team_master = master;
4112  this_thr->th.th_team_serialized = team->t.t_serialized;
4113  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4114
4115  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4116
4117  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4118                tid, gtid, this_thr, this_thr->th.th_current_task));
4119
4120  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4121                           team, tid, TRUE);
4122
4123  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4124                tid, gtid, this_thr, this_thr->th.th_current_task));
4125  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4126  // __kmp_initialize_team()?
4127
4128  /* TODO no worksharing in speculative threads */
4129  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4130
4131  this_thr->th.th_local.this_construct = 0;
4132
4133  if (!this_thr->th.th_pri_common) {
4134    this_thr->th.th_pri_common =
4135        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4136    if (__kmp_storage_map) {
4137      __kmp_print_storage_map_gtid(
4138          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4139          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4140    }
4141    this_thr->th.th_pri_head = NULL;
4142  }
4143
4144  /* Initialize dynamic dispatch */
4145  {
4146    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4147    // Use team max_nproc since this will never change for the team.
4148    size_t disp_size =
4149        sizeof(dispatch_private_info_t) *
4150        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4151    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4152                  team->t.t_max_nproc));
4153    KMP_ASSERT(dispatch);
4154    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4155    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4156
4157    dispatch->th_disp_index = 0;
4158#if OMP_45_ENABLED
4159    dispatch->th_doacross_buf_idx = 0;
4160#endif
4161    if (!dispatch->th_disp_buffer) {
4162      dispatch->th_disp_buffer =
4163          (dispatch_private_info_t *)__kmp_allocate(disp_size);
4164
4165      if (__kmp_storage_map) {
4166        __kmp_print_storage_map_gtid(
4167            gtid, &dispatch->th_disp_buffer[0],
4168            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4169                                          ? 1
4170                                          : __kmp_dispatch_num_buffers],
4171            disp_size, "th_%d.th_dispatch.th_disp_buffer "
4172                       "(team_%d.t_dispatch[%d].th_disp_buffer)",
4173            gtid, team->t.t_id, gtid);
4174      }
4175    } else {
4176      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4177    }
4178
4179    dispatch->th_dispatch_pr_current = 0;
4180    dispatch->th_dispatch_sh_current = 0;
4181
4182    dispatch->th_deo_fcn = 0; /* ORDERED     */
4183    dispatch->th_dxo_fcn = 0; /* END ORDERED */
4184  }
4185
4186  this_thr->th.th_next_pool = NULL;
4187
4188  if (!this_thr->th.th_task_state_memo_stack) {
4189    size_t i;
4190    this_thr->th.th_task_state_memo_stack =
4191        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4192    this_thr->th.th_task_state_top = 0;
4193    this_thr->th.th_task_state_stack_sz = 4;
4194    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4195         ++i) // zero init the stack
4196      this_thr->th.th_task_state_memo_stack[i] = 0;
4197  }
4198
4199  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4200  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4201
4202  KMP_MB();
4203}
4204
4205/* allocate a new thread for the requesting team. this is only called from
4206   within a forkjoin critical section. we will first try to get an available
4207   thread from the thread pool. if none is available, we will fork a new one
4208   assuming we are able to create a new one. this should be assured, as the
4209   caller should check on this first. */
4210kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4211                                  int new_tid) {
4212  kmp_team_t *serial_team;
4213  kmp_info_t *new_thr;
4214  int new_gtid;
4215
4216  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4217  KMP_DEBUG_ASSERT(root && team);
4218#if !KMP_NESTED_HOT_TEAMS
4219  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4220#endif
4221  KMP_MB();
4222
4223  /* first, try to get one from the thread pool */
4224  if (__kmp_thread_pool) {
4225
4226    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4227    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4228    if (new_thr == __kmp_thread_pool_insert_pt) {
4229      __kmp_thread_pool_insert_pt = NULL;
4230    }
4231    TCW_4(new_thr->th.th_in_pool, FALSE);
4232    // Don't touch th_active_in_pool or th_active.
4233    // The worker thread adjusts those flags as it sleeps/awakens.
4234    __kmp_thread_pool_nth--;
4235
4236    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4237                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4238    KMP_ASSERT(!new_thr->th.th_team);
4239    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4240    KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4241
4242    /* setup the thread structure */
4243    __kmp_initialize_info(new_thr, team, new_tid,
4244                          new_thr->th.th_info.ds.ds_gtid);
4245    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4246
4247    TCW_4(__kmp_nth, __kmp_nth + 1);
4248    root->r.r_cg_nthreads++;
4249
4250    new_thr->th.th_task_state = 0;
4251    new_thr->th.th_task_state_top = 0;
4252    new_thr->th.th_task_state_stack_sz = 4;
4253
4254#ifdef KMP_ADJUST_BLOCKTIME
4255    /* Adjust blocktime back to zero if necessary */
4256    /* Middle initialization might not have occurred yet */
4257    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4258      if (__kmp_nth > __kmp_avail_proc) {
4259        __kmp_zero_bt = TRUE;
4260      }
4261    }
4262#endif /* KMP_ADJUST_BLOCKTIME */
4263
4264#if KMP_DEBUG
4265    // If thread entered pool via __kmp_free_thread, wait_flag should !=
4266    // KMP_BARRIER_PARENT_FLAG.
4267    int b;
4268    kmp_balign_t *balign = new_thr->th.th_bar;
4269    for (b = 0; b < bs_last_barrier; ++b)
4270      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4271#endif
4272
4273    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4274                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4275
4276    KMP_MB();
4277    return new_thr;
4278  }
4279
4280  /* no, well fork a new one */
4281  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4282  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4283
4284#if KMP_USE_MONITOR
4285  // If this is the first worker thread the RTL is creating, then also
4286  // launch the monitor thread.  We try to do this as early as possible.
4287  if (!TCR_4(__kmp_init_monitor)) {
4288    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4289    if (!TCR_4(__kmp_init_monitor)) {
4290      KF_TRACE(10, ("before __kmp_create_monitor\n"));
4291      TCW_4(__kmp_init_monitor, 1);
4292      __kmp_create_monitor(&__kmp_monitor);
4293      KF_TRACE(10, ("after __kmp_create_monitor\n"));
4294#if KMP_OS_WINDOWS
4295      // AC: wait until monitor has started. This is a fix for CQ232808.
4296      // The reason is that if the library is loaded/unloaded in a loop with
4297      // small (parallel) work in between, then there is high probability that
4298      // monitor thread started after the library shutdown. At shutdown it is
4299      // too late to cope with the problem, because when the master is in
4300      // DllMain (process detach) the monitor has no chances to start (it is
4301      // blocked), and master has no means to inform the monitor that the
4302      // library has gone, because all the memory which the monitor can access
4303      // is going to be released/reset.
4304      while (TCR_4(__kmp_init_monitor) < 2) {
4305        KMP_YIELD(TRUE);
4306      }
4307      KF_TRACE(10, ("after monitor thread has started\n"));
4308#endif
4309    }
4310    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4311  }
4312#endif
4313
4314  KMP_MB();
4315  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4316    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4317  }
4318
4319  /* allocate space for it. */
4320  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4321
4322  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4323
4324  if (__kmp_storage_map) {
4325    __kmp_print_thread_storage_map(new_thr, new_gtid);
4326  }
4327
4328  // add the reserve serialized team, initialized from the team's master thread
4329  {
4330    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4331    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4332    new_thr->th.th_serial_team = serial_team =
4333        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4334#if OMPT_SUPPORT
4335                                          ompt_data_none, // root parallel id
4336#endif
4337#if OMP_40_ENABLED
4338                                          proc_bind_default,
4339#endif
4340                                          &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4341  }
4342  KMP_ASSERT(serial_team);
4343  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4344  // execution (it is unused for now).
4345  serial_team->t.t_threads[0] = new_thr;
4346  KF_TRACE(10,
4347           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4348            new_thr));
4349
4350  /* setup the thread structures */
4351  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4352
4353#if USE_FAST_MEMORY
4354  __kmp_initialize_fast_memory(new_thr);
4355#endif /* USE_FAST_MEMORY */
4356
4357#if KMP_USE_BGET
4358  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4359  __kmp_initialize_bget(new_thr);
4360#endif
4361
4362  __kmp_init_random(new_thr); // Initialize random number generator
4363
4364  /* Initialize these only once when thread is grabbed for a team allocation */
4365  KA_TRACE(20,
4366           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4367            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4368
4369  int b;
4370  kmp_balign_t *balign = new_thr->th.th_bar;
4371  for (b = 0; b < bs_last_barrier; ++b) {
4372    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4373    balign[b].bb.team = NULL;
4374    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4375    balign[b].bb.use_oncore_barrier = 0;
4376  }
4377
4378  new_thr->th.th_spin_here = FALSE;
4379  new_thr->th.th_next_waiting = 0;
4380#if KMP_OS_UNIX
4381  new_thr->th.th_blocking = false;
4382#endif
4383
4384#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4385  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4386  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4387  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4388  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4389#endif
4390#if OMP_50_ENABLED
4391  new_thr->th.th_def_allocator = __kmp_def_allocator;
4392  new_thr->th.th_prev_level = 0;
4393  new_thr->th.th_prev_num_threads = 1;
4394#endif
4395
4396  TCW_4(new_thr->th.th_in_pool, FALSE);
4397  new_thr->th.th_active_in_pool = FALSE;
4398  TCW_4(new_thr->th.th_active, TRUE);
4399
4400  /* adjust the global counters */
4401  __kmp_all_nth++;
4402  __kmp_nth++;
4403
4404  root->r.r_cg_nthreads++;
4405
4406  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4407  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4408  if (__kmp_adjust_gtid_mode) {
4409    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4410      if (TCR_4(__kmp_gtid_mode) != 2) {
4411        TCW_4(__kmp_gtid_mode, 2);
4412      }
4413    } else {
4414      if (TCR_4(__kmp_gtid_mode) != 1) {
4415        TCW_4(__kmp_gtid_mode, 1);
4416      }
4417    }
4418  }
4419
4420#ifdef KMP_ADJUST_BLOCKTIME
4421  /* Adjust blocktime back to zero if necessary       */
4422  /* Middle initialization might not have occurred yet */
4423  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4424    if (__kmp_nth > __kmp_avail_proc) {
4425      __kmp_zero_bt = TRUE;
4426    }
4427  }
4428#endif /* KMP_ADJUST_BLOCKTIME */
4429
4430  /* actually fork it and create the new worker thread */
4431  KF_TRACE(
4432      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4433  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4434  KF_TRACE(10,
4435           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4436
4437  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4438                new_gtid));
4439  KMP_MB();
4440  return new_thr;
4441}
4442
4443/* Reinitialize team for reuse.
4444   The hot team code calls this case at every fork barrier, so EPCC barrier
4445   test are extremely sensitive to changes in it, esp. writes to the team
4446   struct, which cause a cache invalidation in all threads.
4447   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4448static void __kmp_reinitialize_team(kmp_team_t *team,
4449                                    kmp_internal_control_t *new_icvs,
4450                                    ident_t *loc) {
4451  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4452                team->t.t_threads[0], team));
4453  KMP_DEBUG_ASSERT(team && new_icvs);
4454  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4455  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4456
4457  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4458  // Copy ICVs to the master thread's implicit taskdata
4459  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4460  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4461
4462  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4463                team->t.t_threads[0], team));
4464}
4465
4466/* Initialize the team data structure.
4467   This assumes the t_threads and t_max_nproc are already set.
4468   Also, we don't touch the arguments */
4469static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4470                                  kmp_internal_control_t *new_icvs,
4471                                  ident_t *loc) {
4472  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4473
4474  /* verify */
4475  KMP_DEBUG_ASSERT(team);
4476  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4477  KMP_DEBUG_ASSERT(team->t.t_threads);
4478  KMP_MB();
4479
4480  team->t.t_master_tid = 0; /* not needed */
4481  /* team->t.t_master_bar;        not needed */
4482  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4483  team->t.t_nproc = new_nproc;
4484
4485  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4486  team->t.t_next_pool = NULL;
4487  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4488   * up hot team */
4489
4490  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4491  team->t.t_invoke = NULL; /* not needed */
4492
4493  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4494  team->t.t_sched.sched = new_icvs->sched.sched;
4495
4496#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4497  team->t.t_fp_control_saved = FALSE; /* not needed */
4498  team->t.t_x87_fpu_control_word = 0; /* not needed */
4499  team->t.t_mxcsr = 0; /* not needed */
4500#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4501
4502  team->t.t_construct = 0;
4503
4504  team->t.t_ordered.dt.t_value = 0;
4505  team->t.t_master_active = FALSE;
4506
4507  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4508
4509#ifdef KMP_DEBUG
4510  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4511#endif
4512#if KMP_OS_WINDOWS
4513  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4514#endif
4515
4516  team->t.t_control_stack_top = NULL;
4517
4518  __kmp_reinitialize_team(team, new_icvs, loc);
4519
4520  KMP_MB();
4521  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4522}
4523
4524#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4525/* Sets full mask for thread and returns old mask, no changes to structures. */
4526static void
4527__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4528  if (KMP_AFFINITY_CAPABLE()) {
4529    int status;
4530    if (old_mask != NULL) {
4531      status = __kmp_get_system_affinity(old_mask, TRUE);
4532      int error = errno;
4533      if (status != 0) {
4534        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4535                    __kmp_msg_null);
4536      }
4537    }
4538    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4539  }
4540}
4541#endif
4542
4543#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4544
4545// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4546// It calculats the worker + master thread's partition based upon the parent
4547// thread's partition, and binds each worker to a thread in their partition.
4548// The master thread's partition should already include its current binding.
4549static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4550  // Copy the master thread's place partion to the team struct
4551  kmp_info_t *master_th = team->t.t_threads[0];
4552  KMP_DEBUG_ASSERT(master_th != NULL);
4553  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4554  int first_place = master_th->th.th_first_place;
4555  int last_place = master_th->th.th_last_place;
4556  int masters_place = master_th->th.th_current_place;
4557  team->t.t_first_place = first_place;
4558  team->t.t_last_place = last_place;
4559
4560  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4561                "bound to place %d partition = [%d,%d]\n",
4562                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4563                team->t.t_id, masters_place, first_place, last_place));
4564
4565  switch (proc_bind) {
4566
4567  case proc_bind_default:
4568    // serial teams might have the proc_bind policy set to proc_bind_default. It
4569    // doesn't matter, as we don't rebind master thread for any proc_bind policy
4570    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4571    break;
4572
4573  case proc_bind_master: {
4574    int f;
4575    int n_th = team->t.t_nproc;
4576    for (f = 1; f < n_th; f++) {
4577      kmp_info_t *th = team->t.t_threads[f];
4578      KMP_DEBUG_ASSERT(th != NULL);
4579      th->th.th_first_place = first_place;
4580      th->th.th_last_place = last_place;
4581      th->th.th_new_place = masters_place;
4582#if OMP_50_ENABLED
4583      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4584          team->t.t_display_affinity != 1) {
4585        team->t.t_display_affinity = 1;
4586      }
4587#endif
4588
4589      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4590                     "partition = [%d,%d]\n",
4591                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4592                     f, masters_place, first_place, last_place));
4593    }
4594  } break;
4595
4596  case proc_bind_close: {
4597    int f;
4598    int n_th = team->t.t_nproc;
4599    int n_places;
4600    if (first_place <= last_place) {
4601      n_places = last_place - first_place + 1;
4602    } else {
4603      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4604    }
4605    if (n_th <= n_places) {
4606      int place = masters_place;
4607      for (f = 1; f < n_th; f++) {
4608        kmp_info_t *th = team->t.t_threads[f];
4609        KMP_DEBUG_ASSERT(th != NULL);
4610
4611        if (place == last_place) {
4612          place = first_place;
4613        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4614          place = 0;
4615        } else {
4616          place++;
4617        }
4618        th->th.th_first_place = first_place;
4619        th->th.th_last_place = last_place;
4620        th->th.th_new_place = place;
4621#if OMP_50_ENABLED
4622        if (__kmp_display_affinity && place != th->th.th_current_place &&
4623            team->t.t_display_affinity != 1) {
4624          team->t.t_display_affinity = 1;
4625        }
4626#endif
4627
4628        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4629                       "partition = [%d,%d]\n",
4630                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4631                       team->t.t_id, f, place, first_place, last_place));
4632      }
4633    } else {
4634      int S, rem, gap, s_count;
4635      S = n_th / n_places;
4636      s_count = 0;
4637      rem = n_th - (S * n_places);
4638      gap = rem > 0 ? n_places / rem : n_places;
4639      int place = masters_place;
4640      int gap_ct = gap;
4641      for (f = 0; f < n_th; f++) {
4642        kmp_info_t *th = team->t.t_threads[f];
4643        KMP_DEBUG_ASSERT(th != NULL);
4644
4645        th->th.th_first_place = first_place;
4646        th->th.th_last_place = last_place;
4647        th->th.th_new_place = place;
4648#if OMP_50_ENABLED
4649        if (__kmp_display_affinity && place != th->th.th_current_place &&
4650            team->t.t_display_affinity != 1) {
4651          team->t.t_display_affinity = 1;
4652        }
4653#endif
4654        s_count++;
4655
4656        if ((s_count == S) && rem && (gap_ct == gap)) {
4657          // do nothing, add an extra thread to place on next iteration
4658        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4659          // we added an extra thread to this place; move to next place
4660          if (place == last_place) {
4661            place = first_place;
4662          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4663            place = 0;
4664          } else {
4665            place++;
4666          }
4667          s_count = 0;
4668          gap_ct = 1;
4669          rem--;
4670        } else if (s_count == S) { // place full; don't add extra
4671          if (place == last_place) {
4672            place = first_place;
4673          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4674            place = 0;
4675          } else {
4676            place++;
4677          }
4678          gap_ct++;
4679          s_count = 0;
4680        }
4681
4682        KA_TRACE(100,
4683                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4684                  "partition = [%d,%d]\n",
4685                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4686                  th->th.th_new_place, first_place, last_place));
4687      }
4688      KMP_DEBUG_ASSERT(place == masters_place);
4689    }
4690  } break;
4691
4692  case proc_bind_spread: {
4693    int f;
4694    int n_th = team->t.t_nproc;
4695    int n_places;
4696    int thidx;
4697    if (first_place <= last_place) {
4698      n_places = last_place - first_place + 1;
4699    } else {
4700      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4701    }
4702    if (n_th <= n_places) {
4703      int place = -1;
4704
4705      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4706        int S = n_places / n_th;
4707        int s_count, rem, gap, gap_ct;
4708
4709        place = masters_place;
4710        rem = n_places - n_th * S;
4711        gap = rem ? n_th / rem : 1;
4712        gap_ct = gap;
4713        thidx = n_th;
4714        if (update_master_only == 1)
4715          thidx = 1;
4716        for (f = 0; f < thidx; f++) {
4717          kmp_info_t *th = team->t.t_threads[f];
4718          KMP_DEBUG_ASSERT(th != NULL);
4719
4720          th->th.th_first_place = place;
4721          th->th.th_new_place = place;
4722#if OMP_50_ENABLED
4723          if (__kmp_display_affinity && place != th->th.th_current_place &&
4724              team->t.t_display_affinity != 1) {
4725            team->t.t_display_affinity = 1;
4726          }
4727#endif
4728          s_count = 1;
4729          while (s_count < S) {
4730            if (place == last_place) {
4731              place = first_place;
4732            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4733              place = 0;
4734            } else {
4735              place++;
4736            }
4737            s_count++;
4738          }
4739          if (rem && (gap_ct == gap)) {
4740            if (place == last_place) {
4741              place = first_place;
4742            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4743              place = 0;
4744            } else {
4745              place++;
4746            }
4747            rem--;
4748            gap_ct = 0;
4749          }
4750          th->th.th_last_place = place;
4751          gap_ct++;
4752
4753          if (place == last_place) {
4754            place = first_place;
4755          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4756            place = 0;
4757          } else {
4758            place++;
4759          }
4760
4761          KA_TRACE(100,
4762                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4763                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4764                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4765                    f, th->th.th_new_place, th->th.th_first_place,
4766                    th->th.th_last_place, __kmp_affinity_num_masks));
4767        }
4768      } else {
4769        /* Having uniform space of available computation places I can create
4770           T partitions of round(P/T) size and put threads into the first
4771           place of each partition. */
4772        double current = static_cast<double>(masters_place);
4773        double spacing =
4774            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4775        int first, last;
4776        kmp_info_t *th;
4777
4778        thidx = n_th + 1;
4779        if (update_master_only == 1)
4780          thidx = 1;
4781        for (f = 0; f < thidx; f++) {
4782          first = static_cast<int>(current);
4783          last = static_cast<int>(current + spacing) - 1;
4784          KMP_DEBUG_ASSERT(last >= first);
4785          if (first >= n_places) {
4786            if (masters_place) {
4787              first -= n_places;
4788              last -= n_places;
4789              if (first == (masters_place + 1)) {
4790                KMP_DEBUG_ASSERT(f == n_th);
4791                first--;
4792              }
4793              if (last == masters_place) {
4794                KMP_DEBUG_ASSERT(f == (n_th - 1));
4795                last--;
4796              }
4797            } else {
4798              KMP_DEBUG_ASSERT(f == n_th);
4799              first = 0;
4800              last = 0;
4801            }
4802          }
4803          if (last >= n_places) {
4804            last = (n_places - 1);
4805          }
4806          place = first;
4807          current += spacing;
4808          if (f < n_th) {
4809            KMP_DEBUG_ASSERT(0 <= first);
4810            KMP_DEBUG_ASSERT(n_places > first);
4811            KMP_DEBUG_ASSERT(0 <= last);
4812            KMP_DEBUG_ASSERT(n_places > last);
4813            KMP_DEBUG_ASSERT(last_place >= first_place);
4814            th = team->t.t_threads[f];
4815            KMP_DEBUG_ASSERT(th);
4816            th->th.th_first_place = first;
4817            th->th.th_new_place = place;
4818            th->th.th_last_place = last;
4819#if OMP_50_ENABLED
4820            if (__kmp_display_affinity && place != th->th.th_current_place &&
4821                team->t.t_display_affinity != 1) {
4822              team->t.t_display_affinity = 1;
4823            }
4824#endif
4825            KA_TRACE(100,
4826                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4827                      "partition = [%d,%d], spacing = %.4f\n",
4828                      __kmp_gtid_from_thread(team->t.t_threads[f]),
4829                      team->t.t_id, f, th->th.th_new_place,
4830                      th->th.th_first_place, th->th.th_last_place, spacing));
4831          }
4832        }
4833      }
4834      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4835    } else {
4836      int S, rem, gap, s_count;
4837      S = n_th / n_places;
4838      s_count = 0;
4839      rem = n_th - (S * n_places);
4840      gap = rem > 0 ? n_places / rem : n_places;
4841      int place = masters_place;
4842      int gap_ct = gap;
4843      thidx = n_th;
4844      if (update_master_only == 1)
4845        thidx = 1;
4846      for (f = 0; f < thidx; f++) {
4847        kmp_info_t *th = team->t.t_threads[f];
4848        KMP_DEBUG_ASSERT(th != NULL);
4849
4850        th->th.th_first_place = place;
4851        th->th.th_last_place = place;
4852        th->th.th_new_place = place;
4853#if OMP_50_ENABLED
4854        if (__kmp_display_affinity && place != th->th.th_current_place &&
4855            team->t.t_display_affinity != 1) {
4856          team->t.t_display_affinity = 1;
4857        }
4858#endif
4859        s_count++;
4860
4861        if ((s_count == S) && rem && (gap_ct == gap)) {
4862          // do nothing, add an extra thread to place on next iteration
4863        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4864          // we added an extra thread to this place; move on to next place
4865          if (place == last_place) {
4866            place = first_place;
4867          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4868            place = 0;
4869          } else {
4870            place++;
4871          }
4872          s_count = 0;
4873          gap_ct = 1;
4874          rem--;
4875        } else if (s_count == S) { // place is full; don't add extra thread
4876          if (place == last_place) {
4877            place = first_place;
4878          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4879            place = 0;
4880          } else {
4881            place++;
4882          }
4883          gap_ct++;
4884          s_count = 0;
4885        }
4886
4887        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4888                       "partition = [%d,%d]\n",
4889                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4890                       team->t.t_id, f, th->th.th_new_place,
4891                       th->th.th_first_place, th->th.th_last_place));
4892      }
4893      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4894    }
4895  } break;
4896
4897  default:
4898    break;
4899  }
4900
4901  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4902}
4903
4904#endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4905
4906/* allocate a new team data structure to use.  take one off of the free pool if
4907   available */
4908kmp_team_t *
4909__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4910#if OMPT_SUPPORT
4911                    ompt_data_t ompt_parallel_data,
4912#endif
4913#if OMP_40_ENABLED
4914                    kmp_proc_bind_t new_proc_bind,
4915#endif
4916                    kmp_internal_control_t *new_icvs,
4917                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4918  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4919  int f;
4920  kmp_team_t *team;
4921  int use_hot_team = !root->r.r_active;
4922  int level = 0;
4923
4924  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4925  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4926  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4927  KMP_MB();
4928
4929#if KMP_NESTED_HOT_TEAMS
4930  kmp_hot_team_ptr_t *hot_teams;
4931  if (master) {
4932    team = master->th.th_team;
4933    level = team->t.t_active_level;
4934    if (master->th.th_teams_microtask) { // in teams construct?
4935      if (master->th.th_teams_size.nteams > 1 &&
4936          ( // #teams > 1
4937              team->t.t_pkfn ==
4938                  (microtask_t)__kmp_teams_master || // inner fork of the teams
4939              master->th.th_teams_level <
4940                  team->t.t_level)) { // or nested parallel inside the teams
4941        ++level; // not increment if #teams==1, or for outer fork of the teams;
4942        // increment otherwise
4943      }
4944    }
4945    hot_teams = master->th.th_hot_teams;
4946    if (level < __kmp_hot_teams_max_level && hot_teams &&
4947        hot_teams[level]
4948            .hot_team) { // hot team has already been allocated for given level
4949      use_hot_team = 1;
4950    } else {
4951      use_hot_team = 0;
4952    }
4953  }
4954#endif
4955  // Optimization to use a "hot" team
4956  if (use_hot_team && new_nproc > 1) {
4957    KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4958#if KMP_NESTED_HOT_TEAMS
4959    team = hot_teams[level].hot_team;
4960#else
4961    team = root->r.r_hot_team;
4962#endif
4963#if KMP_DEBUG
4964    if (__kmp_tasking_mode != tskm_immediate_exec) {
4965      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4966                    "task_team[1] = %p before reinit\n",
4967                    team->t.t_task_team[0], team->t.t_task_team[1]));
4968    }
4969#endif
4970
4971    // Has the number of threads changed?
4972    /* Let's assume the most common case is that the number of threads is
4973       unchanged, and put that case first. */
4974    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4975      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4976      // This case can mean that omp_set_num_threads() was called and the hot
4977      // team size was already reduced, so we check the special flag
4978      if (team->t.t_size_changed == -1) {
4979        team->t.t_size_changed = 1;
4980      } else {
4981        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4982      }
4983
4984      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4985      kmp_r_sched_t new_sched = new_icvs->sched;
4986      // set master's schedule as new run-time schedule
4987      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4988
4989      __kmp_reinitialize_team(team, new_icvs,
4990                              root->r.r_uber_thread->th.th_ident);
4991
4992      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4993                    team->t.t_threads[0], team));
4994      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4995
4996#if OMP_40_ENABLED
4997#if KMP_AFFINITY_SUPPORTED
4998      if ((team->t.t_size_changed == 0) &&
4999          (team->t.t_proc_bind == new_proc_bind)) {
5000        if (new_proc_bind == proc_bind_spread) {
5001          __kmp_partition_places(
5002              team, 1); // add flag to update only master for spread
5003        }
5004        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5005                       "proc_bind = %d, partition = [%d,%d]\n",
5006                       team->t.t_id, new_proc_bind, team->t.t_first_place,
5007                       team->t.t_last_place));
5008      } else {
5009        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5010        __kmp_partition_places(team);
5011      }
5012#else
5013      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5014#endif /* KMP_AFFINITY_SUPPORTED */
5015#endif /* OMP_40_ENABLED */
5016    } else if (team->t.t_nproc > new_nproc) {
5017      KA_TRACE(20,
5018               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5019                new_nproc));
5020
5021      team->t.t_size_changed = 1;
5022#if KMP_NESTED_HOT_TEAMS
5023      if (__kmp_hot_teams_mode == 0) {
5024        // AC: saved number of threads should correspond to team's value in this
5025        // mode, can be bigger in mode 1, when hot team has threads in reserve
5026        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5027        hot_teams[level].hot_team_nth = new_nproc;
5028#endif // KMP_NESTED_HOT_TEAMS
5029        /* release the extra threads we don't need any more */
5030        for (f = new_nproc; f < team->t.t_nproc; f++) {
5031          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5032          if (__kmp_tasking_mode != tskm_immediate_exec) {
5033            // When decreasing team size, threads no longer in the team should
5034            // unref task team.
5035            team->t.t_threads[f]->th.th_task_team = NULL;
5036          }
5037          __kmp_free_thread(team->t.t_threads[f]);
5038          team->t.t_threads[f] = NULL;
5039        }
5040#if KMP_NESTED_HOT_TEAMS
5041      } // (__kmp_hot_teams_mode == 0)
5042      else {
5043        // When keeping extra threads in team, switch threads to wait on own
5044        // b_go flag
5045        for (f = new_nproc; f < team->t.t_nproc; ++f) {
5046          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5047          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5048          for (int b = 0; b < bs_last_barrier; ++b) {
5049            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5050              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5051            }
5052            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5053          }
5054        }
5055      }
5056#endif // KMP_NESTED_HOT_TEAMS
5057      team->t.t_nproc = new_nproc;
5058      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5059      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5060      __kmp_reinitialize_team(team, new_icvs,
5061                              root->r.r_uber_thread->th.th_ident);
5062
5063      /* update the remaining threads */
5064      for (f = 0; f < new_nproc; ++f) {
5065        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5066      }
5067      // restore the current task state of the master thread: should be the
5068      // implicit task
5069      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5070                    team->t.t_threads[0], team));
5071
5072      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5073
5074#ifdef KMP_DEBUG
5075      for (f = 0; f < team->t.t_nproc; f++) {
5076        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5077                         team->t.t_threads[f]->th.th_team_nproc ==
5078                             team->t.t_nproc);
5079      }
5080#endif
5081
5082#if OMP_40_ENABLED
5083      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5084#if KMP_AFFINITY_SUPPORTED
5085      __kmp_partition_places(team);
5086#endif
5087#endif
5088    } else { // team->t.t_nproc < new_nproc
5089#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5090      kmp_affin_mask_t *old_mask;
5091      if (KMP_AFFINITY_CAPABLE()) {
5092        KMP_CPU_ALLOC(old_mask);
5093      }
5094#endif
5095
5096      KA_TRACE(20,
5097               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5098                new_nproc));
5099
5100      team->t.t_size_changed = 1;
5101
5102#if KMP_NESTED_HOT_TEAMS
5103      int avail_threads = hot_teams[level].hot_team_nth;
5104      if (new_nproc < avail_threads)
5105        avail_threads = new_nproc;
5106      kmp_info_t **other_threads = team->t.t_threads;
5107      for (f = team->t.t_nproc; f < avail_threads; ++f) {
5108        // Adjust barrier data of reserved threads (if any) of the team
5109        // Other data will be set in __kmp_initialize_info() below.
5110        int b;
5111        kmp_balign_t *balign = other_threads[f]->th.th_bar;
5112        for (b = 0; b < bs_last_barrier; ++b) {
5113          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5114          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5115#if USE_DEBUGGER
5116          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5117#endif
5118        }
5119      }
5120      if (hot_teams[level].hot_team_nth >= new_nproc) {
5121        // we have all needed threads in reserve, no need to allocate any
5122        // this only possible in mode 1, cannot have reserved threads in mode 0
5123        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5124        team->t.t_nproc = new_nproc; // just get reserved threads involved
5125      } else {
5126        // we may have some threads in reserve, but not enough
5127        team->t.t_nproc =
5128            hot_teams[level]
5129                .hot_team_nth; // get reserved threads involved if any
5130        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5131#endif // KMP_NESTED_HOT_TEAMS
5132        if (team->t.t_max_nproc < new_nproc) {
5133          /* reallocate larger arrays */
5134          __kmp_reallocate_team_arrays(team, new_nproc);
5135          __kmp_reinitialize_team(team, new_icvs, NULL);
5136        }
5137
5138#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5139        /* Temporarily set full mask for master thread before creation of
5140           workers. The reason is that workers inherit the affinity from master,
5141           so if a lot of workers are created on the single core quickly, they
5142           don't get a chance to set their own affinity for a long time. */
5143        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5144#endif
5145
5146        /* allocate new threads for the hot team */
5147        for (f = team->t.t_nproc; f < new_nproc; f++) {
5148          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5149          KMP_DEBUG_ASSERT(new_worker);
5150          team->t.t_threads[f] = new_worker;
5151
5152          KA_TRACE(20,
5153                   ("__kmp_allocate_team: team %d init T#%d arrived: "
5154                    "join=%llu, plain=%llu\n",
5155                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5156                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5157                    team->t.t_bar[bs_plain_barrier].b_arrived));
5158
5159          { // Initialize barrier data for new threads.
5160            int b;
5161            kmp_balign_t *balign = new_worker->th.th_bar;
5162            for (b = 0; b < bs_last_barrier; ++b) {
5163              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5164              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5165                               KMP_BARRIER_PARENT_FLAG);
5166#if USE_DEBUGGER
5167              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5168#endif
5169            }
5170          }
5171        }
5172
5173#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5174        if (KMP_AFFINITY_CAPABLE()) {
5175          /* Restore initial master thread's affinity mask */
5176          __kmp_set_system_affinity(old_mask, TRUE);
5177          KMP_CPU_FREE(old_mask);
5178        }
5179#endif
5180#if KMP_NESTED_HOT_TEAMS
5181      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5182#endif // KMP_NESTED_HOT_TEAMS
5183      /* make sure everyone is syncronized */
5184      int old_nproc = team->t.t_nproc; // save old value and use to update only
5185      // new threads below
5186      __kmp_initialize_team(team, new_nproc, new_icvs,
5187                            root->r.r_uber_thread->th.th_ident);
5188
5189      /* reinitialize the threads */
5190      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5191      for (f = 0; f < team->t.t_nproc; ++f)
5192        __kmp_initialize_info(team->t.t_threads[f], team, f,
5193                              __kmp_gtid_from_tid(f, team));
5194      if (level) { // set th_task_state for new threads in nested hot team
5195        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5196        // only need to set the th_task_state for the new threads. th_task_state
5197        // for master thread will not be accurate until after this in
5198        // __kmp_fork_call(), so we look to the master's memo_stack to get the
5199        // correct value.
5200        for (f = old_nproc; f < team->t.t_nproc; ++f)
5201          team->t.t_threads[f]->th.th_task_state =
5202              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5203      } else { // set th_task_state for new threads in non-nested hot team
5204        int old_state =
5205            team->t.t_threads[0]->th.th_task_state; // copy master's state
5206        for (f = old_nproc; f < team->t.t_nproc; ++f)
5207          team->t.t_threads[f]->th.th_task_state = old_state;
5208      }
5209
5210#ifdef KMP_DEBUG
5211      for (f = 0; f < team->t.t_nproc; ++f) {
5212        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5213                         team->t.t_threads[f]->th.th_team_nproc ==
5214                             team->t.t_nproc);
5215      }
5216#endif
5217
5218#if OMP_40_ENABLED
5219      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5220#if KMP_AFFINITY_SUPPORTED
5221      __kmp_partition_places(team);
5222#endif
5223#endif
5224    } // Check changes in number of threads
5225
5226#if OMP_40_ENABLED
5227    kmp_info_t *master = team->t.t_threads[0];
5228    if (master->th.th_teams_microtask) {
5229      for (f = 1; f < new_nproc; ++f) {
5230        // propagate teams construct specific info to workers
5231        kmp_info_t *thr = team->t.t_threads[f];
5232        thr->th.th_teams_microtask = master->th.th_teams_microtask;
5233        thr->th.th_teams_level = master->th.th_teams_level;
5234        thr->th.th_teams_size = master->th.th_teams_size;
5235      }
5236    }
5237#endif /* OMP_40_ENABLED */
5238#if KMP_NESTED_HOT_TEAMS
5239    if (level) {
5240      // Sync barrier state for nested hot teams, not needed for outermost hot
5241      // team.
5242      for (f = 1; f < new_nproc; ++f) {
5243        kmp_info_t *thr = team->t.t_threads[f];
5244        int b;
5245        kmp_balign_t *balign = thr->th.th_bar;
5246        for (b = 0; b < bs_last_barrier; ++b) {
5247          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5248          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5249#if USE_DEBUGGER
5250          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5251#endif
5252        }
5253      }
5254    }
5255#endif // KMP_NESTED_HOT_TEAMS
5256
5257    /* reallocate space for arguments if necessary */
5258    __kmp_alloc_argv_entries(argc, team, TRUE);
5259    KMP_CHECK_UPDATE(team->t.t_argc, argc);
5260    // The hot team re-uses the previous task team,
5261    // if untouched during the previous release->gather phase.
5262
5263    KF_TRACE(10, (" hot_team = %p\n", team));
5264
5265#if KMP_DEBUG
5266    if (__kmp_tasking_mode != tskm_immediate_exec) {
5267      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5268                    "task_team[1] = %p after reinit\n",
5269                    team->t.t_task_team[0], team->t.t_task_team[1]));
5270    }
5271#endif
5272
5273#if OMPT_SUPPORT
5274    __ompt_team_assign_id(team, ompt_parallel_data);
5275#endif
5276
5277    KMP_MB();
5278
5279    return team;
5280  }
5281
5282  /* next, let's try to take one from the team pool */
5283  KMP_MB();
5284  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5285    /* TODO: consider resizing undersized teams instead of reaping them, now
5286       that we have a resizing mechanism */
5287    if (team->t.t_max_nproc >= max_nproc) {
5288      /* take this team from the team pool */
5289      __kmp_team_pool = team->t.t_next_pool;
5290
5291      /* setup the team for fresh use */
5292      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5293
5294      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5295                    "task_team[1] %p to NULL\n",
5296                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
5297      team->t.t_task_team[0] = NULL;
5298      team->t.t_task_team[1] = NULL;
5299
5300      /* reallocate space for arguments if necessary */
5301      __kmp_alloc_argv_entries(argc, team, TRUE);
5302      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5303
5304      KA_TRACE(
5305          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5306               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5307      { // Initialize barrier data.
5308        int b;
5309        for (b = 0; b < bs_last_barrier; ++b) {
5310          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5311#if USE_DEBUGGER
5312          team->t.t_bar[b].b_master_arrived = 0;
5313          team->t.t_bar[b].b_team_arrived = 0;
5314#endif
5315        }
5316      }
5317
5318#if OMP_40_ENABLED
5319      team->t.t_proc_bind = new_proc_bind;
5320#endif
5321
5322      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5323                    team->t.t_id));
5324
5325#if OMPT_SUPPORT
5326      __ompt_team_assign_id(team, ompt_parallel_data);
5327#endif
5328
5329      KMP_MB();
5330
5331      return team;
5332    }
5333
5334    /* reap team if it is too small, then loop back and check the next one */
5335    // not sure if this is wise, but, will be redone during the hot-teams
5336    // rewrite.
5337    /* TODO: Use technique to find the right size hot-team, don't reap them */
5338    team = __kmp_reap_team(team);
5339    __kmp_team_pool = team;
5340  }
5341
5342  /* nothing available in the pool, no matter, make a new team! */
5343  KMP_MB();
5344  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5345
5346  /* and set it up */
5347  team->t.t_max_nproc = max_nproc;
5348  /* NOTE well, for some reason allocating one big buffer and dividing it up
5349     seems to really hurt performance a lot on the P4, so, let's not use this */
5350  __kmp_allocate_team_arrays(team, max_nproc);
5351
5352  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5353  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5354
5355  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5356                "%p to NULL\n",
5357                &team->t.t_task_team[0], &team->t.t_task_team[1]));
5358  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5359  // memory, no need to duplicate
5360  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5361  // memory, no need to duplicate
5362
5363  if (__kmp_storage_map) {
5364    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5365  }
5366
5367  /* allocate space for arguments */
5368  __kmp_alloc_argv_entries(argc, team, FALSE);
5369  team->t.t_argc = argc;
5370
5371  KA_TRACE(20,
5372           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5373            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5374  { // Initialize barrier data.
5375    int b;
5376    for (b = 0; b < bs_last_barrier; ++b) {
5377      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5378#if USE_DEBUGGER
5379      team->t.t_bar[b].b_master_arrived = 0;
5380      team->t.t_bar[b].b_team_arrived = 0;
5381#endif
5382    }
5383  }
5384
5385#if OMP_40_ENABLED
5386  team->t.t_proc_bind = new_proc_bind;
5387#endif
5388
5389#if OMPT_SUPPORT
5390  __ompt_team_assign_id(team, ompt_parallel_data);
5391  team->t.ompt_serialized_team_info = NULL;
5392#endif
5393
5394  KMP_MB();
5395
5396  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5397                team->t.t_id));
5398
5399  return team;
5400}
5401
5402/* TODO implement hot-teams at all levels */
5403/* TODO implement lazy thread release on demand (disband request) */
5404
5405/* free the team.  return it to the team pool.  release all the threads
5406 * associated with it */
5407void __kmp_free_team(kmp_root_t *root,
5408                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5409  int f;
5410  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5411                team->t.t_id));
5412
5413  /* verify state */
5414  KMP_DEBUG_ASSERT(root);
5415  KMP_DEBUG_ASSERT(team);
5416  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5417  KMP_DEBUG_ASSERT(team->t.t_threads);
5418
5419  int use_hot_team = team == root->r.r_hot_team;
5420#if KMP_NESTED_HOT_TEAMS
5421  int level;
5422  kmp_hot_team_ptr_t *hot_teams;
5423  if (master) {
5424    level = team->t.t_active_level - 1;
5425    if (master->th.th_teams_microtask) { // in teams construct?
5426      if (master->th.th_teams_size.nteams > 1) {
5427        ++level; // level was not increased in teams construct for
5428        // team_of_masters
5429      }
5430      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5431          master->th.th_teams_level == team->t.t_level) {
5432        ++level; // level was not increased in teams construct for
5433        // team_of_workers before the parallel
5434      } // team->t.t_level will be increased inside parallel
5435    }
5436    hot_teams = master->th.th_hot_teams;
5437    if (level < __kmp_hot_teams_max_level) {
5438      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5439      use_hot_team = 1;
5440    }
5441  }
5442#endif // KMP_NESTED_HOT_TEAMS
5443
5444  /* team is done working */
5445  TCW_SYNC_PTR(team->t.t_pkfn,
5446               NULL); // Important for Debugging Support Library.
5447#if KMP_OS_WINDOWS
5448  team->t.t_copyin_counter = 0; // init counter for possible reuse
5449#endif
5450  // Do not reset pointer to parent team to NULL for hot teams.
5451
5452  /* if we are non-hot team, release our threads */
5453  if (!use_hot_team) {
5454    if (__kmp_tasking_mode != tskm_immediate_exec) {
5455      // Wait for threads to reach reapable state
5456      for (f = 1; f < team->t.t_nproc; ++f) {
5457        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5458        kmp_info_t *th = team->t.t_threads[f];
5459        volatile kmp_uint32 *state = &th->th.th_reap_state;
5460        while (*state != KMP_SAFE_TO_REAP) {
5461#if KMP_OS_WINDOWS
5462          // On Windows a thread can be killed at any time, check this
5463          DWORD ecode;
5464          if (!__kmp_is_thread_alive(th, &ecode)) {
5465            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5466            break;
5467          }
5468#endif
5469          // first check if thread is sleeping
5470          kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5471          if (fl.is_sleeping())
5472            fl.resume(__kmp_gtid_from_thread(th));
5473          KMP_CPU_PAUSE();
5474        }
5475      }
5476
5477      // Delete task teams
5478      int tt_idx;
5479      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5480        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5481        if (task_team != NULL) {
5482          for (f = 0; f < team->t.t_nproc;
5483               ++f) { // Have all threads unref task teams
5484            team->t.t_threads[f]->th.th_task_team = NULL;
5485          }
5486          KA_TRACE(
5487              20,
5488              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5489               __kmp_get_gtid(), task_team, team->t.t_id));
5490#if KMP_NESTED_HOT_TEAMS
5491          __kmp_free_task_team(master, task_team);
5492#endif
5493          team->t.t_task_team[tt_idx] = NULL;
5494        }
5495      }
5496    }
5497
5498    // Reset pointer to parent team only for non-hot teams.
5499    team->t.t_parent = NULL;
5500    team->t.t_level = 0;
5501    team->t.t_active_level = 0;
5502
5503    /* free the worker threads */
5504    for (f = 1; f < team->t.t_nproc; ++f) {
5505      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5506      __kmp_free_thread(team->t.t_threads[f]);
5507      team->t.t_threads[f] = NULL;
5508    }
5509
5510    /* put the team back in the team pool */
5511    /* TODO limit size of team pool, call reap_team if pool too large */
5512    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5513    __kmp_team_pool = (volatile kmp_team_t *)team;
5514  }
5515
5516  KMP_MB();
5517}
5518
5519/* reap the team.  destroy it, reclaim all its resources and free its memory */
5520kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5521  kmp_team_t *next_pool = team->t.t_next_pool;
5522
5523  KMP_DEBUG_ASSERT(team);
5524  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5525  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5526  KMP_DEBUG_ASSERT(team->t.t_threads);
5527  KMP_DEBUG_ASSERT(team->t.t_argv);
5528
5529  /* TODO clean the threads that are a part of this? */
5530
5531  /* free stuff */
5532  __kmp_free_team_arrays(team);
5533  if (team->t.t_argv != &team->t.t_inline_argv[0])
5534    __kmp_free((void *)team->t.t_argv);
5535  __kmp_free(team);
5536
5537  KMP_MB();
5538  return next_pool;
5539}
5540
5541// Free the thread.  Don't reap it, just place it on the pool of available
5542// threads.
5543//
5544// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5545// binding for the affinity mechanism to be useful.
5546//
5547// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5548// However, we want to avoid a potential performance problem by always
5549// scanning through the list to find the correct point at which to insert
5550// the thread (potential N**2 behavior).  To do this we keep track of the
5551// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5552// With single-level parallelism, threads will always be added to the tail
5553// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5554// parallelism, all bets are off and we may need to scan through the entire
5555// free list.
5556//
5557// This change also has a potentially large performance benefit, for some
5558// applications.  Previously, as threads were freed from the hot team, they
5559// would be placed back on the free list in inverse order.  If the hot team
5560// grew back to it's original size, then the freed thread would be placed
5561// back on the hot team in reverse order.  This could cause bad cache
5562// locality problems on programs where the size of the hot team regularly
5563// grew and shrunk.
5564//
5565// Now, for single-level parallelism, the OMP tid is alway == gtid.
5566void __kmp_free_thread(kmp_info_t *this_th) {
5567  int gtid;
5568  kmp_info_t **scan;
5569  kmp_root_t *root = this_th->th.th_root;
5570
5571  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5572                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5573
5574  KMP_DEBUG_ASSERT(this_th);
5575
5576  // When moving thread to pool, switch thread to wait on own b_go flag, and
5577  // uninitialized (NULL team).
5578  int b;
5579  kmp_balign_t *balign = this_th->th.th_bar;
5580  for (b = 0; b < bs_last_barrier; ++b) {
5581    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5582      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5583    balign[b].bb.team = NULL;
5584    balign[b].bb.leaf_kids = 0;
5585  }
5586  this_th->th.th_task_state = 0;
5587  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5588
5589  /* put thread back on the free pool */
5590  TCW_PTR(this_th->th.th_team, NULL);
5591  TCW_PTR(this_th->th.th_root, NULL);
5592  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5593
5594  /* If the implicit task assigned to this thread can be used by other threads
5595   * -> multiple threads can share the data and try to free the task at
5596   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5597   * with higher probability when hot team is disabled but can occurs even when
5598   * the hot team is enabled */
5599  __kmp_free_implicit_task(this_th);
5600  this_th->th.th_current_task = NULL;
5601
5602  // If the __kmp_thread_pool_insert_pt is already past the new insert
5603  // point, then we need to re-scan the entire list.
5604  gtid = this_th->th.th_info.ds.ds_gtid;
5605  if (__kmp_thread_pool_insert_pt != NULL) {
5606    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5607    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5608      __kmp_thread_pool_insert_pt = NULL;
5609    }
5610  }
5611
5612  // Scan down the list to find the place to insert the thread.
5613  // scan is the address of a link in the list, possibly the address of
5614  // __kmp_thread_pool itself.
5615  //
5616  // In the absence of nested parallism, the for loop will have 0 iterations.
5617  if (__kmp_thread_pool_insert_pt != NULL) {
5618    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5619  } else {
5620    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5621  }
5622  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5623       scan = &((*scan)->th.th_next_pool))
5624    ;
5625
5626  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5627  // to its address.
5628  TCW_PTR(this_th->th.th_next_pool, *scan);
5629  __kmp_thread_pool_insert_pt = *scan = this_th;
5630  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5631                   (this_th->th.th_info.ds.ds_gtid <
5632                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5633  TCW_4(this_th->th.th_in_pool, TRUE);
5634  __kmp_thread_pool_nth++;
5635
5636  TCW_4(__kmp_nth, __kmp_nth - 1);
5637  root->r.r_cg_nthreads--;
5638
5639#ifdef KMP_ADJUST_BLOCKTIME
5640  /* Adjust blocktime back to user setting or default if necessary */
5641  /* Middle initialization might never have occurred                */
5642  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5643    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5644    if (__kmp_nth <= __kmp_avail_proc) {
5645      __kmp_zero_bt = FALSE;
5646    }
5647  }
5648#endif /* KMP_ADJUST_BLOCKTIME */
5649
5650  KMP_MB();
5651}
5652
5653/* ------------------------------------------------------------------------ */
5654
5655void *__kmp_launch_thread(kmp_info_t *this_thr) {
5656  int gtid = this_thr->th.th_info.ds.ds_gtid;
5657  /*    void                 *stack_data;*/
5658  kmp_team_t *(*volatile pteam);
5659
5660  KMP_MB();
5661  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5662
5663  if (__kmp_env_consistency_check) {
5664    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5665  }
5666
5667#if OMPT_SUPPORT
5668  ompt_data_t *thread_data;
5669  if (ompt_enabled.enabled) {
5670    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5671    *thread_data = ompt_data_none;
5672
5673    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5674    this_thr->th.ompt_thread_info.wait_id = 0;
5675    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5676    if (ompt_enabled.ompt_callback_thread_begin) {
5677      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5678          ompt_thread_worker, thread_data);
5679    }
5680  }
5681#endif
5682
5683#if OMPT_SUPPORT
5684  if (ompt_enabled.enabled) {
5685    this_thr->th.ompt_thread_info.state = ompt_state_idle;
5686  }
5687#endif
5688  /* This is the place where threads wait for work */
5689  while (!TCR_4(__kmp_global.g.g_done)) {
5690    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5691    KMP_MB();
5692
5693    /* wait for work to do */
5694    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5695
5696    /* No tid yet since not part of a team */
5697    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5698
5699#if OMPT_SUPPORT
5700    if (ompt_enabled.enabled) {
5701      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5702    }
5703#endif
5704
5705    pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5706
5707    /* have we been allocated? */
5708    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5709      /* we were just woken up, so run our new task */
5710      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5711        int rc;
5712        KA_TRACE(20,
5713                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5714                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5715                  (*pteam)->t.t_pkfn));
5716
5717        updateHWFPControl(*pteam);
5718
5719#if OMPT_SUPPORT
5720        if (ompt_enabled.enabled) {
5721          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5722        }
5723#endif
5724
5725        rc = (*pteam)->t.t_invoke(gtid);
5726        KMP_ASSERT(rc);
5727
5728        KMP_MB();
5729        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5730                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5731                      (*pteam)->t.t_pkfn));
5732      }
5733#if OMPT_SUPPORT
5734      if (ompt_enabled.enabled) {
5735        /* no frame set while outside task */
5736        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5737
5738        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5739      }
5740#endif
5741      /* join barrier after parallel region */
5742      __kmp_join_barrier(gtid);
5743    }
5744  }
5745  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5746
5747#if OMPT_SUPPORT
5748  if (ompt_enabled.ompt_callback_thread_end) {
5749    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5750  }
5751#endif
5752
5753  this_thr->th.th_task_team = NULL;
5754  /* run the destructors for the threadprivate data for this thread */
5755  __kmp_common_destroy_gtid(gtid);
5756
5757  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5758  KMP_MB();
5759  return this_thr;
5760}
5761
5762/* ------------------------------------------------------------------------ */
5763
5764void __kmp_internal_end_dest(void *specific_gtid) {
5765#if KMP_COMPILER_ICC
5766#pragma warning(push)
5767#pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5768// significant bits
5769#endif
5770  // Make sure no significant bits are lost
5771  int gtid = (kmp_intptr_t)specific_gtid - 1;
5772#if KMP_COMPILER_ICC
5773#pragma warning(pop)
5774#endif
5775
5776  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5777  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5778   * this is because 0 is reserved for the nothing-stored case */
5779
5780  /* josh: One reason for setting the gtid specific data even when it is being
5781     destroyed by pthread is to allow gtid lookup through thread specific data
5782     (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5783     that gets executed in the call to __kmp_internal_end_thread, actually
5784     gets the gtid through the thread specific data.  Setting it here seems
5785     rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5786     to run smoothly.
5787     todo: get rid of this after we remove the dependence on
5788     __kmp_gtid_get_specific  */
5789  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5790    __kmp_gtid_set_specific(gtid);
5791#ifdef KMP_TDATA_GTID
5792  __kmp_gtid = gtid;
5793#endif
5794  __kmp_internal_end_thread(gtid);
5795}
5796
5797#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5798
5799// 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5800// destructors work perfectly, but in real libomp.so I have no evidence it is
5801// ever called. However, -fini linker option in makefile.mk works fine.
5802
5803__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5804  __kmp_internal_end_atexit();
5805}
5806
5807void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5808
5809#endif
5810
5811/* [Windows] josh: when the atexit handler is called, there may still be more
5812   than one thread alive */
5813void __kmp_internal_end_atexit(void) {
5814  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5815  /* [Windows]
5816     josh: ideally, we want to completely shutdown the library in this atexit
5817     handler, but stat code that depends on thread specific data for gtid fails
5818     because that data becomes unavailable at some point during the shutdown, so
5819     we call __kmp_internal_end_thread instead. We should eventually remove the
5820     dependency on __kmp_get_specific_gtid in the stat code and use
5821     __kmp_internal_end_library to cleanly shutdown the library.
5822
5823     // TODO: Can some of this comment about GVS be removed?
5824     I suspect that the offending stat code is executed when the calling thread
5825     tries to clean up a dead root thread's data structures, resulting in GVS
5826     code trying to close the GVS structures for that thread, but since the stat
5827     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5828     the calling thread is cleaning up itself instead of another thread, it get
5829     confused. This happens because allowing a thread to unregister and cleanup
5830     another thread is a recent modification for addressing an issue.
5831     Based on the current design (20050722), a thread may end up
5832     trying to unregister another thread only if thread death does not trigger
5833     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5834     thread specific data destructor function to detect thread death. For
5835     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5836     is nothing.  Thus, the workaround is applicable only for Windows static
5837     stat library. */
5838  __kmp_internal_end_library(-1);
5839#if KMP_OS_WINDOWS
5840  __kmp_close_console();
5841#endif
5842}
5843
5844static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5845  // It is assumed __kmp_forkjoin_lock is acquired.
5846
5847  int gtid;
5848
5849  KMP_DEBUG_ASSERT(thread != NULL);
5850
5851  gtid = thread->th.th_info.ds.ds_gtid;
5852
5853  if (!is_root) {
5854
5855    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5856      /* Assume the threads are at the fork barrier here */
5857      KA_TRACE(
5858          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5859               gtid));
5860      /* Need release fence here to prevent seg faults for tree forkjoin barrier
5861       * (GEH) */
5862      ANNOTATE_HAPPENS_BEFORE(thread);
5863      kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5864      __kmp_release_64(&flag);
5865    }
5866
5867    // Terminate OS thread.
5868    __kmp_reap_worker(thread);
5869
5870    // The thread was killed asynchronously.  If it was actively
5871    // spinning in the thread pool, decrement the global count.
5872    //
5873    // There is a small timing hole here - if the worker thread was just waking
5874    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5875    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5876    // the global counter might not get updated.
5877    //
5878    // Currently, this can only happen as the library is unloaded,
5879    // so there are no harmful side effects.
5880    if (thread->th.th_active_in_pool) {
5881      thread->th.th_active_in_pool = FALSE;
5882      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5883      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5884    }
5885
5886    // Decrement # of [worker] threads in the pool.
5887    KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5888    --__kmp_thread_pool_nth;
5889  }
5890
5891  __kmp_free_implicit_task(thread);
5892
5893// Free the fast memory for tasking
5894#if USE_FAST_MEMORY
5895  __kmp_free_fast_memory(thread);
5896#endif /* USE_FAST_MEMORY */
5897
5898  __kmp_suspend_uninitialize_thread(thread);
5899
5900  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5901  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5902
5903  --__kmp_all_nth;
5904// __kmp_nth was decremented when thread is added to the pool.
5905
5906#ifdef KMP_ADJUST_BLOCKTIME
5907  /* Adjust blocktime back to user setting or default if necessary */
5908  /* Middle initialization might never have occurred                */
5909  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5910    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5911    if (__kmp_nth <= __kmp_avail_proc) {
5912      __kmp_zero_bt = FALSE;
5913    }
5914  }
5915#endif /* KMP_ADJUST_BLOCKTIME */
5916
5917  /* free the memory being used */
5918  if (__kmp_env_consistency_check) {
5919    if (thread->th.th_cons) {
5920      __kmp_free_cons_stack(thread->th.th_cons);
5921      thread->th.th_cons = NULL;
5922    }
5923  }
5924
5925  if (thread->th.th_pri_common != NULL) {
5926    __kmp_free(thread->th.th_pri_common);
5927    thread->th.th_pri_common = NULL;
5928  }
5929
5930  if (thread->th.th_task_state_memo_stack != NULL) {
5931    __kmp_free(thread->th.th_task_state_memo_stack);
5932    thread->th.th_task_state_memo_stack = NULL;
5933  }
5934
5935#if KMP_USE_BGET
5936  if (thread->th.th_local.bget_data != NULL) {
5937    __kmp_finalize_bget(thread);
5938  }
5939#endif
5940
5941#if KMP_AFFINITY_SUPPORTED
5942  if (thread->th.th_affin_mask != NULL) {
5943    KMP_CPU_FREE(thread->th.th_affin_mask);
5944    thread->th.th_affin_mask = NULL;
5945  }
5946#endif /* KMP_AFFINITY_SUPPORTED */
5947
5948#if KMP_USE_HIER_SCHED
5949  if (thread->th.th_hier_bar_data != NULL) {
5950    __kmp_free(thread->th.th_hier_bar_data);
5951    thread->th.th_hier_bar_data = NULL;
5952  }
5953#endif
5954
5955  __kmp_reap_team(thread->th.th_serial_team);
5956  thread->th.th_serial_team = NULL;
5957  __kmp_free(thread);
5958
5959  KMP_MB();
5960
5961} // __kmp_reap_thread
5962
5963static void __kmp_internal_end(void) {
5964  int i;
5965
5966  /* First, unregister the library */
5967  __kmp_unregister_library();
5968
5969#if KMP_OS_WINDOWS
5970  /* In Win static library, we can't tell when a root actually dies, so we
5971     reclaim the data structures for any root threads that have died but not
5972     unregistered themselves, in order to shut down cleanly.
5973     In Win dynamic library we also can't tell when a thread dies.  */
5974  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5975// dead roots
5976#endif
5977
5978  for (i = 0; i < __kmp_threads_capacity; i++)
5979    if (__kmp_root[i])
5980      if (__kmp_root[i]->r.r_active)
5981        break;
5982  KMP_MB(); /* Flush all pending memory write invalidates.  */
5983  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5984
5985  if (i < __kmp_threads_capacity) {
5986#if KMP_USE_MONITOR
5987    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5988    KMP_MB(); /* Flush all pending memory write invalidates.  */
5989
5990    // Need to check that monitor was initialized before reaping it. If we are
5991    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5992    // __kmp_monitor will appear to contain valid data, but it is only valid in
5993    // the parent process, not the child.
5994    // New behavior (201008): instead of keying off of the flag
5995    // __kmp_init_parallel, the monitor thread creation is keyed off
5996    // of the new flag __kmp_init_monitor.
5997    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5998    if (TCR_4(__kmp_init_monitor)) {
5999      __kmp_reap_monitor(&__kmp_monitor);
6000      TCW_4(__kmp_init_monitor, 0);
6001    }
6002    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6003    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6004#endif // KMP_USE_MONITOR
6005  } else {
6006/* TODO move this to cleanup code */
6007#ifdef KMP_DEBUG
6008    /* make sure that everything has properly ended */
6009    for (i = 0; i < __kmp_threads_capacity; i++) {
6010      if (__kmp_root[i]) {
6011        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6012        //                    there can be uber threads alive here
6013        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6014      }
6015    }
6016#endif
6017
6018    KMP_MB();
6019
6020    // Reap the worker threads.
6021    // This is valid for now, but be careful if threads are reaped sooner.
6022    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6023      // Get the next thread from the pool.
6024      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6025      __kmp_thread_pool = thread->th.th_next_pool;
6026      // Reap it.
6027      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6028      thread->th.th_next_pool = NULL;
6029      thread->th.th_in_pool = FALSE;
6030      __kmp_reap_thread(thread, 0);
6031    }
6032    __kmp_thread_pool_insert_pt = NULL;
6033
6034    // Reap teams.
6035    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6036      // Get the next team from the pool.
6037      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6038      __kmp_team_pool = team->t.t_next_pool;
6039      // Reap it.
6040      team->t.t_next_pool = NULL;
6041      __kmp_reap_team(team);
6042    }
6043
6044    __kmp_reap_task_teams();
6045
6046#if KMP_OS_UNIX
6047    // Threads that are not reaped should not access any resources since they
6048    // are going to be deallocated soon, so the shutdown sequence should wait
6049    // until all threads either exit the final spin-waiting loop or begin
6050    // sleeping after the given blocktime.
6051    for (i = 0; i < __kmp_threads_capacity; i++) {
6052      kmp_info_t *thr = __kmp_threads[i];
6053      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6054        KMP_CPU_PAUSE();
6055    }
6056#endif
6057
6058    for (i = 0; i < __kmp_threads_capacity; ++i) {
6059      // TBD: Add some checking...
6060      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6061    }
6062
6063    /* Make sure all threadprivate destructors get run by joining with all
6064       worker threads before resetting this flag */
6065    TCW_SYNC_4(__kmp_init_common, FALSE);
6066
6067    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6068    KMP_MB();
6069
6070#if KMP_USE_MONITOR
6071    // See note above: One of the possible fixes for CQ138434 / CQ140126
6072    //
6073    // FIXME: push both code fragments down and CSE them?
6074    // push them into __kmp_cleanup() ?
6075    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6076    if (TCR_4(__kmp_init_monitor)) {
6077      __kmp_reap_monitor(&__kmp_monitor);
6078      TCW_4(__kmp_init_monitor, 0);
6079    }
6080    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6081    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6082#endif
6083  } /* else !__kmp_global.t_active */
6084  TCW_4(__kmp_init_gtid, FALSE);
6085  KMP_MB(); /* Flush all pending memory write invalidates.  */
6086
6087  __kmp_cleanup();
6088#if OMPT_SUPPORT
6089  ompt_fini();
6090#endif
6091}
6092
6093void __kmp_internal_end_library(int gtid_req) {
6094  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6095  /* this shouldn't be a race condition because __kmp_internal_end() is the
6096     only place to clear __kmp_serial_init */
6097  /* we'll check this later too, after we get the lock */
6098  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6099  // redundaant, because the next check will work in any case.
6100  if (__kmp_global.g.g_abort) {
6101    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6102    /* TODO abort? */
6103    return;
6104  }
6105  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6106    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6107    return;
6108  }
6109
6110  KMP_MB(); /* Flush all pending memory write invalidates.  */
6111
6112  /* find out who we are and what we should do */
6113  {
6114    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6115    KA_TRACE(
6116        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6117    if (gtid == KMP_GTID_SHUTDOWN) {
6118      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6119                    "already shutdown\n"));
6120      return;
6121    } else if (gtid == KMP_GTID_MONITOR) {
6122      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6123                    "registered, or system shutdown\n"));
6124      return;
6125    } else if (gtid == KMP_GTID_DNE) {
6126      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6127                    "shutdown\n"));
6128      /* we don't know who we are, but we may still shutdown the library */
6129    } else if (KMP_UBER_GTID(gtid)) {
6130      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6131      if (__kmp_root[gtid]->r.r_active) {
6132        __kmp_global.g.g_abort = -1;
6133        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6134        KA_TRACE(10,
6135                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6136                  gtid));
6137        return;
6138      } else {
6139        KA_TRACE(
6140            10,
6141            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6142        __kmp_unregister_root_current_thread(gtid);
6143      }
6144    } else {
6145/* worker threads may call this function through the atexit handler, if they
6146 * call exit() */
6147/* For now, skip the usual subsequent processing and just dump the debug buffer.
6148   TODO: do a thorough shutdown instead */
6149#ifdef DUMP_DEBUG_ON_EXIT
6150      if (__kmp_debug_buf)
6151        __kmp_dump_debug_buffer();
6152#endif
6153      return;
6154    }
6155  }
6156  /* synchronize the termination process */
6157  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6158
6159  /* have we already finished */
6160  if (__kmp_global.g.g_abort) {
6161    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6162    /* TODO abort? */
6163    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6164    return;
6165  }
6166  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6167    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6168    return;
6169  }
6170
6171  /* We need this lock to enforce mutex between this reading of
6172     __kmp_threads_capacity and the writing by __kmp_register_root.
6173     Alternatively, we can use a counter of roots that is atomically updated by
6174     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6175     __kmp_internal_end_*.  */
6176  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6177
6178  /* now we can safely conduct the actual termination */
6179  __kmp_internal_end();
6180
6181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6182  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6183
6184  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6185
6186#ifdef DUMP_DEBUG_ON_EXIT
6187  if (__kmp_debug_buf)
6188    __kmp_dump_debug_buffer();
6189#endif
6190
6191#if KMP_OS_WINDOWS
6192  __kmp_close_console();
6193#endif
6194
6195  __kmp_fini_allocator();
6196
6197} // __kmp_internal_end_library
6198
6199void __kmp_internal_end_thread(int gtid_req) {
6200  int i;
6201
6202  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6203  /* this shouldn't be a race condition because __kmp_internal_end() is the
6204   * only place to clear __kmp_serial_init */
6205  /* we'll check this later too, after we get the lock */
6206  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6207  // redundant, because the next check will work in any case.
6208  if (__kmp_global.g.g_abort) {
6209    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6210    /* TODO abort? */
6211    return;
6212  }
6213  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6214    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6215    return;
6216  }
6217
6218  KMP_MB(); /* Flush all pending memory write invalidates.  */
6219
6220  /* find out who we are and what we should do */
6221  {
6222    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6223    KA_TRACE(10,
6224             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6225    if (gtid == KMP_GTID_SHUTDOWN) {
6226      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6227                    "already shutdown\n"));
6228      return;
6229    } else if (gtid == KMP_GTID_MONITOR) {
6230      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6231                    "registered, or system shutdown\n"));
6232      return;
6233    } else if (gtid == KMP_GTID_DNE) {
6234      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6235                    "shutdown\n"));
6236      return;
6237      /* we don't know who we are */
6238    } else if (KMP_UBER_GTID(gtid)) {
6239      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6240      if (__kmp_root[gtid]->r.r_active) {
6241        __kmp_global.g.g_abort = -1;
6242        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6243        KA_TRACE(10,
6244                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6245                  gtid));
6246        return;
6247      } else {
6248        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6249                      gtid));
6250        __kmp_unregister_root_current_thread(gtid);
6251      }
6252    } else {
6253      /* just a worker thread, let's leave */
6254      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6255
6256      if (gtid >= 0) {
6257        __kmp_threads[gtid]->th.th_task_team = NULL;
6258      }
6259
6260      KA_TRACE(10,
6261               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6262                gtid));
6263      return;
6264    }
6265  }
6266#if KMP_DYNAMIC_LIB
6267  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6268  // thread, because we will better shutdown later in the library destructor.
6269  // The reason of this change is performance problem when non-openmp thread in
6270  // a loop forks and joins many openmp threads. We can save a lot of time
6271  // keeping worker threads alive until the program shutdown.
6272  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6273  // and Windows(DPD200287443) that occurs when using critical sections from
6274  // foreign threads.
6275  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6276  return;
6277#endif
6278  /* synchronize the termination process */
6279  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6280
6281  /* have we already finished */
6282  if (__kmp_global.g.g_abort) {
6283    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6284    /* TODO abort? */
6285    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6286    return;
6287  }
6288  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6289    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6290    return;
6291  }
6292
6293  /* We need this lock to enforce mutex between this reading of
6294     __kmp_threads_capacity and the writing by __kmp_register_root.
6295     Alternatively, we can use a counter of roots that is atomically updated by
6296     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6297     __kmp_internal_end_*.  */
6298
6299  /* should we finish the run-time?  are all siblings done? */
6300  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6301
6302  for (i = 0; i < __kmp_threads_capacity; ++i) {
6303    if (KMP_UBER_GTID(i)) {
6304      KA_TRACE(
6305          10,
6306          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6307      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6308      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6309      return;
6310    }
6311  }
6312
6313  /* now we can safely conduct the actual termination */
6314
6315  __kmp_internal_end();
6316
6317  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6318  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6319
6320  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6321
6322#ifdef DUMP_DEBUG_ON_EXIT
6323  if (__kmp_debug_buf)
6324    __kmp_dump_debug_buffer();
6325#endif
6326} // __kmp_internal_end_thread
6327
6328// -----------------------------------------------------------------------------
6329// Library registration stuff.
6330
6331static long __kmp_registration_flag = 0;
6332// Random value used to indicate library initialization.
6333static char *__kmp_registration_str = NULL;
6334// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6335
6336static inline char *__kmp_reg_status_name() {
6337  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6338     each thread. If registration and unregistration go in different threads
6339     (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6340     env var can not be found, because the name will contain different pid. */
6341  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6342} // __kmp_reg_status_get
6343
6344void __kmp_register_library_startup(void) {
6345
6346  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6347  int done = 0;
6348  union {
6349    double dtime;
6350    long ltime;
6351  } time;
6352#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6353  __kmp_initialize_system_tick();
6354#endif
6355  __kmp_read_system_time(&time.dtime);
6356  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6357  __kmp_registration_str =
6358      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6359                       __kmp_registration_flag, KMP_LIBRARY_FILE);
6360
6361  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6362                __kmp_registration_str));
6363
6364  while (!done) {
6365
6366    char *value = NULL; // Actual value of the environment variable.
6367
6368    // Set environment variable, but do not overwrite if it is exist.
6369    __kmp_env_set(name, __kmp_registration_str, 0);
6370    // Check the variable is written.
6371    value = __kmp_env_get(name);
6372    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6373
6374      done = 1; // Ok, environment variable set successfully, exit the loop.
6375
6376    } else {
6377
6378      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6379      // Check whether it alive or dead.
6380      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6381      char *tail = value;
6382      char *flag_addr_str = NULL;
6383      char *flag_val_str = NULL;
6384      char const *file_name = NULL;
6385      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6386      __kmp_str_split(tail, '-', &flag_val_str, &tail);
6387      file_name = tail;
6388      if (tail != NULL) {
6389        long *flag_addr = 0;
6390        long flag_val = 0;
6391        KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6392        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6393        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6394          // First, check whether environment-encoded address is mapped into
6395          // addr space.
6396          // If so, dereference it to see if it still has the right value.
6397          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6398            neighbor = 1;
6399          } else {
6400            // If not, then we know the other copy of the library is no longer
6401            // running.
6402            neighbor = 2;
6403          }
6404        }
6405      }
6406      switch (neighbor) {
6407      case 0: // Cannot parse environment variable -- neighbor status unknown.
6408        // Assume it is the incompatible format of future version of the
6409        // library. Assume the other library is alive.
6410        // WARN( ... ); // TODO: Issue a warning.
6411        file_name = "unknown library";
6412      // Attention! Falling to the next case. That's intentional.
6413      case 1: { // Neighbor is alive.
6414        // Check it is allowed.
6415        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6416        if (!__kmp_str_match_true(duplicate_ok)) {
6417          // That's not allowed. Issue fatal error.
6418          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6419                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6420        }
6421        KMP_INTERNAL_FREE(duplicate_ok);
6422        __kmp_duplicate_library_ok = 1;
6423        done = 1; // Exit the loop.
6424      } break;
6425      case 2: { // Neighbor is dead.
6426        // Clear the variable and try to register library again.
6427        __kmp_env_unset(name);
6428      } break;
6429      default: { KMP_DEBUG_ASSERT(0); } break;
6430      }
6431    }
6432    KMP_INTERNAL_FREE((void *)value);
6433  }
6434  KMP_INTERNAL_FREE((void *)name);
6435
6436} // func __kmp_register_library_startup
6437
6438void __kmp_unregister_library(void) {
6439
6440  char *name = __kmp_reg_status_name();
6441  char *value = __kmp_env_get(name);
6442
6443  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6444  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6445  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6446    // Ok, this is our variable. Delete it.
6447    __kmp_env_unset(name);
6448  }
6449
6450  KMP_INTERNAL_FREE(__kmp_registration_str);
6451  KMP_INTERNAL_FREE(value);
6452  KMP_INTERNAL_FREE(name);
6453
6454  __kmp_registration_flag = 0;
6455  __kmp_registration_str = NULL;
6456
6457} // __kmp_unregister_library
6458
6459// End of Library registration stuff.
6460// -----------------------------------------------------------------------------
6461
6462#if KMP_MIC_SUPPORTED
6463
6464static void __kmp_check_mic_type() {
6465  kmp_cpuid_t cpuid_state = {0};
6466  kmp_cpuid_t *cs_p = &cpuid_state;
6467  __kmp_x86_cpuid(1, 0, cs_p);
6468  // We don't support mic1 at the moment
6469  if ((cs_p->eax & 0xff0) == 0xB10) {
6470    __kmp_mic_type = mic2;
6471  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6472    __kmp_mic_type = mic3;
6473  } else {
6474    __kmp_mic_type = non_mic;
6475  }
6476}
6477
6478#endif /* KMP_MIC_SUPPORTED */
6479
6480static void __kmp_do_serial_initialize(void) {
6481  int i, gtid;
6482  int size;
6483
6484  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6485
6486  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6487  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6488  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6489  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6490  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6491
6492#if OMPT_SUPPORT
6493  ompt_pre_init();
6494#endif
6495
6496  __kmp_validate_locks();
6497
6498  /* Initialize internal memory allocator */
6499  __kmp_init_allocator();
6500
6501  /* Register the library startup via an environment variable and check to see
6502     whether another copy of the library is already registered. */
6503
6504  __kmp_register_library_startup();
6505
6506  /* TODO reinitialization of library */
6507  if (TCR_4(__kmp_global.g.g_done)) {
6508    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6509  }
6510
6511  __kmp_global.g.g_abort = 0;
6512  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6513
6514/* initialize the locks */
6515#if KMP_USE_ADAPTIVE_LOCKS
6516#if KMP_DEBUG_ADAPTIVE_LOCKS
6517  __kmp_init_speculative_stats();
6518#endif
6519#endif
6520#if KMP_STATS_ENABLED
6521  __kmp_stats_init();
6522#endif
6523  __kmp_init_lock(&__kmp_global_lock);
6524  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6525  __kmp_init_lock(&__kmp_debug_lock);
6526  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6527  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6528  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6529  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6530  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6531  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6532  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6533  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6534  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6535  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6536  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6537  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6538  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6539  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6540  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6541#if KMP_USE_MONITOR
6542  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6543#endif
6544  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6545
6546  /* conduct initialization and initial setup of configuration */
6547
6548  __kmp_runtime_initialize();
6549
6550#if KMP_MIC_SUPPORTED
6551  __kmp_check_mic_type();
6552#endif
6553
6554// Some global variable initialization moved here from kmp_env_initialize()
6555#ifdef KMP_DEBUG
6556  kmp_diag = 0;
6557#endif
6558  __kmp_abort_delay = 0;
6559
6560  // From __kmp_init_dflt_team_nth()
6561  /* assume the entire machine will be used */
6562  __kmp_dflt_team_nth_ub = __kmp_xproc;
6563  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6564    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6565  }
6566  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6567    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6568  }
6569  __kmp_max_nth = __kmp_sys_max_nth;
6570  __kmp_cg_max_nth = __kmp_sys_max_nth;
6571  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6572  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6573    __kmp_teams_max_nth = __kmp_sys_max_nth;
6574  }
6575
6576  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6577  // part
6578  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6579#if KMP_USE_MONITOR
6580  __kmp_monitor_wakeups =
6581      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6582  __kmp_bt_intervals =
6583      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6584#endif
6585  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6586  __kmp_library = library_throughput;
6587  // From KMP_SCHEDULE initialization
6588  __kmp_static = kmp_sch_static_balanced;
6589// AC: do not use analytical here, because it is non-monotonous
6590//__kmp_guided = kmp_sch_guided_iterative_chunked;
6591//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6592// need to repeat assignment
6593// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6594// bit control and barrier method control parts
6595#if KMP_FAST_REDUCTION_BARRIER
6596#define kmp_reduction_barrier_gather_bb ((int)1)
6597#define kmp_reduction_barrier_release_bb ((int)1)
6598#define kmp_reduction_barrier_gather_pat bp_hyper_bar
6599#define kmp_reduction_barrier_release_pat bp_hyper_bar
6600#endif // KMP_FAST_REDUCTION_BARRIER
6601  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6602    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6603    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6604    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6605    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6606#if KMP_FAST_REDUCTION_BARRIER
6607    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6608      // lin_64 ): hyper,1
6609      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6610      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6611      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6612      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6613    }
6614#endif // KMP_FAST_REDUCTION_BARRIER
6615  }
6616#if KMP_FAST_REDUCTION_BARRIER
6617#undef kmp_reduction_barrier_release_pat
6618#undef kmp_reduction_barrier_gather_pat
6619#undef kmp_reduction_barrier_release_bb
6620#undef kmp_reduction_barrier_gather_bb
6621#endif // KMP_FAST_REDUCTION_BARRIER
6622#if KMP_MIC_SUPPORTED
6623  if (__kmp_mic_type == mic2) { // KNC
6624    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6625    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6626    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6627        1; // forkjoin release
6628    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6629    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6630  }
6631#if KMP_FAST_REDUCTION_BARRIER
6632  if (__kmp_mic_type == mic2) { // KNC
6633    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6634    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6635  }
6636#endif // KMP_FAST_REDUCTION_BARRIER
6637#endif // KMP_MIC_SUPPORTED
6638
6639// From KMP_CHECKS initialization
6640#ifdef KMP_DEBUG
6641  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6642#else
6643  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6644#endif
6645
6646  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6647  __kmp_foreign_tp = TRUE;
6648
6649  __kmp_global.g.g_dynamic = FALSE;
6650  __kmp_global.g.g_dynamic_mode = dynamic_default;
6651
6652  __kmp_env_initialize(NULL);
6653
6654// Print all messages in message catalog for testing purposes.
6655#ifdef KMP_DEBUG
6656  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6657  if (__kmp_str_match_true(val)) {
6658    kmp_str_buf_t buffer;
6659    __kmp_str_buf_init(&buffer);
6660    __kmp_i18n_dump_catalog(&buffer);
6661    __kmp_printf("%s", buffer.str);
6662    __kmp_str_buf_free(&buffer);
6663  }
6664  __kmp_env_free(&val);
6665#endif
6666
6667  __kmp_threads_capacity =
6668      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6669  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6670  __kmp_tp_capacity = __kmp_default_tp_capacity(
6671      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6672
6673  // If the library is shut down properly, both pools must be NULL. Just in
6674  // case, set them to NULL -- some memory may leak, but subsequent code will
6675  // work even if pools are not freed.
6676  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6677  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6678  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6679  __kmp_thread_pool = NULL;
6680  __kmp_thread_pool_insert_pt = NULL;
6681  __kmp_team_pool = NULL;
6682
6683  /* Allocate all of the variable sized records */
6684  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6685   * expandable */
6686  /* Since allocation is cache-aligned, just add extra padding at the end */
6687  size =
6688      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6689      CACHE_LINE;
6690  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6691  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6692                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
6693
6694  /* init thread counts */
6695  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6696                   0); // Asserts fail if the library is reinitializing and
6697  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6698  __kmp_all_nth = 0;
6699  __kmp_nth = 0;
6700
6701  /* setup the uber master thread and hierarchy */
6702  gtid = __kmp_register_root(TRUE);
6703  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6704  KMP_ASSERT(KMP_UBER_GTID(gtid));
6705  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6706
6707  KMP_MB(); /* Flush all pending memory write invalidates.  */
6708
6709  __kmp_common_initialize();
6710
6711#if KMP_OS_UNIX
6712  /* invoke the child fork handler */
6713  __kmp_register_atfork();
6714#endif
6715
6716#if !KMP_DYNAMIC_LIB
6717  {
6718    /* Invoke the exit handler when the program finishes, only for static
6719       library. For dynamic library, we already have _fini and DllMain. */
6720    int rc = atexit(__kmp_internal_end_atexit);
6721    if (rc != 0) {
6722      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6723                  __kmp_msg_null);
6724    }
6725  }
6726#endif
6727
6728#if KMP_HANDLE_SIGNALS
6729#if KMP_OS_UNIX
6730  /* NOTE: make sure that this is called before the user installs their own
6731     signal handlers so that the user handlers are called first. this way they
6732     can return false, not call our handler, avoid terminating the library, and
6733     continue execution where they left off. */
6734  __kmp_install_signals(FALSE);
6735#endif /* KMP_OS_UNIX */
6736#if KMP_OS_WINDOWS
6737  __kmp_install_signals(TRUE);
6738#endif /* KMP_OS_WINDOWS */
6739#endif
6740
6741  /* we have finished the serial initialization */
6742  __kmp_init_counter++;
6743
6744  __kmp_init_serial = TRUE;
6745
6746  if (__kmp_settings) {
6747    __kmp_env_print();
6748  }
6749
6750#if OMP_40_ENABLED
6751  if (__kmp_display_env || __kmp_display_env_verbose) {
6752    __kmp_env_print_2();
6753  }
6754#endif // OMP_40_ENABLED
6755
6756#if OMPT_SUPPORT
6757  ompt_post_init();
6758#endif
6759
6760  KMP_MB();
6761
6762  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6763}
6764
6765void __kmp_serial_initialize(void) {
6766  if (__kmp_init_serial) {
6767    return;
6768  }
6769  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6770  if (__kmp_init_serial) {
6771    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6772    return;
6773  }
6774  __kmp_do_serial_initialize();
6775  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6776}
6777
6778static void __kmp_do_middle_initialize(void) {
6779  int i, j;
6780  int prev_dflt_team_nth;
6781
6782  if (!__kmp_init_serial) {
6783    __kmp_do_serial_initialize();
6784  }
6785
6786  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6787
6788  // Save the previous value for the __kmp_dflt_team_nth so that
6789  // we can avoid some reinitialization if it hasn't changed.
6790  prev_dflt_team_nth = __kmp_dflt_team_nth;
6791
6792#if KMP_AFFINITY_SUPPORTED
6793  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6794  // number of cores on the machine.
6795  __kmp_affinity_initialize();
6796
6797  // Run through the __kmp_threads array and set the affinity mask
6798  // for each root thread that is currently registered with the RTL.
6799  for (i = 0; i < __kmp_threads_capacity; i++) {
6800    if (TCR_PTR(__kmp_threads[i]) != NULL) {
6801      __kmp_affinity_set_init_mask(i, TRUE);
6802    }
6803  }
6804#endif /* KMP_AFFINITY_SUPPORTED */
6805
6806  KMP_ASSERT(__kmp_xproc > 0);
6807  if (__kmp_avail_proc == 0) {
6808    __kmp_avail_proc = __kmp_xproc;
6809  }
6810
6811  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6812  // correct them now
6813  j = 0;
6814  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6815    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6816        __kmp_avail_proc;
6817    j++;
6818  }
6819
6820  if (__kmp_dflt_team_nth == 0) {
6821#ifdef KMP_DFLT_NTH_CORES
6822    // Default #threads = #cores
6823    __kmp_dflt_team_nth = __kmp_ncores;
6824    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6825                  "__kmp_ncores (%d)\n",
6826                  __kmp_dflt_team_nth));
6827#else
6828    // Default #threads = #available OS procs
6829    __kmp_dflt_team_nth = __kmp_avail_proc;
6830    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6831                  "__kmp_avail_proc(%d)\n",
6832                  __kmp_dflt_team_nth));
6833#endif /* KMP_DFLT_NTH_CORES */
6834  }
6835
6836  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6837    __kmp_dflt_team_nth = KMP_MIN_NTH;
6838  }
6839  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6840    __kmp_dflt_team_nth = __kmp_sys_max_nth;
6841  }
6842
6843  // There's no harm in continuing if the following check fails,
6844  // but it indicates an error in the previous logic.
6845  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6846
6847  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6848    // Run through the __kmp_threads array and set the num threads icv for each
6849    // root thread that is currently registered with the RTL (which has not
6850    // already explicitly set its nthreads-var with a call to
6851    // omp_set_num_threads()).
6852    for (i = 0; i < __kmp_threads_capacity; i++) {
6853      kmp_info_t *thread = __kmp_threads[i];
6854      if (thread == NULL)
6855        continue;
6856      if (thread->th.th_current_task->td_icvs.nproc != 0)
6857        continue;
6858
6859      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6860    }
6861  }
6862  KA_TRACE(
6863      20,
6864      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6865       __kmp_dflt_team_nth));
6866
6867#ifdef KMP_ADJUST_BLOCKTIME
6868  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6869  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6870    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6871    if (__kmp_nth > __kmp_avail_proc) {
6872      __kmp_zero_bt = TRUE;
6873    }
6874  }
6875#endif /* KMP_ADJUST_BLOCKTIME */
6876
6877  /* we have finished middle initialization */
6878  TCW_SYNC_4(__kmp_init_middle, TRUE);
6879
6880  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6881}
6882
6883void __kmp_middle_initialize(void) {
6884  if (__kmp_init_middle) {
6885    return;
6886  }
6887  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6888  if (__kmp_init_middle) {
6889    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6890    return;
6891  }
6892  __kmp_do_middle_initialize();
6893  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6894}
6895
6896void __kmp_parallel_initialize(void) {
6897  int gtid = __kmp_entry_gtid(); // this might be a new root
6898
6899  /* synchronize parallel initialization (for sibling) */
6900  if (TCR_4(__kmp_init_parallel))
6901    return;
6902  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6903  if (TCR_4(__kmp_init_parallel)) {
6904    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6905    return;
6906  }
6907
6908  /* TODO reinitialization after we have already shut down */
6909  if (TCR_4(__kmp_global.g.g_done)) {
6910    KA_TRACE(
6911        10,
6912        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6913    __kmp_infinite_loop();
6914  }
6915
6916  /* jc: The lock __kmp_initz_lock is already held, so calling
6917     __kmp_serial_initialize would cause a deadlock.  So we call
6918     __kmp_do_serial_initialize directly. */
6919  if (!__kmp_init_middle) {
6920    __kmp_do_middle_initialize();
6921  }
6922
6923  /* begin initialization */
6924  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6925  KMP_ASSERT(KMP_UBER_GTID(gtid));
6926
6927#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6928  // Save the FP control regs.
6929  // Worker threads will set theirs to these values at thread startup.
6930  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6931  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6932  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6933#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6934
6935#if KMP_OS_UNIX
6936#if KMP_HANDLE_SIGNALS
6937  /*  must be after __kmp_serial_initialize  */
6938  __kmp_install_signals(TRUE);
6939#endif
6940#endif
6941
6942  __kmp_suspend_initialize();
6943
6944#if defined(USE_LOAD_BALANCE)
6945  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6946    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6947  }
6948#else
6949  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6950    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6951  }
6952#endif
6953
6954  if (__kmp_version) {
6955    __kmp_print_version_2();
6956  }
6957
6958  /* we have finished parallel initialization */
6959  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6960
6961  KMP_MB();
6962  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6963
6964  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6965}
6966
6967/* ------------------------------------------------------------------------ */
6968
6969void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6970                                   kmp_team_t *team) {
6971  kmp_disp_t *dispatch;
6972
6973  KMP_MB();
6974
6975  /* none of the threads have encountered any constructs, yet. */
6976  this_thr->th.th_local.this_construct = 0;
6977#if KMP_CACHE_MANAGE
6978  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6979#endif /* KMP_CACHE_MANAGE */
6980  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6981  KMP_DEBUG_ASSERT(dispatch);
6982  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6983  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6984  // this_thr->th.th_info.ds.ds_tid ] );
6985
6986  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6987#if OMP_45_ENABLED
6988  dispatch->th_doacross_buf_idx =
6989      0; /* reset the doacross dispatch buffer counter */
6990#endif
6991  if (__kmp_env_consistency_check)
6992    __kmp_push_parallel(gtid, team->t.t_ident);
6993
6994  KMP_MB(); /* Flush all pending memory write invalidates.  */
6995}
6996
6997void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6998                                  kmp_team_t *team) {
6999  if (__kmp_env_consistency_check)
7000    __kmp_pop_parallel(gtid, team->t.t_ident);
7001
7002  __kmp_finish_implicit_task(this_thr);
7003}
7004
7005int __kmp_invoke_task_func(int gtid) {
7006  int rc;
7007  int tid = __kmp_tid_from_gtid(gtid);
7008  kmp_info_t *this_thr = __kmp_threads[gtid];
7009  kmp_team_t *team = this_thr->th.th_team;
7010
7011  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7012#if USE_ITT_BUILD
7013  if (__itt_stack_caller_create_ptr) {
7014    __kmp_itt_stack_callee_enter(
7015        (__itt_caller)
7016            team->t.t_stack_id); // inform ittnotify about entering user's code
7017  }
7018#endif /* USE_ITT_BUILD */
7019#if INCLUDE_SSC_MARKS
7020  SSC_MARK_INVOKING();
7021#endif
7022
7023#if OMPT_SUPPORT
7024  void *dummy;
7025  void **exit_runtime_p;
7026  ompt_data_t *my_task_data;
7027  ompt_data_t *my_parallel_data;
7028  int ompt_team_size;
7029
7030  if (ompt_enabled.enabled) {
7031    exit_runtime_p = &(
7032        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7033  } else {
7034    exit_runtime_p = &dummy;
7035  }
7036
7037  my_task_data =
7038      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7039  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7040  if (ompt_enabled.ompt_callback_implicit_task) {
7041    ompt_team_size = team->t.t_nproc;
7042    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7043        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7044        __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7045    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7046  }
7047#endif
7048
7049  {
7050    KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7051    KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7052    rc =
7053        __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7054                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7055#if OMPT_SUPPORT
7056                               ,
7057                               exit_runtime_p
7058#endif
7059                               );
7060#if OMPT_SUPPORT
7061    *exit_runtime_p = NULL;
7062#endif
7063  }
7064
7065#if USE_ITT_BUILD
7066  if (__itt_stack_caller_create_ptr) {
7067    __kmp_itt_stack_callee_leave(
7068        (__itt_caller)
7069            team->t.t_stack_id); // inform ittnotify about leaving user's code
7070  }
7071#endif /* USE_ITT_BUILD */
7072  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7073
7074  return rc;
7075}
7076
7077#if OMP_40_ENABLED
7078void __kmp_teams_master(int gtid) {
7079  // This routine is called by all master threads in teams construct
7080  kmp_info_t *thr = __kmp_threads[gtid];
7081  kmp_team_t *team = thr->th.th_team;
7082  ident_t *loc = team->t.t_ident;
7083  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7084  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7085  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7086  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7087                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7088// Launch league of teams now, but not let workers execute
7089// (they hang on fork barrier until next parallel)
7090#if INCLUDE_SSC_MARKS
7091  SSC_MARK_FORKING();
7092#endif
7093  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7094                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7095                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7096#if INCLUDE_SSC_MARKS
7097  SSC_MARK_JOINING();
7098#endif
7099
7100  // AC: last parameter "1" eliminates join barrier which won't work because
7101  // worker threads are in a fork barrier waiting for more parallel regions
7102  __kmp_join_call(loc, gtid
7103#if OMPT_SUPPORT
7104                  ,
7105                  fork_context_intel
7106#endif
7107                  ,
7108                  1);
7109}
7110
7111int __kmp_invoke_teams_master(int gtid) {
7112  kmp_info_t *this_thr = __kmp_threads[gtid];
7113  kmp_team_t *team = this_thr->th.th_team;
7114#if KMP_DEBUG
7115  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7116    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7117                     (void *)__kmp_teams_master);
7118#endif
7119  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7120  __kmp_teams_master(gtid);
7121  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7122  return 1;
7123}
7124#endif /* OMP_40_ENABLED */
7125
7126/* this sets the requested number of threads for the next parallel region
7127   encountered by this team. since this should be enclosed in the forkjoin
7128   critical section it should avoid race conditions with assymmetrical nested
7129   parallelism */
7130
7131void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7132  kmp_info_t *thr = __kmp_threads[gtid];
7133
7134  if (num_threads > 0)
7135    thr->th.th_set_nproc = num_threads;
7136}
7137
7138#if OMP_40_ENABLED
7139
7140/* this sets the requested number of teams for the teams region and/or
7141   the number of threads for the next parallel region encountered  */
7142void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7143                          int num_threads) {
7144  kmp_info_t *thr = __kmp_threads[gtid];
7145  KMP_DEBUG_ASSERT(num_teams >= 0);
7146  KMP_DEBUG_ASSERT(num_threads >= 0);
7147
7148  if (num_teams == 0)
7149    num_teams = 1; // default number of teams is 1.
7150  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7151    if (!__kmp_reserve_warn) {
7152      __kmp_reserve_warn = 1;
7153      __kmp_msg(kmp_ms_warning,
7154                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7155                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7156    }
7157    num_teams = __kmp_teams_max_nth;
7158  }
7159  // Set number of teams (number of threads in the outer "parallel" of the
7160  // teams)
7161  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7162
7163  // Remember the number of threads for inner parallel regions
7164  if (num_threads == 0) {
7165    if (!TCR_4(__kmp_init_middle))
7166      __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7167    num_threads = __kmp_avail_proc / num_teams;
7168    if (num_teams * num_threads > __kmp_teams_max_nth) {
7169      // adjust num_threads w/o warning as it is not user setting
7170      num_threads = __kmp_teams_max_nth / num_teams;
7171    }
7172  } else {
7173    if (num_teams * num_threads > __kmp_teams_max_nth) {
7174      int new_threads = __kmp_teams_max_nth / num_teams;
7175      if (!__kmp_reserve_warn) { // user asked for too many threads
7176        __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7177        __kmp_msg(kmp_ms_warning,
7178                  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7179                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7180      }
7181      num_threads = new_threads;
7182    }
7183  }
7184  thr->th.th_teams_size.nth = num_threads;
7185}
7186
7187// Set the proc_bind var to use in the following parallel region.
7188void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7189  kmp_info_t *thr = __kmp_threads[gtid];
7190  thr->th.th_set_proc_bind = proc_bind;
7191}
7192
7193#endif /* OMP_40_ENABLED */
7194
7195/* Launch the worker threads into the microtask. */
7196
7197void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7198  kmp_info_t *this_thr = __kmp_threads[gtid];
7199
7200#ifdef KMP_DEBUG
7201  int f;
7202#endif /* KMP_DEBUG */
7203
7204  KMP_DEBUG_ASSERT(team);
7205  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7206  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7207  KMP_MB(); /* Flush all pending memory write invalidates.  */
7208
7209  team->t.t_construct = 0; /* no single directives seen yet */
7210  team->t.t_ordered.dt.t_value =
7211      0; /* thread 0 enters the ordered section first */
7212
7213  /* Reset the identifiers on the dispatch buffer */
7214  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7215  if (team->t.t_max_nproc > 1) {
7216    int i;
7217    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7218      team->t.t_disp_buffer[i].buffer_index = i;
7219#if OMP_45_ENABLED
7220      team->t.t_disp_buffer[i].doacross_buf_idx = i;
7221#endif
7222    }
7223  } else {
7224    team->t.t_disp_buffer[0].buffer_index = 0;
7225#if OMP_45_ENABLED
7226    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7227#endif
7228  }
7229
7230  KMP_MB(); /* Flush all pending memory write invalidates.  */
7231  KMP_ASSERT(this_thr->th.th_team == team);
7232
7233#ifdef KMP_DEBUG
7234  for (f = 0; f < team->t.t_nproc; f++) {
7235    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7236                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7237  }
7238#endif /* KMP_DEBUG */
7239
7240  /* release the worker threads so they may begin working */
7241  __kmp_fork_barrier(gtid, 0);
7242}
7243
7244void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7245  kmp_info_t *this_thr = __kmp_threads[gtid];
7246
7247  KMP_DEBUG_ASSERT(team);
7248  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7249  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7250  KMP_MB(); /* Flush all pending memory write invalidates.  */
7251
7252/* Join barrier after fork */
7253
7254#ifdef KMP_DEBUG
7255  if (__kmp_threads[gtid] &&
7256      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7257    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7258                 __kmp_threads[gtid]);
7259    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7260                 "team->t.t_nproc=%d\n",
7261                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7262                 team->t.t_nproc);
7263    __kmp_print_structure();
7264  }
7265  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7266                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7267#endif /* KMP_DEBUG */
7268
7269  __kmp_join_barrier(gtid); /* wait for everyone */
7270#if OMPT_SUPPORT
7271  if (ompt_enabled.enabled &&
7272      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7273    int ds_tid = this_thr->th.th_info.ds.ds_tid;
7274    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7275    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7276#if OMPT_OPTIONAL
7277    void *codeptr = NULL;
7278    if (KMP_MASTER_TID(ds_tid) &&
7279        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7280         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7281      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7282
7283    if (ompt_enabled.ompt_callback_sync_region_wait) {
7284      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7285          ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7286    }
7287    if (ompt_enabled.ompt_callback_sync_region) {
7288      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7289          ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7290    }
7291#endif
7292    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7293      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7294          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7295    }
7296  }
7297#endif
7298
7299  KMP_MB(); /* Flush all pending memory write invalidates.  */
7300  KMP_ASSERT(this_thr->th.th_team == team);
7301}
7302
7303/* ------------------------------------------------------------------------ */
7304
7305#ifdef USE_LOAD_BALANCE
7306
7307// Return the worker threads actively spinning in the hot team, if we
7308// are at the outermost level of parallelism.  Otherwise, return 0.
7309static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7310  int i;
7311  int retval;
7312  kmp_team_t *hot_team;
7313
7314  if (root->r.r_active) {
7315    return 0;
7316  }
7317  hot_team = root->r.r_hot_team;
7318  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7319    return hot_team->t.t_nproc - 1; // Don't count master thread
7320  }
7321
7322  // Skip the master thread - it is accounted for elsewhere.
7323  retval = 0;
7324  for (i = 1; i < hot_team->t.t_nproc; i++) {
7325    if (hot_team->t.t_threads[i]->th.th_active) {
7326      retval++;
7327    }
7328  }
7329  return retval;
7330}
7331
7332// Perform an automatic adjustment to the number of
7333// threads used by the next parallel region.
7334static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7335  int retval;
7336  int pool_active;
7337  int hot_team_active;
7338  int team_curr_active;
7339  int system_active;
7340
7341  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7342                set_nproc));
7343  KMP_DEBUG_ASSERT(root);
7344  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7345                       ->th.th_current_task->td_icvs.dynamic == TRUE);
7346  KMP_DEBUG_ASSERT(set_nproc > 1);
7347
7348  if (set_nproc == 1) {
7349    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7350    return 1;
7351  }
7352
7353  // Threads that are active in the thread pool, active in the hot team for this
7354  // particular root (if we are at the outer par level), and the currently
7355  // executing thread (to become the master) are available to add to the new
7356  // team, but are currently contributing to the system load, and must be
7357  // accounted for.
7358  pool_active = __kmp_thread_pool_active_nth;
7359  hot_team_active = __kmp_active_hot_team_nproc(root);
7360  team_curr_active = pool_active + hot_team_active + 1;
7361
7362  // Check the system load.
7363  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7364  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7365                "hot team active = %d\n",
7366                system_active, pool_active, hot_team_active));
7367
7368  if (system_active < 0) {
7369    // There was an error reading the necessary info from /proc, so use the
7370    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7371    // = dynamic_thread_limit, we shouldn't wind up getting back here.
7372    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7373    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7374
7375    // Make this call behave like the thread limit algorithm.
7376    retval = __kmp_avail_proc - __kmp_nth +
7377             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7378    if (retval > set_nproc) {
7379      retval = set_nproc;
7380    }
7381    if (retval < KMP_MIN_NTH) {
7382      retval = KMP_MIN_NTH;
7383    }
7384
7385    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7386                  retval));
7387    return retval;
7388  }
7389
7390  // There is a slight delay in the load balance algorithm in detecting new
7391  // running procs. The real system load at this instant should be at least as
7392  // large as the #active omp thread that are available to add to the team.
7393  if (system_active < team_curr_active) {
7394    system_active = team_curr_active;
7395  }
7396  retval = __kmp_avail_proc - system_active + team_curr_active;
7397  if (retval > set_nproc) {
7398    retval = set_nproc;
7399  }
7400  if (retval < KMP_MIN_NTH) {
7401    retval = KMP_MIN_NTH;
7402  }
7403
7404  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7405  return retval;
7406} // __kmp_load_balance_nproc()
7407
7408#endif /* USE_LOAD_BALANCE */
7409
7410/* ------------------------------------------------------------------------ */
7411
7412/* NOTE: this is called with the __kmp_init_lock held */
7413void __kmp_cleanup(void) {
7414  int f;
7415
7416  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7417
7418  if (TCR_4(__kmp_init_parallel)) {
7419#if KMP_HANDLE_SIGNALS
7420    __kmp_remove_signals();
7421#endif
7422    TCW_4(__kmp_init_parallel, FALSE);
7423  }
7424
7425  if (TCR_4(__kmp_init_middle)) {
7426#if KMP_AFFINITY_SUPPORTED
7427    __kmp_affinity_uninitialize();
7428#endif /* KMP_AFFINITY_SUPPORTED */
7429    __kmp_cleanup_hierarchy();
7430    TCW_4(__kmp_init_middle, FALSE);
7431  }
7432
7433  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7434
7435  if (__kmp_init_serial) {
7436    __kmp_runtime_destroy();
7437    __kmp_init_serial = FALSE;
7438  }
7439
7440  __kmp_cleanup_threadprivate_caches();
7441
7442  for (f = 0; f < __kmp_threads_capacity; f++) {
7443    if (__kmp_root[f] != NULL) {
7444      __kmp_free(__kmp_root[f]);
7445      __kmp_root[f] = NULL;
7446    }
7447  }
7448  __kmp_free(__kmp_threads);
7449  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7450  // there is no need in freeing __kmp_root.
7451  __kmp_threads = NULL;
7452  __kmp_root = NULL;
7453  __kmp_threads_capacity = 0;
7454
7455#if KMP_USE_DYNAMIC_LOCK
7456  __kmp_cleanup_indirect_user_locks();
7457#else
7458  __kmp_cleanup_user_locks();
7459#endif
7460
7461#if KMP_AFFINITY_SUPPORTED
7462  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7463  __kmp_cpuinfo_file = NULL;
7464#endif /* KMP_AFFINITY_SUPPORTED */
7465
7466#if KMP_USE_ADAPTIVE_LOCKS
7467#if KMP_DEBUG_ADAPTIVE_LOCKS
7468  __kmp_print_speculative_stats();
7469#endif
7470#endif
7471  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7472  __kmp_nested_nth.nth = NULL;
7473  __kmp_nested_nth.size = 0;
7474  __kmp_nested_nth.used = 0;
7475  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7476  __kmp_nested_proc_bind.bind_types = NULL;
7477  __kmp_nested_proc_bind.size = 0;
7478  __kmp_nested_proc_bind.used = 0;
7479#if OMP_50_ENABLED
7480  if (__kmp_affinity_format) {
7481    KMP_INTERNAL_FREE(__kmp_affinity_format);
7482    __kmp_affinity_format = NULL;
7483  }
7484#endif
7485
7486  __kmp_i18n_catclose();
7487
7488#if KMP_USE_HIER_SCHED
7489  __kmp_hier_scheds.deallocate();
7490#endif
7491
7492#if KMP_STATS_ENABLED
7493  __kmp_stats_fini();
7494#endif
7495
7496  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7497}
7498
7499/* ------------------------------------------------------------------------ */
7500
7501int __kmp_ignore_mppbeg(void) {
7502  char *env;
7503
7504  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7505    if (__kmp_str_match_false(env))
7506      return FALSE;
7507  }
7508  // By default __kmpc_begin() is no-op.
7509  return TRUE;
7510}
7511
7512int __kmp_ignore_mppend(void) {
7513  char *env;
7514
7515  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7516    if (__kmp_str_match_false(env))
7517      return FALSE;
7518  }
7519  // By default __kmpc_end() is no-op.
7520  return TRUE;
7521}
7522
7523void __kmp_internal_begin(void) {
7524  int gtid;
7525  kmp_root_t *root;
7526
7527  /* this is a very important step as it will register new sibling threads
7528     and assign these new uber threads a new gtid */
7529  gtid = __kmp_entry_gtid();
7530  root = __kmp_threads[gtid]->th.th_root;
7531  KMP_ASSERT(KMP_UBER_GTID(gtid));
7532
7533  if (root->r.r_begin)
7534    return;
7535  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7536  if (root->r.r_begin) {
7537    __kmp_release_lock(&root->r.r_begin_lock, gtid);
7538    return;
7539  }
7540
7541  root->r.r_begin = TRUE;
7542
7543  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7544}
7545
7546/* ------------------------------------------------------------------------ */
7547
7548void __kmp_user_set_library(enum library_type arg) {
7549  int gtid;
7550  kmp_root_t *root;
7551  kmp_info_t *thread;
7552
7553  /* first, make sure we are initialized so we can get our gtid */
7554
7555  gtid = __kmp_entry_gtid();
7556  thread = __kmp_threads[gtid];
7557
7558  root = thread->th.th_root;
7559
7560  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7561                library_serial));
7562  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7563                                  thread */
7564    KMP_WARNING(SetLibraryIncorrectCall);
7565    return;
7566  }
7567
7568  switch (arg) {
7569  case library_serial:
7570    thread->th.th_set_nproc = 0;
7571    set__nproc(thread, 1);
7572    break;
7573  case library_turnaround:
7574    thread->th.th_set_nproc = 0;
7575    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7576                                           : __kmp_dflt_team_nth_ub);
7577    break;
7578  case library_throughput:
7579    thread->th.th_set_nproc = 0;
7580    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7581                                           : __kmp_dflt_team_nth_ub);
7582    break;
7583  default:
7584    KMP_FATAL(UnknownLibraryType, arg);
7585  }
7586
7587  __kmp_aux_set_library(arg);
7588}
7589
7590void __kmp_aux_set_stacksize(size_t arg) {
7591  if (!__kmp_init_serial)
7592    __kmp_serial_initialize();
7593
7594#if KMP_OS_DARWIN
7595  if (arg & (0x1000 - 1)) {
7596    arg &= ~(0x1000 - 1);
7597    if (arg + 0x1000) /* check for overflow if we round up */
7598      arg += 0x1000;
7599  }
7600#endif
7601  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7602
7603  /* only change the default stacksize before the first parallel region */
7604  if (!TCR_4(__kmp_init_parallel)) {
7605    size_t value = arg; /* argument is in bytes */
7606
7607    if (value < __kmp_sys_min_stksize)
7608      value = __kmp_sys_min_stksize;
7609    else if (value > KMP_MAX_STKSIZE)
7610      value = KMP_MAX_STKSIZE;
7611
7612    __kmp_stksize = value;
7613
7614    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7615  }
7616
7617  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7618}
7619
7620/* set the behaviour of the runtime library */
7621/* TODO this can cause some odd behaviour with sibling parallelism... */
7622void __kmp_aux_set_library(enum library_type arg) {
7623  __kmp_library = arg;
7624
7625  switch (__kmp_library) {
7626  case library_serial: {
7627    KMP_INFORM(LibraryIsSerial);
7628    (void)__kmp_change_library(TRUE);
7629  } break;
7630  case library_turnaround:
7631    (void)__kmp_change_library(TRUE);
7632    break;
7633  case library_throughput:
7634    (void)__kmp_change_library(FALSE);
7635    break;
7636  default:
7637    KMP_FATAL(UnknownLibraryType, arg);
7638  }
7639}
7640
7641/* Getting team information common for all team API */
7642// Returns NULL if not in teams construct
7643static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7644  kmp_info_t *thr = __kmp_entry_thread();
7645  teams_serialized = 0;
7646  if (thr->th.th_teams_microtask) {
7647    kmp_team_t *team = thr->th.th_team;
7648    int tlevel = thr->th.th_teams_level; // the level of the teams construct
7649    int ii = team->t.t_level;
7650    teams_serialized = team->t.t_serialized;
7651    int level = tlevel + 1;
7652    KMP_DEBUG_ASSERT(ii >= tlevel);
7653    while (ii > level) {
7654      for (teams_serialized = team->t.t_serialized;
7655           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7656      }
7657      if (team->t.t_serialized && (!teams_serialized)) {
7658        team = team->t.t_parent;
7659        continue;
7660      }
7661      if (ii > level) {
7662        team = team->t.t_parent;
7663        ii--;
7664      }
7665    }
7666    return team;
7667  }
7668  return NULL;
7669}
7670
7671int __kmp_aux_get_team_num() {
7672  int serialized;
7673  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7674  if (team) {
7675    if (serialized > 1) {
7676      return 0; // teams region is serialized ( 1 team of 1 thread ).
7677    } else {
7678      return team->t.t_master_tid;
7679    }
7680  }
7681  return 0;
7682}
7683
7684int __kmp_aux_get_num_teams() {
7685  int serialized;
7686  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7687  if (team) {
7688    if (serialized > 1) {
7689      return 1;
7690    } else {
7691      return team->t.t_parent->t.t_nproc;
7692    }
7693  }
7694  return 1;
7695}
7696
7697/* ------------------------------------------------------------------------ */
7698
7699#if OMP_50_ENABLED
7700/*
7701 * Affinity Format Parser
7702 *
7703 * Field is in form of: %[[[0].]size]type
7704 * % and type are required (%% means print a literal '%')
7705 * type is either single char or long name surrounded by {},
7706 * e.g., N or {num_threads}
7707 * 0 => leading zeros
7708 * . => right justified when size is specified
7709 * by default output is left justified
7710 * size is the *minimum* field length
7711 * All other characters are printed as is
7712 *
7713 * Available field types:
7714 * L {thread_level}      - omp_get_level()
7715 * n {thread_num}        - omp_get_thread_num()
7716 * h {host}              - name of host machine
7717 * P {process_id}        - process id (integer)
7718 * T {thread_identifier} - native thread identifier (integer)
7719 * N {num_threads}       - omp_get_num_threads()
7720 * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7721 * a {thread_affinity}   - comma separated list of integers or integer ranges
7722 *                         (values of affinity mask)
7723 *
7724 * Implementation-specific field types can be added
7725 * If a type is unknown, print "undefined"
7726*/
7727
7728// Structure holding the short name, long name, and corresponding data type
7729// for snprintf.  A table of these will represent the entire valid keyword
7730// field types.
7731typedef struct kmp_affinity_format_field_t {
7732  char short_name; // from spec e.g., L -> thread level
7733  const char *long_name; // from spec thread_level -> thread level
7734  char field_format; // data type for snprintf (typically 'd' or 's'
7735  // for integer or string)
7736} kmp_affinity_format_field_t;
7737
7738static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7739#if KMP_AFFINITY_SUPPORTED
7740    {'A', "thread_affinity", 's'},
7741#endif
7742    {'t', "team_num", 'd'},
7743    {'T', "num_teams", 'd'},
7744    {'L', "nesting_level", 'd'},
7745    {'n', "thread_num", 'd'},
7746    {'N', "num_threads", 'd'},
7747    {'a', "ancestor_tnum", 'd'},
7748    {'H', "host", 's'},
7749    {'P', "process_id", 'd'},
7750    {'i', "native_thread_id", 'd'}};
7751
7752// Return the number of characters it takes to hold field
7753static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7754                                            const char **ptr,
7755                                            kmp_str_buf_t *field_buffer) {
7756  int rc, format_index, field_value;
7757  const char *width_left, *width_right;
7758  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7759  static const int FORMAT_SIZE = 20;
7760  char format[FORMAT_SIZE] = {0};
7761  char absolute_short_name = 0;
7762
7763  KMP_DEBUG_ASSERT(gtid >= 0);
7764  KMP_DEBUG_ASSERT(th);
7765  KMP_DEBUG_ASSERT(**ptr == '%');
7766  KMP_DEBUG_ASSERT(field_buffer);
7767
7768  __kmp_str_buf_clear(field_buffer);
7769
7770  // Skip the initial %
7771  (*ptr)++;
7772
7773  // Check for %% first
7774  if (**ptr == '%') {
7775    __kmp_str_buf_cat(field_buffer, "%", 1);
7776    (*ptr)++; // skip over the second %
7777    return 1;
7778  }
7779
7780  // Parse field modifiers if they are present
7781  pad_zeros = false;
7782  if (**ptr == '0') {
7783    pad_zeros = true;
7784    (*ptr)++; // skip over 0
7785  }
7786  right_justify = false;
7787  if (**ptr == '.') {
7788    right_justify = true;
7789    (*ptr)++; // skip over .
7790  }
7791  // Parse width of field: [width_left, width_right)
7792  width_left = width_right = NULL;
7793  if (**ptr >= '0' && **ptr <= '9') {
7794    width_left = *ptr;
7795    SKIP_DIGITS(*ptr);
7796    width_right = *ptr;
7797  }
7798
7799  // Create the format for KMP_SNPRINTF based on flags parsed above
7800  format_index = 0;
7801  format[format_index++] = '%';
7802  if (!right_justify)
7803    format[format_index++] = '-';
7804  if (pad_zeros)
7805    format[format_index++] = '0';
7806  if (width_left && width_right) {
7807    int i = 0;
7808    // Only allow 8 digit number widths.
7809    // This also prevents overflowing format variable
7810    while (i < 8 && width_left < width_right) {
7811      format[format_index++] = *width_left;
7812      width_left++;
7813      i++;
7814    }
7815  }
7816
7817  // Parse a name (long or short)
7818  // Canonicalize the name into absolute_short_name
7819  found_valid_name = false;
7820  parse_long_name = (**ptr == '{');
7821  if (parse_long_name)
7822    (*ptr)++; // skip initial left brace
7823  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7824                             sizeof(__kmp_affinity_format_table[0]);
7825       ++i) {
7826    char short_name = __kmp_affinity_format_table[i].short_name;
7827    const char *long_name = __kmp_affinity_format_table[i].long_name;
7828    char field_format = __kmp_affinity_format_table[i].field_format;
7829    if (parse_long_name) {
7830      int length = KMP_STRLEN(long_name);
7831      if (strncmp(*ptr, long_name, length) == 0) {
7832        found_valid_name = true;
7833        (*ptr) += length; // skip the long name
7834      }
7835    } else if (**ptr == short_name) {
7836      found_valid_name = true;
7837      (*ptr)++; // skip the short name
7838    }
7839    if (found_valid_name) {
7840      format[format_index++] = field_format;
7841      format[format_index++] = '\0';
7842      absolute_short_name = short_name;
7843      break;
7844    }
7845  }
7846  if (parse_long_name) {
7847    if (**ptr != '}') {
7848      absolute_short_name = 0;
7849    } else {
7850      (*ptr)++; // skip over the right brace
7851    }
7852  }
7853
7854  // Attempt to fill the buffer with the requested
7855  // value using snprintf within __kmp_str_buf_print()
7856  switch (absolute_short_name) {
7857  case 't':
7858    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7859    break;
7860  case 'T':
7861    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7862    break;
7863  case 'L':
7864    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7865    break;
7866  case 'n':
7867    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7868    break;
7869  case 'H': {
7870    static const int BUFFER_SIZE = 256;
7871    char buf[BUFFER_SIZE];
7872    __kmp_expand_host_name(buf, BUFFER_SIZE);
7873    rc = __kmp_str_buf_print(field_buffer, format, buf);
7874  } break;
7875  case 'P':
7876    rc = __kmp_str_buf_print(field_buffer, format, getpid());
7877    break;
7878  case 'i':
7879    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7880    break;
7881  case 'N':
7882    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7883    break;
7884  case 'a':
7885    field_value =
7886        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7887    rc = __kmp_str_buf_print(field_buffer, format, field_value);
7888    break;
7889#if KMP_AFFINITY_SUPPORTED
7890  case 'A': {
7891    kmp_str_buf_t buf;
7892    __kmp_str_buf_init(&buf);
7893    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7894    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7895    __kmp_str_buf_free(&buf);
7896  } break;
7897#endif
7898  default:
7899    // According to spec, If an implementation does not have info for field
7900    // type, then "undefined" is printed
7901    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7902    // Skip the field
7903    if (parse_long_name) {
7904      SKIP_TOKEN(*ptr);
7905      if (**ptr == '}')
7906        (*ptr)++;
7907    } else {
7908      (*ptr)++;
7909    }
7910  }
7911
7912  KMP_ASSERT(format_index <= FORMAT_SIZE);
7913  return rc;
7914}
7915
7916/*
7917 * Return number of characters needed to hold the affinity string
7918 * (not including null byte character)
7919 * The resultant string is printed to buffer, which the caller can then
7920 * handle afterwards
7921*/
7922size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7923                                  kmp_str_buf_t *buffer) {
7924  const char *parse_ptr;
7925  size_t retval;
7926  const kmp_info_t *th;
7927  kmp_str_buf_t field;
7928
7929  KMP_DEBUG_ASSERT(buffer);
7930  KMP_DEBUG_ASSERT(gtid >= 0);
7931
7932  __kmp_str_buf_init(&field);
7933  __kmp_str_buf_clear(buffer);
7934
7935  th = __kmp_threads[gtid];
7936  retval = 0;
7937
7938  // If format is NULL or zero-length string, then we use
7939  // affinity-format-var ICV
7940  parse_ptr = format;
7941  if (parse_ptr == NULL || *parse_ptr == '\0') {
7942    parse_ptr = __kmp_affinity_format;
7943  }
7944  KMP_DEBUG_ASSERT(parse_ptr);
7945
7946  while (*parse_ptr != '\0') {
7947    // Parse a field
7948    if (*parse_ptr == '%') {
7949      // Put field in the buffer
7950      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7951      __kmp_str_buf_catbuf(buffer, &field);
7952      retval += rc;
7953    } else {
7954      // Put literal character in buffer
7955      __kmp_str_buf_cat(buffer, parse_ptr, 1);
7956      retval++;
7957      parse_ptr++;
7958    }
7959  }
7960  __kmp_str_buf_free(&field);
7961  return retval;
7962}
7963
7964// Displays the affinity string to stdout
7965void __kmp_aux_display_affinity(int gtid, const char *format) {
7966  kmp_str_buf_t buf;
7967  __kmp_str_buf_init(&buf);
7968  __kmp_aux_capture_affinity(gtid, format, &buf);
7969  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7970  __kmp_str_buf_free(&buf);
7971}
7972#endif // OMP_50_ENABLED
7973
7974/* ------------------------------------------------------------------------ */
7975
7976void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7977  int blocktime = arg; /* argument is in milliseconds */
7978#if KMP_USE_MONITOR
7979  int bt_intervals;
7980#endif
7981  int bt_set;
7982
7983  __kmp_save_internal_controls(thread);
7984
7985  /* Normalize and set blocktime for the teams */
7986  if (blocktime < KMP_MIN_BLOCKTIME)
7987    blocktime = KMP_MIN_BLOCKTIME;
7988  else if (blocktime > KMP_MAX_BLOCKTIME)
7989    blocktime = KMP_MAX_BLOCKTIME;
7990
7991  set__blocktime_team(thread->th.th_team, tid, blocktime);
7992  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7993
7994#if KMP_USE_MONITOR
7995  /* Calculate and set blocktime intervals for the teams */
7996  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7997
7998  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7999  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8000#endif
8001
8002  /* Set whether blocktime has been set to "TRUE" */
8003  bt_set = TRUE;
8004
8005  set__bt_set_team(thread->th.th_team, tid, bt_set);
8006  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8007#if KMP_USE_MONITOR
8008  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8009                "bt_intervals=%d, monitor_updates=%d\n",
8010                __kmp_gtid_from_tid(tid, thread->th.th_team),
8011                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8012                __kmp_monitor_wakeups));
8013#else
8014  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8015                __kmp_gtid_from_tid(tid, thread->th.th_team),
8016                thread->th.th_team->t.t_id, tid, blocktime));
8017#endif
8018}
8019
8020void __kmp_aux_set_defaults(char const *str, int len) {
8021  if (!__kmp_init_serial) {
8022    __kmp_serial_initialize();
8023  }
8024  __kmp_env_initialize(str);
8025
8026  if (__kmp_settings
8027#if OMP_40_ENABLED
8028      || __kmp_display_env || __kmp_display_env_verbose
8029#endif // OMP_40_ENABLED
8030      ) {
8031    __kmp_env_print();
8032  }
8033} // __kmp_aux_set_defaults
8034
8035/* ------------------------------------------------------------------------ */
8036/* internal fast reduction routines */
8037
8038PACKED_REDUCTION_METHOD_T
8039__kmp_determine_reduction_method(
8040    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8041    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8042    kmp_critical_name *lck) {
8043
8044  // Default reduction method: critical construct ( lck != NULL, like in current
8045  // PAROPT )
8046  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8047  // can be selected by RTL
8048  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8049  // can be selected by RTL
8050  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8051  // among generated by PAROPT.
8052
8053  PACKED_REDUCTION_METHOD_T retval;
8054
8055  int team_size;
8056
8057  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8058  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8059
8060#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8061  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8062#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8063
8064  retval = critical_reduce_block;
8065
8066  // another choice of getting a team size (with 1 dynamic deference) is slower
8067  team_size = __kmp_get_team_num_threads(global_tid);
8068  if (team_size == 1) {
8069
8070    retval = empty_reduce_block;
8071
8072  } else {
8073
8074    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8075
8076#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8077
8078#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8079    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8080
8081    int teamsize_cutoff = 4;
8082
8083#if KMP_MIC_SUPPORTED
8084    if (__kmp_mic_type != non_mic) {
8085      teamsize_cutoff = 8;
8086    }
8087#endif
8088    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8089    if (tree_available) {
8090      if (team_size <= teamsize_cutoff) {
8091        if (atomic_available) {
8092          retval = atomic_reduce_block;
8093        }
8094      } else {
8095        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8096      }
8097    } else if (atomic_available) {
8098      retval = atomic_reduce_block;
8099    }
8100#else
8101#error "Unknown or unsupported OS"
8102#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8103       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8104
8105#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8106
8107#if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
8108
8109    // basic tuning
8110
8111    if (atomic_available) {
8112      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8113        retval = atomic_reduce_block;
8114      }
8115    } // otherwise: use critical section
8116
8117#elif KMP_OS_DARWIN
8118
8119    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8120    if (atomic_available && (num_vars <= 3)) {
8121      retval = atomic_reduce_block;
8122    } else if (tree_available) {
8123      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8124          (reduce_size < (2000 * sizeof(kmp_real64)))) {
8125        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8126      }
8127    } // otherwise: use critical section
8128
8129#else
8130#error "Unknown or unsupported OS"
8131#endif
8132
8133#else
8134#error "Unknown or unsupported architecture"
8135#endif
8136  }
8137
8138  // KMP_FORCE_REDUCTION
8139
8140  // If the team is serialized (team_size == 1), ignore the forced reduction
8141  // method and stay with the unsynchronized method (empty_reduce_block)
8142  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8143      team_size != 1) {
8144
8145    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8146
8147    int atomic_available, tree_available;
8148
8149    switch ((forced_retval = __kmp_force_reduction_method)) {
8150    case critical_reduce_block:
8151      KMP_ASSERT(lck); // lck should be != 0
8152      break;
8153
8154    case atomic_reduce_block:
8155      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8156      if (!atomic_available) {
8157        KMP_WARNING(RedMethodNotSupported, "atomic");
8158        forced_retval = critical_reduce_block;
8159      }
8160      break;
8161
8162    case tree_reduce_block:
8163      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8164      if (!tree_available) {
8165        KMP_WARNING(RedMethodNotSupported, "tree");
8166        forced_retval = critical_reduce_block;
8167      } else {
8168#if KMP_FAST_REDUCTION_BARRIER
8169        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8170#endif
8171      }
8172      break;
8173
8174    default:
8175      KMP_ASSERT(0); // "unsupported method specified"
8176    }
8177
8178    retval = forced_retval;
8179  }
8180
8181  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8182
8183#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8184#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8185
8186  return (retval);
8187}
8188
8189// this function is for testing set/get/determine reduce method
8190kmp_int32 __kmp_get_reduce_method(void) {
8191  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8192}
8193