1/*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#define __KMP_IMP
14#include "omp.h" /* extern "C" declarations of user-visible routines */
15#include "kmp.h"
16#include "kmp_error.h"
17#include "kmp_i18n.h"
18#include "kmp_itt.h"
19#include "kmp_lock.h"
20#include "kmp_stats.h"
21#include "ompt-specific.h"
22
23#define MAX_MESSAGE 512
24
25// flags will be used in future, e.g. to implement openmp_strict library
26// restrictions
27
28/*!
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc   in   source location information
31 * @param flags in   for future use (currently ignored)
32 *
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
35 */
36void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37  // By default __kmpc_begin() is no-op.
38  char *env;
39  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40      __kmp_str_match_true(env)) {
41    __kmp_middle_initialize();
42    KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43  } else if (__kmp_ignore_mppbeg() == FALSE) {
44    // By default __kmp_ignore_mppbeg() returns TRUE.
45    __kmp_internal_begin();
46    KC_TRACE(10, ("__kmpc_begin: called\n"));
47  }
48}
49
50/*!
51 * @ingroup STARTUP_SHUTDOWN
52 * @param loc source location information
53 *
54 * Shutdown the runtime library. This is also optional, and even if called will
55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56 * zero.
57 */
58void __kmpc_end(ident_t *loc) {
59  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62  // returns FALSE and __kmpc_end() will unregister this root (it can cause
63  // library shut down).
64  if (__kmp_ignore_mppend() == FALSE) {
65    KC_TRACE(10, ("__kmpc_end: called\n"));
66    KA_TRACE(30, ("__kmpc_end\n"));
67
68    __kmp_internal_end_thread(-1);
69  }
70#if KMP_OS_WINDOWS && OMPT_SUPPORT
71  // Normal exit process on Windows does not allow worker threads of the final
72  // parallel region to finish reporting their events, so shutting down the
73  // library here fixes the issue at least for the cases where __kmpc_end() is
74  // placed properly.
75  if (ompt_enabled.enabled)
76    __kmp_internal_end_library(__kmp_gtid_get_specific());
77#endif
78}
79
80/*!
81@ingroup THREAD_STATES
82@param loc Source location information.
83@return The global thread index of the active thread.
84
85This function can be called in any context.
86
87If the runtime has ony been entered at the outermost level from a
88single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89that which would be returned by omp_get_thread_num() in the outermost
90active parallel construct. (Or zero if there is no active parallel
91construct, since the master thread is necessarily thread zero).
92
93If multiple non-OpenMP threads all enter an OpenMP construct then this
94will be a unique thread identifier among all the threads created by
95the OpenMP runtime (but the value cannot be defined in terms of
96OpenMP thread ids returned by omp_get_thread_num()).
97*/
98kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99  kmp_int32 gtid = __kmp_entry_gtid();
100
101  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102
103  return gtid;
104}
105
106/*!
107@ingroup THREAD_STATES
108@param loc Source location information.
109@return The number of threads under control of the OpenMP<sup>*</sup> runtime
110
111This function can be called in any context.
112It returns the total number of threads under the control of the OpenMP runtime.
113That is not a number that can be determined by any OpenMP standard calls, since
114the library may be called from more than one non-OpenMP thread, and this
115reflects the total over all such calls. Similarly the runtime maintains
116underlying threads even when they are not active (since the cost of creating
117and destroying OS threads is high), this call counts all such threads even if
118they are not waiting for work.
119*/
120kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121  KC_TRACE(10,
122           ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123
124  return TCR_4(__kmp_all_nth);
125}
126
127/*!
128@ingroup THREAD_STATES
129@param loc Source location information.
130@return The thread number of the calling thread in the innermost active parallel
131construct.
132*/
133kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135  return __kmp_tid_from_gtid(__kmp_entry_gtid());
136}
137
138/*!
139@ingroup THREAD_STATES
140@param loc Source location information.
141@return The number of threads in the innermost active parallel construct.
142*/
143kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145
146  return __kmp_entry_thread()->th.th_team->t.t_nproc;
147}
148
149/*!
150 * @ingroup DEPRECATED
151 * @param loc location description
152 *
153 * This function need not be called. It always returns TRUE.
154 */
155kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156#ifndef KMP_DEBUG
157
158  return TRUE;
159
160#else
161
162  const char *semi2;
163  const char *semi3;
164  int line_no;
165
166  if (__kmp_par_range == 0) {
167    return TRUE;
168  }
169  semi2 = loc->psource;
170  if (semi2 == NULL) {
171    return TRUE;
172  }
173  semi2 = strchr(semi2, ';');
174  if (semi2 == NULL) {
175    return TRUE;
176  }
177  semi2 = strchr(semi2 + 1, ';');
178  if (semi2 == NULL) {
179    return TRUE;
180  }
181  if (__kmp_par_range_filename[0]) {
182    const char *name = semi2 - 1;
183    while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184      name--;
185    }
186    if ((*name == '/') || (*name == ';')) {
187      name++;
188    }
189    if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190      return __kmp_par_range < 0;
191    }
192  }
193  semi3 = strchr(semi2 + 1, ';');
194  if (__kmp_par_range_routine[0]) {
195    if ((semi3 != NULL) && (semi3 > semi2) &&
196        (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197      return __kmp_par_range < 0;
198    }
199  }
200  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201    if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202      return __kmp_par_range > 0;
203    }
204    return __kmp_par_range < 0;
205  }
206  return TRUE;
207
208#endif /* KMP_DEBUG */
209}
210
211/*!
212@ingroup THREAD_STATES
213@param loc Source location information.
214@return 1 if this thread is executing inside an active parallel region, zero if
215not.
216*/
217kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218  return __kmp_entry_thread()->th.th_root->r.r_active;
219}
220
221/*!
222@ingroup PARALLEL
223@param loc source location information
224@param global_tid global thread number
225@param num_threads number of threads requested for this parallel construct
226
227Set the number of threads to be used by the next fork spawned by this thread.
228This call is only required if the parallel construct has a `num_threads` clause.
229*/
230void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                             kmp_int32 num_threads) {
232  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                global_tid, num_threads));
234
235  __kmp_push_num_threads(loc, global_tid, num_threads);
236}
237
238void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240
241  /* the num_threads are automatically popped */
242}
243
244void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                           kmp_int32 proc_bind) {
246  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                proc_bind));
248
249  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250}
251
252/*!
253@ingroup PARALLEL
254@param loc  source location information
255@param argc  total number of arguments in the ellipsis
256@param microtask  pointer to callback routine consisting of outlined parallel
257construct
258@param ...  pointers to shared variables that aren't global
259
260Do the actual fork and call the microtask in the relevant number of threads.
261*/
262void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263  int gtid = __kmp_entry_gtid();
264
265#if (KMP_STATS_ENABLED)
266  // If we were in a serial region, then stop the serial timer, record
267  // the event, and start parallel region timer
268  stats_state_e previous_state = KMP_GET_THREAD_STATE();
269  if (previous_state == stats_state_e::SERIAL_REGION) {
270    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271  } else {
272    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273  }
274  int inParallel = __kmpc_in_parallel(loc);
275  if (inParallel) {
276    KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277  } else {
278    KMP_COUNT_BLOCK(OMP_PARALLEL);
279  }
280#endif
281
282  // maybe to save thr_state is enough here
283  {
284    va_list ap;
285    va_start(ap, microtask);
286
287#if OMPT_SUPPORT
288    ompt_frame_t *ompt_frame;
289    if (ompt_enabled.enabled) {
290      kmp_info_t *master_th = __kmp_threads[gtid];
291      kmp_team_t *parent_team = master_th->th.th_team;
292      ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293      if (lwt)
294        ompt_frame = &(lwt->ompt_task_info.frame);
295      else {
296        int tid = __kmp_tid_from_gtid(gtid);
297        ompt_frame = &(
298            parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299      }
300      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301      OMPT_STORE_RETURN_ADDRESS(gtid);
302    }
303#endif
304
305#if INCLUDE_SSC_MARKS
306    SSC_MARK_FORKING();
307#endif
308    __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                    VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                    VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311                    kmp_va_addr_of(ap));
312#if INCLUDE_SSC_MARKS
313    SSC_MARK_JOINING();
314#endif
315    __kmp_join_call(loc, gtid
316#if OMPT_SUPPORT
317                    ,
318                    fork_context_intel
319#endif
320                    );
321
322    va_end(ap);
323  }
324
325#if KMP_STATS_ENABLED
326  if (previous_state == stats_state_e::SERIAL_REGION) {
327    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328  } else {
329    KMP_POP_PARTITIONED_TIMER();
330  }
331#endif // KMP_STATS_ENABLED
332}
333
334/*!
335@ingroup PARALLEL
336@param loc source location information
337@param global_tid global thread number
338@param num_teams number of teams requested for the teams construct
339@param num_threads number of threads per team requested for the teams construct
340
341Set the number of teams to be used by the teams construct.
342This call is only required if the teams construct has a `num_teams` clause
343or a `thread_limit` clause (or both).
344*/
345void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                           kmp_int32 num_teams, kmp_int32 num_threads) {
347  KA_TRACE(20,
348           ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349            global_tid, num_teams, num_threads));
350
351  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352}
353
354/*!
355@ingroup PARALLEL
356@param loc  source location information
357@param argc  total number of arguments in the ellipsis
358@param microtask  pointer to callback routine consisting of outlined teams
359construct
360@param ...  pointers to shared variables that aren't global
361
362Do the actual fork and call the microtask in the relevant number of threads.
363*/
364void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
365                       ...) {
366  int gtid = __kmp_entry_gtid();
367  kmp_info_t *this_thr = __kmp_threads[gtid];
368  va_list ap;
369  va_start(ap, microtask);
370
371#if KMP_STATS_ENABLED
372  KMP_COUNT_BLOCK(OMP_TEAMS);
373  stats_state_e previous_state = KMP_GET_THREAD_STATE();
374  if (previous_state == stats_state_e::SERIAL_REGION) {
375    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
376  } else {
377    KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
378  }
379#endif
380
381  // remember teams entry point and nesting level
382  this_thr->th.th_teams_microtask = microtask;
383  this_thr->th.th_teams_level =
384      this_thr->th.th_team->t.t_level; // AC: can be >0 on host
385
386#if OMPT_SUPPORT
387  kmp_team_t *parent_team = this_thr->th.th_team;
388  int tid = __kmp_tid_from_gtid(gtid);
389  if (ompt_enabled.enabled) {
390    parent_team->t.t_implicit_task_taskdata[tid]
391        .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
392  }
393  OMPT_STORE_RETURN_ADDRESS(gtid);
394#endif
395
396  // check if __kmpc_push_num_teams called, set default number of teams
397  // otherwise
398  if (this_thr->th.th_teams_size.nteams == 0) {
399    __kmp_push_num_teams(loc, gtid, 0, 0);
400  }
401  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
402  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
403  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
404
405  __kmp_fork_call(
406      loc, gtid, fork_context_intel, argc,
407      VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
408      VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
409  __kmp_join_call(loc, gtid
410#if OMPT_SUPPORT
411                  ,
412                  fork_context_intel
413#endif
414                  );
415
416  // Pop current CG root off list
417  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
418  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
419  this_thr->th.th_cg_roots = tmp->up;
420  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
421                 " to node %p. cg_nthreads was %d\n",
422                 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
423  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
424  int i = tmp->cg_nthreads--;
425  if (i == 1) { // check is we are the last thread in CG (not always the case)
426    __kmp_free(tmp);
427  }
428  // Restore current task's thread_limit from CG root
429  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430  this_thr->th.th_current_task->td_icvs.thread_limit =
431      this_thr->th.th_cg_roots->cg_thread_limit;
432
433  this_thr->th.th_teams_microtask = NULL;
434  this_thr->th.th_teams_level = 0;
435  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
436  va_end(ap);
437#if KMP_STATS_ENABLED
438  if (previous_state == stats_state_e::SERIAL_REGION) {
439    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
440  } else {
441    KMP_POP_PARTITIONED_TIMER();
442  }
443#endif // KMP_STATS_ENABLED
444}
445
446// I don't think this function should ever have been exported.
447// The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
448// openmp code ever called it, but it's been exported from the RTL for so
449// long that I'm afraid to remove the definition.
450int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
451
452/*!
453@ingroup PARALLEL
454@param loc  source location information
455@param global_tid  global thread number
456
457Enter a serialized parallel construct. This interface is used to handle a
458conditional parallel region, like this,
459@code
460#pragma omp parallel if (condition)
461@endcode
462when the condition is false.
463*/
464void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
465// The implementation is now in kmp_runtime.cpp so that it can share static
466// functions with kmp_fork_call since the tasks to be done are similar in
467// each case.
468#if OMPT_SUPPORT
469  OMPT_STORE_RETURN_ADDRESS(global_tid);
470#endif
471  __kmp_serialized_parallel(loc, global_tid);
472}
473
474/*!
475@ingroup PARALLEL
476@param loc  source location information
477@param global_tid  global thread number
478
479Leave a serialized parallel construct.
480*/
481void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482  kmp_internal_control_t *top;
483  kmp_info_t *this_thr;
484  kmp_team_t *serial_team;
485
486  KC_TRACE(10,
487           ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
488
489  /* skip all this code for autopar serialized loops since it results in
490     unacceptable overhead */
491  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
492    return;
493
494  // Not autopar code
495  if (!TCR_4(__kmp_init_parallel))
496    __kmp_parallel_initialize();
497
498  __kmp_resume_if_soft_paused();
499
500  this_thr = __kmp_threads[global_tid];
501  serial_team = this_thr->th.th_serial_team;
502
503  kmp_task_team_t *task_team = this_thr->th.th_task_team;
504  // we need to wait for the proxy tasks before finishing the thread
505  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
506    __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
507
508  KMP_MB();
509  KMP_DEBUG_ASSERT(serial_team);
510  KMP_ASSERT(serial_team->t.t_serialized);
511  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
512  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
513  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
514  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
515
516#if OMPT_SUPPORT
517  if (ompt_enabled.enabled &&
518      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
519    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
520    if (ompt_enabled.ompt_callback_implicit_task) {
521      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
522          ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
523          OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
524    }
525
526    // reset clear the task id only after unlinking the task
527    ompt_data_t *parent_task_data;
528    __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
529
530    if (ompt_enabled.ompt_callback_parallel_end) {
531      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
532          &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
533          ompt_parallel_invoker_program | ompt_parallel_team,
534          OMPT_LOAD_RETURN_ADDRESS(global_tid));
535    }
536    __ompt_lw_taskteam_unlink(this_thr);
537    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
538  }
539#endif
540
541  /* If necessary, pop the internal control stack values and replace the team
542   * values */
543  top = serial_team->t.t_control_stack_top;
544  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
545    copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
546    serial_team->t.t_control_stack_top = top->next;
547    __kmp_free(top);
548  }
549
550  // if( serial_team -> t.t_serialized > 1 )
551  serial_team->t.t_level--;
552
553  /* pop dispatch buffers stack */
554  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
555  {
556    dispatch_private_info_t *disp_buffer =
557        serial_team->t.t_dispatch->th_disp_buffer;
558    serial_team->t.t_dispatch->th_disp_buffer =
559        serial_team->t.t_dispatch->th_disp_buffer->next;
560    __kmp_free(disp_buffer);
561  }
562  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
563
564  --serial_team->t.t_serialized;
565  if (serial_team->t.t_serialized == 0) {
566
567/* return to the parallel section */
568
569#if KMP_ARCH_X86 || KMP_ARCH_X86_64
570    if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
571      __kmp_clear_x87_fpu_status_word();
572      __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
573      __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
574    }
575#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
576
577    this_thr->th.th_team = serial_team->t.t_parent;
578    this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
579
580    /* restore values cached in the thread */
581    this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
582    this_thr->th.th_team_master =
583        serial_team->t.t_parent->t.t_threads[0]; /* JPH */
584    this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
585
586    /* TODO the below shouldn't need to be adjusted for serialized teams */
587    this_thr->th.th_dispatch =
588        &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
589
590    __kmp_pop_current_task_from_thread(this_thr);
591
592    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
593    this_thr->th.th_current_task->td_flags.executing = 1;
594
595    if (__kmp_tasking_mode != tskm_immediate_exec) {
596      // Copy the task team from the new child / old parent team to the thread.
597      this_thr->th.th_task_team =
598          this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
599      KA_TRACE(20,
600               ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
601                "team %p\n",
602                global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
603    }
604  } else {
605    if (__kmp_tasking_mode != tskm_immediate_exec) {
606      KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
607                    "depth of serial team %p to %d\n",
608                    global_tid, serial_team, serial_team->t.t_serialized));
609    }
610  }
611
612  if (__kmp_env_consistency_check)
613    __kmp_pop_parallel(global_tid, NULL);
614#if OMPT_SUPPORT
615  if (ompt_enabled.enabled)
616    this_thr->th.ompt_thread_info.state =
617        ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
618                                           : ompt_state_work_parallel);
619#endif
620}
621
622/*!
623@ingroup SYNCHRONIZATION
624@param loc  source location information.
625
626Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
627depending on the memory ordering convention obeyed by the compiler
628even that may not be necessary).
629*/
630void __kmpc_flush(ident_t *loc) {
631  KC_TRACE(10, ("__kmpc_flush: called\n"));
632
633  /* need explicit __mf() here since use volatile instead in library */
634  KMP_MB(); /* Flush all pending memory write invalidates.  */
635
636#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
637#if KMP_MIC
638// fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
639// We shouldn't need it, though, since the ABI rules require that
640// * If the compiler generates NGO stores it also generates the fence
641// * If users hand-code NGO stores they should insert the fence
642// therefore no incomplete unordered stores should be visible.
643#else
644  // C74404
645  // This is to address non-temporal store instructions (sfence needed).
646  // The clflush instruction is addressed either (mfence needed).
647  // Probably the non-temporal load monvtdqa instruction should also be
648  // addressed.
649  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
650  if (!__kmp_cpuinfo.initialized) {
651    __kmp_query_cpuid(&__kmp_cpuinfo);
652  }
653  if (!__kmp_cpuinfo.sse2) {
654    // CPU cannot execute SSE2 instructions.
655  } else {
656#if KMP_COMPILER_ICC
657    _mm_mfence();
658#elif KMP_COMPILER_MSVC
659    MemoryBarrier();
660#else
661    __sync_synchronize();
662#endif // KMP_COMPILER_ICC
663  }
664#endif // KMP_MIC
665#elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
666       KMP_ARCH_RISCV64)
667// Nothing to see here move along
668#elif KMP_ARCH_PPC64
669// Nothing needed here (we have a real MB above).
670#if KMP_OS_CNK
671  // The flushing thread needs to yield here; this prevents a
672  // busy-waiting thread from saturating the pipeline. flush is
673  // often used in loops like this:
674  // while (!flag) {
675  //   #pragma omp flush(flag)
676  // }
677  // and adding the yield here is good for at least a 10x speedup
678  // when running >2 threads per core (on the NAS LU benchmark).
679  __kmp_yield();
680#endif
681#else
682#error Unknown or unsupported architecture
683#endif
684
685#if OMPT_SUPPORT && OMPT_OPTIONAL
686  if (ompt_enabled.ompt_callback_flush) {
687    ompt_callbacks.ompt_callback(ompt_callback_flush)(
688        __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
689  }
690#endif
691}
692
693/* -------------------------------------------------------------------------- */
694/*!
695@ingroup SYNCHRONIZATION
696@param loc source location information
697@param global_tid thread id.
698
699Execute a barrier.
700*/
701void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
702  KMP_COUNT_BLOCK(OMP_BARRIER);
703  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
704
705  if (!TCR_4(__kmp_init_parallel))
706    __kmp_parallel_initialize();
707
708  __kmp_resume_if_soft_paused();
709
710  if (__kmp_env_consistency_check) {
711    if (loc == 0) {
712      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
713    }
714    __kmp_check_barrier(global_tid, ct_barrier, loc);
715  }
716
717#if OMPT_SUPPORT
718  ompt_frame_t *ompt_frame;
719  if (ompt_enabled.enabled) {
720    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
721    if (ompt_frame->enter_frame.ptr == NULL)
722      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
723    OMPT_STORE_RETURN_ADDRESS(global_tid);
724  }
725#endif
726  __kmp_threads[global_tid]->th.th_ident = loc;
727  // TODO: explicit barrier_wait_id:
728  //   this function is called when 'barrier' directive is present or
729  //   implicit barrier at the end of a worksharing construct.
730  // 1) better to add a per-thread barrier counter to a thread data structure
731  // 2) set to 0 when a new team is created
732  // 4) no sync is required
733
734  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
735#if OMPT_SUPPORT && OMPT_OPTIONAL
736  if (ompt_enabled.enabled) {
737    ompt_frame->enter_frame = ompt_data_none;
738  }
739#endif
740}
741
742/* The BARRIER for a MASTER section is always explicit   */
743/*!
744@ingroup WORK_SHARING
745@param loc  source location information.
746@param global_tid  global thread number .
747@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
748*/
749kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
750  int status = 0;
751
752  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
753
754  if (!TCR_4(__kmp_init_parallel))
755    __kmp_parallel_initialize();
756
757  __kmp_resume_if_soft_paused();
758
759  if (KMP_MASTER_GTID(global_tid)) {
760    KMP_COUNT_BLOCK(OMP_MASTER);
761    KMP_PUSH_PARTITIONED_TIMER(OMP_master);
762    status = 1;
763  }
764
765#if OMPT_SUPPORT && OMPT_OPTIONAL
766  if (status) {
767    if (ompt_enabled.ompt_callback_master) {
768      kmp_info_t *this_thr = __kmp_threads[global_tid];
769      kmp_team_t *team = this_thr->th.th_team;
770
771      int tid = __kmp_tid_from_gtid(global_tid);
772      ompt_callbacks.ompt_callback(ompt_callback_master)(
773          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
774          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
775          OMPT_GET_RETURN_ADDRESS(0));
776    }
777  }
778#endif
779
780  if (__kmp_env_consistency_check) {
781#if KMP_USE_DYNAMIC_LOCK
782    if (status)
783      __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
784    else
785      __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
786#else
787    if (status)
788      __kmp_push_sync(global_tid, ct_master, loc, NULL);
789    else
790      __kmp_check_sync(global_tid, ct_master, loc, NULL);
791#endif
792  }
793
794  return status;
795}
796
797/*!
798@ingroup WORK_SHARING
799@param loc  source location information.
800@param global_tid  global thread number .
801
802Mark the end of a <tt>master</tt> region. This should only be called by the
803thread that executes the <tt>master</tt> region.
804*/
805void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
806  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
807
808  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
809  KMP_POP_PARTITIONED_TIMER();
810
811#if OMPT_SUPPORT && OMPT_OPTIONAL
812  kmp_info_t *this_thr = __kmp_threads[global_tid];
813  kmp_team_t *team = this_thr->th.th_team;
814  if (ompt_enabled.ompt_callback_master) {
815    int tid = __kmp_tid_from_gtid(global_tid);
816    ompt_callbacks.ompt_callback(ompt_callback_master)(
817        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
818        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
819        OMPT_GET_RETURN_ADDRESS(0));
820  }
821#endif
822
823  if (__kmp_env_consistency_check) {
824    if (global_tid < 0)
825      KMP_WARNING(ThreadIdentInvalid);
826
827    if (KMP_MASTER_GTID(global_tid))
828      __kmp_pop_sync(global_tid, ct_master, loc);
829  }
830}
831
832/*!
833@ingroup WORK_SHARING
834@param loc  source location information.
835@param gtid  global thread number.
836
837Start execution of an <tt>ordered</tt> construct.
838*/
839void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
840  int cid = 0;
841  kmp_info_t *th;
842  KMP_DEBUG_ASSERT(__kmp_init_serial);
843
844  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
845
846  if (!TCR_4(__kmp_init_parallel))
847    __kmp_parallel_initialize();
848
849  __kmp_resume_if_soft_paused();
850
851#if USE_ITT_BUILD
852  __kmp_itt_ordered_prep(gtid);
853// TODO: ordered_wait_id
854#endif /* USE_ITT_BUILD */
855
856  th = __kmp_threads[gtid];
857
858#if OMPT_SUPPORT && OMPT_OPTIONAL
859  kmp_team_t *team;
860  ompt_wait_id_t lck;
861  void *codeptr_ra;
862  if (ompt_enabled.enabled) {
863    OMPT_STORE_RETURN_ADDRESS(gtid);
864    team = __kmp_team_from_gtid(gtid);
865    lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
866    /* OMPT state update */
867    th->th.ompt_thread_info.wait_id = lck;
868    th->th.ompt_thread_info.state = ompt_state_wait_ordered;
869
870    /* OMPT event callback */
871    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
872    if (ompt_enabled.ompt_callback_mutex_acquire) {
873      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
874          ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
875          codeptr_ra);
876    }
877  }
878#endif
879
880  if (th->th.th_dispatch->th_deo_fcn != 0)
881    (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
882  else
883    __kmp_parallel_deo(&gtid, &cid, loc);
884
885#if OMPT_SUPPORT && OMPT_OPTIONAL
886  if (ompt_enabled.enabled) {
887    /* OMPT state update */
888    th->th.ompt_thread_info.state = ompt_state_work_parallel;
889    th->th.ompt_thread_info.wait_id = 0;
890
891    /* OMPT event callback */
892    if (ompt_enabled.ompt_callback_mutex_acquired) {
893      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
894          ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
895    }
896  }
897#endif
898
899#if USE_ITT_BUILD
900  __kmp_itt_ordered_start(gtid);
901#endif /* USE_ITT_BUILD */
902}
903
904/*!
905@ingroup WORK_SHARING
906@param loc  source location information.
907@param gtid  global thread number.
908
909End execution of an <tt>ordered</tt> construct.
910*/
911void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
912  int cid = 0;
913  kmp_info_t *th;
914
915  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
916
917#if USE_ITT_BUILD
918  __kmp_itt_ordered_end(gtid);
919// TODO: ordered_wait_id
920#endif /* USE_ITT_BUILD */
921
922  th = __kmp_threads[gtid];
923
924  if (th->th.th_dispatch->th_dxo_fcn != 0)
925    (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
926  else
927    __kmp_parallel_dxo(&gtid, &cid, loc);
928
929#if OMPT_SUPPORT && OMPT_OPTIONAL
930  OMPT_STORE_RETURN_ADDRESS(gtid);
931  if (ompt_enabled.ompt_callback_mutex_released) {
932    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
933        ompt_mutex_ordered,
934        (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
935            ->t.t_ordered.dt.t_value,
936        OMPT_LOAD_RETURN_ADDRESS(gtid));
937  }
938#endif
939}
940
941#if KMP_USE_DYNAMIC_LOCK
942
943static __forceinline void
944__kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
945                          kmp_int32 gtid, kmp_indirect_locktag_t tag) {
946  // Pointer to the allocated indirect lock is written to crit, while indexing
947  // is ignored.
948  void *idx;
949  kmp_indirect_lock_t **lck;
950  lck = (kmp_indirect_lock_t **)crit;
951  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
952  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
953  KMP_SET_I_LOCK_LOCATION(ilk, loc);
954  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
955  KA_TRACE(20,
956           ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
957#if USE_ITT_BUILD
958  __kmp_itt_critical_creating(ilk->lock, loc);
959#endif
960  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
961  if (status == 0) {
962#if USE_ITT_BUILD
963    __kmp_itt_critical_destroyed(ilk->lock);
964#endif
965    // We don't really need to destroy the unclaimed lock here since it will be
966    // cleaned up at program exit.
967    // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
968  }
969  KMP_DEBUG_ASSERT(*lck != NULL);
970}
971
972// Fast-path acquire tas lock
973#define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
974  {                                                                            \
975    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
976    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
977    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
978    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
979        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
980      kmp_uint32 spins;                                                        \
981      KMP_FSYNC_PREPARE(l);                                                    \
982      KMP_INIT_YIELD(spins);                                                   \
983      kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
984      do {                                                                     \
985        if (TCR_4(__kmp_nth) >                                                 \
986            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
987          KMP_YIELD(TRUE);                                                     \
988        } else {                                                               \
989          KMP_YIELD_SPIN(spins);                                               \
990        }                                                                      \
991        __kmp_spin_backoff(&backoff);                                          \
992      } while (                                                                \
993          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
994          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
995    }                                                                          \
996    KMP_FSYNC_ACQUIRED(l);                                                     \
997  }
998
999// Fast-path test tas lock
1000#define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1001  {                                                                            \
1002    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1003    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1004    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1005    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1006         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1007  }
1008
1009// Fast-path release tas lock
1010#define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1011  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1012
1013#if KMP_USE_FUTEX
1014
1015#include <sys/syscall.h>
1016#include <unistd.h>
1017#ifndef FUTEX_WAIT
1018#define FUTEX_WAIT 0
1019#endif
1020#ifndef FUTEX_WAKE
1021#define FUTEX_WAKE 1
1022#endif
1023
1024// Fast-path acquire futex lock
1025#define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1026  {                                                                            \
1027    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1028    kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1029    KMP_MB();                                                                  \
1030    KMP_FSYNC_PREPARE(ftx);                                                    \
1031    kmp_int32 poll_val;                                                        \
1032    while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1033                &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1034                KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1035      kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1036      if (!cond) {                                                             \
1037        if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1038                                         poll_val |                            \
1039                                             KMP_LOCK_BUSY(1, futex))) {       \
1040          continue;                                                            \
1041        }                                                                      \
1042        poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1043      }                                                                        \
1044      kmp_int32 rc;                                                            \
1045      if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1046                        NULL, NULL, 0)) != 0) {                                \
1047        continue;                                                              \
1048      }                                                                        \
1049      gtid_code |= 1;                                                          \
1050    }                                                                          \
1051    KMP_FSYNC_ACQUIRED(ftx);                                                   \
1052  }
1053
1054// Fast-path test futex lock
1055#define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1056  {                                                                            \
1057    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1058    if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1059                                    KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1060      KMP_FSYNC_ACQUIRED(ftx);                                                 \
1061      rc = TRUE;                                                               \
1062    } else {                                                                   \
1063      rc = FALSE;                                                              \
1064    }                                                                          \
1065  }
1066
1067// Fast-path release futex lock
1068#define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1069  {                                                                            \
1070    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1071    KMP_MB();                                                                  \
1072    KMP_FSYNC_RELEASING(ftx);                                                  \
1073    kmp_int32 poll_val =                                                       \
1074        KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1075    if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1076      syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1077              KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1078    }                                                                          \
1079    KMP_MB();                                                                  \
1080    KMP_YIELD_OVERSUB();                                                       \
1081  }
1082
1083#endif // KMP_USE_FUTEX
1084
1085#else // KMP_USE_DYNAMIC_LOCK
1086
1087static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1088                                                      ident_t const *loc,
1089                                                      kmp_int32 gtid) {
1090  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1091
1092  // Because of the double-check, the following load doesn't need to be volatile
1093  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1094
1095  if (lck == NULL) {
1096    void *idx;
1097
1098    // Allocate & initialize the lock.
1099    // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1100    lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1101    __kmp_init_user_lock_with_checks(lck);
1102    __kmp_set_user_lock_location(lck, loc);
1103#if USE_ITT_BUILD
1104    __kmp_itt_critical_creating(lck);
1105// __kmp_itt_critical_creating() should be called *before* the first usage
1106// of underlying lock. It is the only place where we can guarantee it. There
1107// are chances the lock will destroyed with no usage, but it is not a
1108// problem, because this is not real event seen by user but rather setting
1109// name for object (lock). See more details in kmp_itt.h.
1110#endif /* USE_ITT_BUILD */
1111
1112    // Use a cmpxchg instruction to slam the start of the critical section with
1113    // the lock pointer.  If another thread beat us to it, deallocate the lock,
1114    // and use the lock that the other thread allocated.
1115    int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1116
1117    if (status == 0) {
1118// Deallocate the lock and reload the value.
1119#if USE_ITT_BUILD
1120      __kmp_itt_critical_destroyed(lck);
1121// Let ITT know the lock is destroyed and the same memory location may be reused
1122// for another purpose.
1123#endif /* USE_ITT_BUILD */
1124      __kmp_destroy_user_lock_with_checks(lck);
1125      __kmp_user_lock_free(&idx, gtid, lck);
1126      lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1127      KMP_DEBUG_ASSERT(lck != NULL);
1128    }
1129  }
1130  return lck;
1131}
1132
1133#endif // KMP_USE_DYNAMIC_LOCK
1134
1135/*!
1136@ingroup WORK_SHARING
1137@param loc  source location information.
1138@param global_tid  global thread number .
1139@param crit identity of the critical section. This could be a pointer to a lock
1140associated with the critical section, or some other suitably unique value.
1141
1142Enter code protected by a `critical` construct.
1143This function blocks until the executing thread can enter the critical section.
1144*/
1145void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1146                     kmp_critical_name *crit) {
1147#if KMP_USE_DYNAMIC_LOCK
1148#if OMPT_SUPPORT && OMPT_OPTIONAL
1149  OMPT_STORE_RETURN_ADDRESS(global_tid);
1150#endif // OMPT_SUPPORT
1151  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1152#else
1153  KMP_COUNT_BLOCK(OMP_CRITICAL);
1154#if OMPT_SUPPORT && OMPT_OPTIONAL
1155  ompt_state_t prev_state = ompt_state_undefined;
1156  ompt_thread_info_t ti;
1157#endif
1158  kmp_user_lock_p lck;
1159
1160  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1161
1162  // TODO: add THR_OVHD_STATE
1163
1164  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1165  KMP_CHECK_USER_LOCK_INIT();
1166
1167  if ((__kmp_user_lock_kind == lk_tas) &&
1168      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1169    lck = (kmp_user_lock_p)crit;
1170  }
1171#if KMP_USE_FUTEX
1172  else if ((__kmp_user_lock_kind == lk_futex) &&
1173           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1174    lck = (kmp_user_lock_p)crit;
1175  }
1176#endif
1177  else { // ticket, queuing or drdpa
1178    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1179  }
1180
1181  if (__kmp_env_consistency_check)
1182    __kmp_push_sync(global_tid, ct_critical, loc, lck);
1183
1184// since the critical directive binds to all threads, not just the current
1185// team we have to check this even if we are in a serialized team.
1186// also, even if we are the uber thread, we still have to conduct the lock,
1187// as we have to contend with sibling threads.
1188
1189#if USE_ITT_BUILD
1190  __kmp_itt_critical_acquiring(lck);
1191#endif /* USE_ITT_BUILD */
1192#if OMPT_SUPPORT && OMPT_OPTIONAL
1193  OMPT_STORE_RETURN_ADDRESS(gtid);
1194  void *codeptr_ra = NULL;
1195  if (ompt_enabled.enabled) {
1196    ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1197    /* OMPT state update */
1198    prev_state = ti.state;
1199    ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1200    ti.state = ompt_state_wait_critical;
1201
1202    /* OMPT event callback */
1203    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1204    if (ompt_enabled.ompt_callback_mutex_acquire) {
1205      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1206          ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1207          (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1208    }
1209  }
1210#endif
1211  // Value of 'crit' should be good for using as a critical_id of the critical
1212  // section directive.
1213  __kmp_acquire_user_lock_with_checks(lck, global_tid);
1214
1215#if USE_ITT_BUILD
1216  __kmp_itt_critical_acquired(lck);
1217#endif /* USE_ITT_BUILD */
1218#if OMPT_SUPPORT && OMPT_OPTIONAL
1219  if (ompt_enabled.enabled) {
1220    /* OMPT state update */
1221    ti.state = prev_state;
1222    ti.wait_id = 0;
1223
1224    /* OMPT event callback */
1225    if (ompt_enabled.ompt_callback_mutex_acquired) {
1226      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1227          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1228    }
1229  }
1230#endif
1231  KMP_POP_PARTITIONED_TIMER();
1232
1233  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1234  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1235#endif // KMP_USE_DYNAMIC_LOCK
1236}
1237
1238#if KMP_USE_DYNAMIC_LOCK
1239
1240// Converts the given hint to an internal lock implementation
1241static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1242#if KMP_USE_TSX
1243#define KMP_TSX_LOCK(seq) lockseq_##seq
1244#else
1245#define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1246#endif
1247
1248#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1249#define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1250#else
1251#define KMP_CPUINFO_RTM 0
1252#endif
1253
1254  // Hints that do not require further logic
1255  if (hint & kmp_lock_hint_hle)
1256    return KMP_TSX_LOCK(hle);
1257  if (hint & kmp_lock_hint_rtm)
1258    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1259  if (hint & kmp_lock_hint_adaptive)
1260    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1261
1262  // Rule out conflicting hints first by returning the default lock
1263  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1264    return __kmp_user_lock_seq;
1265  if ((hint & omp_lock_hint_speculative) &&
1266      (hint & omp_lock_hint_nonspeculative))
1267    return __kmp_user_lock_seq;
1268
1269  // Do not even consider speculation when it appears to be contended
1270  if (hint & omp_lock_hint_contended)
1271    return lockseq_queuing;
1272
1273  // Uncontended lock without speculation
1274  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1275    return lockseq_tas;
1276
1277  // HLE lock for speculation
1278  if (hint & omp_lock_hint_speculative)
1279    return KMP_TSX_LOCK(hle);
1280
1281  return __kmp_user_lock_seq;
1282}
1283
1284#if OMPT_SUPPORT && OMPT_OPTIONAL
1285#if KMP_USE_DYNAMIC_LOCK
1286static kmp_mutex_impl_t
1287__ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1288  if (user_lock) {
1289    switch (KMP_EXTRACT_D_TAG(user_lock)) {
1290    case 0:
1291      break;
1292#if KMP_USE_FUTEX
1293    case locktag_futex:
1294      return kmp_mutex_impl_queuing;
1295#endif
1296    case locktag_tas:
1297      return kmp_mutex_impl_spin;
1298#if KMP_USE_TSX
1299    case locktag_hle:
1300      return kmp_mutex_impl_speculative;
1301#endif
1302    default:
1303      return kmp_mutex_impl_none;
1304    }
1305    ilock = KMP_LOOKUP_I_LOCK(user_lock);
1306  }
1307  KMP_ASSERT(ilock);
1308  switch (ilock->type) {
1309#if KMP_USE_TSX
1310  case locktag_adaptive:
1311  case locktag_rtm:
1312    return kmp_mutex_impl_speculative;
1313#endif
1314  case locktag_nested_tas:
1315    return kmp_mutex_impl_spin;
1316#if KMP_USE_FUTEX
1317  case locktag_nested_futex:
1318#endif
1319  case locktag_ticket:
1320  case locktag_queuing:
1321  case locktag_drdpa:
1322  case locktag_nested_ticket:
1323  case locktag_nested_queuing:
1324  case locktag_nested_drdpa:
1325    return kmp_mutex_impl_queuing;
1326  default:
1327    return kmp_mutex_impl_none;
1328  }
1329}
1330#else
1331// For locks without dynamic binding
1332static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1333  switch (__kmp_user_lock_kind) {
1334  case lk_tas:
1335    return kmp_mutex_impl_spin;
1336#if KMP_USE_FUTEX
1337  case lk_futex:
1338#endif
1339  case lk_ticket:
1340  case lk_queuing:
1341  case lk_drdpa:
1342    return kmp_mutex_impl_queuing;
1343#if KMP_USE_TSX
1344  case lk_hle:
1345  case lk_rtm:
1346  case lk_adaptive:
1347    return kmp_mutex_impl_speculative;
1348#endif
1349  default:
1350    return kmp_mutex_impl_none;
1351  }
1352}
1353#endif // KMP_USE_DYNAMIC_LOCK
1354#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1355
1356/*!
1357@ingroup WORK_SHARING
1358@param loc  source location information.
1359@param global_tid  global thread number.
1360@param crit identity of the critical section. This could be a pointer to a lock
1361associated with the critical section, or some other suitably unique value.
1362@param hint the lock hint.
1363
1364Enter code protected by a `critical` construct with a hint. The hint value is
1365used to suggest a lock implementation. This function blocks until the executing
1366thread can enter the critical section unless the hint suggests use of
1367speculative execution and the hardware supports it.
1368*/
1369void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1370                               kmp_critical_name *crit, uint32_t hint) {
1371  KMP_COUNT_BLOCK(OMP_CRITICAL);
1372  kmp_user_lock_p lck;
1373#if OMPT_SUPPORT && OMPT_OPTIONAL
1374  ompt_state_t prev_state = ompt_state_undefined;
1375  ompt_thread_info_t ti;
1376  // This is the case, if called from __kmpc_critical:
1377  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1378  if (!codeptr)
1379    codeptr = OMPT_GET_RETURN_ADDRESS(0);
1380#endif
1381
1382  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1383
1384  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1385  // Check if it is initialized.
1386  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1387  if (*lk == 0) {
1388    kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1389    if (KMP_IS_D_LOCK(lckseq)) {
1390      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1391                                  KMP_GET_D_TAG(lckseq));
1392    } else {
1393      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1394    }
1395  }
1396  // Branch for accessing the actual lock object and set operation. This
1397  // branching is inevitable since this lock initialization does not follow the
1398  // normal dispatch path (lock table is not used).
1399  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1400    lck = (kmp_user_lock_p)lk;
1401    if (__kmp_env_consistency_check) {
1402      __kmp_push_sync(global_tid, ct_critical, loc, lck,
1403                      __kmp_map_hint_to_lock(hint));
1404    }
1405#if USE_ITT_BUILD
1406    __kmp_itt_critical_acquiring(lck);
1407#endif
1408#if OMPT_SUPPORT && OMPT_OPTIONAL
1409    if (ompt_enabled.enabled) {
1410      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1411      /* OMPT state update */
1412      prev_state = ti.state;
1413      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1414      ti.state = ompt_state_wait_critical;
1415
1416      /* OMPT event callback */
1417      if (ompt_enabled.ompt_callback_mutex_acquire) {
1418        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1419            ompt_mutex_critical, (unsigned int)hint,
1420            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1421            codeptr);
1422      }
1423    }
1424#endif
1425#if KMP_USE_INLINED_TAS
1426    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1427      KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1428    } else
1429#elif KMP_USE_INLINED_FUTEX
1430    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1431      KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1432    } else
1433#endif
1434    {
1435      KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1436    }
1437  } else {
1438    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1439    lck = ilk->lock;
1440    if (__kmp_env_consistency_check) {
1441      __kmp_push_sync(global_tid, ct_critical, loc, lck,
1442                      __kmp_map_hint_to_lock(hint));
1443    }
1444#if USE_ITT_BUILD
1445    __kmp_itt_critical_acquiring(lck);
1446#endif
1447#if OMPT_SUPPORT && OMPT_OPTIONAL
1448    if (ompt_enabled.enabled) {
1449      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1450      /* OMPT state update */
1451      prev_state = ti.state;
1452      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1453      ti.state = ompt_state_wait_critical;
1454
1455      /* OMPT event callback */
1456      if (ompt_enabled.ompt_callback_mutex_acquire) {
1457        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1458            ompt_mutex_critical, (unsigned int)hint,
1459            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1460            codeptr);
1461      }
1462    }
1463#endif
1464    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1465  }
1466  KMP_POP_PARTITIONED_TIMER();
1467
1468#if USE_ITT_BUILD
1469  __kmp_itt_critical_acquired(lck);
1470#endif /* USE_ITT_BUILD */
1471#if OMPT_SUPPORT && OMPT_OPTIONAL
1472  if (ompt_enabled.enabled) {
1473    /* OMPT state update */
1474    ti.state = prev_state;
1475    ti.wait_id = 0;
1476
1477    /* OMPT event callback */
1478    if (ompt_enabled.ompt_callback_mutex_acquired) {
1479      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1480          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1481    }
1482  }
1483#endif
1484
1485  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1486  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1487} // __kmpc_critical_with_hint
1488
1489#endif // KMP_USE_DYNAMIC_LOCK
1490
1491/*!
1492@ingroup WORK_SHARING
1493@param loc  source location information.
1494@param global_tid  global thread number .
1495@param crit identity of the critical section. This could be a pointer to a lock
1496associated with the critical section, or some other suitably unique value.
1497
1498Leave a critical section, releasing any lock that was held during its execution.
1499*/
1500void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1501                         kmp_critical_name *crit) {
1502  kmp_user_lock_p lck;
1503
1504  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1505
1506#if KMP_USE_DYNAMIC_LOCK
1507  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1508    lck = (kmp_user_lock_p)crit;
1509    KMP_ASSERT(lck != NULL);
1510    if (__kmp_env_consistency_check) {
1511      __kmp_pop_sync(global_tid, ct_critical, loc);
1512    }
1513#if USE_ITT_BUILD
1514    __kmp_itt_critical_releasing(lck);
1515#endif
1516#if KMP_USE_INLINED_TAS
1517    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1518      KMP_RELEASE_TAS_LOCK(lck, global_tid);
1519    } else
1520#elif KMP_USE_INLINED_FUTEX
1521    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1522      KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1523    } else
1524#endif
1525    {
1526      KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1527    }
1528  } else {
1529    kmp_indirect_lock_t *ilk =
1530        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1531    KMP_ASSERT(ilk != NULL);
1532    lck = ilk->lock;
1533    if (__kmp_env_consistency_check) {
1534      __kmp_pop_sync(global_tid, ct_critical, loc);
1535    }
1536#if USE_ITT_BUILD
1537    __kmp_itt_critical_releasing(lck);
1538#endif
1539    KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1540  }
1541
1542#else // KMP_USE_DYNAMIC_LOCK
1543
1544  if ((__kmp_user_lock_kind == lk_tas) &&
1545      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1546    lck = (kmp_user_lock_p)crit;
1547  }
1548#if KMP_USE_FUTEX
1549  else if ((__kmp_user_lock_kind == lk_futex) &&
1550           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1551    lck = (kmp_user_lock_p)crit;
1552  }
1553#endif
1554  else { // ticket, queuing or drdpa
1555    lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1556  }
1557
1558  KMP_ASSERT(lck != NULL);
1559
1560  if (__kmp_env_consistency_check)
1561    __kmp_pop_sync(global_tid, ct_critical, loc);
1562
1563#if USE_ITT_BUILD
1564  __kmp_itt_critical_releasing(lck);
1565#endif /* USE_ITT_BUILD */
1566  // Value of 'crit' should be good for using as a critical_id of the critical
1567  // section directive.
1568  __kmp_release_user_lock_with_checks(lck, global_tid);
1569
1570#endif // KMP_USE_DYNAMIC_LOCK
1571
1572#if OMPT_SUPPORT && OMPT_OPTIONAL
1573  /* OMPT release event triggers after lock is released; place here to trigger
1574   * for all #if branches */
1575  OMPT_STORE_RETURN_ADDRESS(global_tid);
1576  if (ompt_enabled.ompt_callback_mutex_released) {
1577    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1578        ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1579        OMPT_LOAD_RETURN_ADDRESS(0));
1580  }
1581#endif
1582
1583  KMP_POP_PARTITIONED_TIMER();
1584  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1585}
1586
1587/*!
1588@ingroup SYNCHRONIZATION
1589@param loc source location information
1590@param global_tid thread id.
1591@return one if the thread should execute the master block, zero otherwise
1592
1593Start execution of a combined barrier and master. The barrier is executed inside
1594this function.
1595*/
1596kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1597  int status;
1598
1599  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1600
1601  if (!TCR_4(__kmp_init_parallel))
1602    __kmp_parallel_initialize();
1603
1604  __kmp_resume_if_soft_paused();
1605
1606  if (__kmp_env_consistency_check)
1607    __kmp_check_barrier(global_tid, ct_barrier, loc);
1608
1609#if OMPT_SUPPORT
1610  ompt_frame_t *ompt_frame;
1611  if (ompt_enabled.enabled) {
1612    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1613    if (ompt_frame->enter_frame.ptr == NULL)
1614      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1615    OMPT_STORE_RETURN_ADDRESS(global_tid);
1616  }
1617#endif
1618#if USE_ITT_NOTIFY
1619  __kmp_threads[global_tid]->th.th_ident = loc;
1620#endif
1621  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1622#if OMPT_SUPPORT && OMPT_OPTIONAL
1623  if (ompt_enabled.enabled) {
1624    ompt_frame->enter_frame = ompt_data_none;
1625  }
1626#endif
1627
1628  return (status != 0) ? 0 : 1;
1629}
1630
1631/*!
1632@ingroup SYNCHRONIZATION
1633@param loc source location information
1634@param global_tid thread id.
1635
1636Complete the execution of a combined barrier and master. This function should
1637only be called at the completion of the <tt>master</tt> code. Other threads will
1638still be waiting at the barrier and this call releases them.
1639*/
1640void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1641  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1642
1643  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1644}
1645
1646/*!
1647@ingroup SYNCHRONIZATION
1648@param loc source location information
1649@param global_tid thread id.
1650@return one if the thread should execute the master block, zero otherwise
1651
1652Start execution of a combined barrier and master(nowait) construct.
1653The barrier is executed inside this function.
1654There is no equivalent "end" function, since the
1655*/
1656kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1657  kmp_int32 ret;
1658
1659  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1660
1661  if (!TCR_4(__kmp_init_parallel))
1662    __kmp_parallel_initialize();
1663
1664  __kmp_resume_if_soft_paused();
1665
1666  if (__kmp_env_consistency_check) {
1667    if (loc == 0) {
1668      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1669    }
1670    __kmp_check_barrier(global_tid, ct_barrier, loc);
1671  }
1672
1673#if OMPT_SUPPORT
1674  ompt_frame_t *ompt_frame;
1675  if (ompt_enabled.enabled) {
1676    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1677    if (ompt_frame->enter_frame.ptr == NULL)
1678      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1679    OMPT_STORE_RETURN_ADDRESS(global_tid);
1680  }
1681#endif
1682#if USE_ITT_NOTIFY
1683  __kmp_threads[global_tid]->th.th_ident = loc;
1684#endif
1685  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1686#if OMPT_SUPPORT && OMPT_OPTIONAL
1687  if (ompt_enabled.enabled) {
1688    ompt_frame->enter_frame = ompt_data_none;
1689  }
1690#endif
1691
1692  ret = __kmpc_master(loc, global_tid);
1693
1694  if (__kmp_env_consistency_check) {
1695    /*  there's no __kmpc_end_master called; so the (stats) */
1696    /*  actions of __kmpc_end_master are done here          */
1697
1698    if (global_tid < 0) {
1699      KMP_WARNING(ThreadIdentInvalid);
1700    }
1701    if (ret) {
1702      /* only one thread should do the pop since only */
1703      /* one did the push (see __kmpc_master())       */
1704
1705      __kmp_pop_sync(global_tid, ct_master, loc);
1706    }
1707  }
1708
1709  return (ret);
1710}
1711
1712/* The BARRIER for a SINGLE process section is always explicit   */
1713/*!
1714@ingroup WORK_SHARING
1715@param loc  source location information
1716@param global_tid  global thread number
1717@return One if this thread should execute the single construct, zero otherwise.
1718
1719Test whether to execute a <tt>single</tt> construct.
1720There are no implicit barriers in the two "single" calls, rather the compiler
1721should introduce an explicit barrier if it is required.
1722*/
1723
1724kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1725  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1726
1727  if (rc) {
1728    // We are going to execute the single statement, so we should count it.
1729    KMP_COUNT_BLOCK(OMP_SINGLE);
1730    KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1731  }
1732
1733#if OMPT_SUPPORT && OMPT_OPTIONAL
1734  kmp_info_t *this_thr = __kmp_threads[global_tid];
1735  kmp_team_t *team = this_thr->th.th_team;
1736  int tid = __kmp_tid_from_gtid(global_tid);
1737
1738  if (ompt_enabled.enabled) {
1739    if (rc) {
1740      if (ompt_enabled.ompt_callback_work) {
1741        ompt_callbacks.ompt_callback(ompt_callback_work)(
1742            ompt_work_single_executor, ompt_scope_begin,
1743            &(team->t.ompt_team_info.parallel_data),
1744            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1745            1, OMPT_GET_RETURN_ADDRESS(0));
1746      }
1747    } else {
1748      if (ompt_enabled.ompt_callback_work) {
1749        ompt_callbacks.ompt_callback(ompt_callback_work)(
1750            ompt_work_single_other, ompt_scope_begin,
1751            &(team->t.ompt_team_info.parallel_data),
1752            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1753            1, OMPT_GET_RETURN_ADDRESS(0));
1754        ompt_callbacks.ompt_callback(ompt_callback_work)(
1755            ompt_work_single_other, ompt_scope_end,
1756            &(team->t.ompt_team_info.parallel_data),
1757            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1758            1, OMPT_GET_RETURN_ADDRESS(0));
1759      }
1760    }
1761  }
1762#endif
1763
1764  return rc;
1765}
1766
1767/*!
1768@ingroup WORK_SHARING
1769@param loc  source location information
1770@param global_tid  global thread number
1771
1772Mark the end of a <tt>single</tt> construct.  This function should
1773only be called by the thread that executed the block of code protected
1774by the `single` construct.
1775*/
1776void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1777  __kmp_exit_single(global_tid);
1778  KMP_POP_PARTITIONED_TIMER();
1779
1780#if OMPT_SUPPORT && OMPT_OPTIONAL
1781  kmp_info_t *this_thr = __kmp_threads[global_tid];
1782  kmp_team_t *team = this_thr->th.th_team;
1783  int tid = __kmp_tid_from_gtid(global_tid);
1784
1785  if (ompt_enabled.ompt_callback_work) {
1786    ompt_callbacks.ompt_callback(ompt_callback_work)(
1787        ompt_work_single_executor, ompt_scope_end,
1788        &(team->t.ompt_team_info.parallel_data),
1789        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1790        OMPT_GET_RETURN_ADDRESS(0));
1791  }
1792#endif
1793}
1794
1795/*!
1796@ingroup WORK_SHARING
1797@param loc Source location
1798@param global_tid Global thread id
1799
1800Mark the end of a statically scheduled loop.
1801*/
1802void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1803  KMP_POP_PARTITIONED_TIMER();
1804  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1805
1806#if OMPT_SUPPORT && OMPT_OPTIONAL
1807  if (ompt_enabled.ompt_callback_work) {
1808    ompt_work_t ompt_work_type = ompt_work_loop;
1809    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1810    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1811    // Determine workshare type
1812    if (loc != NULL) {
1813      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1814        ompt_work_type = ompt_work_loop;
1815      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1816        ompt_work_type = ompt_work_sections;
1817      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1818        ompt_work_type = ompt_work_distribute;
1819      } else {
1820        // use default set above.
1821        // a warning about this case is provided in __kmpc_for_static_init
1822      }
1823      KMP_DEBUG_ASSERT(ompt_work_type);
1824    }
1825    ompt_callbacks.ompt_callback(ompt_callback_work)(
1826        ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1827        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1828  }
1829#endif
1830  if (__kmp_env_consistency_check)
1831    __kmp_pop_workshare(global_tid, ct_pdo, loc);
1832}
1833
1834// User routines which take C-style arguments (call by value)
1835// different from the Fortran equivalent routines
1836
1837void ompc_set_num_threads(int arg) {
1838  // !!!!! TODO: check the per-task binding
1839  __kmp_set_num_threads(arg, __kmp_entry_gtid());
1840}
1841
1842void ompc_set_dynamic(int flag) {
1843  kmp_info_t *thread;
1844
1845  /* For the thread-private implementation of the internal controls */
1846  thread = __kmp_entry_thread();
1847
1848  __kmp_save_internal_controls(thread);
1849
1850  set__dynamic(thread, flag ? TRUE : FALSE);
1851}
1852
1853void ompc_set_nested(int flag) {
1854  kmp_info_t *thread;
1855
1856  /* For the thread-private internal controls implementation */
1857  thread = __kmp_entry_thread();
1858
1859  __kmp_save_internal_controls(thread);
1860
1861  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1862}
1863
1864void ompc_set_max_active_levels(int max_active_levels) {
1865  /* TO DO */
1866  /* we want per-task implementation of this internal control */
1867
1868  /* For the per-thread internal controls implementation */
1869  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1870}
1871
1872void ompc_set_schedule(omp_sched_t kind, int modifier) {
1873  // !!!!! TODO: check the per-task binding
1874  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1875}
1876
1877int ompc_get_ancestor_thread_num(int level) {
1878  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1879}
1880
1881int ompc_get_team_size(int level) {
1882  return __kmp_get_team_size(__kmp_entry_gtid(), level);
1883}
1884
1885/* OpenMP 5.0 Affinity Format API */
1886
1887void ompc_set_affinity_format(char const *format) {
1888  if (!__kmp_init_serial) {
1889    __kmp_serial_initialize();
1890  }
1891  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1892                         format, KMP_STRLEN(format) + 1);
1893}
1894
1895size_t ompc_get_affinity_format(char *buffer, size_t size) {
1896  size_t format_size;
1897  if (!__kmp_init_serial) {
1898    __kmp_serial_initialize();
1899  }
1900  format_size = KMP_STRLEN(__kmp_affinity_format);
1901  if (buffer && size) {
1902    __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1903                           format_size + 1);
1904  }
1905  return format_size;
1906}
1907
1908void ompc_display_affinity(char const *format) {
1909  int gtid;
1910  if (!TCR_4(__kmp_init_middle)) {
1911    __kmp_middle_initialize();
1912  }
1913  gtid = __kmp_get_gtid();
1914  __kmp_aux_display_affinity(gtid, format);
1915}
1916
1917size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1918                             char const *format) {
1919  int gtid;
1920  size_t num_required;
1921  kmp_str_buf_t capture_buf;
1922  if (!TCR_4(__kmp_init_middle)) {
1923    __kmp_middle_initialize();
1924  }
1925  gtid = __kmp_get_gtid();
1926  __kmp_str_buf_init(&capture_buf);
1927  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1928  if (buffer && buf_size) {
1929    __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1930                           capture_buf.used + 1);
1931  }
1932  __kmp_str_buf_free(&capture_buf);
1933  return num_required;
1934}
1935
1936void kmpc_set_stacksize(int arg) {
1937  // __kmp_aux_set_stacksize initializes the library if needed
1938  __kmp_aux_set_stacksize(arg);
1939}
1940
1941void kmpc_set_stacksize_s(size_t arg) {
1942  // __kmp_aux_set_stacksize initializes the library if needed
1943  __kmp_aux_set_stacksize(arg);
1944}
1945
1946void kmpc_set_blocktime(int arg) {
1947  int gtid, tid;
1948  kmp_info_t *thread;
1949
1950  gtid = __kmp_entry_gtid();
1951  tid = __kmp_tid_from_gtid(gtid);
1952  thread = __kmp_thread_from_gtid(gtid);
1953
1954  __kmp_aux_set_blocktime(arg, thread, tid);
1955}
1956
1957void kmpc_set_library(int arg) {
1958  // __kmp_user_set_library initializes the library if needed
1959  __kmp_user_set_library((enum library_type)arg);
1960}
1961
1962void kmpc_set_defaults(char const *str) {
1963  // __kmp_aux_set_defaults initializes the library if needed
1964  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1965}
1966
1967void kmpc_set_disp_num_buffers(int arg) {
1968  // ignore after initialization because some teams have already
1969  // allocated dispatch buffers
1970  if (__kmp_init_serial == 0 && arg > 0)
1971    __kmp_dispatch_num_buffers = arg;
1972}
1973
1974int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1975#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1976  return -1;
1977#else
1978  if (!TCR_4(__kmp_init_middle)) {
1979    __kmp_middle_initialize();
1980  }
1981  return __kmp_aux_set_affinity_mask_proc(proc, mask);
1982#endif
1983}
1984
1985int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1986#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1987  return -1;
1988#else
1989  if (!TCR_4(__kmp_init_middle)) {
1990    __kmp_middle_initialize();
1991  }
1992  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1993#endif
1994}
1995
1996int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1997#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1998  return -1;
1999#else
2000  if (!TCR_4(__kmp_init_middle)) {
2001    __kmp_middle_initialize();
2002  }
2003  return __kmp_aux_get_affinity_mask_proc(proc, mask);
2004#endif
2005}
2006
2007/* -------------------------------------------------------------------------- */
2008/*!
2009@ingroup THREADPRIVATE
2010@param loc       source location information
2011@param gtid      global thread number
2012@param cpy_size  size of the cpy_data buffer
2013@param cpy_data  pointer to data to be copied
2014@param cpy_func  helper function to call for copying data
2015@param didit     flag variable: 1=single thread; 0=not single thread
2016
2017__kmpc_copyprivate implements the interface for the private data broadcast
2018needed for the copyprivate clause associated with a single region in an
2019OpenMP<sup>*</sup> program (both C and Fortran).
2020All threads participating in the parallel region call this routine.
2021One of the threads (called the single thread) should have the <tt>didit</tt>
2022variable set to 1 and all other threads should have that variable set to 0.
2023All threads pass a pointer to a data buffer (cpy_data) that they have built.
2024
2025The OpenMP specification forbids the use of nowait on the single region when a
2026copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2027barrier internally to avoid race conditions, so the code generation for the
2028single region should avoid generating a barrier after the call to @ref
2029__kmpc_copyprivate.
2030
2031The <tt>gtid</tt> parameter is the global thread id for the current thread.
2032The <tt>loc</tt> parameter is a pointer to source location information.
2033
2034Internal implementation: The single thread will first copy its descriptor
2035address (cpy_data) to a team-private location, then the other threads will each
2036call the function pointed to by the parameter cpy_func, which carries out the
2037copy by copying the data using the cpy_data buffer.
2038
2039The cpy_func routine used for the copy and the contents of the data area defined
2040by cpy_data and cpy_size may be built in any fashion that will allow the copy
2041to be done. For instance, the cpy_data buffer can hold the actual data to be
2042copied or it may hold a list of pointers to the data. The cpy_func routine must
2043interpret the cpy_data buffer appropriately.
2044
2045The interface to cpy_func is as follows:
2046@code
2047void cpy_func( void *destination, void *source )
2048@endcode
2049where void *destination is the cpy_data pointer for the thread being copied to
2050and void *source is the cpy_data pointer for the thread being copied from.
2051*/
2052void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2053                        void *cpy_data, void (*cpy_func)(void *, void *),
2054                        kmp_int32 didit) {
2055  void **data_ptr;
2056
2057  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2058
2059  KMP_MB();
2060
2061  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2062
2063  if (__kmp_env_consistency_check) {
2064    if (loc == 0) {
2065      KMP_WARNING(ConstructIdentInvalid);
2066    }
2067  }
2068
2069  // ToDo: Optimize the following two barriers into some kind of split barrier
2070
2071  if (didit)
2072    *data_ptr = cpy_data;
2073
2074#if OMPT_SUPPORT
2075  ompt_frame_t *ompt_frame;
2076  if (ompt_enabled.enabled) {
2077    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2078    if (ompt_frame->enter_frame.ptr == NULL)
2079      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2080    OMPT_STORE_RETURN_ADDRESS(gtid);
2081  }
2082#endif
2083/* This barrier is not a barrier region boundary */
2084#if USE_ITT_NOTIFY
2085  __kmp_threads[gtid]->th.th_ident = loc;
2086#endif
2087  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2088
2089  if (!didit)
2090    (*cpy_func)(cpy_data, *data_ptr);
2091
2092// Consider next barrier a user-visible barrier for barrier region boundaries
2093// Nesting checks are already handled by the single construct checks
2094
2095#if OMPT_SUPPORT
2096  if (ompt_enabled.enabled) {
2097    OMPT_STORE_RETURN_ADDRESS(gtid);
2098  }
2099#endif
2100#if USE_ITT_NOTIFY
2101  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2102// tasks can overwrite the location)
2103#endif
2104  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2105#if OMPT_SUPPORT && OMPT_OPTIONAL
2106  if (ompt_enabled.enabled) {
2107    ompt_frame->enter_frame = ompt_data_none;
2108  }
2109#endif
2110}
2111
2112/* -------------------------------------------------------------------------- */
2113
2114#define INIT_LOCK __kmp_init_user_lock_with_checks
2115#define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2116#define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2117#define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2118#define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2119#define ACQUIRE_NESTED_LOCK_TIMED                                              \
2120  __kmp_acquire_nested_user_lock_with_checks_timed
2121#define RELEASE_LOCK __kmp_release_user_lock_with_checks
2122#define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2123#define TEST_LOCK __kmp_test_user_lock_with_checks
2124#define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2125#define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2126#define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2127
2128// TODO: Make check abort messages use location info & pass it into
2129// with_checks routines
2130
2131#if KMP_USE_DYNAMIC_LOCK
2132
2133// internal lock initializer
2134static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2135                                                    kmp_dyna_lockseq_t seq) {
2136  if (KMP_IS_D_LOCK(seq)) {
2137    KMP_INIT_D_LOCK(lock, seq);
2138#if USE_ITT_BUILD
2139    __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2140#endif
2141  } else {
2142    KMP_INIT_I_LOCK(lock, seq);
2143#if USE_ITT_BUILD
2144    kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2145    __kmp_itt_lock_creating(ilk->lock, loc);
2146#endif
2147  }
2148}
2149
2150// internal nest lock initializer
2151static __forceinline void
2152__kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2153                               kmp_dyna_lockseq_t seq) {
2154#if KMP_USE_TSX
2155  // Don't have nested lock implementation for speculative locks
2156  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2157    seq = __kmp_user_lock_seq;
2158#endif
2159  switch (seq) {
2160  case lockseq_tas:
2161    seq = lockseq_nested_tas;
2162    break;
2163#if KMP_USE_FUTEX
2164  case lockseq_futex:
2165    seq = lockseq_nested_futex;
2166    break;
2167#endif
2168  case lockseq_ticket:
2169    seq = lockseq_nested_ticket;
2170    break;
2171  case lockseq_queuing:
2172    seq = lockseq_nested_queuing;
2173    break;
2174  case lockseq_drdpa:
2175    seq = lockseq_nested_drdpa;
2176    break;
2177  default:
2178    seq = lockseq_nested_queuing;
2179  }
2180  KMP_INIT_I_LOCK(lock, seq);
2181#if USE_ITT_BUILD
2182  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2183  __kmp_itt_lock_creating(ilk->lock, loc);
2184#endif
2185}
2186
2187/* initialize the lock with a hint */
2188void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2189                                uintptr_t hint) {
2190  KMP_DEBUG_ASSERT(__kmp_init_serial);
2191  if (__kmp_env_consistency_check && user_lock == NULL) {
2192    KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2193  }
2194
2195  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2196
2197#if OMPT_SUPPORT && OMPT_OPTIONAL
2198  // This is the case, if called from omp_init_lock_with_hint:
2199  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2200  if (!codeptr)
2201    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2202  if (ompt_enabled.ompt_callback_lock_init) {
2203    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2204        ompt_mutex_lock, (omp_lock_hint_t)hint,
2205        __ompt_get_mutex_impl_type(user_lock),
2206        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2207  }
2208#endif
2209}
2210
2211/* initialize the lock with a hint */
2212void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2213                                     void **user_lock, uintptr_t hint) {
2214  KMP_DEBUG_ASSERT(__kmp_init_serial);
2215  if (__kmp_env_consistency_check && user_lock == NULL) {
2216    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2217  }
2218
2219  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2220
2221#if OMPT_SUPPORT && OMPT_OPTIONAL
2222  // This is the case, if called from omp_init_lock_with_hint:
2223  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2224  if (!codeptr)
2225    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2226  if (ompt_enabled.ompt_callback_lock_init) {
2227    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2228        ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2229        __ompt_get_mutex_impl_type(user_lock),
2230        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2231  }
2232#endif
2233}
2234
2235#endif // KMP_USE_DYNAMIC_LOCK
2236
2237/* initialize the lock */
2238void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2239#if KMP_USE_DYNAMIC_LOCK
2240
2241  KMP_DEBUG_ASSERT(__kmp_init_serial);
2242  if (__kmp_env_consistency_check && user_lock == NULL) {
2243    KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2244  }
2245  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2246
2247#if OMPT_SUPPORT && OMPT_OPTIONAL
2248  // This is the case, if called from omp_init_lock_with_hint:
2249  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2250  if (!codeptr)
2251    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2252  if (ompt_enabled.ompt_callback_lock_init) {
2253    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2254        ompt_mutex_lock, omp_lock_hint_none,
2255        __ompt_get_mutex_impl_type(user_lock),
2256        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2257  }
2258#endif
2259
2260#else // KMP_USE_DYNAMIC_LOCK
2261
2262  static char const *const func = "omp_init_lock";
2263  kmp_user_lock_p lck;
2264  KMP_DEBUG_ASSERT(__kmp_init_serial);
2265
2266  if (__kmp_env_consistency_check) {
2267    if (user_lock == NULL) {
2268      KMP_FATAL(LockIsUninitialized, func);
2269    }
2270  }
2271
2272  KMP_CHECK_USER_LOCK_INIT();
2273
2274  if ((__kmp_user_lock_kind == lk_tas) &&
2275      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2276    lck = (kmp_user_lock_p)user_lock;
2277  }
2278#if KMP_USE_FUTEX
2279  else if ((__kmp_user_lock_kind == lk_futex) &&
2280           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2281    lck = (kmp_user_lock_p)user_lock;
2282  }
2283#endif
2284  else {
2285    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2286  }
2287  INIT_LOCK(lck);
2288  __kmp_set_user_lock_location(lck, loc);
2289
2290#if OMPT_SUPPORT && OMPT_OPTIONAL
2291  // This is the case, if called from omp_init_lock_with_hint:
2292  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2293  if (!codeptr)
2294    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2295  if (ompt_enabled.ompt_callback_lock_init) {
2296    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2297        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2298        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2299  }
2300#endif
2301
2302#if USE_ITT_BUILD
2303  __kmp_itt_lock_creating(lck);
2304#endif /* USE_ITT_BUILD */
2305
2306#endif // KMP_USE_DYNAMIC_LOCK
2307} // __kmpc_init_lock
2308
2309/* initialize the lock */
2310void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2311#if KMP_USE_DYNAMIC_LOCK
2312
2313  KMP_DEBUG_ASSERT(__kmp_init_serial);
2314  if (__kmp_env_consistency_check && user_lock == NULL) {
2315    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2316  }
2317  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2318
2319#if OMPT_SUPPORT && OMPT_OPTIONAL
2320  // This is the case, if called from omp_init_lock_with_hint:
2321  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2322  if (!codeptr)
2323    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2324  if (ompt_enabled.ompt_callback_lock_init) {
2325    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2326        ompt_mutex_nest_lock, omp_lock_hint_none,
2327        __ompt_get_mutex_impl_type(user_lock),
2328        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2329  }
2330#endif
2331
2332#else // KMP_USE_DYNAMIC_LOCK
2333
2334  static char const *const func = "omp_init_nest_lock";
2335  kmp_user_lock_p lck;
2336  KMP_DEBUG_ASSERT(__kmp_init_serial);
2337
2338  if (__kmp_env_consistency_check) {
2339    if (user_lock == NULL) {
2340      KMP_FATAL(LockIsUninitialized, func);
2341    }
2342  }
2343
2344  KMP_CHECK_USER_LOCK_INIT();
2345
2346  if ((__kmp_user_lock_kind == lk_tas) &&
2347      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2348       OMP_NEST_LOCK_T_SIZE)) {
2349    lck = (kmp_user_lock_p)user_lock;
2350  }
2351#if KMP_USE_FUTEX
2352  else if ((__kmp_user_lock_kind == lk_futex) &&
2353           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2354            OMP_NEST_LOCK_T_SIZE)) {
2355    lck = (kmp_user_lock_p)user_lock;
2356  }
2357#endif
2358  else {
2359    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2360  }
2361
2362  INIT_NESTED_LOCK(lck);
2363  __kmp_set_user_lock_location(lck, loc);
2364
2365#if OMPT_SUPPORT && OMPT_OPTIONAL
2366  // This is the case, if called from omp_init_lock_with_hint:
2367  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2368  if (!codeptr)
2369    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2370  if (ompt_enabled.ompt_callback_lock_init) {
2371    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2372        ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2373        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2374  }
2375#endif
2376
2377#if USE_ITT_BUILD
2378  __kmp_itt_lock_creating(lck);
2379#endif /* USE_ITT_BUILD */
2380
2381#endif // KMP_USE_DYNAMIC_LOCK
2382} // __kmpc_init_nest_lock
2383
2384void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2385#if KMP_USE_DYNAMIC_LOCK
2386
2387#if USE_ITT_BUILD
2388  kmp_user_lock_p lck;
2389  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2390    lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2391  } else {
2392    lck = (kmp_user_lock_p)user_lock;
2393  }
2394  __kmp_itt_lock_destroyed(lck);
2395#endif
2396#if OMPT_SUPPORT && OMPT_OPTIONAL
2397  // This is the case, if called from omp_init_lock_with_hint:
2398  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2399  if (!codeptr)
2400    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2401  if (ompt_enabled.ompt_callback_lock_destroy) {
2402    kmp_user_lock_p lck;
2403    if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2404      lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2405    } else {
2406      lck = (kmp_user_lock_p)user_lock;
2407    }
2408    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2409        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2410  }
2411#endif
2412  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2413#else
2414  kmp_user_lock_p lck;
2415
2416  if ((__kmp_user_lock_kind == lk_tas) &&
2417      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2418    lck = (kmp_user_lock_p)user_lock;
2419  }
2420#if KMP_USE_FUTEX
2421  else if ((__kmp_user_lock_kind == lk_futex) &&
2422           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2423    lck = (kmp_user_lock_p)user_lock;
2424  }
2425#endif
2426  else {
2427    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2428  }
2429
2430#if OMPT_SUPPORT && OMPT_OPTIONAL
2431  // This is the case, if called from omp_init_lock_with_hint:
2432  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2433  if (!codeptr)
2434    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2435  if (ompt_enabled.ompt_callback_lock_destroy) {
2436    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2437        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2438  }
2439#endif
2440
2441#if USE_ITT_BUILD
2442  __kmp_itt_lock_destroyed(lck);
2443#endif /* USE_ITT_BUILD */
2444  DESTROY_LOCK(lck);
2445
2446  if ((__kmp_user_lock_kind == lk_tas) &&
2447      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2448    ;
2449  }
2450#if KMP_USE_FUTEX
2451  else if ((__kmp_user_lock_kind == lk_futex) &&
2452           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2453    ;
2454  }
2455#endif
2456  else {
2457    __kmp_user_lock_free(user_lock, gtid, lck);
2458  }
2459#endif // KMP_USE_DYNAMIC_LOCK
2460} // __kmpc_destroy_lock
2461
2462/* destroy the lock */
2463void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2464#if KMP_USE_DYNAMIC_LOCK
2465
2466#if USE_ITT_BUILD
2467  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2468  __kmp_itt_lock_destroyed(ilk->lock);
2469#endif
2470#if OMPT_SUPPORT && OMPT_OPTIONAL
2471  // This is the case, if called from omp_init_lock_with_hint:
2472  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2473  if (!codeptr)
2474    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2475  if (ompt_enabled.ompt_callback_lock_destroy) {
2476    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2477        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2478  }
2479#endif
2480  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2481
2482#else // KMP_USE_DYNAMIC_LOCK
2483
2484  kmp_user_lock_p lck;
2485
2486  if ((__kmp_user_lock_kind == lk_tas) &&
2487      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2488       OMP_NEST_LOCK_T_SIZE)) {
2489    lck = (kmp_user_lock_p)user_lock;
2490  }
2491#if KMP_USE_FUTEX
2492  else if ((__kmp_user_lock_kind == lk_futex) &&
2493           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2494            OMP_NEST_LOCK_T_SIZE)) {
2495    lck = (kmp_user_lock_p)user_lock;
2496  }
2497#endif
2498  else {
2499    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2500  }
2501
2502#if OMPT_SUPPORT && OMPT_OPTIONAL
2503  // This is the case, if called from omp_init_lock_with_hint:
2504  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2505  if (!codeptr)
2506    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2507  if (ompt_enabled.ompt_callback_lock_destroy) {
2508    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2509        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2510  }
2511#endif
2512
2513#if USE_ITT_BUILD
2514  __kmp_itt_lock_destroyed(lck);
2515#endif /* USE_ITT_BUILD */
2516
2517  DESTROY_NESTED_LOCK(lck);
2518
2519  if ((__kmp_user_lock_kind == lk_tas) &&
2520      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2521       OMP_NEST_LOCK_T_SIZE)) {
2522    ;
2523  }
2524#if KMP_USE_FUTEX
2525  else if ((__kmp_user_lock_kind == lk_futex) &&
2526           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2527            OMP_NEST_LOCK_T_SIZE)) {
2528    ;
2529  }
2530#endif
2531  else {
2532    __kmp_user_lock_free(user_lock, gtid, lck);
2533  }
2534#endif // KMP_USE_DYNAMIC_LOCK
2535} // __kmpc_destroy_nest_lock
2536
2537void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2538  KMP_COUNT_BLOCK(OMP_set_lock);
2539#if KMP_USE_DYNAMIC_LOCK
2540  int tag = KMP_EXTRACT_D_TAG(user_lock);
2541#if USE_ITT_BUILD
2542  __kmp_itt_lock_acquiring(
2543      (kmp_user_lock_p)
2544          user_lock); // itt function will get to the right lock object.
2545#endif
2546#if OMPT_SUPPORT && OMPT_OPTIONAL
2547  // This is the case, if called from omp_init_lock_with_hint:
2548  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2549  if (!codeptr)
2550    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2551  if (ompt_enabled.ompt_callback_mutex_acquire) {
2552    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2553        ompt_mutex_lock, omp_lock_hint_none,
2554        __ompt_get_mutex_impl_type(user_lock),
2555        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2556  }
2557#endif
2558#if KMP_USE_INLINED_TAS
2559  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2560    KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2561  } else
2562#elif KMP_USE_INLINED_FUTEX
2563  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2564    KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2565  } else
2566#endif
2567  {
2568    __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2569  }
2570#if USE_ITT_BUILD
2571  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2572#endif
2573#if OMPT_SUPPORT && OMPT_OPTIONAL
2574  if (ompt_enabled.ompt_callback_mutex_acquired) {
2575    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2576        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2577  }
2578#endif
2579
2580#else // KMP_USE_DYNAMIC_LOCK
2581
2582  kmp_user_lock_p lck;
2583
2584  if ((__kmp_user_lock_kind == lk_tas) &&
2585      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2586    lck = (kmp_user_lock_p)user_lock;
2587  }
2588#if KMP_USE_FUTEX
2589  else if ((__kmp_user_lock_kind == lk_futex) &&
2590           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2591    lck = (kmp_user_lock_p)user_lock;
2592  }
2593#endif
2594  else {
2595    lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2596  }
2597
2598#if USE_ITT_BUILD
2599  __kmp_itt_lock_acquiring(lck);
2600#endif /* USE_ITT_BUILD */
2601#if OMPT_SUPPORT && OMPT_OPTIONAL
2602  // This is the case, if called from omp_init_lock_with_hint:
2603  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2604  if (!codeptr)
2605    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2606  if (ompt_enabled.ompt_callback_mutex_acquire) {
2607    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2608        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2609        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2610  }
2611#endif
2612
2613  ACQUIRE_LOCK(lck, gtid);
2614
2615#if USE_ITT_BUILD
2616  __kmp_itt_lock_acquired(lck);
2617#endif /* USE_ITT_BUILD */
2618
2619#if OMPT_SUPPORT && OMPT_OPTIONAL
2620  if (ompt_enabled.ompt_callback_mutex_acquired) {
2621    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2622        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2623  }
2624#endif
2625
2626#endif // KMP_USE_DYNAMIC_LOCK
2627}
2628
2629void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2630#if KMP_USE_DYNAMIC_LOCK
2631
2632#if USE_ITT_BUILD
2633  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2634#endif
2635#if OMPT_SUPPORT && OMPT_OPTIONAL
2636  // This is the case, if called from omp_init_lock_with_hint:
2637  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2638  if (!codeptr)
2639    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2640  if (ompt_enabled.enabled) {
2641    if (ompt_enabled.ompt_callback_mutex_acquire) {
2642      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2643          ompt_mutex_nest_lock, omp_lock_hint_none,
2644          __ompt_get_mutex_impl_type(user_lock),
2645          (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2646    }
2647  }
2648#endif
2649  int acquire_status =
2650      KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2651  (void) acquire_status;
2652#if USE_ITT_BUILD
2653  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2654#endif
2655
2656#if OMPT_SUPPORT && OMPT_OPTIONAL
2657  if (ompt_enabled.enabled) {
2658    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2659      if (ompt_enabled.ompt_callback_mutex_acquired) {
2660        // lock_first
2661        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2662            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2663            codeptr);
2664      }
2665    } else {
2666      if (ompt_enabled.ompt_callback_nest_lock) {
2667        // lock_next
2668        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2669            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2670      }
2671    }
2672  }
2673#endif
2674
2675#else // KMP_USE_DYNAMIC_LOCK
2676  int acquire_status;
2677  kmp_user_lock_p lck;
2678
2679  if ((__kmp_user_lock_kind == lk_tas) &&
2680      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2681       OMP_NEST_LOCK_T_SIZE)) {
2682    lck = (kmp_user_lock_p)user_lock;
2683  }
2684#if KMP_USE_FUTEX
2685  else if ((__kmp_user_lock_kind == lk_futex) &&
2686           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2687            OMP_NEST_LOCK_T_SIZE)) {
2688    lck = (kmp_user_lock_p)user_lock;
2689  }
2690#endif
2691  else {
2692    lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2693  }
2694
2695#if USE_ITT_BUILD
2696  __kmp_itt_lock_acquiring(lck);
2697#endif /* USE_ITT_BUILD */
2698#if OMPT_SUPPORT && OMPT_OPTIONAL
2699  // This is the case, if called from omp_init_lock_with_hint:
2700  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2701  if (!codeptr)
2702    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2703  if (ompt_enabled.enabled) {
2704    if (ompt_enabled.ompt_callback_mutex_acquire) {
2705      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2706          ompt_mutex_nest_lock, omp_lock_hint_none,
2707          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2708          codeptr);
2709    }
2710  }
2711#endif
2712
2713  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2714
2715#if USE_ITT_BUILD
2716  __kmp_itt_lock_acquired(lck);
2717#endif /* USE_ITT_BUILD */
2718
2719#if OMPT_SUPPORT && OMPT_OPTIONAL
2720  if (ompt_enabled.enabled) {
2721    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2722      if (ompt_enabled.ompt_callback_mutex_acquired) {
2723        // lock_first
2724        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2725            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2726      }
2727    } else {
2728      if (ompt_enabled.ompt_callback_nest_lock) {
2729        // lock_next
2730        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2731            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2732      }
2733    }
2734  }
2735#endif
2736
2737#endif // KMP_USE_DYNAMIC_LOCK
2738}
2739
2740void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2741#if KMP_USE_DYNAMIC_LOCK
2742
2743  int tag = KMP_EXTRACT_D_TAG(user_lock);
2744#if USE_ITT_BUILD
2745  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2746#endif
2747#if KMP_USE_INLINED_TAS
2748  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2749    KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2750  } else
2751#elif KMP_USE_INLINED_FUTEX
2752  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2753    KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2754  } else
2755#endif
2756  {
2757    __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2758  }
2759
2760#if OMPT_SUPPORT && OMPT_OPTIONAL
2761  // This is the case, if called from omp_init_lock_with_hint:
2762  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2763  if (!codeptr)
2764    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2765  if (ompt_enabled.ompt_callback_mutex_released) {
2766    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2767        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2768  }
2769#endif
2770
2771#else // KMP_USE_DYNAMIC_LOCK
2772
2773  kmp_user_lock_p lck;
2774
2775  /* Can't use serial interval since not block structured */
2776  /* release the lock */
2777
2778  if ((__kmp_user_lock_kind == lk_tas) &&
2779      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2780#if KMP_OS_LINUX &&                                                            \
2781    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2782// "fast" path implemented to fix customer performance issue
2783#if USE_ITT_BUILD
2784    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2785#endif /* USE_ITT_BUILD */
2786    TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2787    KMP_MB();
2788
2789#if OMPT_SUPPORT && OMPT_OPTIONAL
2790    // This is the case, if called from omp_init_lock_with_hint:
2791    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2792    if (!codeptr)
2793      codeptr = OMPT_GET_RETURN_ADDRESS(0);
2794    if (ompt_enabled.ompt_callback_mutex_released) {
2795      ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2796          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2797    }
2798#endif
2799
2800    return;
2801#else
2802    lck = (kmp_user_lock_p)user_lock;
2803#endif
2804  }
2805#if KMP_USE_FUTEX
2806  else if ((__kmp_user_lock_kind == lk_futex) &&
2807           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2808    lck = (kmp_user_lock_p)user_lock;
2809  }
2810#endif
2811  else {
2812    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2813  }
2814
2815#if USE_ITT_BUILD
2816  __kmp_itt_lock_releasing(lck);
2817#endif /* USE_ITT_BUILD */
2818
2819  RELEASE_LOCK(lck, gtid);
2820
2821#if OMPT_SUPPORT && OMPT_OPTIONAL
2822  // This is the case, if called from omp_init_lock_with_hint:
2823  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2824  if (!codeptr)
2825    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2826  if (ompt_enabled.ompt_callback_mutex_released) {
2827    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2828        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2829  }
2830#endif
2831
2832#endif // KMP_USE_DYNAMIC_LOCK
2833}
2834
2835/* release the lock */
2836void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2837#if KMP_USE_DYNAMIC_LOCK
2838
2839#if USE_ITT_BUILD
2840  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2841#endif
2842  int release_status =
2843      KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2844  (void) release_status;
2845
2846#if OMPT_SUPPORT && OMPT_OPTIONAL
2847  // This is the case, if called from omp_init_lock_with_hint:
2848  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2849  if (!codeptr)
2850    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2851  if (ompt_enabled.enabled) {
2852    if (release_status == KMP_LOCK_RELEASED) {
2853      if (ompt_enabled.ompt_callback_mutex_released) {
2854        // release_lock_last
2855        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2856            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2857            codeptr);
2858      }
2859    } else if (ompt_enabled.ompt_callback_nest_lock) {
2860      // release_lock_prev
2861      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2862          ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2863    }
2864  }
2865#endif
2866
2867#else // KMP_USE_DYNAMIC_LOCK
2868
2869  kmp_user_lock_p lck;
2870
2871  /* Can't use serial interval since not block structured */
2872
2873  if ((__kmp_user_lock_kind == lk_tas) &&
2874      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2875       OMP_NEST_LOCK_T_SIZE)) {
2876#if KMP_OS_LINUX &&                                                            \
2877    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2878    // "fast" path implemented to fix customer performance issue
2879    kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2880#if USE_ITT_BUILD
2881    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2882#endif /* USE_ITT_BUILD */
2883
2884#if OMPT_SUPPORT && OMPT_OPTIONAL
2885    int release_status = KMP_LOCK_STILL_HELD;
2886#endif
2887
2888    if (--(tl->lk.depth_locked) == 0) {
2889      TCW_4(tl->lk.poll, 0);
2890#if OMPT_SUPPORT && OMPT_OPTIONAL
2891      release_status = KMP_LOCK_RELEASED;
2892#endif
2893    }
2894    KMP_MB();
2895
2896#if OMPT_SUPPORT && OMPT_OPTIONAL
2897    // This is the case, if called from omp_init_lock_with_hint:
2898    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2899    if (!codeptr)
2900      codeptr = OMPT_GET_RETURN_ADDRESS(0);
2901    if (ompt_enabled.enabled) {
2902      if (release_status == KMP_LOCK_RELEASED) {
2903        if (ompt_enabled.ompt_callback_mutex_released) {
2904          // release_lock_last
2905          ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2906              ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2907        }
2908      } else if (ompt_enabled.ompt_callback_nest_lock) {
2909        // release_lock_previous
2910        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2911            ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2912      }
2913    }
2914#endif
2915
2916    return;
2917#else
2918    lck = (kmp_user_lock_p)user_lock;
2919#endif
2920  }
2921#if KMP_USE_FUTEX
2922  else if ((__kmp_user_lock_kind == lk_futex) &&
2923           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2924            OMP_NEST_LOCK_T_SIZE)) {
2925    lck = (kmp_user_lock_p)user_lock;
2926  }
2927#endif
2928  else {
2929    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2930  }
2931
2932#if USE_ITT_BUILD
2933  __kmp_itt_lock_releasing(lck);
2934#endif /* USE_ITT_BUILD */
2935
2936  int release_status;
2937  release_status = RELEASE_NESTED_LOCK(lck, gtid);
2938#if OMPT_SUPPORT && OMPT_OPTIONAL
2939  // This is the case, if called from omp_init_lock_with_hint:
2940  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2941  if (!codeptr)
2942    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2943  if (ompt_enabled.enabled) {
2944    if (release_status == KMP_LOCK_RELEASED) {
2945      if (ompt_enabled.ompt_callback_mutex_released) {
2946        // release_lock_last
2947        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2948            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2949      }
2950    } else if (ompt_enabled.ompt_callback_nest_lock) {
2951      // release_lock_previous
2952      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2953          ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2954    }
2955  }
2956#endif
2957
2958#endif // KMP_USE_DYNAMIC_LOCK
2959}
2960
2961/* try to acquire the lock */
2962int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2963  KMP_COUNT_BLOCK(OMP_test_lock);
2964
2965#if KMP_USE_DYNAMIC_LOCK
2966  int rc;
2967  int tag = KMP_EXTRACT_D_TAG(user_lock);
2968#if USE_ITT_BUILD
2969  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2970#endif
2971#if OMPT_SUPPORT && OMPT_OPTIONAL
2972  // This is the case, if called from omp_init_lock_with_hint:
2973  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2974  if (!codeptr)
2975    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2976  if (ompt_enabled.ompt_callback_mutex_acquire) {
2977    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2978        ompt_mutex_lock, omp_lock_hint_none,
2979        __ompt_get_mutex_impl_type(user_lock),
2980        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2981  }
2982#endif
2983#if KMP_USE_INLINED_TAS
2984  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2985    KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2986  } else
2987#elif KMP_USE_INLINED_FUTEX
2988  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2989    KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2990  } else
2991#endif
2992  {
2993    rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2994  }
2995  if (rc) {
2996#if USE_ITT_BUILD
2997    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2998#endif
2999#if OMPT_SUPPORT && OMPT_OPTIONAL
3000    if (ompt_enabled.ompt_callback_mutex_acquired) {
3001      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3002          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3003    }
3004#endif
3005    return FTN_TRUE;
3006  } else {
3007#if USE_ITT_BUILD
3008    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3009#endif
3010    return FTN_FALSE;
3011  }
3012
3013#else // KMP_USE_DYNAMIC_LOCK
3014
3015  kmp_user_lock_p lck;
3016  int rc;
3017
3018  if ((__kmp_user_lock_kind == lk_tas) &&
3019      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3020    lck = (kmp_user_lock_p)user_lock;
3021  }
3022#if KMP_USE_FUTEX
3023  else if ((__kmp_user_lock_kind == lk_futex) &&
3024           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3025    lck = (kmp_user_lock_p)user_lock;
3026  }
3027#endif
3028  else {
3029    lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3030  }
3031
3032#if USE_ITT_BUILD
3033  __kmp_itt_lock_acquiring(lck);
3034#endif /* USE_ITT_BUILD */
3035#if OMPT_SUPPORT && OMPT_OPTIONAL
3036  // This is the case, if called from omp_init_lock_with_hint:
3037  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3038  if (!codeptr)
3039    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3040  if (ompt_enabled.ompt_callback_mutex_acquire) {
3041    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3042        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3043        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3044  }
3045#endif
3046
3047  rc = TEST_LOCK(lck, gtid);
3048#if USE_ITT_BUILD
3049  if (rc) {
3050    __kmp_itt_lock_acquired(lck);
3051  } else {
3052    __kmp_itt_lock_cancelled(lck);
3053  }
3054#endif /* USE_ITT_BUILD */
3055#if OMPT_SUPPORT && OMPT_OPTIONAL
3056  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3057    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3058        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059  }
3060#endif
3061
3062  return (rc ? FTN_TRUE : FTN_FALSE);
3063
3064/* Can't use serial interval since not block structured */
3065
3066#endif // KMP_USE_DYNAMIC_LOCK
3067}
3068
3069/* try to acquire the lock */
3070int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3071#if KMP_USE_DYNAMIC_LOCK
3072  int rc;
3073#if USE_ITT_BUILD
3074  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3075#endif
3076#if OMPT_SUPPORT && OMPT_OPTIONAL
3077  // This is the case, if called from omp_init_lock_with_hint:
3078  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3079  if (!codeptr)
3080    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3081  if (ompt_enabled.ompt_callback_mutex_acquire) {
3082    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3083        ompt_mutex_nest_lock, omp_lock_hint_none,
3084        __ompt_get_mutex_impl_type(user_lock),
3085        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3086  }
3087#endif
3088  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3089#if USE_ITT_BUILD
3090  if (rc) {
3091    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3092  } else {
3093    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3094  }
3095#endif
3096#if OMPT_SUPPORT && OMPT_OPTIONAL
3097  if (ompt_enabled.enabled && rc) {
3098    if (rc == 1) {
3099      if (ompt_enabled.ompt_callback_mutex_acquired) {
3100        // lock_first
3101        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3102            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3103            codeptr);
3104      }
3105    } else {
3106      if (ompt_enabled.ompt_callback_nest_lock) {
3107        // lock_next
3108        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3109            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3110      }
3111    }
3112  }
3113#endif
3114  return rc;
3115
3116#else // KMP_USE_DYNAMIC_LOCK
3117
3118  kmp_user_lock_p lck;
3119  int rc;
3120
3121  if ((__kmp_user_lock_kind == lk_tas) &&
3122      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3123       OMP_NEST_LOCK_T_SIZE)) {
3124    lck = (kmp_user_lock_p)user_lock;
3125  }
3126#if KMP_USE_FUTEX
3127  else if ((__kmp_user_lock_kind == lk_futex) &&
3128           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3129            OMP_NEST_LOCK_T_SIZE)) {
3130    lck = (kmp_user_lock_p)user_lock;
3131  }
3132#endif
3133  else {
3134    lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3135  }
3136
3137#if USE_ITT_BUILD
3138  __kmp_itt_lock_acquiring(lck);
3139#endif /* USE_ITT_BUILD */
3140
3141#if OMPT_SUPPORT && OMPT_OPTIONAL
3142  // This is the case, if called from omp_init_lock_with_hint:
3143  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3144  if (!codeptr)
3145    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3146  if (ompt_enabled.enabled) &&
3147        ompt_enabled.ompt_callback_mutex_acquire) {
3148      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3149          ompt_mutex_nest_lock, omp_lock_hint_none,
3150          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3151          codeptr);
3152    }
3153#endif
3154
3155  rc = TEST_NESTED_LOCK(lck, gtid);
3156#if USE_ITT_BUILD
3157  if (rc) {
3158    __kmp_itt_lock_acquired(lck);
3159  } else {
3160    __kmp_itt_lock_cancelled(lck);
3161  }
3162#endif /* USE_ITT_BUILD */
3163#if OMPT_SUPPORT && OMPT_OPTIONAL
3164  if (ompt_enabled.enabled && rc) {
3165    if (rc == 1) {
3166      if (ompt_enabled.ompt_callback_mutex_acquired) {
3167        // lock_first
3168        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3169            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3170      }
3171    } else {
3172      if (ompt_enabled.ompt_callback_nest_lock) {
3173        // lock_next
3174        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3175            ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3176      }
3177    }
3178  }
3179#endif
3180  return rc;
3181
3182/* Can't use serial interval since not block structured */
3183
3184#endif // KMP_USE_DYNAMIC_LOCK
3185}
3186
3187// Interface to fast scalable reduce methods routines
3188
3189// keep the selected method in a thread local structure for cross-function
3190// usage: will be used in __kmpc_end_reduce* functions;
3191// another solution: to re-determine the method one more time in
3192// __kmpc_end_reduce* functions (new prototype required then)
3193// AT: which solution is better?
3194#define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3195  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3196
3197#define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3198  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3199
3200// description of the packed_reduction_method variable: look at the macros in
3201// kmp.h
3202
3203// used in a critical section reduce block
3204static __forceinline void
3205__kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3206                                          kmp_critical_name *crit) {
3207
3208  // this lock was visible to a customer and to the threading profile tool as a
3209  // serial overhead span (although it's used for an internal purpose only)
3210  //            why was it visible in previous implementation?
3211  //            should we keep it visible in new reduce block?
3212  kmp_user_lock_p lck;
3213
3214#if KMP_USE_DYNAMIC_LOCK
3215
3216  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3217  // Check if it is initialized.
3218  if (*lk == 0) {
3219    if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3220      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3221                                  KMP_GET_D_TAG(__kmp_user_lock_seq));
3222    } else {
3223      __kmp_init_indirect_csptr(crit, loc, global_tid,
3224                                KMP_GET_I_TAG(__kmp_user_lock_seq));
3225    }
3226  }
3227  // Branch for accessing the actual lock object and set operation. This
3228  // branching is inevitable since this lock initialization does not follow the
3229  // normal dispatch path (lock table is not used).
3230  if (KMP_EXTRACT_D_TAG(lk) != 0) {
3231    lck = (kmp_user_lock_p)lk;
3232    KMP_DEBUG_ASSERT(lck != NULL);
3233    if (__kmp_env_consistency_check) {
3234      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3235    }
3236    KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3237  } else {
3238    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3239    lck = ilk->lock;
3240    KMP_DEBUG_ASSERT(lck != NULL);
3241    if (__kmp_env_consistency_check) {
3242      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3243    }
3244    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3245  }
3246
3247#else // KMP_USE_DYNAMIC_LOCK
3248
3249  // We know that the fast reduction code is only emitted by Intel compilers
3250  // with 32 byte critical sections. If there isn't enough space, then we
3251  // have to use a pointer.
3252  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3253    lck = (kmp_user_lock_p)crit;
3254  } else {
3255    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3256  }
3257  KMP_DEBUG_ASSERT(lck != NULL);
3258
3259  if (__kmp_env_consistency_check)
3260    __kmp_push_sync(global_tid, ct_critical, loc, lck);
3261
3262  __kmp_acquire_user_lock_with_checks(lck, global_tid);
3263
3264#endif // KMP_USE_DYNAMIC_LOCK
3265}
3266
3267// used in a critical section reduce block
3268static __forceinline void
3269__kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3270                                        kmp_critical_name *crit) {
3271
3272  kmp_user_lock_p lck;
3273
3274#if KMP_USE_DYNAMIC_LOCK
3275
3276  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3277    lck = (kmp_user_lock_p)crit;
3278    if (__kmp_env_consistency_check)
3279      __kmp_pop_sync(global_tid, ct_critical, loc);
3280    KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3281  } else {
3282    kmp_indirect_lock_t *ilk =
3283        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3284    if (__kmp_env_consistency_check)
3285      __kmp_pop_sync(global_tid, ct_critical, loc);
3286    KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3287  }
3288
3289#else // KMP_USE_DYNAMIC_LOCK
3290
3291  // We know that the fast reduction code is only emitted by Intel compilers
3292  // with 32 byte critical sections. If there isn't enough space, then we have
3293  // to use a pointer.
3294  if (__kmp_base_user_lock_size > 32) {
3295    lck = *((kmp_user_lock_p *)crit);
3296    KMP_ASSERT(lck != NULL);
3297  } else {
3298    lck = (kmp_user_lock_p)crit;
3299  }
3300
3301  if (__kmp_env_consistency_check)
3302    __kmp_pop_sync(global_tid, ct_critical, loc);
3303
3304  __kmp_release_user_lock_with_checks(lck, global_tid);
3305
3306#endif // KMP_USE_DYNAMIC_LOCK
3307} // __kmp_end_critical_section_reduce_block
3308
3309static __forceinline int
3310__kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3311                                     int *task_state) {
3312  kmp_team_t *team;
3313
3314  // Check if we are inside the teams construct?
3315  if (th->th.th_teams_microtask) {
3316    *team_p = team = th->th.th_team;
3317    if (team->t.t_level == th->th.th_teams_level) {
3318      // This is reduction at teams construct.
3319      KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3320      // Let's swap teams temporarily for the reduction.
3321      th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3322      th->th.th_team = team->t.t_parent;
3323      th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3324      th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3325      *task_state = th->th.th_task_state;
3326      th->th.th_task_state = 0;
3327
3328      return 1;
3329    }
3330  }
3331  return 0;
3332}
3333
3334static __forceinline void
3335__kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3336  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3337  th->th.th_info.ds.ds_tid = 0;
3338  th->th.th_team = team;
3339  th->th.th_team_nproc = team->t.t_nproc;
3340  th->th.th_task_team = team->t.t_task_team[task_state];
3341  th->th.th_task_state = task_state;
3342}
3343
3344/* 2.a.i. Reduce Block without a terminating barrier */
3345/*!
3346@ingroup SYNCHRONIZATION
3347@param loc source location information
3348@param global_tid global thread number
3349@param num_vars number of items (variables) to be reduced
3350@param reduce_size size of data in bytes to be reduced
3351@param reduce_data pointer to data to be reduced
3352@param reduce_func callback function providing reduction operation on two
3353operands and returning result of reduction in lhs_data
3354@param lck pointer to the unique lock data structure
3355@result 1 for the master thread, 0 for all other team threads, 2 for all team
3356threads if atomic reduction needed
3357
3358The nowait version is used for a reduce clause with the nowait argument.
3359*/
3360kmp_int32
3361__kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3362                     size_t reduce_size, void *reduce_data,
3363                     void (*reduce_func)(void *lhs_data, void *rhs_data),
3364                     kmp_critical_name *lck) {
3365
3366  KMP_COUNT_BLOCK(REDUCE_nowait);
3367  int retval = 0;
3368  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3369  kmp_info_t *th;
3370  kmp_team_t *team;
3371  int teams_swapped = 0, task_state;
3372  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3373
3374  // why do we need this initialization here at all?
3375  // Reduction clause can not be used as a stand-alone directive.
3376
3377  // do not call __kmp_serial_initialize(), it will be called by
3378  // __kmp_parallel_initialize() if needed
3379  // possible detection of false-positive race by the threadchecker ???
3380  if (!TCR_4(__kmp_init_parallel))
3381    __kmp_parallel_initialize();
3382
3383  __kmp_resume_if_soft_paused();
3384
3385// check correctness of reduce block nesting
3386#if KMP_USE_DYNAMIC_LOCK
3387  if (__kmp_env_consistency_check)
3388    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3389#else
3390  if (__kmp_env_consistency_check)
3391    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3392#endif
3393
3394  th = __kmp_thread_from_gtid(global_tid);
3395  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3396
3397  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3398  // the value should be kept in a variable
3399  // the variable should be either a construct-specific or thread-specific
3400  // property, not a team specific property
3401  //     (a thread can reach the next reduce block on the next construct, reduce
3402  //     method may differ on the next construct)
3403  // an ident_t "loc" parameter could be used as a construct-specific property
3404  // (what if loc == 0?)
3405  //     (if both construct-specific and team-specific variables were shared,
3406  //     then unness extra syncs should be needed)
3407  // a thread-specific variable is better regarding two issues above (next
3408  // construct and extra syncs)
3409  // a thread-specific "th_local.reduction_method" variable is used currently
3410  // each thread executes 'determine' and 'set' lines (no need to execute by one
3411  // thread, to avoid unness extra syncs)
3412
3413  packed_reduction_method = __kmp_determine_reduction_method(
3414      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3415  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3416
3417  OMPT_REDUCTION_DECL(th, global_tid);
3418  if (packed_reduction_method == critical_reduce_block) {
3419
3420    OMPT_REDUCTION_BEGIN;
3421
3422    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3423    retval = 1;
3424
3425  } else if (packed_reduction_method == empty_reduce_block) {
3426
3427    OMPT_REDUCTION_BEGIN;
3428
3429    // usage: if team size == 1, no synchronization is required ( Intel
3430    // platforms only )
3431    retval = 1;
3432
3433  } else if (packed_reduction_method == atomic_reduce_block) {
3434
3435    retval = 2;
3436
3437    // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3438    // won't be called by the code gen)
3439    //     (it's not quite good, because the checking block has been closed by
3440    //     this 'pop',
3441    //      but atomic operation has not been executed yet, will be executed
3442    //      slightly later, literally on next instruction)
3443    if (__kmp_env_consistency_check)
3444      __kmp_pop_sync(global_tid, ct_reduce, loc);
3445
3446  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3447                                   tree_reduce_block)) {
3448
3449// AT: performance issue: a real barrier here
3450// AT:     (if master goes slow, other threads are blocked here waiting for the
3451// master to come and release them)
3452// AT:     (it's not what a customer might expect specifying NOWAIT clause)
3453// AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3454// be confusing to a customer)
3455// AT: another implementation of *barrier_gather*nowait() (or some other design)
3456// might go faster and be more in line with sense of NOWAIT
3457// AT: TO DO: do epcc test and compare times
3458
3459// this barrier should be invisible to a customer and to the threading profile
3460// tool (it's neither a terminating barrier nor customer's code, it's
3461// used for an internal purpose)
3462#if OMPT_SUPPORT
3463    // JP: can this barrier potentially leed to task scheduling?
3464    // JP: as long as there is a barrier in the implementation, OMPT should and
3465    // will provide the barrier events
3466    //         so we set-up the necessary frame/return addresses.
3467    ompt_frame_t *ompt_frame;
3468    if (ompt_enabled.enabled) {
3469      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3470      if (ompt_frame->enter_frame.ptr == NULL)
3471        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3472      OMPT_STORE_RETURN_ADDRESS(global_tid);
3473    }
3474#endif
3475#if USE_ITT_NOTIFY
3476    __kmp_threads[global_tid]->th.th_ident = loc;
3477#endif
3478    retval =
3479        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3480                      global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3481    retval = (retval != 0) ? (0) : (1);
3482#if OMPT_SUPPORT && OMPT_OPTIONAL
3483    if (ompt_enabled.enabled) {
3484      ompt_frame->enter_frame = ompt_data_none;
3485    }
3486#endif
3487
3488    // all other workers except master should do this pop here
3489    //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3490    if (__kmp_env_consistency_check) {
3491      if (retval == 0) {
3492        __kmp_pop_sync(global_tid, ct_reduce, loc);
3493      }
3494    }
3495
3496  } else {
3497
3498    // should never reach this block
3499    KMP_ASSERT(0); // "unexpected method"
3500  }
3501  if (teams_swapped) {
3502    __kmp_restore_swapped_teams(th, team, task_state);
3503  }
3504  KA_TRACE(
3505      10,
3506      ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3507       global_tid, packed_reduction_method, retval));
3508
3509  return retval;
3510}
3511
3512/*!
3513@ingroup SYNCHRONIZATION
3514@param loc source location information
3515@param global_tid global thread id.
3516@param lck pointer to the unique lock data structure
3517
3518Finish the execution of a reduce nowait.
3519*/
3520void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3521                              kmp_critical_name *lck) {
3522
3523  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3524
3525  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3526
3527  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3528
3529  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3530
3531  if (packed_reduction_method == critical_reduce_block) {
3532
3533    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3534    OMPT_REDUCTION_END;
3535
3536  } else if (packed_reduction_method == empty_reduce_block) {
3537
3538    // usage: if team size == 1, no synchronization is required ( on Intel
3539    // platforms only )
3540
3541    OMPT_REDUCTION_END;
3542
3543  } else if (packed_reduction_method == atomic_reduce_block) {
3544
3545    // neither master nor other workers should get here
3546    //     (code gen does not generate this call in case 2: atomic reduce block)
3547    // actually it's better to remove this elseif at all;
3548    // after removal this value will checked by the 'else' and will assert
3549
3550  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3551                                   tree_reduce_block)) {
3552
3553    // only master gets here
3554    // OMPT: tree reduction is annotated in the barrier code
3555
3556  } else {
3557
3558    // should never reach this block
3559    KMP_ASSERT(0); // "unexpected method"
3560  }
3561
3562  if (__kmp_env_consistency_check)
3563    __kmp_pop_sync(global_tid, ct_reduce, loc);
3564
3565  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3566                global_tid, packed_reduction_method));
3567
3568  return;
3569}
3570
3571/* 2.a.ii. Reduce Block with a terminating barrier */
3572
3573/*!
3574@ingroup SYNCHRONIZATION
3575@param loc source location information
3576@param global_tid global thread number
3577@param num_vars number of items (variables) to be reduced
3578@param reduce_size size of data in bytes to be reduced
3579@param reduce_data pointer to data to be reduced
3580@param reduce_func callback function providing reduction operation on two
3581operands and returning result of reduction in lhs_data
3582@param lck pointer to the unique lock data structure
3583@result 1 for the master thread, 0 for all other team threads, 2 for all team
3584threads if atomic reduction needed
3585
3586A blocking reduce that includes an implicit barrier.
3587*/
3588kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3589                        size_t reduce_size, void *reduce_data,
3590                        void (*reduce_func)(void *lhs_data, void *rhs_data),
3591                        kmp_critical_name *lck) {
3592  KMP_COUNT_BLOCK(REDUCE_wait);
3593  int retval = 0;
3594  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3595  kmp_info_t *th;
3596  kmp_team_t *team;
3597  int teams_swapped = 0, task_state;
3598
3599  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3600
3601  // why do we need this initialization here at all?
3602  // Reduction clause can not be a stand-alone directive.
3603
3604  // do not call __kmp_serial_initialize(), it will be called by
3605  // __kmp_parallel_initialize() if needed
3606  // possible detection of false-positive race by the threadchecker ???
3607  if (!TCR_4(__kmp_init_parallel))
3608    __kmp_parallel_initialize();
3609
3610  __kmp_resume_if_soft_paused();
3611
3612// check correctness of reduce block nesting
3613#if KMP_USE_DYNAMIC_LOCK
3614  if (__kmp_env_consistency_check)
3615    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3616#else
3617  if (__kmp_env_consistency_check)
3618    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3619#endif
3620
3621  th = __kmp_thread_from_gtid(global_tid);
3622  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3623
3624  packed_reduction_method = __kmp_determine_reduction_method(
3625      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3626  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3627
3628  OMPT_REDUCTION_DECL(th, global_tid);
3629
3630  if (packed_reduction_method == critical_reduce_block) {
3631
3632    OMPT_REDUCTION_BEGIN;
3633    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3634    retval = 1;
3635
3636  } else if (packed_reduction_method == empty_reduce_block) {
3637
3638    OMPT_REDUCTION_BEGIN;
3639    // usage: if team size == 1, no synchronization is required ( Intel
3640    // platforms only )
3641    retval = 1;
3642
3643  } else if (packed_reduction_method == atomic_reduce_block) {
3644
3645    retval = 2;
3646
3647  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3648                                   tree_reduce_block)) {
3649
3650// case tree_reduce_block:
3651// this barrier should be visible to a customer and to the threading profile
3652// tool (it's a terminating barrier on constructs if NOWAIT not specified)
3653#if OMPT_SUPPORT
3654    ompt_frame_t *ompt_frame;
3655    if (ompt_enabled.enabled) {
3656      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3657      if (ompt_frame->enter_frame.ptr == NULL)
3658        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3659      OMPT_STORE_RETURN_ADDRESS(global_tid);
3660    }
3661#endif
3662#if USE_ITT_NOTIFY
3663    __kmp_threads[global_tid]->th.th_ident =
3664        loc; // needed for correct notification of frames
3665#endif
3666    retval =
3667        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3668                      global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3669    retval = (retval != 0) ? (0) : (1);
3670#if OMPT_SUPPORT && OMPT_OPTIONAL
3671    if (ompt_enabled.enabled) {
3672      ompt_frame->enter_frame = ompt_data_none;
3673    }
3674#endif
3675
3676    // all other workers except master should do this pop here
3677    // ( none of other workers except master will enter __kmpc_end_reduce() )
3678    if (__kmp_env_consistency_check) {
3679      if (retval == 0) { // 0: all other workers; 1: master
3680        __kmp_pop_sync(global_tid, ct_reduce, loc);
3681      }
3682    }
3683
3684  } else {
3685
3686    // should never reach this block
3687    KMP_ASSERT(0); // "unexpected method"
3688  }
3689  if (teams_swapped) {
3690    __kmp_restore_swapped_teams(th, team, task_state);
3691  }
3692
3693  KA_TRACE(10,
3694           ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3695            global_tid, packed_reduction_method, retval));
3696  return retval;
3697}
3698
3699/*!
3700@ingroup SYNCHRONIZATION
3701@param loc source location information
3702@param global_tid global thread id.
3703@param lck pointer to the unique lock data structure
3704
3705Finish the execution of a blocking reduce.
3706The <tt>lck</tt> pointer must be the same as that used in the corresponding
3707start function.
3708*/
3709void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3710                       kmp_critical_name *lck) {
3711
3712  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3713  kmp_info_t *th;
3714  kmp_team_t *team;
3715  int teams_swapped = 0, task_state;
3716
3717  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3718
3719  th = __kmp_thread_from_gtid(global_tid);
3720  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3721
3722  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3723
3724  // this barrier should be visible to a customer and to the threading profile
3725  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3726  OMPT_REDUCTION_DECL(th, global_tid);
3727
3728  if (packed_reduction_method == critical_reduce_block) {
3729    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3730
3731    OMPT_REDUCTION_END;
3732
3733// TODO: implicit barrier: should be exposed
3734#if OMPT_SUPPORT
3735    ompt_frame_t *ompt_frame;
3736    if (ompt_enabled.enabled) {
3737      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3738      if (ompt_frame->enter_frame.ptr == NULL)
3739        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3740      OMPT_STORE_RETURN_ADDRESS(global_tid);
3741    }
3742#endif
3743#if USE_ITT_NOTIFY
3744    __kmp_threads[global_tid]->th.th_ident = loc;
3745#endif
3746    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3747#if OMPT_SUPPORT && OMPT_OPTIONAL
3748    if (ompt_enabled.enabled) {
3749      ompt_frame->enter_frame = ompt_data_none;
3750    }
3751#endif
3752
3753  } else if (packed_reduction_method == empty_reduce_block) {
3754
3755    OMPT_REDUCTION_END;
3756
3757// usage: if team size==1, no synchronization is required (Intel platforms only)
3758
3759// TODO: implicit barrier: should be exposed
3760#if OMPT_SUPPORT
3761    ompt_frame_t *ompt_frame;
3762    if (ompt_enabled.enabled) {
3763      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3764      if (ompt_frame->enter_frame.ptr == NULL)
3765        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3766      OMPT_STORE_RETURN_ADDRESS(global_tid);
3767    }
3768#endif
3769#if USE_ITT_NOTIFY
3770    __kmp_threads[global_tid]->th.th_ident = loc;
3771#endif
3772    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3773#if OMPT_SUPPORT && OMPT_OPTIONAL
3774    if (ompt_enabled.enabled) {
3775      ompt_frame->enter_frame = ompt_data_none;
3776    }
3777#endif
3778
3779  } else if (packed_reduction_method == atomic_reduce_block) {
3780
3781#if OMPT_SUPPORT
3782    ompt_frame_t *ompt_frame;
3783    if (ompt_enabled.enabled) {
3784      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3785      if (ompt_frame->enter_frame.ptr == NULL)
3786        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3787      OMPT_STORE_RETURN_ADDRESS(global_tid);
3788    }
3789#endif
3790// TODO: implicit barrier: should be exposed
3791#if USE_ITT_NOTIFY
3792    __kmp_threads[global_tid]->th.th_ident = loc;
3793#endif
3794    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3795#if OMPT_SUPPORT && OMPT_OPTIONAL
3796    if (ompt_enabled.enabled) {
3797      ompt_frame->enter_frame = ompt_data_none;
3798    }
3799#endif
3800
3801  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3802                                   tree_reduce_block)) {
3803
3804    // only master executes here (master releases all other workers)
3805    __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3806                            global_tid);
3807
3808  } else {
3809
3810    // should never reach this block
3811    KMP_ASSERT(0); // "unexpected method"
3812  }
3813  if (teams_swapped) {
3814    __kmp_restore_swapped_teams(th, team, task_state);
3815  }
3816
3817  if (__kmp_env_consistency_check)
3818    __kmp_pop_sync(global_tid, ct_reduce, loc);
3819
3820  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3821                global_tid, packed_reduction_method));
3822
3823  return;
3824}
3825
3826#undef __KMP_GET_REDUCTION_METHOD
3827#undef __KMP_SET_REDUCTION_METHOD
3828
3829/* end of interface to fast scalable reduce routines */
3830
3831kmp_uint64 __kmpc_get_taskid() {
3832
3833  kmp_int32 gtid;
3834  kmp_info_t *thread;
3835
3836  gtid = __kmp_get_gtid();
3837  if (gtid < 0) {
3838    return 0;
3839  }
3840  thread = __kmp_thread_from_gtid(gtid);
3841  return thread->th.th_current_task->td_task_id;
3842
3843} // __kmpc_get_taskid
3844
3845kmp_uint64 __kmpc_get_parent_taskid() {
3846
3847  kmp_int32 gtid;
3848  kmp_info_t *thread;
3849  kmp_taskdata_t *parent_task;
3850
3851  gtid = __kmp_get_gtid();
3852  if (gtid < 0) {
3853    return 0;
3854  }
3855  thread = __kmp_thread_from_gtid(gtid);
3856  parent_task = thread->th.th_current_task->td_parent;
3857  return (parent_task == NULL ? 0 : parent_task->td_task_id);
3858
3859} // __kmpc_get_parent_taskid
3860
3861/*!
3862@ingroup WORK_SHARING
3863@param loc  source location information.
3864@param gtid  global thread number.
3865@param num_dims  number of associated doacross loops.
3866@param dims  info on loops bounds.
3867
3868Initialize doacross loop information.
3869Expect compiler send us inclusive bounds,
3870e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3871*/
3872void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3873                          const struct kmp_dim *dims) {
3874  int j, idx;
3875  kmp_int64 last, trace_count;
3876  kmp_info_t *th = __kmp_threads[gtid];
3877  kmp_team_t *team = th->th.th_team;
3878  kmp_uint32 *flags;
3879  kmp_disp_t *pr_buf = th->th.th_dispatch;
3880  dispatch_shared_info_t *sh_buf;
3881
3882  KA_TRACE(
3883      20,
3884      ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3885       gtid, num_dims, !team->t.t_serialized));
3886  KMP_DEBUG_ASSERT(dims != NULL);
3887  KMP_DEBUG_ASSERT(num_dims > 0);
3888
3889  if (team->t.t_serialized) {
3890    KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3891    return; // no dependencies if team is serialized
3892  }
3893  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3894  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3895  // the next loop
3896  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3897
3898  // Save bounds info into allocated private buffer
3899  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3900  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3901      th, sizeof(kmp_int64) * (4 * num_dims + 1));
3902  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3903  pr_buf->th_doacross_info[0] =
3904      (kmp_int64)num_dims; // first element is number of dimensions
3905  // Save also address of num_done in order to access it later without knowing
3906  // the buffer index
3907  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3908  pr_buf->th_doacross_info[2] = dims[0].lo;
3909  pr_buf->th_doacross_info[3] = dims[0].up;
3910  pr_buf->th_doacross_info[4] = dims[0].st;
3911  last = 5;
3912  for (j = 1; j < num_dims; ++j) {
3913    kmp_int64
3914        range_length; // To keep ranges of all dimensions but the first dims[0]
3915    if (dims[j].st == 1) { // most common case
3916      // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3917      range_length = dims[j].up - dims[j].lo + 1;
3918    } else {
3919      if (dims[j].st > 0) {
3920        KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3921        range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3922      } else { // negative increment
3923        KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3924        range_length =
3925            (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3926      }
3927    }
3928    pr_buf->th_doacross_info[last++] = range_length;
3929    pr_buf->th_doacross_info[last++] = dims[j].lo;
3930    pr_buf->th_doacross_info[last++] = dims[j].up;
3931    pr_buf->th_doacross_info[last++] = dims[j].st;
3932  }
3933
3934  // Compute total trip count.
3935  // Start with range of dims[0] which we don't need to keep in the buffer.
3936  if (dims[0].st == 1) { // most common case
3937    trace_count = dims[0].up - dims[0].lo + 1;
3938  } else if (dims[0].st > 0) {
3939    KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3940    trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3941  } else { // negative increment
3942    KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3943    trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3944  }
3945  for (j = 1; j < num_dims; ++j) {
3946    trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3947  }
3948  KMP_DEBUG_ASSERT(trace_count > 0);
3949
3950  // Check if shared buffer is not occupied by other loop (idx -
3951  // __kmp_dispatch_num_buffers)
3952  if (idx != sh_buf->doacross_buf_idx) {
3953    // Shared buffer is occupied, wait for it to be free
3954    __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3955                 __kmp_eq_4, NULL);
3956  }
3957#if KMP_32_BIT_ARCH
3958  // Check if we are the first thread. After the CAS the first thread gets 0,
3959  // others get 1 if initialization is in progress, allocated pointer otherwise.
3960  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3961  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3962      (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3963#else
3964  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3965      (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3966#endif
3967  if (flags == NULL) {
3968    // we are the first thread, allocate the array of flags
3969    size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3970    flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3971    KMP_MB();
3972    sh_buf->doacross_flags = flags;
3973  } else if (flags == (kmp_uint32 *)1) {
3974#if KMP_32_BIT_ARCH
3975    // initialization is still in progress, need to wait
3976    while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3977#else
3978    while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3979#endif
3980      KMP_YIELD(TRUE);
3981    KMP_MB();
3982  } else {
3983    KMP_MB();
3984  }
3985  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3986  pr_buf->th_doacross_flags =
3987      sh_buf->doacross_flags; // save private copy in order to not
3988  // touch shared buffer on each iteration
3989  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3990}
3991
3992void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3993  kmp_int32 shft, num_dims, i;
3994  kmp_uint32 flag;
3995  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3996  kmp_info_t *th = __kmp_threads[gtid];
3997  kmp_team_t *team = th->th.th_team;
3998  kmp_disp_t *pr_buf;
3999  kmp_int64 lo, up, st;
4000
4001  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4002  if (team->t.t_serialized) {
4003    KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4004    return; // no dependencies if team is serialized
4005  }
4006
4007  // calculate sequential iteration number and check out-of-bounds condition
4008  pr_buf = th->th.th_dispatch;
4009  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4010  num_dims = pr_buf->th_doacross_info[0];
4011  lo = pr_buf->th_doacross_info[2];
4012  up = pr_buf->th_doacross_info[3];
4013  st = pr_buf->th_doacross_info[4];
4014#if OMPT_SUPPORT && OMPT_OPTIONAL
4015  ompt_dependence_t deps[num_dims];
4016#endif
4017  if (st == 1) { // most common case
4018    if (vec[0] < lo || vec[0] > up) {
4019      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4020                    "bounds [%lld,%lld]\n",
4021                    gtid, vec[0], lo, up));
4022      return;
4023    }
4024    iter_number = vec[0] - lo;
4025  } else if (st > 0) {
4026    if (vec[0] < lo || vec[0] > up) {
4027      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4028                    "bounds [%lld,%lld]\n",
4029                    gtid, vec[0], lo, up));
4030      return;
4031    }
4032    iter_number = (kmp_uint64)(vec[0] - lo) / st;
4033  } else { // negative increment
4034    if (vec[0] > lo || vec[0] < up) {
4035      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4036                    "bounds [%lld,%lld]\n",
4037                    gtid, vec[0], lo, up));
4038      return;
4039    }
4040    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4041  }
4042#if OMPT_SUPPORT && OMPT_OPTIONAL
4043  deps[0].variable.value = iter_number;
4044  deps[0].dependence_type = ompt_dependence_type_sink;
4045#endif
4046  for (i = 1; i < num_dims; ++i) {
4047    kmp_int64 iter, ln;
4048    kmp_int32 j = i * 4;
4049    ln = pr_buf->th_doacross_info[j + 1];
4050    lo = pr_buf->th_doacross_info[j + 2];
4051    up = pr_buf->th_doacross_info[j + 3];
4052    st = pr_buf->th_doacross_info[j + 4];
4053    if (st == 1) {
4054      if (vec[i] < lo || vec[i] > up) {
4055        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4056                      "bounds [%lld,%lld]\n",
4057                      gtid, vec[i], lo, up));
4058        return;
4059      }
4060      iter = vec[i] - lo;
4061    } else if (st > 0) {
4062      if (vec[i] < lo || vec[i] > up) {
4063        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4064                      "bounds [%lld,%lld]\n",
4065                      gtid, vec[i], lo, up));
4066        return;
4067      }
4068      iter = (kmp_uint64)(vec[i] - lo) / st;
4069    } else { // st < 0
4070      if (vec[i] > lo || vec[i] < up) {
4071        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4072                      "bounds [%lld,%lld]\n",
4073                      gtid, vec[i], lo, up));
4074        return;
4075      }
4076      iter = (kmp_uint64)(lo - vec[i]) / (-st);
4077    }
4078    iter_number = iter + ln * iter_number;
4079#if OMPT_SUPPORT && OMPT_OPTIONAL
4080    deps[i].variable.value = iter;
4081    deps[i].dependence_type = ompt_dependence_type_sink;
4082#endif
4083  }
4084  shft = iter_number % 32; // use 32-bit granularity
4085  iter_number >>= 5; // divided by 32
4086  flag = 1 << shft;
4087  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4088    KMP_YIELD(TRUE);
4089  }
4090  KMP_MB();
4091#if OMPT_SUPPORT && OMPT_OPTIONAL
4092  if (ompt_enabled.ompt_callback_dependences) {
4093    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4094        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4095  }
4096#endif
4097  KA_TRACE(20,
4098           ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4099            gtid, (iter_number << 5) + shft));
4100}
4101
4102void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4103  kmp_int32 shft, num_dims, i;
4104  kmp_uint32 flag;
4105  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4106  kmp_info_t *th = __kmp_threads[gtid];
4107  kmp_team_t *team = th->th.th_team;
4108  kmp_disp_t *pr_buf;
4109  kmp_int64 lo, st;
4110
4111  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4112  if (team->t.t_serialized) {
4113    KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4114    return; // no dependencies if team is serialized
4115  }
4116
4117  // calculate sequential iteration number (same as in "wait" but no
4118  // out-of-bounds checks)
4119  pr_buf = th->th.th_dispatch;
4120  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4121  num_dims = pr_buf->th_doacross_info[0];
4122  lo = pr_buf->th_doacross_info[2];
4123  st = pr_buf->th_doacross_info[4];
4124#if OMPT_SUPPORT && OMPT_OPTIONAL
4125  ompt_dependence_t deps[num_dims];
4126#endif
4127  if (st == 1) { // most common case
4128    iter_number = vec[0] - lo;
4129  } else if (st > 0) {
4130    iter_number = (kmp_uint64)(vec[0] - lo) / st;
4131  } else { // negative increment
4132    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4133  }
4134#if OMPT_SUPPORT && OMPT_OPTIONAL
4135  deps[0].variable.value = iter_number;
4136  deps[0].dependence_type = ompt_dependence_type_source;
4137#endif
4138  for (i = 1; i < num_dims; ++i) {
4139    kmp_int64 iter, ln;
4140    kmp_int32 j = i * 4;
4141    ln = pr_buf->th_doacross_info[j + 1];
4142    lo = pr_buf->th_doacross_info[j + 2];
4143    st = pr_buf->th_doacross_info[j + 4];
4144    if (st == 1) {
4145      iter = vec[i] - lo;
4146    } else if (st > 0) {
4147      iter = (kmp_uint64)(vec[i] - lo) / st;
4148    } else { // st < 0
4149      iter = (kmp_uint64)(lo - vec[i]) / (-st);
4150    }
4151    iter_number = iter + ln * iter_number;
4152#if OMPT_SUPPORT && OMPT_OPTIONAL
4153    deps[i].variable.value = iter;
4154    deps[i].dependence_type = ompt_dependence_type_source;
4155#endif
4156  }
4157#if OMPT_SUPPORT && OMPT_OPTIONAL
4158  if (ompt_enabled.ompt_callback_dependences) {
4159    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4160        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4161  }
4162#endif
4163  shft = iter_number % 32; // use 32-bit granularity
4164  iter_number >>= 5; // divided by 32
4165  flag = 1 << shft;
4166  KMP_MB();
4167  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4168    KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4169  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4170                (iter_number << 5) + shft));
4171}
4172
4173void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4174  kmp_int32 num_done;
4175  kmp_info_t *th = __kmp_threads[gtid];
4176  kmp_team_t *team = th->th.th_team;
4177  kmp_disp_t *pr_buf = th->th.th_dispatch;
4178
4179  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4180  if (team->t.t_serialized) {
4181    KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4182    return; // nothing to do
4183  }
4184  num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4185  if (num_done == th->th.th_team_nproc) {
4186    // we are the last thread, need to free shared resources
4187    int idx = pr_buf->th_doacross_buf_idx - 1;
4188    dispatch_shared_info_t *sh_buf =
4189        &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4190    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4191                     (kmp_int64)&sh_buf->doacross_num_done);
4192    KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4193    KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4194    __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4195    sh_buf->doacross_flags = NULL;
4196    sh_buf->doacross_num_done = 0;
4197    sh_buf->doacross_buf_idx +=
4198        __kmp_dispatch_num_buffers; // free buffer for future re-use
4199  }
4200  // free private resources (need to keep buffer index forever)
4201  pr_buf->th_doacross_flags = NULL;
4202  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4203  pr_buf->th_doacross_info = NULL;
4204  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4205}
4206
4207/* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4208void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4209  return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4210}
4211
4212void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4213  __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4214}
4215
4216int __kmpc_get_target_offload(void) {
4217  if (!__kmp_init_serial) {
4218    __kmp_serial_initialize();
4219  }
4220  return __kmp_target_offload;
4221}
4222
4223int __kmpc_pause_resource(kmp_pause_status_t level) {
4224  if (!__kmp_init_serial) {
4225    return 1; // Can't pause if runtime is not initialized
4226  }
4227  return __kmp_pause_resource(level);
4228}
4229