1/* Copyright (C) 2007-2022 Free Software Foundation, Inc.
2   Contributed by Richard Henderson <rth@redhat.com>.
3
4   This file is part of the GNU Offloading and Multi Processing Library
5   (libgomp).
6
7   Libgomp is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15   more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26/* This file handles the maintenance of tasks in response to task
27   creation and termination.  */
28
29#include "libgomp.h"
30#include <stdlib.h>
31#include <string.h>
32#include <assert.h>
33#include "gomp-constants.h"
34
35typedef struct gomp_task_depend_entry *hash_entry_type;
36
37static inline void *
38htab_alloc (size_t size)
39{
40  return gomp_malloc (size);
41}
42
43static inline void
44htab_free (void *ptr)
45{
46  free (ptr);
47}
48
49#include "hashtab.h"
50
51static inline hashval_t
52htab_hash (hash_entry_type element)
53{
54  return hash_pointer (element->addr);
55}
56
57static inline bool
58htab_eq (hash_entry_type x, hash_entry_type y)
59{
60  return x->addr == y->addr;
61}
62
63/* Create a new task data structure.  */
64
65void
66gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task,
67		struct gomp_task_icv *prev_icv)
68{
69  /* It would seem that using memset here would be a win, but it turns
70     out that partially filling gomp_task allows us to keep the
71     overhead of task creation low.  In the nqueens-1.c test, for a
72     sufficiently large N, we drop the overhead from 5-6% to 1%.
73
74     Note, the nqueens-1.c test in serial mode is a good test to
75     benchmark the overhead of creating tasks as there are millions of
76     tiny tasks created that all run undeferred.  */
77  task->parent = parent_task;
78  priority_queue_init (&task->children_queue);
79  task->taskgroup = NULL;
80  task->dependers = NULL;
81  task->depend_hash = NULL;
82  task->taskwait = NULL;
83  task->depend_count = 0;
84  task->completion_sem = NULL;
85  task->deferred_p = false;
86  task->icv = *prev_icv;
87  task->kind = GOMP_TASK_IMPLICIT;
88  task->in_tied_task = false;
89  task->final_task = false;
90  task->copy_ctors_done = false;
91  task->parent_depends_on = false;
92}
93
94/* Clean up a task, after completing it.  */
95
96void
97gomp_end_task (void)
98{
99  struct gomp_thread *thr = gomp_thread ();
100  struct gomp_task *task = thr->task;
101
102  gomp_finish_task (task);
103  thr->task = task->parent;
104}
105
106/* Clear the parent field of every task in LIST.  */
107
108static inline void
109gomp_clear_parent_in_list (struct priority_list *list)
110{
111  struct priority_node *p = list->tasks;
112  if (p)
113    do
114      {
115	priority_node_to_task (PQ_CHILDREN, p)->parent = NULL;
116	p = p->next;
117      }
118    while (p != list->tasks);
119}
120
121/* Splay tree version of gomp_clear_parent_in_list.
122
123   Clear the parent field of every task in NODE within SP, and free
124   the node when done.  */
125
126static void
127gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node)
128{
129  if (!node)
130    return;
131  prio_splay_tree_node left = node->left, right = node->right;
132  gomp_clear_parent_in_list (&node->key.l);
133#if _LIBGOMP_CHECKING_
134  memset (node, 0xaf, sizeof (*node));
135#endif
136  /* No need to remove the node from the tree.  We're nuking
137     everything, so just free the nodes and our caller can clear the
138     entire splay tree.  */
139  free (node);
140  gomp_clear_parent_in_tree (sp, left);
141  gomp_clear_parent_in_tree (sp, right);
142}
143
144/* Clear the parent field of every task in Q and remove every task
145   from Q.  */
146
147static inline void
148gomp_clear_parent (struct priority_queue *q)
149{
150  if (priority_queue_multi_p (q))
151    {
152      gomp_clear_parent_in_tree (&q->t, q->t.root);
153      /* All the nodes have been cleared in gomp_clear_parent_in_tree.
154	 No need to remove anything.  We can just nuke everything.  */
155      q->t.root = NULL;
156    }
157  else
158    gomp_clear_parent_in_list (&q->l);
159}
160
161/* Helper function for GOMP_task and gomp_create_target_task.
162
163   For a TASK with in/out dependencies, fill in the various dependency
164   queues.  PARENT is the parent of said task.  DEPEND is as in
165   GOMP_task.  */
166
167static void
168gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
169			 void **depend)
170{
171  size_t ndepend = (uintptr_t) depend[0];
172  size_t i;
173  hash_entry_type ent;
174
175  if (ndepend)
176    {
177      /* depend[0] is total # */
178      size_t nout = (uintptr_t) depend[1]; /* # of out: and inout: */
179      /* ndepend - nout is # of in: */
180      for (i = 0; i < ndepend; i++)
181	{
182	  task->depend[i].addr = depend[2 + i];
183	  task->depend[i].is_in = i >= nout;
184	}
185    }
186  else
187    {
188      ndepend = (uintptr_t) depend[1]; /* total # */
189      size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */
190      size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */
191      /* For now we treat mutexinoutset like out, which is compliant, but
192	 inefficient.  */
193      size_t nin = (uintptr_t) depend[4]; /* # of in: */
194      /* ndepend - nout - nmutexinoutset - nin is # of depobjs */
195      size_t normal = nout + nmutexinoutset + nin;
196      size_t n = 0;
197      for (i = normal; i < ndepend; i++)
198	{
199	  void **d = (void **) (uintptr_t) depend[5 + i];
200	  switch ((uintptr_t) d[1])
201	    {
202	    case GOMP_DEPEND_OUT:
203	    case GOMP_DEPEND_INOUT:
204	    case GOMP_DEPEND_MUTEXINOUTSET:
205	      break;
206	    case GOMP_DEPEND_IN:
207	      continue;
208	    default:
209	      gomp_fatal ("unknown omp_depend_t dependence type %d",
210			  (int) (uintptr_t) d[1]);
211	    }
212	  task->depend[n].addr = d[0];
213	  task->depend[n++].is_in = 0;
214	}
215      for (i = 0; i < normal; i++)
216	{
217	  task->depend[n].addr = depend[5 + i];
218	  task->depend[n++].is_in = i >= nout + nmutexinoutset;
219	}
220      for (i = normal; i < ndepend; i++)
221	{
222	  void **d = (void **) (uintptr_t) depend[5 + i];
223	  if ((uintptr_t) d[1] != GOMP_DEPEND_IN)
224	    continue;
225	  task->depend[n].addr = d[0];
226	  task->depend[n++].is_in = 1;
227	}
228    }
229  task->depend_count = ndepend;
230  task->num_dependees = 0;
231  if (parent->depend_hash == NULL)
232    parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
233  for (i = 0; i < ndepend; i++)
234    {
235      task->depend[i].next = NULL;
236      task->depend[i].prev = NULL;
237      task->depend[i].task = task;
238      task->depend[i].redundant = false;
239      task->depend[i].redundant_out = false;
240
241      hash_entry_type *slot = htab_find_slot (&parent->depend_hash,
242					      &task->depend[i], INSERT);
243      hash_entry_type out = NULL, last = NULL;
244      if (*slot)
245	{
246	  /* If multiple depends on the same task are the same, all but the
247	     first one are redundant.  As inout/out come first, if any of them
248	     is inout/out, it will win, which is the right semantics.  */
249	  if ((*slot)->task == task)
250	    {
251	      task->depend[i].redundant = true;
252	      continue;
253	    }
254	  for (ent = *slot; ent; ent = ent->next)
255	    {
256	      if (ent->redundant_out)
257		break;
258
259	      last = ent;
260
261	      /* depend(in:...) doesn't depend on earlier depend(in:...).  */
262	      if (task->depend[i].is_in && ent->is_in)
263		continue;
264
265	      if (!ent->is_in)
266		out = ent;
267
268	      struct gomp_task *tsk = ent->task;
269	      if (tsk->dependers == NULL)
270		{
271		  tsk->dependers
272		    = gomp_malloc (sizeof (struct gomp_dependers_vec)
273				   + 6 * sizeof (struct gomp_task *));
274		  tsk->dependers->n_elem = 1;
275		  tsk->dependers->allocated = 6;
276		  tsk->dependers->elem[0] = task;
277		  task->num_dependees++;
278		  continue;
279		}
280	      /* We already have some other dependency on tsk from earlier
281		 depend clause.  */
282	      else if (tsk->dependers->n_elem
283		       && (tsk->dependers->elem[tsk->dependers->n_elem - 1]
284			   == task))
285		continue;
286	      else if (tsk->dependers->n_elem == tsk->dependers->allocated)
287		{
288		  tsk->dependers->allocated
289		    = tsk->dependers->allocated * 2 + 2;
290		  tsk->dependers
291		    = gomp_realloc (tsk->dependers,
292				    sizeof (struct gomp_dependers_vec)
293				    + (tsk->dependers->allocated
294				       * sizeof (struct gomp_task *)));
295		}
296	      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
297	      task->num_dependees++;
298	    }
299	  task->depend[i].next = *slot;
300	  (*slot)->prev = &task->depend[i];
301	}
302      *slot = &task->depend[i];
303
304      /* There is no need to store more than one depend({,in}out:) task per
305	 address in the hash table chain for the purpose of creation of
306	 deferred tasks, because each out depends on all earlier outs, thus it
307	 is enough to record just the last depend({,in}out:).  For depend(in:),
308	 we need to keep all of the previous ones not terminated yet, because
309	 a later depend({,in}out:) might need to depend on all of them.  So, if
310	 the new task's clause is depend({,in}out:), we know there is at most
311	 one other depend({,in}out:) clause in the list (out).  For
312	 non-deferred tasks we want to see all outs, so they are moved to the
313	 end of the chain, after first redundant_out entry all following
314	 entries should be redundant_out.  */
315      if (!task->depend[i].is_in && out)
316	{
317	  if (out != last)
318	    {
319	      out->next->prev = out->prev;
320	      out->prev->next = out->next;
321	      out->next = last->next;
322	      out->prev = last;
323	      last->next = out;
324	      if (out->next)
325		out->next->prev = out;
326	    }
327	  out->redundant_out = true;
328	}
329    }
330}
331
332/* Called when encountering an explicit task directive.  If IF_CLAUSE is
333   false, then we must not delay in executing the task.  If UNTIED is true,
334   then the task may be executed by any member of the team.
335
336   DEPEND is an array containing:
337     if depend[0] is non-zero, then:
338	depend[0]: number of depend elements.
339	depend[1]: number of depend elements of type "out/inout".
340	depend[2..N+1]: address of [1..N]th depend element.
341     otherwise, when depend[0] is zero, then:
342	depend[1]: number of depend elements.
343	depend[2]: number of depend elements of type "out/inout".
344	depend[3]: number of depend elements of type "mutexinoutset".
345	depend[4]: number of depend elements of type "in".
346	depend[5..4+depend[2]+depend[3]+depend[4]]: address of depend elements
347	depend[5+depend[2]+depend[3]+depend[4]..4+depend[1]]: address of
348		   omp_depend_t objects.  */
349
350void
351GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
352	   long arg_size, long arg_align, bool if_clause, unsigned flags,
353	   void **depend, int priority_arg, void *detach)
354{
355  struct gomp_thread *thr = gomp_thread ();
356  struct gomp_team *team = thr->ts.team;
357  int priority = 0;
358
359#ifdef HAVE_BROKEN_POSIX_SEMAPHORES
360  /* If pthread_mutex_* is used for omp_*lock*, then each task must be
361     tied to one thread all the time.  This means UNTIED tasks must be
362     tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN
363     might be running on different thread than FN.  */
364  if (cpyfn)
365    if_clause = false;
366  flags &= ~GOMP_TASK_FLAG_UNTIED;
367#endif
368
369  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
370  if (__builtin_expect (gomp_cancel_var, 0) && team)
371    {
372      if (gomp_team_barrier_cancelled (&team->barrier))
373	return;
374      if (thr->task->taskgroup)
375	{
376	  if (thr->task->taskgroup->cancelled)
377	    return;
378	  if (thr->task->taskgroup->workshare
379	      && thr->task->taskgroup->prev
380	      && thr->task->taskgroup->prev->cancelled)
381	    return;
382	}
383    }
384
385  if (__builtin_expect ((flags & GOMP_TASK_FLAG_PRIORITY) != 0, 0))
386    {
387      priority = priority_arg;
388      if (priority > gomp_max_task_priority_var)
389	priority = gomp_max_task_priority_var;
390    }
391
392  if (!if_clause || team == NULL
393      || (thr->task && thr->task->final_task)
394      || team->task_count > 64 * team->nthreads)
395    {
396      struct gomp_task task;
397      gomp_sem_t completion_sem;
398
399      /* If there are depend clauses and earlier deferred sibling tasks
400	 with depend clauses, check if there isn't a dependency.  If there
401	 is, we need to wait for them.  There is no need to handle
402	 depend clauses for non-deferred tasks other than this, because
403	 the parent task is suspended until the child task finishes and thus
404	 it can't start further child tasks.  */
405      if ((flags & GOMP_TASK_FLAG_DEPEND)
406	  && thr->task && thr->task->depend_hash)
407	gomp_task_maybe_wait_for_dependencies (depend);
408
409      gomp_init_task (&task, thr->task, gomp_icv (false));
410      task.kind = GOMP_TASK_UNDEFERRED;
411      task.final_task = (thr->task && thr->task->final_task)
412			|| (flags & GOMP_TASK_FLAG_FINAL);
413      task.priority = priority;
414
415      if ((flags & GOMP_TASK_FLAG_DETACH) != 0)
416	{
417	  gomp_sem_init (&completion_sem, 0);
418	  task.completion_sem = &completion_sem;
419	  *(void **) detach = &task;
420	  if (data)
421	    *(void **) data = &task;
422
423	  gomp_debug (0, "Thread %d: new event: %p\n",
424		      thr->ts.team_id, &task);
425	}
426
427      if (thr->task)
428	{
429	  task.in_tied_task = thr->task->in_tied_task;
430	  task.taskgroup = thr->task->taskgroup;
431	}
432      thr->task = &task;
433      if (__builtin_expect (cpyfn != NULL, 0))
434	{
435	  char buf[arg_size + arg_align - 1];
436	  char *arg = (char *) (((uintptr_t) buf + arg_align - 1)
437				& ~(uintptr_t) (arg_align - 1));
438	  cpyfn (arg, data);
439	  fn (arg);
440	}
441      else
442	fn (data);
443
444      if ((flags & GOMP_TASK_FLAG_DETACH) != 0)
445	{
446	  gomp_sem_wait (&completion_sem);
447	  gomp_sem_destroy (&completion_sem);
448	}
449
450      /* Access to "children" is normally done inside a task_lock
451	 mutex region, but the only way this particular task.children
452	 can be set is if this thread's task work function (fn)
453	 creates children.  So since the setter is *this* thread, we
454	 need no barriers here when testing for non-NULL.  We can have
455	 task.children set by the current thread then changed by a
456	 child thread, but seeing a stale non-NULL value is not a
457	 problem.  Once past the task_lock acquisition, this thread
458	 will see the real value of task.children.  */
459      if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED))
460	{
461	  gomp_mutex_lock (&team->task_lock);
462	  gomp_clear_parent (&task.children_queue);
463	  gomp_mutex_unlock (&team->task_lock);
464	}
465      gomp_end_task ();
466    }
467  else
468    {
469      struct gomp_task *task;
470      struct gomp_task *parent = thr->task;
471      struct gomp_taskgroup *taskgroup = parent->taskgroup;
472      char *arg;
473      bool do_wake;
474      size_t depend_size = 0;
475
476      if (flags & GOMP_TASK_FLAG_DEPEND)
477	depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1])
478		       * sizeof (struct gomp_task_depend_entry));
479      task = gomp_malloc (sizeof (*task) + depend_size
480			  + arg_size + arg_align - 1);
481      arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
482		      & ~(uintptr_t) (arg_align - 1));
483      gomp_init_task (task, parent, gomp_icv (false));
484      task->priority = priority;
485      task->kind = GOMP_TASK_UNDEFERRED;
486      task->in_tied_task = parent->in_tied_task;
487      task->taskgroup = taskgroup;
488      task->deferred_p = true;
489      if ((flags & GOMP_TASK_FLAG_DETACH) != 0)
490	{
491	  task->detach_team = team;
492
493	  *(void **) detach = task;
494	  if (data)
495	    *(void **) data = task;
496
497	  gomp_debug (0, "Thread %d: new event: %p\n", thr->ts.team_id, task);
498	}
499      thr->task = task;
500      if (cpyfn)
501	{
502	  cpyfn (arg, data);
503	  task->copy_ctors_done = true;
504	}
505      else
506	memcpy (arg, data, arg_size);
507      thr->task = parent;
508      task->kind = GOMP_TASK_WAITING;
509      task->fn = fn;
510      task->fn_data = arg;
511      task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
512      gomp_mutex_lock (&team->task_lock);
513      /* If parallel or taskgroup has been cancelled, don't start new
514	 tasks.  */
515      if (__builtin_expect (gomp_cancel_var, 0)
516	  && !task->copy_ctors_done)
517	{
518	  if (gomp_team_barrier_cancelled (&team->barrier))
519	    {
520	    do_cancel:
521	      gomp_mutex_unlock (&team->task_lock);
522	      gomp_finish_task (task);
523	      free (task);
524	      return;
525	    }
526	  if (taskgroup)
527	    {
528	      if (taskgroup->cancelled)
529		goto do_cancel;
530	      if (taskgroup->workshare
531		  && taskgroup->prev
532		  && taskgroup->prev->cancelled)
533		goto do_cancel;
534	    }
535	}
536      if (taskgroup)
537	taskgroup->num_children++;
538      if (depend_size)
539	{
540	  gomp_task_handle_depend (task, parent, depend);
541	  if (task->num_dependees)
542	    {
543	      /* Tasks that depend on other tasks are not put into the
544		 various waiting queues, so we are done for now.  Said
545		 tasks are instead put into the queues via
546		 gomp_task_run_post_handle_dependers() after their
547		 dependencies have been satisfied.  After which, they
548		 can be picked up by the various scheduling
549		 points.  */
550	      gomp_mutex_unlock (&team->task_lock);
551	      return;
552	    }
553	}
554
555      priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
556			     task, priority,
557			     PRIORITY_INSERT_BEGIN,
558			     /*adjust_parent_depends_on=*/false,
559			     task->parent_depends_on);
560      if (taskgroup)
561	priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
562			       task, priority,
563			       PRIORITY_INSERT_BEGIN,
564			       /*adjust_parent_depends_on=*/false,
565			       task->parent_depends_on);
566
567      priority_queue_insert (PQ_TEAM, &team->task_queue,
568			     task, priority,
569			     PRIORITY_INSERT_END,
570			     /*adjust_parent_depends_on=*/false,
571			     task->parent_depends_on);
572
573      ++team->task_count;
574      ++team->task_queued_count;
575      gomp_team_barrier_set_task_pending (&team->barrier);
576      do_wake = team->task_running_count + !parent->in_tied_task
577		< team->nthreads;
578      gomp_mutex_unlock (&team->task_lock);
579      if (do_wake)
580	gomp_team_barrier_wake (&team->barrier, 1);
581    }
582}
583
584ialias (GOMP_taskgroup_start)
585ialias (GOMP_taskgroup_end)
586ialias (GOMP_taskgroup_reduction_register)
587
588#define TYPE long
589#define UTYPE unsigned long
590#define TYPE_is_long 1
591#include "taskloop.c"
592#undef TYPE
593#undef UTYPE
594#undef TYPE_is_long
595
596#define TYPE unsigned long long
597#define UTYPE TYPE
598#define GOMP_taskloop GOMP_taskloop_ull
599#include "taskloop.c"
600#undef TYPE
601#undef UTYPE
602#undef GOMP_taskloop
603
604static void inline
605priority_queue_move_task_first (enum priority_queue_type type,
606				struct priority_queue *head,
607				struct gomp_task *task)
608{
609#if _LIBGOMP_CHECKING_
610  if (!priority_queue_task_in_queue_p (type, head, task))
611    gomp_fatal ("Attempt to move first missing task %p", task);
612#endif
613  struct priority_list *list;
614  if (priority_queue_multi_p (head))
615    {
616      list = priority_queue_lookup_priority (head, task->priority);
617#if _LIBGOMP_CHECKING_
618      if (!list)
619	gomp_fatal ("Unable to find priority %d", task->priority);
620#endif
621    }
622  else
623    list = &head->l;
624  priority_list_remove (list, task_to_priority_node (type, task), 0);
625  priority_list_insert (type, list, task, task->priority,
626			PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN,
627			task->parent_depends_on);
628}
629
630/* Actual body of GOMP_PLUGIN_target_task_completion that is executed
631   with team->task_lock held, or is executed in the thread that called
632   gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been
633   run before it acquires team->task_lock.  */
634
635static void
636gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task)
637{
638  struct gomp_task *parent = task->parent;
639  if (parent)
640    priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue,
641				    task);
642
643  struct gomp_taskgroup *taskgroup = task->taskgroup;
644  if (taskgroup)
645    priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
646				    task);
647
648  priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
649			 PRIORITY_INSERT_BEGIN, false,
650			 task->parent_depends_on);
651  task->kind = GOMP_TASK_WAITING;
652  if (parent && parent->taskwait)
653    {
654      if (parent->taskwait->in_taskwait)
655	{
656	  /* One more task has had its dependencies met.
657	     Inform any waiters.  */
658	  parent->taskwait->in_taskwait = false;
659	  gomp_sem_post (&parent->taskwait->taskwait_sem);
660	}
661      else if (parent->taskwait->in_depend_wait)
662	{
663	  /* One more task has had its dependencies met.
664	     Inform any waiters.  */
665	  parent->taskwait->in_depend_wait = false;
666	  gomp_sem_post (&parent->taskwait->taskwait_sem);
667	}
668    }
669  if (taskgroup && taskgroup->in_taskgroup_wait)
670    {
671      /* One more task has had its dependencies met.
672	 Inform any waiters.  */
673      taskgroup->in_taskgroup_wait = false;
674      gomp_sem_post (&taskgroup->taskgroup_sem);
675    }
676
677  ++team->task_queued_count;
678  gomp_team_barrier_set_task_pending (&team->barrier);
679  /* I'm afraid this can't be done after releasing team->task_lock,
680     as gomp_target_task_completion is run from unrelated thread and
681     therefore in between gomp_mutex_unlock and gomp_team_barrier_wake
682     the team could be gone already.  */
683  if (team->nthreads > team->task_running_count)
684    gomp_team_barrier_wake (&team->barrier, 1);
685}
686
687/* Signal that a target task TTASK has completed the asynchronously
688   running phase and should be requeued as a task to handle the
689   variable unmapping.  */
690
691void
692GOMP_PLUGIN_target_task_completion (void *data)
693{
694  struct gomp_target_task *ttask = (struct gomp_target_task *) data;
695  struct gomp_task *task = ttask->task;
696  struct gomp_team *team = ttask->team;
697
698  gomp_mutex_lock (&team->task_lock);
699  if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN)
700    {
701      ttask->state = GOMP_TARGET_TASK_FINISHED;
702      gomp_mutex_unlock (&team->task_lock);
703      return;
704    }
705  ttask->state = GOMP_TARGET_TASK_FINISHED;
706  gomp_target_task_completion (team, task);
707  gomp_mutex_unlock (&team->task_lock);
708}
709
710static void gomp_task_run_post_handle_depend_hash (struct gomp_task *);
711
712/* Called for nowait target tasks.  */
713
714bool
715gomp_create_target_task (struct gomp_device_descr *devicep,
716			 void (*fn) (void *), size_t mapnum, void **hostaddrs,
717			 size_t *sizes, unsigned short *kinds,
718			 unsigned int flags, void **depend, void **args,
719			 enum gomp_target_task_state state)
720{
721  struct gomp_thread *thr = gomp_thread ();
722  struct gomp_team *team = thr->ts.team;
723
724  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
725  if (__builtin_expect (gomp_cancel_var, 0) && team)
726    {
727      if (gomp_team_barrier_cancelled (&team->barrier))
728	return true;
729      if (thr->task->taskgroup)
730	{
731	  if (thr->task->taskgroup->cancelled)
732	    return true;
733	  if (thr->task->taskgroup->workshare
734	      && thr->task->taskgroup->prev
735	      && thr->task->taskgroup->prev->cancelled)
736	    return true;
737	}
738    }
739
740  struct gomp_target_task *ttask;
741  struct gomp_task *task;
742  struct gomp_task *parent = thr->task;
743  struct gomp_taskgroup *taskgroup = parent->taskgroup;
744  bool do_wake;
745  size_t depend_size = 0;
746  uintptr_t depend_cnt = 0;
747  size_t tgt_align = 0, tgt_size = 0;
748  uintptr_t args_cnt = 0;
749
750  if (depend != NULL)
751    {
752      depend_cnt = (uintptr_t) (depend[0] ? depend[0] : depend[1]);
753      depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
754    }
755  if (fn)
756    {
757      /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are
758	 firstprivate on the target task.  */
759      size_t i;
760      for (i = 0; i < mapnum; i++)
761	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
762	  {
763	    size_t align = (size_t) 1 << (kinds[i] >> 8);
764	    if (tgt_align < align)
765	      tgt_align = align;
766	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
767	    tgt_size += sizes[i];
768	  }
769      if (tgt_align)
770	tgt_size += tgt_align - 1;
771      else
772	tgt_size = 0;
773      if (args)
774	{
775	  void **cargs = args;
776	  while (*cargs)
777	    {
778	      intptr_t id = (intptr_t) *cargs++;
779	      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
780		cargs++;
781	    }
782	  args_cnt = cargs + 1 - args;
783	}
784    }
785
786  task = gomp_malloc (sizeof (*task) + depend_size
787		      + sizeof (*ttask)
788		      + args_cnt * sizeof (void *)
789		      + mapnum * (sizeof (void *) + sizeof (size_t)
790				  + sizeof (unsigned short))
791		      + tgt_size);
792  gomp_init_task (task, parent, gomp_icv (false));
793  task->priority = 0;
794  task->kind = GOMP_TASK_WAITING;
795  task->in_tied_task = parent->in_tied_task;
796  task->taskgroup = taskgroup;
797  ttask = (struct gomp_target_task *) &task->depend[depend_cnt];
798  ttask->devicep = devicep;
799  ttask->fn = fn;
800  ttask->mapnum = mapnum;
801  memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *));
802  if (args_cnt)
803    {
804      ttask->args = (void **) &ttask->hostaddrs[mapnum];
805      memcpy (ttask->args, args, args_cnt * sizeof (void *));
806      ttask->sizes = (size_t *) &ttask->args[args_cnt];
807    }
808  else
809    {
810      ttask->args = args;
811      ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum];
812    }
813  memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t));
814  ttask->kinds = (unsigned short *) &ttask->sizes[mapnum];
815  memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short));
816  if (tgt_align)
817    {
818      char *tgt = (char *) &ttask->kinds[mapnum];
819      size_t i;
820      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
821      if (al)
822	tgt += tgt_align - al;
823      tgt_size = 0;
824      for (i = 0; i < mapnum; i++)
825	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
826	  {
827	    size_t align = (size_t) 1 << (kinds[i] >> 8);
828	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
829	    memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
830	    ttask->hostaddrs[i] = tgt + tgt_size;
831	    tgt_size = tgt_size + sizes[i];
832	  }
833    }
834  ttask->flags = flags;
835  ttask->state = state;
836  ttask->task = task;
837  ttask->team = team;
838  task->fn = NULL;
839  task->fn_data = ttask;
840  task->final_task = 0;
841  gomp_mutex_lock (&team->task_lock);
842  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
843  if (__builtin_expect (gomp_cancel_var, 0))
844    {
845      if (gomp_team_barrier_cancelled (&team->barrier))
846	{
847	do_cancel:
848	  gomp_mutex_unlock (&team->task_lock);
849	  gomp_finish_task (task);
850	  free (task);
851	  return true;
852	}
853      if (taskgroup)
854	{
855	  if (taskgroup->cancelled)
856	    goto do_cancel;
857	  if (taskgroup->workshare
858	      && taskgroup->prev
859	      && taskgroup->prev->cancelled)
860	    goto do_cancel;
861	}
862    }
863  if (depend_size)
864    {
865      gomp_task_handle_depend (task, parent, depend);
866      if (task->num_dependees)
867	{
868	  if (taskgroup)
869	    taskgroup->num_children++;
870	  gomp_mutex_unlock (&team->task_lock);
871	  return true;
872	}
873    }
874  if (state == GOMP_TARGET_TASK_DATA)
875    {
876      gomp_task_run_post_handle_depend_hash (task);
877      gomp_mutex_unlock (&team->task_lock);
878      gomp_finish_task (task);
879      free (task);
880      return false;
881    }
882  if (taskgroup)
883    taskgroup->num_children++;
884  /* For async offloading, if we don't need to wait for dependencies,
885     run the gomp_target_task_fn right away, essentially schedule the
886     mapping part of the task in the current thread.  */
887  if (devicep != NULL
888      && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
889    {
890      priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
891			     PRIORITY_INSERT_END,
892			     /*adjust_parent_depends_on=*/false,
893			     task->parent_depends_on);
894      if (taskgroup)
895	priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
896			       task, 0, PRIORITY_INSERT_END,
897			       /*adjust_parent_depends_on=*/false,
898			       task->parent_depends_on);
899      task->pnode[PQ_TEAM].next = NULL;
900      task->pnode[PQ_TEAM].prev = NULL;
901      task->kind = GOMP_TASK_TIED;
902      ++team->task_count;
903      gomp_mutex_unlock (&team->task_lock);
904
905      thr->task = task;
906      gomp_target_task_fn (task->fn_data);
907      thr->task = parent;
908
909      gomp_mutex_lock (&team->task_lock);
910      task->kind = GOMP_TASK_ASYNC_RUNNING;
911      /* If GOMP_PLUGIN_target_task_completion has run already
912	 in between gomp_target_task_fn and the mutex lock,
913	 perform the requeuing here.  */
914      if (ttask->state == GOMP_TARGET_TASK_FINISHED)
915	gomp_target_task_completion (team, task);
916      else
917	ttask->state = GOMP_TARGET_TASK_RUNNING;
918      gomp_mutex_unlock (&team->task_lock);
919      return true;
920    }
921  priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
922			 PRIORITY_INSERT_BEGIN,
923			 /*adjust_parent_depends_on=*/false,
924			 task->parent_depends_on);
925  if (taskgroup)
926    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0,
927			   PRIORITY_INSERT_BEGIN,
928			   /*adjust_parent_depends_on=*/false,
929			   task->parent_depends_on);
930  priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0,
931			 PRIORITY_INSERT_END,
932			 /*adjust_parent_depends_on=*/false,
933			 task->parent_depends_on);
934  ++team->task_count;
935  ++team->task_queued_count;
936  gomp_team_barrier_set_task_pending (&team->barrier);
937  do_wake = team->task_running_count + !parent->in_tied_task
938	    < team->nthreads;
939  gomp_mutex_unlock (&team->task_lock);
940  if (do_wake)
941    gomp_team_barrier_wake (&team->barrier, 1);
942  return true;
943}
944
945/* Given a parent_depends_on task in LIST, move it to the front of its
946   priority so it is run as soon as possible.
947
948   Care is taken to update the list's LAST_PARENT_DEPENDS_ON field.
949
950   We rearrange the queue such that all parent_depends_on tasks are
951   first, and last_parent_depends_on points to the last such task we
952   rearranged.  For example, given the following tasks in a queue
953   where PD[123] are the parent_depends_on tasks:
954
955	task->children
956	|
957	V
958	C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4
959
960	We rearrange such that:
961
962	task->children
963	|	       +--- last_parent_depends_on
964	|	       |
965	V	       V
966	PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4.  */
967
968static void inline
969priority_list_upgrade_task (struct priority_list *list,
970			    struct priority_node *node)
971{
972  struct priority_node *last_parent_depends_on
973    = list->last_parent_depends_on;
974  if (last_parent_depends_on)
975    {
976      node->prev->next = node->next;
977      node->next->prev = node->prev;
978      node->prev = last_parent_depends_on;
979      node->next = last_parent_depends_on->next;
980      node->prev->next = node;
981      node->next->prev = node;
982    }
983  else if (node != list->tasks)
984    {
985      node->prev->next = node->next;
986      node->next->prev = node->prev;
987      node->prev = list->tasks->prev;
988      node->next = list->tasks;
989      list->tasks = node;
990      node->prev->next = node;
991      node->next->prev = node;
992    }
993  list->last_parent_depends_on = node;
994}
995
996/* Given a parent_depends_on TASK in its parent's children_queue, move
997   it to the front of its priority so it is run as soon as possible.
998
999   PARENT is passed as an optimization.
1000
1001   (This function could be defined in priority_queue.c, but we want it
1002   inlined, and putting it in priority_queue.h is not an option, given
1003   that gomp_task has not been properly defined at that point).  */
1004
1005static void inline
1006priority_queue_upgrade_task (struct gomp_task *task,
1007			     struct gomp_task *parent)
1008{
1009  struct priority_queue *head = &parent->children_queue;
1010  struct priority_node *node = &task->pnode[PQ_CHILDREN];
1011#if _LIBGOMP_CHECKING_
1012  if (!task->parent_depends_on)
1013    gomp_fatal ("priority_queue_upgrade_task: task must be a "
1014		"parent_depends_on task");
1015  if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task))
1016    gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task);
1017#endif
1018  if (priority_queue_multi_p (head))
1019    {
1020      struct priority_list *list
1021	= priority_queue_lookup_priority (head, task->priority);
1022      priority_list_upgrade_task (list, node);
1023    }
1024  else
1025    priority_list_upgrade_task (&head->l, node);
1026}
1027
1028/* Given a CHILD_TASK in LIST that is about to be executed, move it out of
1029   the way in LIST so that other tasks can be considered for
1030   execution.  LIST contains tasks of type TYPE.
1031
1032   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
1033   if applicable.  */
1034
1035static void inline
1036priority_list_downgrade_task (enum priority_queue_type type,
1037			      struct priority_list *list,
1038			      struct gomp_task *child_task)
1039{
1040  struct priority_node *node = task_to_priority_node (type, child_task);
1041  if (list->tasks == node)
1042    list->tasks = node->next;
1043  else if (node->next != list->tasks)
1044    {
1045      /* The task in NODE is about to become TIED and TIED tasks
1046	 cannot come before WAITING tasks.  If we're about to
1047	 leave the queue in such an indeterminate state, rewire
1048	 things appropriately.  However, a TIED task at the end is
1049	 perfectly fine.  */
1050      struct gomp_task *next_task = priority_node_to_task (type, node->next);
1051      if (next_task->kind == GOMP_TASK_WAITING)
1052	{
1053	  /* Remove from list.  */
1054	  node->prev->next = node->next;
1055	  node->next->prev = node->prev;
1056	  /* Rewire at the end.  */
1057	  node->next = list->tasks;
1058	  node->prev = list->tasks->prev;
1059	  list->tasks->prev->next = node;
1060	  list->tasks->prev = node;
1061	}
1062    }
1063
1064  /* If the current task is the last_parent_depends_on for its
1065     priority, adjust last_parent_depends_on appropriately.  */
1066  if (__builtin_expect (child_task->parent_depends_on, 0)
1067      && list->last_parent_depends_on == node)
1068    {
1069      struct gomp_task *prev_child = priority_node_to_task (type, node->prev);
1070      if (node->prev != node
1071	  && prev_child->kind == GOMP_TASK_WAITING
1072	  && prev_child->parent_depends_on)
1073	list->last_parent_depends_on = node->prev;
1074      else
1075	{
1076	  /* There are no more parent_depends_on entries waiting
1077	     to run, clear the list.  */
1078	  list->last_parent_depends_on = NULL;
1079	}
1080    }
1081}
1082
1083/* Given a TASK in HEAD that is about to be executed, move it out of
1084   the way so that other tasks can be considered for execution.  HEAD
1085   contains tasks of type TYPE.
1086
1087   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
1088   if applicable.
1089
1090   (This function could be defined in priority_queue.c, but we want it
1091   inlined, and putting it in priority_queue.h is not an option, given
1092   that gomp_task has not been properly defined at that point).  */
1093
1094static void inline
1095priority_queue_downgrade_task (enum priority_queue_type type,
1096			       struct priority_queue *head,
1097			       struct gomp_task *task)
1098{
1099#if _LIBGOMP_CHECKING_
1100  if (!priority_queue_task_in_queue_p (type, head, task))
1101    gomp_fatal ("Attempt to downgrade missing task %p", task);
1102#endif
1103  if (priority_queue_multi_p (head))
1104    {
1105      struct priority_list *list
1106	= priority_queue_lookup_priority (head, task->priority);
1107      priority_list_downgrade_task (type, list, task);
1108    }
1109  else
1110    priority_list_downgrade_task (type, &head->l, task);
1111}
1112
1113/* Setup CHILD_TASK to execute.  This is done by setting the task to
1114   TIED, and updating all relevant queues so that CHILD_TASK is no
1115   longer chosen for scheduling.  Also, remove CHILD_TASK from the
1116   overall team task queue entirely.
1117
1118   Return TRUE if task or its containing taskgroup has been
1119   cancelled.  */
1120
1121static inline bool
1122gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
1123		   struct gomp_team *team)
1124{
1125#if _LIBGOMP_CHECKING_
1126  if (child_task->parent)
1127    priority_queue_verify (PQ_CHILDREN,
1128			   &child_task->parent->children_queue, true);
1129  if (child_task->taskgroup)
1130    priority_queue_verify (PQ_TASKGROUP,
1131			   &child_task->taskgroup->taskgroup_queue, false);
1132  priority_queue_verify (PQ_TEAM, &team->task_queue, false);
1133#endif
1134
1135  /* Task is about to go tied, move it out of the way.  */
1136  if (parent)
1137    priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue,
1138				   child_task);
1139
1140  /* Task is about to go tied, move it out of the way.  */
1141  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
1142  if (taskgroup)
1143    priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
1144				   child_task);
1145
1146  priority_queue_remove (PQ_TEAM, &team->task_queue, child_task,
1147			 MEMMODEL_RELAXED);
1148  child_task->pnode[PQ_TEAM].next = NULL;
1149  child_task->pnode[PQ_TEAM].prev = NULL;
1150  child_task->kind = GOMP_TASK_TIED;
1151
1152  if (--team->task_queued_count == 0)
1153    gomp_team_barrier_clear_task_pending (&team->barrier);
1154  if (__builtin_expect (gomp_cancel_var, 0)
1155      && !child_task->copy_ctors_done)
1156    {
1157      if (gomp_team_barrier_cancelled (&team->barrier))
1158	return true;
1159      if (taskgroup)
1160	{
1161	  if (taskgroup->cancelled)
1162	    return true;
1163	  if (taskgroup->workshare
1164	      && taskgroup->prev
1165	      && taskgroup->prev->cancelled)
1166	    return true;
1167	}
1168    }
1169  return false;
1170}
1171
1172static void
1173gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
1174{
1175  struct gomp_task *parent = child_task->parent;
1176  size_t i;
1177
1178  for (i = 0; i < child_task->depend_count; i++)
1179    if (!child_task->depend[i].redundant)
1180      {
1181	if (child_task->depend[i].next)
1182	  child_task->depend[i].next->prev = child_task->depend[i].prev;
1183	if (child_task->depend[i].prev)
1184	  child_task->depend[i].prev->next = child_task->depend[i].next;
1185	else
1186	  {
1187	    hash_entry_type *slot
1188	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
1189				NO_INSERT);
1190	    if (*slot != &child_task->depend[i])
1191	      abort ();
1192	    if (child_task->depend[i].next)
1193	      *slot = child_task->depend[i].next;
1194	    else
1195	      htab_clear_slot (parent->depend_hash, slot);
1196	  }
1197      }
1198}
1199
1200/* After a CHILD_TASK has been run, adjust the dependency queue for
1201   each task that depends on CHILD_TASK, to record the fact that there
1202   is one less dependency to worry about.  If a task that depended on
1203   CHILD_TASK now has no dependencies, place it in the various queues
1204   so it gets scheduled to run.
1205
1206   TEAM is the team to which CHILD_TASK belongs to.  */
1207
1208static size_t
1209gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
1210				     struct gomp_team *team)
1211{
1212  struct gomp_task *parent = child_task->parent;
1213  size_t i, count = child_task->dependers->n_elem, ret = 0;
1214  for (i = 0; i < count; i++)
1215    {
1216      struct gomp_task *task = child_task->dependers->elem[i];
1217
1218      /* CHILD_TASK satisfies a dependency for TASK.  Keep track of
1219	 TASK's remaining dependencies.  Once TASK has no other
1220	 dependencies, put it into the various queues so it will get
1221	 scheduled for execution.  */
1222      if (--task->num_dependees != 0)
1223	continue;
1224
1225      struct gomp_taskgroup *taskgroup = task->taskgroup;
1226      if (parent)
1227	{
1228	  priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
1229				 task, task->priority,
1230				 PRIORITY_INSERT_BEGIN,
1231				 /*adjust_parent_depends_on=*/true,
1232				 task->parent_depends_on);
1233	  if (parent->taskwait)
1234	    {
1235	      if (parent->taskwait->in_taskwait)
1236		{
1237		  /* One more task has had its dependencies met.
1238		     Inform any waiters.  */
1239		  parent->taskwait->in_taskwait = false;
1240		  gomp_sem_post (&parent->taskwait->taskwait_sem);
1241		}
1242	      else if (parent->taskwait->in_depend_wait)
1243		{
1244		  /* One more task has had its dependencies met.
1245		     Inform any waiters.  */
1246		  parent->taskwait->in_depend_wait = false;
1247		  gomp_sem_post (&parent->taskwait->taskwait_sem);
1248		}
1249	    }
1250	}
1251      else
1252	task->parent = NULL;
1253      if (taskgroup)
1254	{
1255	  priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
1256				 task, task->priority,
1257				 PRIORITY_INSERT_BEGIN,
1258				 /*adjust_parent_depends_on=*/false,
1259				 task->parent_depends_on);
1260	  if (taskgroup->in_taskgroup_wait)
1261	    {
1262	      /* One more task has had its dependencies met.
1263		 Inform any waiters.  */
1264	      taskgroup->in_taskgroup_wait = false;
1265	      gomp_sem_post (&taskgroup->taskgroup_sem);
1266	    }
1267	}
1268      priority_queue_insert (PQ_TEAM, &team->task_queue,
1269			     task, task->priority,
1270			     PRIORITY_INSERT_END,
1271			     /*adjust_parent_depends_on=*/false,
1272			     task->parent_depends_on);
1273      ++team->task_count;
1274      ++team->task_queued_count;
1275      ++ret;
1276    }
1277  free (child_task->dependers);
1278  child_task->dependers = NULL;
1279  if (ret > 1)
1280    gomp_team_barrier_set_task_pending (&team->barrier);
1281  return ret;
1282}
1283
1284static inline size_t
1285gomp_task_run_post_handle_depend (struct gomp_task *child_task,
1286				  struct gomp_team *team)
1287{
1288  if (child_task->depend_count == 0)
1289    return 0;
1290
1291  /* If parent is gone already, the hash table is freed and nothing
1292     will use the hash table anymore, no need to remove anything from it.  */
1293  if (child_task->parent != NULL)
1294    gomp_task_run_post_handle_depend_hash (child_task);
1295
1296  if (child_task->dependers == NULL)
1297    return 0;
1298
1299  return gomp_task_run_post_handle_dependers (child_task, team);
1300}
1301
1302/* Remove CHILD_TASK from its parent.  */
1303
1304static inline void
1305gomp_task_run_post_remove_parent (struct gomp_task *child_task)
1306{
1307  struct gomp_task *parent = child_task->parent;
1308  if (parent == NULL)
1309    return;
1310
1311  /* If this was the last task the parent was depending on,
1312     synchronize with gomp_task_maybe_wait_for_dependencies so it can
1313     clean up and return.  */
1314  if (__builtin_expect (child_task->parent_depends_on, 0)
1315      && --parent->taskwait->n_depend == 0
1316      && parent->taskwait->in_depend_wait)
1317    {
1318      parent->taskwait->in_depend_wait = false;
1319      gomp_sem_post (&parent->taskwait->taskwait_sem);
1320    }
1321
1322  if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue,
1323			     child_task, MEMMODEL_RELEASE)
1324      && parent->taskwait && parent->taskwait->in_taskwait)
1325    {
1326      parent->taskwait->in_taskwait = false;
1327      gomp_sem_post (&parent->taskwait->taskwait_sem);
1328    }
1329  child_task->pnode[PQ_CHILDREN].next = NULL;
1330  child_task->pnode[PQ_CHILDREN].prev = NULL;
1331}
1332
1333/* Remove CHILD_TASK from its taskgroup.  */
1334
1335static inline void
1336gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
1337{
1338  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
1339  if (taskgroup == NULL)
1340    return;
1341  bool empty = priority_queue_remove (PQ_TASKGROUP,
1342				      &taskgroup->taskgroup_queue,
1343				      child_task, MEMMODEL_RELAXED);
1344  child_task->pnode[PQ_TASKGROUP].next = NULL;
1345  child_task->pnode[PQ_TASKGROUP].prev = NULL;
1346  if (taskgroup->num_children > 1)
1347    --taskgroup->num_children;
1348  else
1349    {
1350      /* We access taskgroup->num_children in GOMP_taskgroup_end
1351	 outside of the task lock mutex region, so
1352	 need a release barrier here to ensure memory
1353	 written by child_task->fn above is flushed
1354	 before the NULL is written.  */
1355      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
1356    }
1357  if (empty && taskgroup->in_taskgroup_wait)
1358    {
1359      taskgroup->in_taskgroup_wait = false;
1360      gomp_sem_post (&taskgroup->taskgroup_sem);
1361    }
1362}
1363
1364void
1365gomp_barrier_handle_tasks (gomp_barrier_state_t state)
1366{
1367  struct gomp_thread *thr = gomp_thread ();
1368  struct gomp_team *team = thr->ts.team;
1369  struct gomp_task *task = thr->task;
1370  struct gomp_task *child_task = NULL;
1371  struct gomp_task *to_free = NULL;
1372  int do_wake = 0;
1373
1374  gomp_mutex_lock (&team->task_lock);
1375  if (gomp_barrier_last_thread (state))
1376    {
1377      if (team->task_count == 0)
1378	{
1379	  gomp_team_barrier_done (&team->barrier, state);
1380	  gomp_mutex_unlock (&team->task_lock);
1381	  gomp_team_barrier_wake (&team->barrier, 0);
1382	  return;
1383	}
1384      gomp_team_barrier_set_waiting_for_tasks (&team->barrier);
1385    }
1386
1387  while (1)
1388    {
1389      bool cancelled = false;
1390
1391      if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED))
1392	{
1393	  bool ignored;
1394	  child_task
1395	    = priority_queue_next_task (PQ_TEAM, &team->task_queue,
1396					PQ_IGNORED, NULL,
1397					&ignored);
1398	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
1399					 team);
1400	  if (__builtin_expect (cancelled, 0))
1401	    {
1402	      if (to_free)
1403		{
1404		  gomp_finish_task (to_free);
1405		  free (to_free);
1406		  to_free = NULL;
1407		}
1408	      goto finish_cancelled;
1409	    }
1410	  team->task_running_count++;
1411	  child_task->in_tied_task = true;
1412	}
1413      else if (team->task_count == 0
1414	       && gomp_team_barrier_waiting_for_tasks (&team->barrier))
1415	{
1416	  gomp_team_barrier_done (&team->barrier, state);
1417	  gomp_mutex_unlock (&team->task_lock);
1418	  gomp_team_barrier_wake (&team->barrier, 0);
1419	  if (to_free)
1420	    {
1421	      gomp_finish_task (to_free);
1422	      free (to_free);
1423	    }
1424	  return;
1425	}
1426      gomp_mutex_unlock (&team->task_lock);
1427      if (do_wake)
1428	{
1429	  gomp_team_barrier_wake (&team->barrier, do_wake);
1430	  do_wake = 0;
1431	}
1432      if (to_free)
1433	{
1434	  gomp_finish_task (to_free);
1435	  free (to_free);
1436	  to_free = NULL;
1437	}
1438      if (child_task)
1439	{
1440	  thr->task = child_task;
1441	  if (__builtin_expect (child_task->fn == NULL, 0))
1442	    {
1443	      if (gomp_target_task_fn (child_task->fn_data))
1444		{
1445		  thr->task = task;
1446		  gomp_mutex_lock (&team->task_lock);
1447		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
1448		  team->task_running_count--;
1449		  struct gomp_target_task *ttask
1450		    = (struct gomp_target_task *) child_task->fn_data;
1451		  /* If GOMP_PLUGIN_target_task_completion has run already
1452		     in between gomp_target_task_fn and the mutex lock,
1453		     perform the requeuing here.  */
1454		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
1455		    gomp_target_task_completion (team, child_task);
1456		  else
1457		    ttask->state = GOMP_TARGET_TASK_RUNNING;
1458		  child_task = NULL;
1459		  continue;
1460		}
1461	    }
1462	  else
1463	    child_task->fn (child_task->fn_data);
1464	  thr->task = task;
1465	}
1466      else
1467	return;
1468      gomp_mutex_lock (&team->task_lock);
1469      if (child_task)
1470	{
1471	  if (child_task->detach_team)
1472	    {
1473	      assert (child_task->detach_team == team);
1474	      child_task->kind = GOMP_TASK_DETACHED;
1475	      ++team->task_detach_count;
1476	      --team->task_running_count;
1477	      gomp_debug (0,
1478			  "thread %d: task with event %p finished without "
1479			  "completion event fulfilled in team barrier\n",
1480			  thr->ts.team_id, child_task);
1481	      child_task = NULL;
1482	      continue;
1483	    }
1484
1485	 finish_cancelled:;
1486	  size_t new_tasks
1487	    = gomp_task_run_post_handle_depend (child_task, team);
1488	  gomp_task_run_post_remove_parent (child_task);
1489	  gomp_clear_parent (&child_task->children_queue);
1490	  gomp_task_run_post_remove_taskgroup (child_task);
1491	  to_free = child_task;
1492	  if (!cancelled)
1493	    team->task_running_count--;
1494	  child_task = NULL;
1495	  if (new_tasks > 1)
1496	    {
1497	      do_wake = team->nthreads - team->task_running_count;
1498	      if (do_wake > new_tasks)
1499		do_wake = new_tasks;
1500	    }
1501	  --team->task_count;
1502	}
1503    }
1504}
1505
1506/* Called when encountering a taskwait directive.
1507
1508   Wait for all children of the current task.  */
1509
1510void
1511GOMP_taskwait (void)
1512{
1513  struct gomp_thread *thr = gomp_thread ();
1514  struct gomp_team *team = thr->ts.team;
1515  struct gomp_task *task = thr->task;
1516  struct gomp_task *child_task = NULL;
1517  struct gomp_task *to_free = NULL;
1518  struct gomp_taskwait taskwait;
1519  int do_wake = 0;
1520
1521  /* The acquire barrier on load of task->children here synchronizes
1522     with the write of a NULL in gomp_task_run_post_remove_parent.  It is
1523     not necessary that we synchronize with other non-NULL writes at
1524     this point, but we must ensure that all writes to memory by a
1525     child thread task work function are seen before we exit from
1526     GOMP_taskwait.  */
1527  if (task == NULL
1528      || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE))
1529    return;
1530
1531  memset (&taskwait, 0, sizeof (taskwait));
1532  bool child_q = false;
1533  gomp_mutex_lock (&team->task_lock);
1534  while (1)
1535    {
1536      bool cancelled = false;
1537      if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED))
1538	{
1539	  bool destroy_taskwait = task->taskwait != NULL;
1540	  task->taskwait = NULL;
1541	  gomp_mutex_unlock (&team->task_lock);
1542	  if (to_free)
1543	    {
1544	      gomp_finish_task (to_free);
1545	      free (to_free);
1546	    }
1547	  if (destroy_taskwait)
1548	    gomp_sem_destroy (&taskwait.taskwait_sem);
1549	  return;
1550	}
1551      struct gomp_task *next_task
1552	= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
1553				    PQ_TEAM, &team->task_queue, &child_q);
1554      if (next_task->kind == GOMP_TASK_WAITING)
1555	{
1556	  child_task = next_task;
1557	  cancelled
1558	    = gomp_task_run_pre (child_task, task, team);
1559	  if (__builtin_expect (cancelled, 0))
1560	    {
1561	      if (to_free)
1562		{
1563		  gomp_finish_task (to_free);
1564		  free (to_free);
1565		  to_free = NULL;
1566		}
1567	      goto finish_cancelled;
1568	    }
1569	}
1570      else
1571	{
1572	/* All tasks we are waiting for are either running in other
1573	   threads, are detached and waiting for the completion event to be
1574	   fulfilled, or they are tasks that have not had their
1575	   dependencies met (so they're not even in the queue).  Wait
1576	   for them.  */
1577	  if (task->taskwait == NULL)
1578	    {
1579	      taskwait.in_depend_wait = false;
1580	      gomp_sem_init (&taskwait.taskwait_sem, 0);
1581	      task->taskwait = &taskwait;
1582	    }
1583	  taskwait.in_taskwait = true;
1584	}
1585      gomp_mutex_unlock (&team->task_lock);
1586      if (do_wake)
1587	{
1588	  gomp_team_barrier_wake (&team->barrier, do_wake);
1589	  do_wake = 0;
1590	}
1591      if (to_free)
1592	{
1593	  gomp_finish_task (to_free);
1594	  free (to_free);
1595	  to_free = NULL;
1596	}
1597      if (child_task)
1598	{
1599	  thr->task = child_task;
1600	  if (__builtin_expect (child_task->fn == NULL, 0))
1601	    {
1602	      if (gomp_target_task_fn (child_task->fn_data))
1603		{
1604		  thr->task = task;
1605		  gomp_mutex_lock (&team->task_lock);
1606		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
1607		  struct gomp_target_task *ttask
1608		    = (struct gomp_target_task *) child_task->fn_data;
1609		  /* If GOMP_PLUGIN_target_task_completion has run already
1610		     in between gomp_target_task_fn and the mutex lock,
1611		     perform the requeuing here.  */
1612		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
1613		    gomp_target_task_completion (team, child_task);
1614		  else
1615		    ttask->state = GOMP_TARGET_TASK_RUNNING;
1616		  child_task = NULL;
1617		  continue;
1618		}
1619	    }
1620	  else
1621	    child_task->fn (child_task->fn_data);
1622	  thr->task = task;
1623	}
1624      else
1625	gomp_sem_wait (&taskwait.taskwait_sem);
1626      gomp_mutex_lock (&team->task_lock);
1627      if (child_task)
1628	{
1629	  if (child_task->detach_team)
1630	    {
1631	      assert (child_task->detach_team == team);
1632	      child_task->kind = GOMP_TASK_DETACHED;
1633	      ++team->task_detach_count;
1634	      gomp_debug (0,
1635			  "thread %d: task with event %p finished without "
1636			  "completion event fulfilled in taskwait\n",
1637			  thr->ts.team_id, child_task);
1638	      child_task = NULL;
1639	      continue;
1640	    }
1641
1642	 finish_cancelled:;
1643	  size_t new_tasks
1644	    = gomp_task_run_post_handle_depend (child_task, team);
1645
1646	  if (child_q)
1647	    {
1648	      priority_queue_remove (PQ_CHILDREN, &task->children_queue,
1649				     child_task, MEMMODEL_RELAXED);
1650	      child_task->pnode[PQ_CHILDREN].next = NULL;
1651	      child_task->pnode[PQ_CHILDREN].prev = NULL;
1652	    }
1653
1654	  gomp_clear_parent (&child_task->children_queue);
1655
1656	  gomp_task_run_post_remove_taskgroup (child_task);
1657
1658	  to_free = child_task;
1659	  child_task = NULL;
1660	  team->task_count--;
1661	  if (new_tasks > 1)
1662	    {
1663	      do_wake = team->nthreads - team->task_running_count
1664			- !task->in_tied_task;
1665	      if (do_wake > new_tasks)
1666		do_wake = new_tasks;
1667	    }
1668	}
1669    }
1670}
1671
1672/* Called when encountering a taskwait directive with depend clause(s).
1673   Wait as if it was an mergeable included task construct with empty body.  */
1674
1675void
1676GOMP_taskwait_depend (void **depend)
1677{
1678  struct gomp_thread *thr = gomp_thread ();
1679  struct gomp_team *team = thr->ts.team;
1680
1681  /* If parallel or taskgroup has been cancelled, return early.  */
1682  if (__builtin_expect (gomp_cancel_var, 0) && team)
1683    {
1684      if (gomp_team_barrier_cancelled (&team->barrier))
1685	return;
1686      if (thr->task->taskgroup)
1687	{
1688	  if (thr->task->taskgroup->cancelled)
1689	    return;
1690	  if (thr->task->taskgroup->workshare
1691	      && thr->task->taskgroup->prev
1692	      && thr->task->taskgroup->prev->cancelled)
1693	    return;
1694	}
1695    }
1696
1697  if (thr->task && thr->task->depend_hash)
1698    gomp_task_maybe_wait_for_dependencies (depend);
1699}
1700
1701/* An undeferred task is about to run.  Wait for all tasks that this
1702   undeferred task depends on.
1703
1704   This is done by first putting all known ready dependencies
1705   (dependencies that have their own dependencies met) at the top of
1706   the scheduling queues.  Then we iterate through these imminently
1707   ready tasks (and possibly other high priority tasks), and run them.
1708   If we run out of ready dependencies to execute, we either wait for
1709   the remaining dependencies to finish, or wait for them to get
1710   scheduled so we can run them.
1711
1712   DEPEND is as in GOMP_task.  */
1713
1714void
1715gomp_task_maybe_wait_for_dependencies (void **depend)
1716{
1717  struct gomp_thread *thr = gomp_thread ();
1718  struct gomp_task *task = thr->task;
1719  struct gomp_team *team = thr->ts.team;
1720  struct gomp_task_depend_entry elem, *ent = NULL;
1721  struct gomp_taskwait taskwait;
1722  size_t orig_ndepend = (uintptr_t) depend[0];
1723  size_t nout = (uintptr_t) depend[1];
1724  size_t ndepend = orig_ndepend;
1725  size_t normal = ndepend;
1726  size_t n = 2;
1727  size_t i;
1728  size_t num_awaited = 0;
1729  struct gomp_task *child_task = NULL;
1730  struct gomp_task *to_free = NULL;
1731  int do_wake = 0;
1732
1733  if (ndepend == 0)
1734    {
1735      ndepend = nout;
1736      nout = (uintptr_t) depend[2] + (uintptr_t) depend[3];
1737      normal = nout + (uintptr_t) depend[4];
1738      n = 5;
1739    }
1740  gomp_mutex_lock (&team->task_lock);
1741  for (i = 0; i < ndepend; i++)
1742    {
1743      elem.addr = depend[i + n];
1744      elem.is_in = i >= nout;
1745      if (__builtin_expect (i >= normal, 0))
1746	{
1747	  void **d = (void **) elem.addr;
1748	  switch ((uintptr_t) d[1])
1749	    {
1750	    case GOMP_DEPEND_IN:
1751	      break;
1752	    case GOMP_DEPEND_OUT:
1753	    case GOMP_DEPEND_INOUT:
1754	    case GOMP_DEPEND_MUTEXINOUTSET:
1755	      elem.is_in = 0;
1756	      break;
1757	    default:
1758	      gomp_fatal ("unknown omp_depend_t dependence type %d",
1759			  (int) (uintptr_t) d[1]);
1760	    }
1761	  elem.addr = d[0];
1762	}
1763      ent = htab_find (task->depend_hash, &elem);
1764      for (; ent; ent = ent->next)
1765	if (elem.is_in && ent->is_in)
1766	  continue;
1767	else
1768	  {
1769	    struct gomp_task *tsk = ent->task;
1770	    if (!tsk->parent_depends_on)
1771	      {
1772		tsk->parent_depends_on = true;
1773		++num_awaited;
1774		/* If dependency TSK itself has no dependencies and is
1775		   ready to run, move it up front so that we run it as
1776		   soon as possible.  */
1777		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
1778		  priority_queue_upgrade_task (tsk, task);
1779	      }
1780	  }
1781    }
1782  if (num_awaited == 0)
1783    {
1784      gomp_mutex_unlock (&team->task_lock);
1785      return;
1786    }
1787
1788  memset (&taskwait, 0, sizeof (taskwait));
1789  taskwait.n_depend = num_awaited;
1790  gomp_sem_init (&taskwait.taskwait_sem, 0);
1791  task->taskwait = &taskwait;
1792
1793  while (1)
1794    {
1795      bool cancelled = false;
1796      if (taskwait.n_depend == 0)
1797	{
1798	  task->taskwait = NULL;
1799	  gomp_mutex_unlock (&team->task_lock);
1800	  if (to_free)
1801	    {
1802	      gomp_finish_task (to_free);
1803	      free (to_free);
1804	    }
1805	  gomp_sem_destroy (&taskwait.taskwait_sem);
1806	  return;
1807	}
1808
1809      /* Theoretically when we have multiple priorities, we should
1810	 chose between the highest priority item in
1811	 task->children_queue and team->task_queue here, so we should
1812	 use priority_queue_next_task().  However, since we are
1813	 running an undeferred task, perhaps that makes all tasks it
1814	 depends on undeferred, thus a priority of INF?  This would
1815	 make it unnecessary to take anything into account here,
1816	 but the dependencies.
1817
1818	 On the other hand, if we want to use priority_queue_next_task(),
1819	 care should be taken to only use priority_queue_remove()
1820	 below if the task was actually removed from the children
1821	 queue.  */
1822      bool ignored;
1823      struct gomp_task *next_task
1824	= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
1825				    PQ_IGNORED, NULL, &ignored);
1826
1827      if (next_task->kind == GOMP_TASK_WAITING)
1828	{
1829	  child_task = next_task;
1830	  cancelled
1831	    = gomp_task_run_pre (child_task, task, team);
1832	  if (__builtin_expect (cancelled, 0))
1833	    {
1834	      if (to_free)
1835		{
1836		  gomp_finish_task (to_free);
1837		  free (to_free);
1838		  to_free = NULL;
1839		}
1840	      goto finish_cancelled;
1841	    }
1842	}
1843      else
1844	/* All tasks we are waiting for are either running in other
1845	   threads, or they are tasks that have not had their
1846	   dependencies met (so they're not even in the queue).  Wait
1847	   for them.  */
1848	taskwait.in_depend_wait = true;
1849      gomp_mutex_unlock (&team->task_lock);
1850      if (do_wake)
1851	{
1852	  gomp_team_barrier_wake (&team->barrier, do_wake);
1853	  do_wake = 0;
1854	}
1855      if (to_free)
1856	{
1857	  gomp_finish_task (to_free);
1858	  free (to_free);
1859	  to_free = NULL;
1860	}
1861      if (child_task)
1862	{
1863	  thr->task = child_task;
1864	  if (__builtin_expect (child_task->fn == NULL, 0))
1865	    {
1866	      if (gomp_target_task_fn (child_task->fn_data))
1867		{
1868		  thr->task = task;
1869		  gomp_mutex_lock (&team->task_lock);
1870		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
1871		  struct gomp_target_task *ttask
1872		    = (struct gomp_target_task *) child_task->fn_data;
1873		  /* If GOMP_PLUGIN_target_task_completion has run already
1874		     in between gomp_target_task_fn and the mutex lock,
1875		     perform the requeuing here.  */
1876		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
1877		    gomp_target_task_completion (team, child_task);
1878		  else
1879		    ttask->state = GOMP_TARGET_TASK_RUNNING;
1880		  child_task = NULL;
1881		  continue;
1882		}
1883	    }
1884	  else
1885	    child_task->fn (child_task->fn_data);
1886	  thr->task = task;
1887	}
1888      else
1889	gomp_sem_wait (&taskwait.taskwait_sem);
1890      gomp_mutex_lock (&team->task_lock);
1891      if (child_task)
1892	{
1893	 finish_cancelled:;
1894	  size_t new_tasks
1895	    = gomp_task_run_post_handle_depend (child_task, team);
1896	  if (child_task->parent_depends_on)
1897	    --taskwait.n_depend;
1898
1899	  priority_queue_remove (PQ_CHILDREN, &task->children_queue,
1900				 child_task, MEMMODEL_RELAXED);
1901	  child_task->pnode[PQ_CHILDREN].next = NULL;
1902	  child_task->pnode[PQ_CHILDREN].prev = NULL;
1903
1904	  gomp_clear_parent (&child_task->children_queue);
1905	  gomp_task_run_post_remove_taskgroup (child_task);
1906	  to_free = child_task;
1907	  child_task = NULL;
1908	  team->task_count--;
1909	  if (new_tasks > 1)
1910	    {
1911	      do_wake = team->nthreads - team->task_running_count
1912			- !task->in_tied_task;
1913	      if (do_wake > new_tasks)
1914		do_wake = new_tasks;
1915	    }
1916	}
1917    }
1918}
1919
1920/* Called when encountering a taskyield directive.  */
1921
1922void
1923GOMP_taskyield (void)
1924{
1925  /* Nothing at the moment.  */
1926}
1927
1928static inline struct gomp_taskgroup *
1929gomp_taskgroup_init (struct gomp_taskgroup *prev)
1930{
1931  struct gomp_taskgroup *taskgroup
1932    = gomp_malloc (sizeof (struct gomp_taskgroup));
1933  taskgroup->prev = prev;
1934  priority_queue_init (&taskgroup->taskgroup_queue);
1935  taskgroup->reductions = prev ? prev->reductions : NULL;
1936  taskgroup->in_taskgroup_wait = false;
1937  taskgroup->cancelled = false;
1938  taskgroup->workshare = false;
1939  taskgroup->num_children = 0;
1940  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
1941  return taskgroup;
1942}
1943
1944void
1945GOMP_taskgroup_start (void)
1946{
1947  struct gomp_thread *thr = gomp_thread ();
1948  struct gomp_team *team = thr->ts.team;
1949  struct gomp_task *task = thr->task;
1950
1951  /* If team is NULL, all tasks are executed as
1952     GOMP_TASK_UNDEFERRED tasks and thus all children tasks of
1953     taskgroup and their descendant tasks will be finished
1954     by the time GOMP_taskgroup_end is called.  */
1955  if (team == NULL)
1956    return;
1957  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
1958}
1959
1960void
1961GOMP_taskgroup_end (void)
1962{
1963  struct gomp_thread *thr = gomp_thread ();
1964  struct gomp_team *team = thr->ts.team;
1965  struct gomp_task *task = thr->task;
1966  struct gomp_taskgroup *taskgroup;
1967  struct gomp_task *child_task = NULL;
1968  struct gomp_task *to_free = NULL;
1969  int do_wake = 0;
1970
1971  if (team == NULL)
1972    return;
1973  taskgroup = task->taskgroup;
1974  if (__builtin_expect (taskgroup == NULL, 0)
1975      && thr->ts.level == 0)
1976    {
1977      /* This can happen if GOMP_taskgroup_start is called when
1978	 thr->ts.team == NULL, but inside of the taskgroup there
1979	 is #pragma omp target nowait that creates an implicit
1980	 team with a single thread.  In this case, we want to wait
1981	 for all outstanding tasks in this team.  */
1982      gomp_team_barrier_wait (&team->barrier);
1983      return;
1984    }
1985
1986  /* The acquire barrier on load of taskgroup->num_children here
1987     synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
1988     It is not necessary that we synchronize with other non-0 writes at
1989     this point, but we must ensure that all writes to memory by a
1990     child thread task work function are seen before we exit from
1991     GOMP_taskgroup_end.  */
1992  if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
1993    goto finish;
1994
1995  bool unused;
1996  gomp_mutex_lock (&team->task_lock);
1997  while (1)
1998    {
1999      bool cancelled = false;
2000      if (priority_queue_empty_p (&taskgroup->taskgroup_queue,
2001				  MEMMODEL_RELAXED))
2002	{
2003	  if (taskgroup->num_children)
2004	    {
2005	      if (priority_queue_empty_p (&task->children_queue,
2006					  MEMMODEL_RELAXED))
2007		goto do_wait;
2008	      child_task
2009		= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
2010					    PQ_TEAM, &team->task_queue,
2011					    &unused);
2012	    }
2013	  else
2014	    {
2015	      gomp_mutex_unlock (&team->task_lock);
2016	      if (to_free)
2017		{
2018		  gomp_finish_task (to_free);
2019		  free (to_free);
2020		}
2021	      goto finish;
2022	    }
2023	}
2024      else
2025	child_task
2026	  = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
2027				      PQ_TEAM, &team->task_queue, &unused);
2028      if (child_task->kind == GOMP_TASK_WAITING)
2029	{
2030	  cancelled
2031	    = gomp_task_run_pre (child_task, child_task->parent, team);
2032	  if (__builtin_expect (cancelled, 0))
2033	    {
2034	      if (to_free)
2035		{
2036		  gomp_finish_task (to_free);
2037		  free (to_free);
2038		  to_free = NULL;
2039		}
2040	      goto finish_cancelled;
2041	    }
2042	}
2043      else
2044	{
2045	  child_task = NULL;
2046	 do_wait:
2047	/* All tasks we are waiting for are either running in other
2048	   threads, or they are tasks that have not had their
2049	   dependencies met (so they're not even in the queue).  Wait
2050	   for them.  */
2051	  taskgroup->in_taskgroup_wait = true;
2052	}
2053      gomp_mutex_unlock (&team->task_lock);
2054      if (do_wake)
2055	{
2056	  gomp_team_barrier_wake (&team->barrier, do_wake);
2057	  do_wake = 0;
2058	}
2059      if (to_free)
2060	{
2061	  gomp_finish_task (to_free);
2062	  free (to_free);
2063	  to_free = NULL;
2064	}
2065      if (child_task)
2066	{
2067	  thr->task = child_task;
2068	  if (__builtin_expect (child_task->fn == NULL, 0))
2069	    {
2070	      if (gomp_target_task_fn (child_task->fn_data))
2071		{
2072		  thr->task = task;
2073		  gomp_mutex_lock (&team->task_lock);
2074		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
2075		  struct gomp_target_task *ttask
2076		    = (struct gomp_target_task *) child_task->fn_data;
2077		  /* If GOMP_PLUGIN_target_task_completion has run already
2078		     in between gomp_target_task_fn and the mutex lock,
2079		     perform the requeuing here.  */
2080		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
2081		    gomp_target_task_completion (team, child_task);
2082		  else
2083		    ttask->state = GOMP_TARGET_TASK_RUNNING;
2084		  child_task = NULL;
2085		  continue;
2086		}
2087	    }
2088	  else
2089	    child_task->fn (child_task->fn_data);
2090	  thr->task = task;
2091	}
2092      else
2093	gomp_sem_wait (&taskgroup->taskgroup_sem);
2094      gomp_mutex_lock (&team->task_lock);
2095      if (child_task)
2096	{
2097	  if (child_task->detach_team)
2098	    {
2099	      assert (child_task->detach_team == team);
2100	      child_task->kind = GOMP_TASK_DETACHED;
2101	      ++team->task_detach_count;
2102	      gomp_debug (0,
2103			  "thread %d: task with event %p finished without "
2104			  "completion event fulfilled in taskgroup\n",
2105			  thr->ts.team_id, child_task);
2106	      child_task = NULL;
2107	      continue;
2108	    }
2109
2110	 finish_cancelled:;
2111	  size_t new_tasks
2112	    = gomp_task_run_post_handle_depend (child_task, team);
2113	  gomp_task_run_post_remove_parent (child_task);
2114	  gomp_clear_parent (&child_task->children_queue);
2115	  gomp_task_run_post_remove_taskgroup (child_task);
2116	  to_free = child_task;
2117	  child_task = NULL;
2118	  team->task_count--;
2119	  if (new_tasks > 1)
2120	    {
2121	      do_wake = team->nthreads - team->task_running_count
2122			- !task->in_tied_task;
2123	      if (do_wake > new_tasks)
2124		do_wake = new_tasks;
2125	    }
2126	}
2127    }
2128
2129 finish:
2130  task->taskgroup = taskgroup->prev;
2131  gomp_sem_destroy (&taskgroup->taskgroup_sem);
2132  free (taskgroup);
2133}
2134
2135static inline __attribute__((always_inline)) void
2136gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
2137			 unsigned nthreads)
2138{
2139  size_t total_cnt = 0;
2140  uintptr_t *d = data;
2141  struct htab *old_htab = NULL, *new_htab;
2142  do
2143    {
2144      if (__builtin_expect (orig != NULL, 0))
2145	{
2146	  /* For worksharing task reductions, memory has been allocated
2147	     already by some other thread that encountered the construct
2148	     earlier.  */
2149	  d[2] = orig[2];
2150	  d[6] = orig[6];
2151	  orig = (uintptr_t *) orig[4];
2152	}
2153      else
2154	{
2155	  size_t sz = d[1] * nthreads;
2156	  /* Should use omp_alloc if d[3] is not -1.  */
2157	  void *ptr = gomp_aligned_alloc (d[2], sz);
2158	  memset (ptr, '\0', sz);
2159	  d[2] = (uintptr_t) ptr;
2160	  d[6] = d[2] + sz;
2161	}
2162      d[5] = 0;
2163      total_cnt += d[0];
2164      if (d[4] == 0)
2165	{
2166	  d[4] = (uintptr_t) old;
2167	  break;
2168	}
2169      else
2170	d = (uintptr_t *) d[4];
2171    }
2172  while (1);
2173  if (old && old[5])
2174    {
2175      old_htab = (struct htab *) old[5];
2176      total_cnt += htab_elements (old_htab);
2177    }
2178  new_htab = htab_create (total_cnt);
2179  if (old_htab)
2180    {
2181      /* Copy old hash table, like in htab_expand.  */
2182      hash_entry_type *p, *olimit;
2183      new_htab->n_elements = htab_elements (old_htab);
2184      olimit = old_htab->entries + old_htab->size;
2185      p = old_htab->entries;
2186      do
2187	{
2188	  hash_entry_type x = *p;
2189	  if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
2190	    *find_empty_slot_for_expand (new_htab, htab_hash (x)) = x;
2191	  p++;
2192	}
2193      while (p < olimit);
2194    }
2195  d = data;
2196  do
2197    {
2198      size_t j;
2199      for (j = 0; j < d[0]; ++j)
2200	{
2201	  uintptr_t *p = d + 7 + j * 3;
2202	  p[2] = (uintptr_t) d;
2203	  /* Ugly hack, hash_entry_type is defined for the task dependencies,
2204	     which hash on the first element which is a pointer.  We need
2205	     to hash also on the first sizeof (uintptr_t) bytes which contain
2206	     a pointer.  Hide the cast from the compiler.  */
2207	  hash_entry_type n;
2208	  __asm ("" : "=g" (n) : "0" (p));
2209	  *htab_find_slot (&new_htab, n, INSERT) = n;
2210	}
2211      if (d[4] == (uintptr_t) old)
2212	break;
2213      else
2214	d = (uintptr_t *) d[4];
2215    }
2216  while (1);
2217  d[5] = (uintptr_t) new_htab;
2218}
2219
2220static void
2221gomp_create_artificial_team (void)
2222{
2223  struct gomp_thread *thr = gomp_thread ();
2224  struct gomp_task_icv *icv;
2225  struct gomp_team *team = gomp_new_team (1);
2226  struct gomp_task *task = thr->task;
2227  struct gomp_task **implicit_task = &task;
2228  icv = task ? &task->icv : &gomp_global_icv;
2229  team->prev_ts = thr->ts;
2230  thr->ts.team = team;
2231  thr->ts.team_id = 0;
2232  thr->ts.work_share = &team->work_shares[0];
2233  thr->ts.last_work_share = NULL;
2234#ifdef HAVE_SYNC_BUILTINS
2235  thr->ts.single_count = 0;
2236#endif
2237  thr->ts.static_trip = 0;
2238  thr->task = &team->implicit_task[0];
2239  gomp_init_task (thr->task, NULL, icv);
2240  while (*implicit_task
2241	 && (*implicit_task)->kind != GOMP_TASK_IMPLICIT)
2242    implicit_task = &(*implicit_task)->parent;
2243  if (*implicit_task)
2244    {
2245      thr->task = *implicit_task;
2246      gomp_end_task ();
2247      free (*implicit_task);
2248      thr->task = &team->implicit_task[0];
2249    }
2250#ifdef LIBGOMP_USE_PTHREADS
2251  else
2252    pthread_setspecific (gomp_thread_destructor, thr);
2253#endif
2254  if (implicit_task != &task)
2255    {
2256      *implicit_task = thr->task;
2257      thr->task = task;
2258    }
2259}
2260
2261/* The format of data is:
2262   data[0]	cnt
2263   data[1]	size
2264   data[2]	alignment (on output array pointer)
2265   data[3]	allocator (-1 if malloc allocator)
2266   data[4]	next pointer
2267   data[5]	used internally (htab pointer)
2268   data[6]	used internally (end of array)
2269   cnt times
2270   ent[0]	address
2271   ent[1]	offset
2272   ent[2]	used internally (pointer to data[0])
2273   The entries are sorted by increasing offset, so that a binary
2274   search can be performed.  Normally, data[8] is 0, exception is
2275   for worksharing construct task reductions in cancellable parallel,
2276   where at offset 0 there should be space for a pointer and an integer
2277   which are used internally.  */
2278
2279void
2280GOMP_taskgroup_reduction_register (uintptr_t *data)
2281{
2282  struct gomp_thread *thr = gomp_thread ();
2283  struct gomp_team *team = thr->ts.team;
2284  struct gomp_task *task;
2285  unsigned nthreads;
2286  if (__builtin_expect (team == NULL, 0))
2287    {
2288      /* The task reduction code needs a team and task, so for
2289	 orphaned taskgroups just create the implicit team.  */
2290      gomp_create_artificial_team ();
2291      ialias_call (GOMP_taskgroup_start) ();
2292      team = thr->ts.team;
2293    }
2294  nthreads = team->nthreads;
2295  task = thr->task;
2296  gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads);
2297  task->taskgroup->reductions = data;
2298}
2299
2300void
2301GOMP_taskgroup_reduction_unregister (uintptr_t *data)
2302{
2303  uintptr_t *d = data;
2304  htab_free ((struct htab *) data[5]);
2305  do
2306    {
2307      gomp_aligned_free ((void *) d[2]);
2308      d = (uintptr_t *) d[4];
2309    }
2310  while (d && !d[5]);
2311}
2312ialias (GOMP_taskgroup_reduction_unregister)
2313
2314/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the
2315   original list item or address of previously remapped original list
2316   item to address of the private copy, store that to ptrs[i].
2317   For i < cntorig, additionally set ptrs[cnt+i] to the address of
2318   the original list item.  */
2319
2320void
2321GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
2322{
2323  struct gomp_thread *thr = gomp_thread ();
2324  struct gomp_task *task = thr->task;
2325  unsigned id = thr->ts.team_id;
2326  uintptr_t *data = task->taskgroup->reductions;
2327  uintptr_t *d;
2328  struct htab *reduction_htab = (struct htab *) data[5];
2329  size_t i;
2330  for (i = 0; i < cnt; ++i)
2331    {
2332      hash_entry_type ent, n;
2333      __asm ("" : "=g" (ent) : "0" (ptrs + i));
2334      n = htab_find (reduction_htab, ent);
2335      if (n)
2336	{
2337	  uintptr_t *p;
2338	  __asm ("" : "=g" (p) : "0" (n));
2339	  /* At this point, p[0] should be equal to (uintptr_t) ptrs[i],
2340	     p[1] is the offset within the allocated chunk for each
2341	     thread, p[2] is the array registered with
2342	     GOMP_taskgroup_reduction_register, d[2] is the base of the
2343	     allocated memory and d[1] is the size of the allocated chunk
2344	     for one thread.  */
2345	  d = (uintptr_t *) p[2];
2346	  ptrs[i] = (void *) (d[2] + id * d[1] + p[1]);
2347	  if (__builtin_expect (i < cntorig, 0))
2348	    ptrs[cnt + i] = (void *) p[0];
2349	  continue;
2350	}
2351      d = data;
2352      while (d != NULL)
2353	{
2354	  if ((uintptr_t) ptrs[i] >= d[2] && (uintptr_t) ptrs[i] < d[6])
2355	    break;
2356	  d = (uintptr_t *) d[4];
2357	}
2358      if (d == NULL)
2359	gomp_fatal ("couldn't find matching task_reduction or reduction with "
2360		    "task modifier for %p", ptrs[i]);
2361      uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1];
2362      ptrs[i] = (void *) (d[2] + id * d[1] + off);
2363      if (__builtin_expect (i < cntorig, 0))
2364	{
2365	  size_t lo = 0, hi = d[0] - 1;
2366	  while (lo <= hi)
2367	    {
2368	      size_t m = (lo + hi) / 2;
2369	      if (d[7 + 3 * m + 1] < off)
2370		lo = m + 1;
2371	      else if (d[7 + 3 * m + 1] == off)
2372		{
2373		  ptrs[cnt + i] = (void *) d[7 + 3 * m];
2374		  break;
2375		}
2376	      else
2377		hi = m - 1;
2378	    }
2379	  if (lo > hi)
2380	    gomp_fatal ("couldn't find matching task_reduction or reduction "
2381			"with task modifier for %p", ptrs[i]);
2382	}
2383    }
2384}
2385
2386struct gomp_taskgroup *
2387gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads)
2388{
2389  struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL);
2390  gomp_reduction_register (data, NULL, NULL, nthreads);
2391  taskgroup->reductions = data;
2392  return taskgroup;
2393}
2394
2395void
2396gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig)
2397{
2398  struct gomp_thread *thr = gomp_thread ();
2399  struct gomp_team *team = thr->ts.team;
2400  struct gomp_task *task = thr->task;
2401  unsigned nthreads = team->nthreads;
2402  gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads);
2403  task->taskgroup->reductions = data;
2404}
2405
2406void
2407gomp_workshare_taskgroup_start (void)
2408{
2409  struct gomp_thread *thr = gomp_thread ();
2410  struct gomp_team *team = thr->ts.team;
2411  struct gomp_task *task;
2412
2413  if (team == NULL)
2414    {
2415      gomp_create_artificial_team ();
2416      team = thr->ts.team;
2417    }
2418  task = thr->task;
2419  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
2420  task->taskgroup->workshare = true;
2421}
2422
2423void
2424GOMP_workshare_task_reduction_unregister (bool cancelled)
2425{
2426  struct gomp_thread *thr = gomp_thread ();
2427  struct gomp_task *task = thr->task;
2428  struct gomp_team *team = thr->ts.team;
2429  uintptr_t *data = task->taskgroup->reductions;
2430  ialias_call (GOMP_taskgroup_end) ();
2431  if (thr->ts.team_id == 0)
2432    ialias_call (GOMP_taskgroup_reduction_unregister) (data);
2433  else
2434    htab_free ((struct htab *) data[5]);
2435
2436  if (!cancelled)
2437    gomp_team_barrier_wait (&team->barrier);
2438}
2439
2440int
2441omp_in_final (void)
2442{
2443  struct gomp_thread *thr = gomp_thread ();
2444  return thr->task && thr->task->final_task;
2445}
2446
2447ialias (omp_in_final)
2448
2449void
2450omp_fulfill_event (omp_event_handle_t event)
2451{
2452  struct gomp_task *task = (struct gomp_task *) event;
2453  if (!task->deferred_p)
2454    {
2455      if (gomp_sem_getcount (task->completion_sem) > 0)
2456	gomp_fatal ("omp_fulfill_event: %p event already fulfilled!\n", task);
2457
2458      gomp_debug (0, "omp_fulfill_event: %p event for undeferred task\n",
2459		  task);
2460      gomp_sem_post (task->completion_sem);
2461      return;
2462    }
2463
2464  struct gomp_team *team = __atomic_load_n (&task->detach_team,
2465					    MEMMODEL_RELAXED);
2466  if (!team)
2467    gomp_fatal ("omp_fulfill_event: %p event is invalid or has already "
2468		"been fulfilled!\n", task);
2469
2470  gomp_mutex_lock (&team->task_lock);
2471  if (task->kind != GOMP_TASK_DETACHED)
2472    {
2473      /* The task has not finished running yet.  */
2474      gomp_debug (0,
2475		  "omp_fulfill_event: %p event fulfilled for unfinished "
2476		  "task\n", task);
2477      __atomic_store_n (&task->detach_team, NULL, MEMMODEL_RELAXED);
2478      gomp_mutex_unlock (&team->task_lock);
2479      return;
2480    }
2481
2482  gomp_debug (0, "omp_fulfill_event: %p event fulfilled for finished task\n",
2483	      task);
2484  size_t new_tasks = gomp_task_run_post_handle_depend (task, team);
2485  gomp_task_run_post_remove_parent (task);
2486  gomp_clear_parent (&task->children_queue);
2487  gomp_task_run_post_remove_taskgroup (task);
2488  team->task_count--;
2489  team->task_detach_count--;
2490
2491  int do_wake = 0;
2492  bool shackled_thread_p = team == gomp_thread ()->ts.team;
2493  if (new_tasks > 0)
2494    {
2495      /* Wake up threads to run new tasks.  */
2496      gomp_team_barrier_set_task_pending (&team->barrier);
2497      do_wake = team->nthreads - team->task_running_count;
2498      if (do_wake > new_tasks)
2499	do_wake = new_tasks;
2500    }
2501
2502  if (!shackled_thread_p
2503      && !do_wake
2504      && team->task_detach_count == 0
2505      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
2506    /* Ensure that at least one thread is woken up to signal that the
2507       barrier can finish.  */
2508    do_wake = 1;
2509
2510  /* If we are running in an unshackled thread, the team might vanish before
2511     gomp_team_barrier_wake is run if we release the lock first, so keep the
2512     lock for the call in that case.  */
2513  if (shackled_thread_p)
2514    gomp_mutex_unlock (&team->task_lock);
2515  if (do_wake)
2516    gomp_team_barrier_wake (&team->barrier, do_wake);
2517  if (!shackled_thread_p)
2518    gomp_mutex_unlock (&team->task_lock);
2519
2520  gomp_finish_task (task);
2521  free (task);
2522}
2523
2524ialias (omp_fulfill_event)
2525