1/* Copyright (C) 2005-2020 Free Software Foundation, Inc.
2   Contributed by Richard Henderson <rth@redhat.com>.
3
4   This file is part of the GNU Offloading and Multi Processing Library
5   (libgomp).
6
7   Libgomp is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15   more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26/* This file handles the maintenance of threads in response to team
27   creation and termination.  */
28
29#include "libgomp.h"
30#include "pool.h"
31#include <stdlib.h>
32#include <string.h>
33
34#ifdef LIBGOMP_USE_PTHREADS
35pthread_attr_t gomp_thread_attr;
36
37/* This key is for the thread destructor.  */
38pthread_key_t gomp_thread_destructor;
39
40
41/* This is the libgomp per-thread data structure.  */
42#if defined HAVE_TLS || defined USE_EMUTLS
43__thread struct gomp_thread gomp_tls_data;
44#else
45pthread_key_t gomp_tls_key;
46#endif
47
48
49/* This structure is used to communicate across pthread_create.  */
50
51struct gomp_thread_start_data
52{
53  void (*fn) (void *);
54  void *fn_data;
55  struct gomp_team_state ts;
56  struct gomp_task *task;
57  struct gomp_thread_pool *thread_pool;
58  unsigned int place;
59  bool nested;
60  pthread_t handle;
61};
62
63
64/* This function is a pthread_create entry point.  This contains the idle
65   loop in which a thread waits to be called up to become part of a team.  */
66
67static void *
68gomp_thread_start (void *xdata)
69{
70  struct gomp_thread_start_data *data = xdata;
71  struct gomp_thread *thr;
72  struct gomp_thread_pool *pool;
73  void (*local_fn) (void *);
74  void *local_data;
75
76#if defined HAVE_TLS || defined USE_EMUTLS
77  thr = &gomp_tls_data;
78#else
79  struct gomp_thread local_thr;
80  thr = &local_thr;
81  pthread_setspecific (gomp_tls_key, thr);
82#endif
83  gomp_sem_init (&thr->release, 0);
84
85  /* Extract what we need from data.  */
86  local_fn = data->fn;
87  local_data = data->fn_data;
88  thr->thread_pool = data->thread_pool;
89  thr->ts = data->ts;
90  thr->task = data->task;
91  thr->place = data->place;
92#ifdef GOMP_NEEDS_THREAD_HANDLE
93  thr->handle = data->handle;
94#endif
95
96  thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
97
98  /* Make thread pool local. */
99  pool = thr->thread_pool;
100
101  if (data->nested)
102    {
103      struct gomp_team *team = thr->ts.team;
104      struct gomp_task *task = thr->task;
105
106      gomp_barrier_wait (&team->barrier);
107
108      local_fn (local_data);
109      gomp_team_barrier_wait_final (&team->barrier);
110      gomp_finish_task (task);
111      gomp_barrier_wait_last (&team->barrier);
112    }
113  else
114    {
115      pool->threads[thr->ts.team_id] = thr;
116
117      gomp_simple_barrier_wait (&pool->threads_dock);
118      do
119	{
120	  struct gomp_team *team = thr->ts.team;
121	  struct gomp_task *task = thr->task;
122
123	  local_fn (local_data);
124	  gomp_team_barrier_wait_final (&team->barrier);
125	  gomp_finish_task (task);
126
127	  gomp_simple_barrier_wait (&pool->threads_dock);
128
129	  local_fn = thr->fn;
130	  local_data = thr->data;
131	  thr->fn = NULL;
132	}
133      while (local_fn);
134    }
135
136  gomp_sem_destroy (&thr->release);
137  pthread_detach (pthread_self ());
138  thr->thread_pool = NULL;
139  thr->task = NULL;
140  return NULL;
141}
142#endif
143
144static inline struct gomp_team *
145get_last_team (unsigned nthreads)
146{
147  struct gomp_thread *thr = gomp_thread ();
148  if (thr->ts.team == NULL)
149    {
150      struct gomp_thread_pool *pool = gomp_get_thread_pool (thr, nthreads);
151      struct gomp_team *last_team = pool->last_team;
152      if (last_team != NULL && last_team->nthreads == nthreads)
153        {
154          pool->last_team = NULL;
155          return last_team;
156        }
157    }
158  return NULL;
159}
160
161/* Create a new team data structure.  */
162
163struct gomp_team *
164gomp_new_team (unsigned nthreads)
165{
166  struct gomp_team *team;
167  int i;
168
169  team = get_last_team (nthreads);
170  if (team == NULL)
171    {
172      size_t extra = sizeof (team->ordered_release[0])
173		     + sizeof (team->implicit_task[0]);
174      team = team_malloc (sizeof (*team) + nthreads * extra);
175
176#ifndef HAVE_SYNC_BUILTINS
177      gomp_mutex_init (&team->work_share_list_free_lock);
178#endif
179      gomp_barrier_init (&team->barrier, nthreads);
180      gomp_mutex_init (&team->task_lock);
181
182      team->nthreads = nthreads;
183    }
184
185  team->work_share_chunk = 8;
186#ifdef HAVE_SYNC_BUILTINS
187  team->single_count = 0;
188#endif
189  team->work_shares_to_free = &team->work_shares[0];
190  gomp_init_work_share (&team->work_shares[0], 0, nthreads);
191  team->work_shares[0].next_alloc = NULL;
192  team->work_share_list_free = NULL;
193  team->work_share_list_alloc = &team->work_shares[1];
194  for (i = 1; i < 7; i++)
195    team->work_shares[i].next_free = &team->work_shares[i + 1];
196  team->work_shares[i].next_free = NULL;
197
198  gomp_sem_init (&team->master_release, 0);
199  team->ordered_release = (void *) &team->implicit_task[nthreads];
200  team->ordered_release[0] = &team->master_release;
201
202  priority_queue_init (&team->task_queue);
203  team->task_count = 0;
204  team->task_queued_count = 0;
205  team->task_running_count = 0;
206  team->work_share_cancelled = 0;
207  team->team_cancelled = 0;
208
209  return team;
210}
211
212
213/* Free a team data structure.  */
214
215static void
216free_team (struct gomp_team *team)
217{
218#ifndef HAVE_SYNC_BUILTINS
219  gomp_mutex_destroy (&team->work_share_list_free_lock);
220#endif
221  gomp_barrier_destroy (&team->barrier);
222  gomp_mutex_destroy (&team->task_lock);
223  priority_queue_free (&team->task_queue);
224  team_free (team);
225}
226
227static void
228gomp_free_pool_helper (void *thread_pool)
229{
230  struct gomp_thread *thr = gomp_thread ();
231  struct gomp_thread_pool *pool
232    = (struct gomp_thread_pool *) thread_pool;
233  gomp_simple_barrier_wait_last (&pool->threads_dock);
234  gomp_sem_destroy (&thr->release);
235  thr->thread_pool = NULL;
236  thr->task = NULL;
237#ifdef LIBGOMP_USE_PTHREADS
238  pthread_detach (pthread_self ());
239  pthread_exit (NULL);
240#elif defined(__nvptx__)
241  asm ("exit;");
242#elif defined(__AMDGCN__)
243  asm ("s_dcache_wb\n\t"
244       "s_endpgm");
245#else
246#error gomp_free_pool_helper must terminate the thread
247#endif
248}
249
250/* Free a thread pool and release its threads. */
251
252void
253gomp_free_thread (void *arg __attribute__((unused)))
254{
255  struct gomp_thread *thr = gomp_thread ();
256  struct gomp_thread_pool *pool = thr->thread_pool;
257  if (pool)
258    {
259      if (pool->threads_used > 0)
260	{
261	  int i;
262	  for (i = 1; i < pool->threads_used; i++)
263	    {
264	      struct gomp_thread *nthr = pool->threads[i];
265	      nthr->fn = gomp_free_pool_helper;
266	      nthr->data = pool;
267	    }
268	  /* This barrier undocks threads docked on pool->threads_dock.  */
269	  gomp_simple_barrier_wait (&pool->threads_dock);
270	  /* And this waits till all threads have called gomp_barrier_wait_last
271	     in gomp_free_pool_helper.  */
272	  gomp_simple_barrier_wait (&pool->threads_dock);
273	  /* Now it is safe to destroy the barrier and free the pool.  */
274	  gomp_simple_barrier_destroy (&pool->threads_dock);
275
276#ifdef HAVE_SYNC_BUILTINS
277	  __sync_fetch_and_add (&gomp_managed_threads,
278				1L - pool->threads_used);
279#else
280	  gomp_mutex_lock (&gomp_managed_threads_lock);
281	  gomp_managed_threads -= pool->threads_used - 1L;
282	  gomp_mutex_unlock (&gomp_managed_threads_lock);
283#endif
284	}
285      if (pool->last_team)
286	free_team (pool->last_team);
287#ifndef __nvptx__
288      team_free (pool->threads);
289      team_free (pool);
290#endif
291      thr->thread_pool = NULL;
292    }
293  if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
294    gomp_team_end ();
295  if (thr->task != NULL)
296    {
297      struct gomp_task *task = thr->task;
298      gomp_end_task ();
299      free (task);
300    }
301}
302
303/* Launch a team.  */
304
305#ifdef LIBGOMP_USE_PTHREADS
306void
307gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
308		 unsigned flags, struct gomp_team *team,
309		 struct gomp_taskgroup *taskgroup)
310{
311  struct gomp_thread_start_data *start_data;
312  struct gomp_thread *thr, *nthr;
313  struct gomp_task *task;
314  struct gomp_task_icv *icv;
315  bool nested;
316  struct gomp_thread_pool *pool;
317  unsigned i, n, old_threads_used = 0;
318  pthread_attr_t thread_attr, *attr;
319  unsigned long nthreads_var;
320  char bind, bind_var;
321  unsigned int s = 0, rest = 0, p = 0, k = 0;
322  unsigned int affinity_count = 0;
323  struct gomp_thread **affinity_thr = NULL;
324  bool force_display = false;
325
326  thr = gomp_thread ();
327  nested = thr->ts.level;
328  pool = thr->thread_pool;
329  task = thr->task;
330  icv = task ? &task->icv : &gomp_global_icv;
331  if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
332    {
333      gomp_init_affinity ();
334      if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
335	gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
336				      thr->place);
337    }
338
339  /* Always save the previous state, even if this isn't a nested team.
340     In particular, we should save any work share state from an outer
341     orphaned work share construct.  */
342  team->prev_ts = thr->ts;
343
344  thr->ts.team = team;
345  thr->ts.team_id = 0;
346  ++thr->ts.level;
347  if (nthreads > 1)
348    ++thr->ts.active_level;
349  thr->ts.work_share = &team->work_shares[0];
350  thr->ts.last_work_share = NULL;
351#ifdef HAVE_SYNC_BUILTINS
352  thr->ts.single_count = 0;
353#endif
354  thr->ts.static_trip = 0;
355  thr->task = &team->implicit_task[0];
356#ifdef GOMP_NEEDS_THREAD_HANDLE
357  thr->handle = pthread_self ();
358#endif
359  nthreads_var = icv->nthreads_var;
360  if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
361      && thr->ts.level < gomp_nthreads_var_list_len)
362    nthreads_var = gomp_nthreads_var_list[thr->ts.level];
363  bind_var = icv->bind_var;
364  if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
365    bind_var = flags & 7;
366  bind = bind_var;
367  if (__builtin_expect (gomp_bind_var_list != NULL, 0)
368      && thr->ts.level < gomp_bind_var_list_len)
369    bind_var = gomp_bind_var_list[thr->ts.level];
370  gomp_init_task (thr->task, task, icv);
371  thr->task->taskgroup = taskgroup;
372  team->implicit_task[0].icv.nthreads_var = nthreads_var;
373  team->implicit_task[0].icv.bind_var = bind_var;
374
375  if (nthreads == 1)
376    return;
377
378  i = 1;
379
380  if (__builtin_expect (gomp_places_list != NULL, 0))
381    {
382      /* Depending on chosen proc_bind model, set subpartition
383	 for the master thread and initialize helper variables
384	 P and optionally S, K and/or REST used by later place
385	 computation for each additional thread.  */
386      p = thr->place - 1;
387      switch (bind)
388	{
389	case omp_proc_bind_true:
390	case omp_proc_bind_close:
391	  if (nthreads > thr->ts.place_partition_len)
392	    {
393	      /* T > P.  S threads will be placed in each place,
394		 and the final REM threads placed one by one
395		 into the already occupied places.  */
396	      s = nthreads / thr->ts.place_partition_len;
397	      rest = nthreads % thr->ts.place_partition_len;
398	    }
399	  else
400	    s = 1;
401	  k = 1;
402	  break;
403	case omp_proc_bind_master:
404	  /* Each thread will be bound to master's place.  */
405	  break;
406	case omp_proc_bind_spread:
407	  if (nthreads <= thr->ts.place_partition_len)
408	    {
409	      /* T <= P.  Each subpartition will have in between s
410		 and s+1 places (subpartitions starting at or
411		 after rest will have s places, earlier s+1 places),
412		 each thread will be bound to the first place in
413		 its subpartition (except for the master thread
414		 that can be bound to another place in its
415		 subpartition).  */
416	      s = thr->ts.place_partition_len / nthreads;
417	      rest = thr->ts.place_partition_len % nthreads;
418	      rest = (s + 1) * rest + thr->ts.place_partition_off;
419	      if (p < rest)
420		{
421		  p -= (p - thr->ts.place_partition_off) % (s + 1);
422		  thr->ts.place_partition_len = s + 1;
423		}
424	      else
425		{
426		  p -= (p - rest) % s;
427		  thr->ts.place_partition_len = s;
428		}
429	      thr->ts.place_partition_off = p;
430	    }
431	  else
432	    {
433	      /* T > P.  Each subpartition will have just a single
434		 place and we'll place between s and s+1
435		 threads into each subpartition.  */
436	      s = nthreads / thr->ts.place_partition_len;
437	      rest = nthreads % thr->ts.place_partition_len;
438	      thr->ts.place_partition_off = p;
439	      thr->ts.place_partition_len = 1;
440	      k = 1;
441	    }
442	  break;
443	}
444    }
445  else
446    bind = omp_proc_bind_false;
447
448  /* We only allow the reuse of idle threads for non-nested PARALLEL
449     regions.  This appears to be implied by the semantics of
450     threadprivate variables, but perhaps that's reading too much into
451     things.  Certainly it does prevent any locking problems, since
452     only the initial program thread will modify gomp_threads.  */
453  if (!nested)
454    {
455      old_threads_used = pool->threads_used;
456
457      if (nthreads <= old_threads_used)
458	n = nthreads;
459      else if (old_threads_used == 0)
460	{
461	  n = 0;
462	  gomp_simple_barrier_init (&pool->threads_dock, nthreads);
463	}
464      else
465	{
466	  n = old_threads_used;
467
468	  /* Increase the barrier threshold to make sure all new
469	     threads arrive before the team is released.  */
470	  gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
471	}
472
473      /* Not true yet, but soon will be.  We're going to release all
474	 threads from the dock, and those that aren't part of the
475	 team will exit.  */
476      pool->threads_used = nthreads;
477
478      /* If necessary, expand the size of the gomp_threads array.  It is
479	 expected that changes in the number of threads are rare, thus we
480	 make no effort to expand gomp_threads_size geometrically.  */
481      if (nthreads >= pool->threads_size)
482	{
483	  pool->threads_size = nthreads + 1;
484	  pool->threads
485	    = gomp_realloc (pool->threads,
486			    pool->threads_size
487			    * sizeof (struct gomp_thread *));
488	  /* Add current (master) thread to threads[].  */
489	  pool->threads[0] = thr;
490	}
491
492      /* Release existing idle threads.  */
493      for (; i < n; ++i)
494	{
495	  unsigned int place_partition_off = thr->ts.place_partition_off;
496	  unsigned int place_partition_len = thr->ts.place_partition_len;
497	  unsigned int place = 0;
498	  if (__builtin_expect (gomp_places_list != NULL, 0))
499	    {
500	      switch (bind)
501		{
502		case omp_proc_bind_true:
503		case omp_proc_bind_close:
504		  if (k == s)
505		    {
506		      ++p;
507		      if (p == (team->prev_ts.place_partition_off
508				+ team->prev_ts.place_partition_len))
509			p = team->prev_ts.place_partition_off;
510		      k = 1;
511		      if (i == nthreads - rest)
512			s = 1;
513		    }
514		  else
515		    ++k;
516		  break;
517		case omp_proc_bind_master:
518		  break;
519		case omp_proc_bind_spread:
520		  if (k == 0)
521		    {
522		      /* T <= P.  */
523		      if (p < rest)
524			p += s + 1;
525		      else
526			p += s;
527		      if (p == (team->prev_ts.place_partition_off
528				+ team->prev_ts.place_partition_len))
529			p = team->prev_ts.place_partition_off;
530		      place_partition_off = p;
531		      if (p < rest)
532			place_partition_len = s + 1;
533		      else
534			place_partition_len = s;
535		    }
536		  else
537		    {
538		      /* T > P.  */
539		      if (k == s)
540			{
541			  ++p;
542			  if (p == (team->prev_ts.place_partition_off
543				    + team->prev_ts.place_partition_len))
544			    p = team->prev_ts.place_partition_off;
545			  k = 1;
546			  if (i == nthreads - rest)
547			    s = 1;
548			}
549		      else
550			++k;
551		      place_partition_off = p;
552		      place_partition_len = 1;
553		    }
554		  break;
555		}
556	      if (affinity_thr != NULL
557		  || (bind != omp_proc_bind_true
558		      && pool->threads[i]->place != p + 1)
559		  || pool->threads[i]->place <= place_partition_off
560		  || pool->threads[i]->place > (place_partition_off
561						+ place_partition_len))
562		{
563		  unsigned int l;
564		  force_display = true;
565		  if (affinity_thr == NULL)
566		    {
567		      unsigned int j;
568
569		      if (team->prev_ts.place_partition_len > 64)
570			affinity_thr
571			  = gomp_malloc (team->prev_ts.place_partition_len
572					 * sizeof (struct gomp_thread *));
573		      else
574			affinity_thr
575			  = gomp_alloca (team->prev_ts.place_partition_len
576					 * sizeof (struct gomp_thread *));
577		      memset (affinity_thr, '\0',
578			      team->prev_ts.place_partition_len
579			      * sizeof (struct gomp_thread *));
580		      for (j = i; j < old_threads_used; j++)
581			{
582			  if (pool->threads[j]->place
583			      > team->prev_ts.place_partition_off
584			      && (pool->threads[j]->place
585				  <= (team->prev_ts.place_partition_off
586				      + team->prev_ts.place_partition_len)))
587			    {
588			      l = pool->threads[j]->place - 1
589				  - team->prev_ts.place_partition_off;
590			      pool->threads[j]->data = affinity_thr[l];
591			      affinity_thr[l] = pool->threads[j];
592			    }
593			  pool->threads[j] = NULL;
594			}
595		      if (nthreads > old_threads_used)
596			memset (&pool->threads[old_threads_used],
597				'\0', ((nthreads - old_threads_used)
598				       * sizeof (struct gomp_thread *)));
599		      n = nthreads;
600		      affinity_count = old_threads_used - i;
601		    }
602		  if (affinity_count == 0)
603		    break;
604		  l = p;
605		  if (affinity_thr[l - team->prev_ts.place_partition_off]
606		      == NULL)
607		    {
608		      if (bind != omp_proc_bind_true)
609			continue;
610		      for (l = place_partition_off;
611			   l < place_partition_off + place_partition_len;
612			   l++)
613			if (affinity_thr[l - team->prev_ts.place_partition_off]
614			    != NULL)
615			  break;
616		      if (l == place_partition_off + place_partition_len)
617			continue;
618		    }
619		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
620		  affinity_thr[l - team->prev_ts.place_partition_off]
621		    = (struct gomp_thread *) nthr->data;
622		  affinity_count--;
623		  pool->threads[i] = nthr;
624		}
625	      else
626		nthr = pool->threads[i];
627	      place = p + 1;
628	    }
629	  else
630	    nthr = pool->threads[i];
631	  nthr->ts.team = team;
632	  nthr->ts.work_share = &team->work_shares[0];
633	  nthr->ts.last_work_share = NULL;
634	  nthr->ts.team_id = i;
635	  nthr->ts.level = team->prev_ts.level + 1;
636	  nthr->ts.active_level = thr->ts.active_level;
637	  nthr->ts.place_partition_off = place_partition_off;
638	  nthr->ts.place_partition_len = place_partition_len;
639#ifdef HAVE_SYNC_BUILTINS
640	  nthr->ts.single_count = 0;
641#endif
642	  nthr->ts.static_trip = 0;
643	  nthr->task = &team->implicit_task[i];
644	  nthr->place = place;
645	  gomp_init_task (nthr->task, task, icv);
646	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
647	  team->implicit_task[i].icv.bind_var = bind_var;
648	  nthr->task->taskgroup = taskgroup;
649	  nthr->fn = fn;
650	  nthr->data = data;
651	  team->ordered_release[i] = &nthr->release;
652	}
653
654      if (__builtin_expect (affinity_thr != NULL, 0))
655	{
656	  /* If AFFINITY_THR is non-NULL just because we had to
657	     permute some threads in the pool, but we've managed
658	     to find exactly as many old threads as we'd find
659	     without affinity, we don't need to handle this
660	     specially anymore.  */
661	  if (nthreads <= old_threads_used
662	      ? (affinity_count == old_threads_used - nthreads)
663	      : (i == old_threads_used))
664	    {
665	      if (team->prev_ts.place_partition_len > 64)
666		free (affinity_thr);
667	      affinity_thr = NULL;
668	      affinity_count = 0;
669	    }
670	  else
671	    {
672	      i = 1;
673	      /* We are going to compute the places/subpartitions
674		 again from the beginning.  So, we need to reinitialize
675		 vars modified by the switch (bind) above inside
676		 of the loop, to the state they had after the initial
677		 switch (bind).  */
678	      switch (bind)
679		{
680		case omp_proc_bind_true:
681		case omp_proc_bind_close:
682		  if (nthreads > thr->ts.place_partition_len)
683		    /* T > P.  S has been changed, so needs
684		       to be recomputed.  */
685		    s = nthreads / thr->ts.place_partition_len;
686		  k = 1;
687		  p = thr->place - 1;
688		  break;
689		case omp_proc_bind_master:
690		  /* No vars have been changed.  */
691		  break;
692		case omp_proc_bind_spread:
693		  p = thr->ts.place_partition_off;
694		  if (k != 0)
695		    {
696		      /* T > P.  */
697		      s = nthreads / team->prev_ts.place_partition_len;
698		      k = 1;
699		    }
700		  break;
701		}
702
703	      /* Increase the barrier threshold to make sure all new
704		 threads and all the threads we're going to let die
705		 arrive before the team is released.  */
706	      if (affinity_count)
707		gomp_simple_barrier_reinit (&pool->threads_dock,
708					    nthreads + affinity_count);
709	    }
710	}
711
712      if (i == nthreads)
713	goto do_release;
714
715    }
716
717  if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
718    {
719      long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
720
721      if (old_threads_used == 0)
722	--diff;
723
724#ifdef HAVE_SYNC_BUILTINS
725      __sync_fetch_and_add (&gomp_managed_threads, diff);
726#else
727      gomp_mutex_lock (&gomp_managed_threads_lock);
728      gomp_managed_threads += diff;
729      gomp_mutex_unlock (&gomp_managed_threads_lock);
730#endif
731    }
732
733  attr = &gomp_thread_attr;
734  if (__builtin_expect (gomp_places_list != NULL, 0))
735    {
736      size_t stacksize;
737      pthread_attr_init (&thread_attr);
738      if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
739	pthread_attr_setstacksize (&thread_attr, stacksize);
740      attr = &thread_attr;
741    }
742
743  start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
744			    * (nthreads - i));
745
746  /* Launch new threads.  */
747  for (; i < nthreads; ++i)
748    {
749      int err;
750
751      start_data->ts.place_partition_off = thr->ts.place_partition_off;
752      start_data->ts.place_partition_len = thr->ts.place_partition_len;
753      start_data->place = 0;
754      if (__builtin_expect (gomp_places_list != NULL, 0))
755	{
756	  switch (bind)
757	    {
758	    case omp_proc_bind_true:
759	    case omp_proc_bind_close:
760	      if (k == s)
761		{
762		  ++p;
763		  if (p == (team->prev_ts.place_partition_off
764			    + team->prev_ts.place_partition_len))
765		    p = team->prev_ts.place_partition_off;
766		  k = 1;
767		  if (i == nthreads - rest)
768		    s = 1;
769		}
770	      else
771		++k;
772	      break;
773	    case omp_proc_bind_master:
774	      break;
775	    case omp_proc_bind_spread:
776	      if (k == 0)
777		{
778		  /* T <= P.  */
779		  if (p < rest)
780		    p += s + 1;
781		  else
782		    p += s;
783		  if (p == (team->prev_ts.place_partition_off
784			    + team->prev_ts.place_partition_len))
785		    p = team->prev_ts.place_partition_off;
786		  start_data->ts.place_partition_off = p;
787		  if (p < rest)
788		    start_data->ts.place_partition_len = s + 1;
789		  else
790		    start_data->ts.place_partition_len = s;
791		}
792	      else
793		{
794		  /* T > P.  */
795		  if (k == s)
796		    {
797		      ++p;
798		      if (p == (team->prev_ts.place_partition_off
799				+ team->prev_ts.place_partition_len))
800			p = team->prev_ts.place_partition_off;
801		      k = 1;
802		      if (i == nthreads - rest)
803			s = 1;
804		    }
805		  else
806		    ++k;
807		  start_data->ts.place_partition_off = p;
808		  start_data->ts.place_partition_len = 1;
809		}
810	      break;
811	    }
812	  start_data->place = p + 1;
813	  if (affinity_thr != NULL && pool->threads[i] != NULL)
814	    continue;
815	  gomp_init_thread_affinity (attr, p);
816	}
817
818      start_data->fn = fn;
819      start_data->fn_data = data;
820      start_data->ts.team = team;
821      start_data->ts.work_share = &team->work_shares[0];
822      start_data->ts.last_work_share = NULL;
823      start_data->ts.team_id = i;
824      start_data->ts.level = team->prev_ts.level + 1;
825      start_data->ts.active_level = thr->ts.active_level;
826#ifdef HAVE_SYNC_BUILTINS
827      start_data->ts.single_count = 0;
828#endif
829      start_data->ts.static_trip = 0;
830      start_data->task = &team->implicit_task[i];
831      gomp_init_task (start_data->task, task, icv);
832      team->implicit_task[i].icv.nthreads_var = nthreads_var;
833      team->implicit_task[i].icv.bind_var = bind_var;
834      start_data->task->taskgroup = taskgroup;
835      start_data->thread_pool = pool;
836      start_data->nested = nested;
837
838      attr = gomp_adjust_thread_attr (attr, &thread_attr);
839      err = pthread_create (&start_data->handle, attr, gomp_thread_start,
840			    start_data);
841      start_data++;
842      if (err != 0)
843	gomp_fatal ("Thread creation failed: %s", strerror (err));
844    }
845
846  if (__builtin_expect (attr == &thread_attr, 0))
847    pthread_attr_destroy (&thread_attr);
848
849 do_release:
850  if (nested)
851    gomp_barrier_wait (&team->barrier);
852  else
853    gomp_simple_barrier_wait (&pool->threads_dock);
854
855  /* Decrease the barrier threshold to match the number of threads
856     that should arrive back at the end of this team.  The extra
857     threads should be exiting.  Note that we arrange for this test
858     to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
859     the barrier as well as gomp_managed_threads was temporarily
860     set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
861     AFFINITY_COUNT if non-zero will be always at least
862     OLD_THREADS_COUNT - NTHREADS.  */
863  if (__builtin_expect (nthreads < old_threads_used, 0)
864      || __builtin_expect (affinity_count, 0))
865    {
866      long diff = (long) nthreads - (long) old_threads_used;
867
868      if (affinity_count)
869	diff = -affinity_count;
870
871      gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
872
873#ifdef HAVE_SYNC_BUILTINS
874      __sync_fetch_and_add (&gomp_managed_threads, diff);
875#else
876      gomp_mutex_lock (&gomp_managed_threads_lock);
877      gomp_managed_threads += diff;
878      gomp_mutex_unlock (&gomp_managed_threads_lock);
879#endif
880    }
881  if (__builtin_expect (gomp_display_affinity_var, 0))
882    {
883      if (nested
884	  || nthreads != old_threads_used
885	  || force_display)
886	{
887	  gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
888					thr->place);
889	  if (nested)
890	    {
891	      start_data -= nthreads - 1;
892	      for (i = 1; i < nthreads; ++i)
893		{
894		  gomp_display_affinity_thread (
895#ifdef LIBGOMP_USE_PTHREADS
896						start_data->handle,
897#else
898						gomp_thread_self (),
899#endif
900						&start_data->ts,
901						start_data->place);
902		  start_data++;
903		}
904	    }
905	  else
906	    {
907	      for (i = 1; i < nthreads; ++i)
908		{
909		  gomp_thread_handle handle
910		    = gomp_thread_to_pthread_t (pool->threads[i]);
911		  gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
912						pool->threads[i]->place);
913		}
914	    }
915	}
916    }
917  if (__builtin_expect (affinity_thr != NULL, 0)
918      && team->prev_ts.place_partition_len > 64)
919    free (affinity_thr);
920}
921#endif
922
923
924/* Terminate the current team.  This is only to be called by the master
925   thread.  We assume that we must wait for the other threads.  */
926
927void
928gomp_team_end (void)
929{
930  struct gomp_thread *thr = gomp_thread ();
931  struct gomp_team *team = thr->ts.team;
932
933  /* This barrier handles all pending explicit threads.
934     As #pragma omp cancel parallel might get awaited count in
935     team->barrier in a inconsistent state, we need to use a different
936     counter here.  */
937  gomp_team_barrier_wait_final (&team->barrier);
938  if (__builtin_expect (team->team_cancelled, 0))
939    {
940      struct gomp_work_share *ws = team->work_shares_to_free;
941      do
942	{
943	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
944	  if (next_ws == NULL)
945	    gomp_ptrlock_set (&ws->next_ws, ws);
946	  gomp_fini_work_share (ws);
947	  ws = next_ws;
948	}
949      while (ws != NULL);
950    }
951  else
952    gomp_fini_work_share (thr->ts.work_share);
953
954  gomp_end_task ();
955  thr->ts = team->prev_ts;
956
957  if (__builtin_expect (thr->ts.level != 0, 0))
958    {
959#ifdef HAVE_SYNC_BUILTINS
960      __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
961#else
962      gomp_mutex_lock (&gomp_managed_threads_lock);
963      gomp_managed_threads -= team->nthreads - 1L;
964      gomp_mutex_unlock (&gomp_managed_threads_lock);
965#endif
966      /* This barrier has gomp_barrier_wait_last counterparts
967	 and ensures the team can be safely destroyed.  */
968      gomp_barrier_wait (&team->barrier);
969    }
970
971  if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
972    {
973      struct gomp_work_share *ws = team->work_shares[0].next_alloc;
974      do
975	{
976	  struct gomp_work_share *next_ws = ws->next_alloc;
977	  free (ws);
978	  ws = next_ws;
979	}
980      while (ws != NULL);
981    }
982  gomp_sem_destroy (&team->master_release);
983
984  if (__builtin_expect (thr->ts.team != NULL, 0)
985      || __builtin_expect (team->nthreads == 1, 0))
986    free_team (team);
987  else
988    {
989      struct gomp_thread_pool *pool = thr->thread_pool;
990      if (pool->last_team)
991	free_team (pool->last_team);
992      pool->last_team = team;
993      gomp_release_thread_pool (pool);
994    }
995}
996
997#ifdef LIBGOMP_USE_PTHREADS
998
999/* Constructors for this file.  */
1000
1001static void __attribute__((constructor))
1002initialize_team (void)
1003{
1004#if !defined HAVE_TLS && !defined USE_EMUTLS
1005  static struct gomp_thread initial_thread_tls_data;
1006
1007  pthread_key_create (&gomp_tls_key, NULL);
1008  pthread_setspecific (gomp_tls_key, &initial_thread_tls_data);
1009#endif
1010
1011  if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
1012    gomp_fatal ("could not create thread pool destructor.");
1013}
1014
1015static void __attribute__((destructor))
1016team_destructor (void)
1017{
1018  /* Without this dlclose on libgomp could lead to subsequent
1019     crashes.  */
1020  pthread_key_delete (gomp_thread_destructor);
1021}
1022
1023/* Similar to gomp_free_pool_helper, but don't detach itself,
1024   gomp_pause_host will pthread_join those threads.  */
1025
1026static void
1027gomp_pause_pool_helper (void *thread_pool)
1028{
1029  struct gomp_thread *thr = gomp_thread ();
1030  struct gomp_thread_pool *pool
1031    = (struct gomp_thread_pool *) thread_pool;
1032  gomp_simple_barrier_wait_last (&pool->threads_dock);
1033  gomp_sem_destroy (&thr->release);
1034  thr->thread_pool = NULL;
1035  thr->task = NULL;
1036  pthread_exit (NULL);
1037}
1038
1039/* Free a thread pool and release its threads.  Return non-zero on
1040   failure.  */
1041
1042int
1043gomp_pause_host (void)
1044{
1045  struct gomp_thread *thr = gomp_thread ();
1046  struct gomp_thread_pool *pool = thr->thread_pool;
1047  if (thr->ts.level)
1048    return -1;
1049  if (pool)
1050    {
1051      if (pool->threads_used > 0)
1052	{
1053	  int i;
1054	  pthread_t *thrs
1055	    = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
1056	  for (i = 1; i < pool->threads_used; i++)
1057	    {
1058	      struct gomp_thread *nthr = pool->threads[i];
1059	      nthr->fn = gomp_pause_pool_helper;
1060	      nthr->data = pool;
1061	      thrs[i] = gomp_thread_to_pthread_t (nthr);
1062	    }
1063	  /* This barrier undocks threads docked on pool->threads_dock.  */
1064	  gomp_simple_barrier_wait (&pool->threads_dock);
1065	  /* And this waits till all threads have called gomp_barrier_wait_last
1066	     in gomp_pause_pool_helper.  */
1067	  gomp_simple_barrier_wait (&pool->threads_dock);
1068	  /* Now it is safe to destroy the barrier and free the pool.  */
1069	  gomp_simple_barrier_destroy (&pool->threads_dock);
1070
1071#ifdef HAVE_SYNC_BUILTINS
1072	  __sync_fetch_and_add (&gomp_managed_threads,
1073				1L - pool->threads_used);
1074#else
1075	  gomp_mutex_lock (&gomp_managed_threads_lock);
1076	  gomp_managed_threads -= pool->threads_used - 1L;
1077	  gomp_mutex_unlock (&gomp_managed_threads_lock);
1078#endif
1079	  for (i = 1; i < pool->threads_used; i++)
1080	    pthread_join (thrs[i], NULL);
1081	}
1082      if (pool->last_team)
1083	free_team (pool->last_team);
1084#ifndef __nvptx__
1085      team_free (pool->threads);
1086      team_free (pool);
1087#endif
1088      thr->thread_pool = NULL;
1089    }
1090  return 0;
1091}
1092#endif
1093
1094struct gomp_task_icv *
1095gomp_new_icv (void)
1096{
1097  struct gomp_thread *thr = gomp_thread ();
1098  struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
1099  gomp_init_task (task, NULL, &gomp_global_icv);
1100  thr->task = task;
1101#ifdef LIBGOMP_USE_PTHREADS
1102  pthread_setspecific (gomp_thread_destructor, thr);
1103#endif
1104  return &task->icv;
1105}
1106