1/* Copyright (C) 2005-2020 Free Software Foundation, Inc.
2   Contributed by Richard Henderson <rth@redhat.com>.
3
4   This file is part of the GNU Offloading and Multi Processing Library
5   (libgomp).
6
7   Libgomp is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15   more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26/* This file handles the ORDERED construct.  */
27
28#include "libgomp.h"
29#include <stdarg.h>
30#include <string.h>
31#include "doacross.h"
32
33
34/* This function is called when first allocating an iteration block.  That
35   is, the thread is not currently on the queue.  The work-share lock must
36   be held on entry.  */
37
38void
39gomp_ordered_first (void)
40{
41  struct gomp_thread *thr = gomp_thread ();
42  struct gomp_team *team = thr->ts.team;
43  struct gomp_work_share *ws = thr->ts.work_share;
44  unsigned index;
45
46  /* Work share constructs can be orphaned.  */
47  if (team == NULL || team->nthreads == 1)
48    return;
49
50  index = ws->ordered_cur + ws->ordered_num_used;
51  if (index >= team->nthreads)
52    index -= team->nthreads;
53  ws->ordered_team_ids[index] = thr->ts.team_id;
54
55  /* If this is the first and only thread in the queue, then there is
56     no one to release us when we get to our ordered section.  Post to
57     our own release queue now so that we won't block later.  */
58  if (ws->ordered_num_used++ == 0)
59    gomp_sem_post (team->ordered_release[thr->ts.team_id]);
60}
61
62/* This function is called when completing the last iteration block.  That
63   is, there are no more iterations to perform and so the thread should be
64   removed from the queue entirely.  Because of the way ORDERED blocks are
65   managed, it follows that we currently own access to the ORDERED block,
66   and should now pass it on to the next thread.  The work-share lock must
67   be held on entry.  */
68
69void
70gomp_ordered_last (void)
71{
72  struct gomp_thread *thr = gomp_thread ();
73  struct gomp_team *team = thr->ts.team;
74  struct gomp_work_share *ws = thr->ts.work_share;
75  unsigned next_id;
76
77  /* Work share constructs can be orphaned.  */
78  if (team == NULL || team->nthreads == 1)
79    return;
80
81  /* We're no longer the owner.  */
82  ws->ordered_owner = -1;
83
84  /* If we're not the last thread in the queue, then wake the next.  */
85  if (--ws->ordered_num_used > 0)
86    {
87      unsigned next = ws->ordered_cur + 1;
88      if (next == team->nthreads)
89	next = 0;
90      ws->ordered_cur = next;
91
92      next_id = ws->ordered_team_ids[next];
93      gomp_sem_post (team->ordered_release[next_id]);
94    }
95}
96
97
98/* This function is called when allocating a subsequent allocation block.
99   That is, we're done with the current iteration block and we're allocating
100   another.  This is the logical combination of a call to gomp_ordered_last
101   followed by a call to gomp_ordered_first.  The work-share lock must be
102   held on entry. */
103
104void
105gomp_ordered_next (void)
106{
107  struct gomp_thread *thr = gomp_thread ();
108  struct gomp_team *team = thr->ts.team;
109  struct gomp_work_share *ws = thr->ts.work_share;
110  unsigned index, next_id;
111
112  /* Work share constructs can be orphaned.  */
113  if (team == NULL || team->nthreads == 1)
114    return;
115
116  /* We're no longer the owner.  */
117  ws->ordered_owner = -1;
118
119  /* If there's only one thread in the queue, that must be us.  */
120  if (ws->ordered_num_used == 1)
121    {
122      /* We have a similar situation as in gomp_ordered_first
123	 where we need to post to our own release semaphore.  */
124      gomp_sem_post (team->ordered_release[thr->ts.team_id]);
125      return;
126    }
127
128  /* If the queue is entirely full, then we move ourself to the end of
129     the queue merely by incrementing ordered_cur.  Only if it's not
130     full do we have to write our id.  */
131  if (ws->ordered_num_used < team->nthreads)
132    {
133      index = ws->ordered_cur + ws->ordered_num_used;
134      if (index >= team->nthreads)
135	index -= team->nthreads;
136      ws->ordered_team_ids[index] = thr->ts.team_id;
137    }
138
139  index = ws->ordered_cur + 1;
140  if (index == team->nthreads)
141    index = 0;
142  ws->ordered_cur = index;
143
144  next_id = ws->ordered_team_ids[index];
145  gomp_sem_post (team->ordered_release[next_id]);
146}
147
148
149/* This function is called when a statically scheduled loop is first
150   being created.  */
151
152void
153gomp_ordered_static_init (void)
154{
155  struct gomp_thread *thr = gomp_thread ();
156  struct gomp_team *team = thr->ts.team;
157
158  if (team == NULL || team->nthreads == 1)
159    return;
160
161  gomp_sem_post (team->ordered_release[0]);
162}
163
164/* This function is called when a statically scheduled loop is moving to
165   the next allocation block.  Static schedules are not first come first
166   served like the others, so we're to move to the numerically next thread,
167   not the next thread on a list.  The work-share lock should *not* be held
168   on entry.  */
169
170void
171gomp_ordered_static_next (void)
172{
173  struct gomp_thread *thr = gomp_thread ();
174  struct gomp_team *team = thr->ts.team;
175  struct gomp_work_share *ws = thr->ts.work_share;
176  unsigned id = thr->ts.team_id;
177
178  if (team == NULL || team->nthreads == 1)
179    return;
180
181  ws->ordered_owner = -1;
182
183  /* This thread currently owns the lock.  Increment the owner.  */
184  if (++id == team->nthreads)
185    id = 0;
186  ws->ordered_team_ids[0] = id;
187  gomp_sem_post (team->ordered_release[id]);
188}
189
190/* This function is called when we need to assert that the thread owns the
191   ordered section.  Due to the problem of posted-but-not-waited semaphores,
192   this needs to happen before completing a loop iteration.  */
193
194void
195gomp_ordered_sync (void)
196{
197  struct gomp_thread *thr = gomp_thread ();
198  struct gomp_team *team = thr->ts.team;
199  struct gomp_work_share *ws = thr->ts.work_share;
200
201  /* Work share constructs can be orphaned.  But this clearly means that
202     we are the only thread, and so we automatically own the section.  */
203  if (team == NULL || team->nthreads == 1)
204    return;
205
206  /* ??? I believe it to be safe to access this data without taking the
207     ws->lock.  The only presumed race condition is with the previous
208     thread on the queue incrementing ordered_cur such that it points
209     to us, concurrently with our check below.  But our team_id is
210     already present in the queue, and the other thread will always
211     post to our release semaphore.  So the two cases are that we will
212     either win the race an momentarily block on the semaphore, or lose
213     the race and find the semaphore already unlocked and so not block.
214     Either way we get correct results.
215     However, there is an implicit flush on entry to an ordered region,
216     so we do need to have a barrier here.  If we were taking a lock
217     this could be MEMMODEL_RELEASE since the acquire would be covered
218     by the lock.  */
219
220  __atomic_thread_fence (MEMMODEL_ACQ_REL);
221  if (ws->ordered_owner != thr->ts.team_id)
222    {
223      gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
224      ws->ordered_owner = thr->ts.team_id;
225    }
226}
227
228/* This function is called by user code when encountering the start of an
229   ORDERED block.  We must check to see if the current thread is at the
230   head of the queue, and if not, block.  */
231
232#ifdef HAVE_ATTRIBUTE_ALIAS
233extern void GOMP_ordered_start (void)
234	__attribute__((alias ("gomp_ordered_sync")));
235#else
236void
237GOMP_ordered_start (void)
238{
239  gomp_ordered_sync ();
240}
241#endif
242
243/* This function is called by user code when encountering the end of an
244   ORDERED block.  With the current ORDERED implementation there's nothing
245   for us to do.
246
247   However, the current implementation has a flaw in that it does not allow
248   the next thread into the ORDERED section immediately after the current
249   thread exits the ORDERED section in its last iteration.  The existence
250   of this function allows the implementation to change.  */
251
252void
253GOMP_ordered_end (void)
254{
255}
256
257/* DOACROSS initialization.  */
258
259#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
260
261void
262gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
263		    size_t extra)
264{
265  struct gomp_thread *thr = gomp_thread ();
266  struct gomp_team *team = thr->ts.team;
267  struct gomp_work_share *ws = thr->ts.work_share;
268  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
269  unsigned long ent, num_ents, elt_sz, shift_sz;
270  struct gomp_doacross_work_share *doacross;
271
272  if (team == NULL || team->nthreads == 1)
273    {
274    empty:
275      if (!extra)
276	ws->doacross = NULL;
277      else
278	{
279	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
280	  doacross->extra = (void *) (doacross + 1);
281	  ws->doacross = doacross;
282	}
283      return;
284    }
285
286  for (i = 0; i < ncounts; i++)
287    {
288      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
289      if (counts[i] == 0)
290	goto empty;
291
292      if (num_bits <= MAX_COLLAPSED_BITS)
293	{
294	  unsigned int this_bits;
295	  if (counts[i] == 1)
296	    this_bits = 1;
297	  else
298	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
299			- __builtin_clzl (counts[i] - 1);
300	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
301	    {
302	      bits[i] = this_bits;
303	      num_bits += this_bits;
304	    }
305	  else
306	    num_bits = MAX_COLLAPSED_BITS + 1;
307	}
308    }
309
310  if (ws->sched == GFS_STATIC)
311    num_ents = team->nthreads;
312  else if (ws->sched == GFS_GUIDED)
313    num_ents = counts[0];
314  else
315    num_ents = (counts[0] - 1) / chunk_size + 1;
316  if (num_bits <= MAX_COLLAPSED_BITS)
317    {
318      elt_sz = sizeof (unsigned long);
319      shift_sz = ncounts * sizeof (unsigned int);
320    }
321  else
322    {
323      elt_sz = sizeof (unsigned long) * ncounts;
324      shift_sz = 0;
325    }
326  elt_sz = (elt_sz + 63) & ~63UL;
327
328  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
329			  + shift_sz + extra);
330  doacross->chunk_size = chunk_size;
331  doacross->elt_sz = elt_sz;
332  doacross->ncounts = ncounts;
333  doacross->flattened = false;
334  doacross->array = (unsigned char *)
335		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
336		     & ~(uintptr_t) 63);
337  if (extra)
338    {
339      doacross->extra = doacross->array + num_ents * elt_sz;
340      memset (doacross->extra, '\0', extra);
341    }
342  else
343    doacross->extra = NULL;
344  if (num_bits <= MAX_COLLAPSED_BITS)
345    {
346      unsigned int shift_count = 0;
347      doacross->flattened = true;
348      for (i = ncounts; i > 0; i--)
349	{
350	  doacross->shift_counts[i - 1] = shift_count;
351	  shift_count += bits[i - 1];
352	}
353      for (ent = 0; ent < num_ents; ent++)
354	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
355    }
356  else
357    for (ent = 0; ent < num_ents; ent++)
358      memset (doacross->array + ent * elt_sz, '\0',
359	      sizeof (unsigned long) * ncounts);
360  if (ws->sched == GFS_STATIC && chunk_size == 0)
361    {
362      unsigned long q = counts[0] / num_ents;
363      unsigned long t = counts[0] % num_ents;
364      doacross->boundary = t * (q + 1);
365      doacross->q = q;
366      doacross->t = t;
367    }
368  ws->doacross = doacross;
369}
370
371/* DOACROSS POST operation.  */
372
373void
374GOMP_doacross_post (long *counts)
375{
376  struct gomp_thread *thr = gomp_thread ();
377  struct gomp_work_share *ws = thr->ts.work_share;
378  struct gomp_doacross_work_share *doacross = ws->doacross;
379  unsigned long ent;
380  unsigned int i;
381
382  if (__builtin_expect (doacross == NULL, 0)
383      || __builtin_expect (doacross->array == NULL, 0))
384    {
385      __sync_synchronize ();
386      return;
387    }
388
389  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
390    ent = thr->ts.team_id;
391  else if (ws->sched == GFS_GUIDED)
392    ent = counts[0];
393  else
394    ent = counts[0] / doacross->chunk_size;
395  unsigned long *array = (unsigned long *) (doacross->array
396					    + ent * doacross->elt_sz);
397
398  if (__builtin_expect (doacross->flattened, 1))
399    {
400      unsigned long flattened
401	= (unsigned long) counts[0] << doacross->shift_counts[0];
402
403      for (i = 1; i < doacross->ncounts; i++)
404	flattened |= (unsigned long) counts[i]
405		     << doacross->shift_counts[i];
406      flattened++;
407      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
408	__atomic_thread_fence (MEMMODEL_RELEASE);
409      else
410	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
411      return;
412    }
413
414  __atomic_thread_fence (MEMMODEL_ACQUIRE);
415  for (i = doacross->ncounts; i-- > 0; )
416    {
417      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
418	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
419    }
420}
421
422/* DOACROSS WAIT operation.  */
423
424void
425GOMP_doacross_wait (long first, ...)
426{
427  struct gomp_thread *thr = gomp_thread ();
428  struct gomp_work_share *ws = thr->ts.work_share;
429  struct gomp_doacross_work_share *doacross = ws->doacross;
430  va_list ap;
431  unsigned long ent;
432  unsigned int i;
433
434  if (__builtin_expect (doacross == NULL, 0)
435      || __builtin_expect (doacross->array == NULL, 0))
436    {
437      __sync_synchronize ();
438      return;
439    }
440
441  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
442    {
443      if (ws->chunk_size == 0)
444	{
445	  if (first < doacross->boundary)
446	    ent = first / (doacross->q + 1);
447	  else
448	    ent = (first - doacross->boundary) / doacross->q
449		  + doacross->t;
450	}
451      else
452	ent = first / ws->chunk_size % thr->ts.team->nthreads;
453    }
454  else if (ws->sched == GFS_GUIDED)
455    ent = first;
456  else
457    ent = first / doacross->chunk_size;
458  unsigned long *array = (unsigned long *) (doacross->array
459					    + ent * doacross->elt_sz);
460
461  if (__builtin_expect (doacross->flattened, 1))
462    {
463      unsigned long flattened
464	= (unsigned long) first << doacross->shift_counts[0];
465      unsigned long cur;
466
467      va_start (ap, first);
468      for (i = 1; i < doacross->ncounts; i++)
469	flattened |= (unsigned long) va_arg (ap, long)
470		     << doacross->shift_counts[i];
471      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
472      if (flattened < cur)
473	{
474	  __atomic_thread_fence (MEMMODEL_RELEASE);
475	  va_end (ap);
476	  return;
477	}
478      doacross_spin (array, flattened, cur);
479      __atomic_thread_fence (MEMMODEL_RELEASE);
480      va_end (ap);
481      return;
482    }
483
484  do
485    {
486      va_start (ap, first);
487      for (i = 0; i < doacross->ncounts; i++)
488	{
489	  unsigned long thisv
490	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
491	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
492	  if (thisv < cur)
493	    {
494	      i = doacross->ncounts;
495	      break;
496	    }
497	  if (thisv > cur)
498	    break;
499	}
500      va_end (ap);
501      if (i == doacross->ncounts)
502	break;
503      cpu_relax ();
504    }
505  while (1);
506  __sync_synchronize ();
507}
508
509typedef unsigned long long gomp_ull;
510
511void
512gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
513			gomp_ull chunk_size, size_t extra)
514{
515  struct gomp_thread *thr = gomp_thread ();
516  struct gomp_team *team = thr->ts.team;
517  struct gomp_work_share *ws = thr->ts.work_share;
518  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
519  unsigned long ent, num_ents, elt_sz, shift_sz;
520  struct gomp_doacross_work_share *doacross;
521
522  if (team == NULL || team->nthreads == 1)
523    {
524    empty:
525      if (!extra)
526	ws->doacross = NULL;
527      else
528	{
529	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
530	  doacross->extra = (void *) (doacross + 1);
531	  ws->doacross = doacross;
532	}
533      return;
534    }
535
536  for (i = 0; i < ncounts; i++)
537    {
538      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
539      if (counts[i] == 0)
540	goto empty;
541
542      if (num_bits <= MAX_COLLAPSED_BITS)
543	{
544	  unsigned int this_bits;
545	  if (counts[i] == 1)
546	    this_bits = 1;
547	  else
548	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
549			- __builtin_clzll (counts[i] - 1);
550	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
551	    {
552	      bits[i] = this_bits;
553	      num_bits += this_bits;
554	    }
555	  else
556	    num_bits = MAX_COLLAPSED_BITS + 1;
557	}
558    }
559
560  if (ws->sched == GFS_STATIC)
561    num_ents = team->nthreads;
562  else if (ws->sched == GFS_GUIDED)
563    num_ents = counts[0];
564  else
565    num_ents = (counts[0] - 1) / chunk_size + 1;
566  if (num_bits <= MAX_COLLAPSED_BITS)
567    {
568      elt_sz = sizeof (unsigned long);
569      shift_sz = ncounts * sizeof (unsigned int);
570    }
571  else
572    {
573      if (sizeof (gomp_ull) == sizeof (unsigned long))
574	elt_sz = sizeof (gomp_ull) * ncounts;
575      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
576	elt_sz = sizeof (unsigned long) * 2 * ncounts;
577      else
578	abort ();
579      shift_sz = 0;
580    }
581  elt_sz = (elt_sz + 63) & ~63UL;
582
583  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
584			  + shift_sz);
585  doacross->chunk_size_ull = chunk_size;
586  doacross->elt_sz = elt_sz;
587  doacross->ncounts = ncounts;
588  doacross->flattened = false;
589  doacross->boundary = 0;
590  doacross->array = (unsigned char *)
591		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
592		     & ~(uintptr_t) 63);
593  if (extra)
594    {
595      doacross->extra = doacross->array + num_ents * elt_sz;
596      memset (doacross->extra, '\0', extra);
597    }
598  else
599    doacross->extra = NULL;
600  if (num_bits <= MAX_COLLAPSED_BITS)
601    {
602      unsigned int shift_count = 0;
603      doacross->flattened = true;
604      for (i = ncounts; i > 0; i--)
605	{
606	  doacross->shift_counts[i - 1] = shift_count;
607	  shift_count += bits[i - 1];
608	}
609      for (ent = 0; ent < num_ents; ent++)
610	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
611    }
612  else
613    for (ent = 0; ent < num_ents; ent++)
614      memset (doacross->array + ent * elt_sz, '\0',
615	      sizeof (unsigned long) * ncounts);
616  if (ws->sched == GFS_STATIC && chunk_size == 0)
617    {
618      gomp_ull q = counts[0] / num_ents;
619      gomp_ull t = counts[0] % num_ents;
620      doacross->boundary_ull = t * (q + 1);
621      doacross->q_ull = q;
622      doacross->t = t;
623    }
624  ws->doacross = doacross;
625}
626
627/* DOACROSS POST operation.  */
628
629void
630GOMP_doacross_ull_post (gomp_ull *counts)
631{
632  struct gomp_thread *thr = gomp_thread ();
633  struct gomp_work_share *ws = thr->ts.work_share;
634  struct gomp_doacross_work_share *doacross = ws->doacross;
635  unsigned long ent;
636  unsigned int i;
637
638  if (__builtin_expect (doacross == NULL, 0)
639      || __builtin_expect (doacross->array == NULL, 0))
640    {
641      __sync_synchronize ();
642      return;
643    }
644
645  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
646    ent = thr->ts.team_id;
647  else if (ws->sched == GFS_GUIDED)
648    ent = counts[0];
649  else
650    ent = counts[0] / doacross->chunk_size_ull;
651
652  if (__builtin_expect (doacross->flattened, 1))
653    {
654      unsigned long *array = (unsigned long *) (doacross->array
655			      + ent * doacross->elt_sz);
656      gomp_ull flattened
657	= counts[0] << doacross->shift_counts[0];
658
659      for (i = 1; i < doacross->ncounts; i++)
660	flattened |= counts[i] << doacross->shift_counts[i];
661      flattened++;
662      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
663	__atomic_thread_fence (MEMMODEL_RELEASE);
664      else
665	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
666      return;
667    }
668
669  __atomic_thread_fence (MEMMODEL_ACQUIRE);
670  if (sizeof (gomp_ull) == sizeof (unsigned long))
671    {
672      gomp_ull *array = (gomp_ull *) (doacross->array
673				      + ent * doacross->elt_sz);
674
675      for (i = doacross->ncounts; i-- > 0; )
676	{
677	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
678	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
679	}
680    }
681  else
682    {
683      unsigned long *array = (unsigned long *) (doacross->array
684						+ ent * doacross->elt_sz);
685
686      for (i = doacross->ncounts; i-- > 0; )
687	{
688	  gomp_ull cull = counts[i] + 1UL;
689	  unsigned long c = (unsigned long) cull;
690	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
691	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
692	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
693	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
694	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
695	}
696    }
697}
698
699/* DOACROSS WAIT operation.  */
700
701void
702GOMP_doacross_ull_wait (gomp_ull first, ...)
703{
704  struct gomp_thread *thr = gomp_thread ();
705  struct gomp_work_share *ws = thr->ts.work_share;
706  struct gomp_doacross_work_share *doacross = ws->doacross;
707  va_list ap;
708  unsigned long ent;
709  unsigned int i;
710
711  if (__builtin_expect (doacross == NULL, 0)
712      || __builtin_expect (doacross->array == NULL, 0))
713    {
714      __sync_synchronize ();
715      return;
716    }
717
718  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
719    {
720      if (ws->chunk_size_ull == 0)
721	{
722	  if (first < doacross->boundary_ull)
723	    ent = first / (doacross->q_ull + 1);
724	  else
725	    ent = (first - doacross->boundary_ull) / doacross->q_ull
726		  + doacross->t;
727	}
728      else
729	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
730    }
731  else if (ws->sched == GFS_GUIDED)
732    ent = first;
733  else
734    ent = first / doacross->chunk_size_ull;
735
736  if (__builtin_expect (doacross->flattened, 1))
737    {
738      unsigned long *array = (unsigned long *) (doacross->array
739						+ ent * doacross->elt_sz);
740      gomp_ull flattened = first << doacross->shift_counts[0];
741      unsigned long cur;
742
743      va_start (ap, first);
744      for (i = 1; i < doacross->ncounts; i++)
745	flattened |= va_arg (ap, gomp_ull)
746		     << doacross->shift_counts[i];
747      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
748      if (flattened < cur)
749	{
750	  __atomic_thread_fence (MEMMODEL_RELEASE);
751	  va_end (ap);
752	  return;
753	}
754      doacross_spin (array, flattened, cur);
755      __atomic_thread_fence (MEMMODEL_RELEASE);
756      va_end (ap);
757      return;
758    }
759
760  if (sizeof (gomp_ull) == sizeof (unsigned long))
761    {
762      gomp_ull *array = (gomp_ull *) (doacross->array
763				      + ent * doacross->elt_sz);
764      do
765	{
766	  va_start (ap, first);
767	  for (i = 0; i < doacross->ncounts; i++)
768	    {
769	      gomp_ull thisv
770		= (i ? va_arg (ap, gomp_ull) : first) + 1;
771	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
772	      if (thisv < cur)
773		{
774		  i = doacross->ncounts;
775		  break;
776		}
777	      if (thisv > cur)
778		break;
779	    }
780	  va_end (ap);
781	  if (i == doacross->ncounts)
782	    break;
783	  cpu_relax ();
784	}
785      while (1);
786    }
787  else
788    {
789      unsigned long *array = (unsigned long *) (doacross->array
790						+ ent * doacross->elt_sz);
791      do
792	{
793	  va_start (ap, first);
794	  for (i = 0; i < doacross->ncounts; i++)
795	    {
796	      gomp_ull thisv
797		= (i ? va_arg (ap, gomp_ull) : first) + 1;
798	      unsigned long t
799		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
800	      unsigned long cur
801		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
802	      if (t < cur)
803		{
804		  i = doacross->ncounts;
805		  break;
806		}
807	      if (t > cur)
808		break;
809	      t = thisv;
810	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
811	      if (t < cur)
812		{
813		  i = doacross->ncounts;
814		  break;
815		}
816	      if (t > cur)
817		break;
818	    }
819	  va_end (ap);
820	  if (i == doacross->ncounts)
821	    break;
822	  cpu_relax ();
823	}
824      while (1);
825    }
826  __sync_synchronize ();
827}
828