1/* Copyright (C) 2005 Free Software Foundation, Inc.
2   Contributed by Richard Henderson <rth@redhat.com>.
3
4   This file is part of the GNU OpenMP Library (libgomp).
5
6   Libgomp is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or
9   (at your option) any later version.
10
11   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
12   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13   FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
14   more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with libgomp; see the file COPYING.LIB.  If not, write to the
18   Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19   MA 02110-1301, USA.  */
20
21/* As a special exception, if you link this library with other files, some
22   of which are compiled with GCC, to produce an executable, this library
23   does not by itself cause the resulting executable to be covered by the
24   GNU General Public License.  This exception does not however invalidate
25   any other reasons why the executable file might be covered by the GNU
26   General Public License.  */
27
28/* This file contains routines for managing work-share iteration, both
29   for loops and sections.  */
30
31#include "libgomp.h"
32#include <stdlib.h>
33
34
35/* This function implements the STATIC scheduling method.  The caller should
36   iterate *pstart <= x < *pend.  Return zero if there are more iterations
37   to perform; nonzero if not.  Return less than 0 if this thread had
38   received the absolutely last iteration.  */
39
40int
41gomp_iter_static_next (long *pstart, long *pend)
42{
43  struct gomp_thread *thr = gomp_thread ();
44  struct gomp_team *team = thr->ts.team;
45  struct gomp_work_share *ws = thr->ts.work_share;
46  unsigned long nthreads = team ? team->nthreads : 1;
47
48  if (thr->ts.static_trip == -1)
49    return -1;
50
51  /* Quick test for degenerate teams and orphaned constructs.  */
52  if (nthreads == 1)
53    {
54      *pstart = ws->next;
55      *pend = ws->end;
56      thr->ts.static_trip = -1;
57      return ws->next == ws->end;
58    }
59
60  /* We interpret chunk_size zero as "unspecified", which means that we
61     should break up the iterations such that each thread makes only one
62     trip through the outer loop.  */
63  if (ws->chunk_size == 0)
64    {
65      unsigned long n, q, i;
66      unsigned long s0, e0;
67      long s, e;
68
69      if (thr->ts.static_trip > 0)
70	return 1;
71
72      /* Compute the total number of iterations.  */
73      s = ws->incr + (ws->incr > 0 ? -1 : 1);
74      n = (ws->end - ws->next + s) / ws->incr;
75      i = thr->ts.team_id;
76
77      /* Compute the "zero-based" start and end points.  That is, as
78         if the loop began at zero and incremented by one.  */
79      q = n / nthreads;
80      q += (q * nthreads != n);
81      s0 = q * i;
82      e0 = s0 + q;
83      if (e0 > n)
84        e0 = n;
85
86      /* Notice when no iterations allocated for this thread.  */
87      if (s0 >= e0)
88	{
89	  thr->ts.static_trip = 1;
90	  return 1;
91	}
92
93      /* Transform these to the actual start and end numbers.  */
94      s = (long)s0 * ws->incr + ws->next;
95      e = (long)e0 * ws->incr + ws->next;
96
97      *pstart = s;
98      *pend = e;
99      thr->ts.static_trip = (e0 == n ? -1 : 1);
100      return 0;
101    }
102  else
103    {
104      unsigned long n, s0, e0, i, c;
105      long s, e;
106
107      /* Otherwise, each thread gets exactly chunk_size iterations
108	 (if available) each time through the loop.  */
109
110      s = ws->incr + (ws->incr > 0 ? -1 : 1);
111      n = (ws->end - ws->next + s) / ws->incr;
112      i = thr->ts.team_id;
113      c = ws->chunk_size;
114
115      /* Initial guess is a C sized chunk positioned nthreads iterations
116	 in, offset by our thread number.  */
117      s0 = (thr->ts.static_trip * nthreads + i) * c;
118      e0 = s0 + c;
119
120      /* Detect overflow.  */
121      if (s0 >= n)
122	return 1;
123      if (e0 > n)
124	e0 = n;
125
126      /* Transform these to the actual start and end numbers.  */
127      s = (long)s0 * ws->incr + ws->next;
128      e = (long)e0 * ws->incr + ws->next;
129
130      *pstart = s;
131      *pend = e;
132
133      if (e0 == n)
134	thr->ts.static_trip = -1;
135      else
136	thr->ts.static_trip++;
137      return 0;
138    }
139}
140
141
142/* This function implements the DYNAMIC scheduling method.  Arguments are
143   as for gomp_iter_static_next.  This function must be called with ws->lock
144   held.  */
145
146bool
147gomp_iter_dynamic_next_locked (long *pstart, long *pend)
148{
149  struct gomp_thread *thr = gomp_thread ();
150  struct gomp_work_share *ws = thr->ts.work_share;
151  long start, end, chunk, left;
152
153  start = ws->next;
154  if (start == ws->end)
155    return false;
156
157  chunk = ws->chunk_size * ws->incr;
158  left = ws->end - start;
159  if (ws->incr < 0)
160    {
161      if (chunk < left)
162	chunk = left;
163    }
164  else
165    {
166      if (chunk > left)
167	chunk = left;
168    }
169  end = start + chunk;
170
171  ws->next = end;
172  *pstart = start;
173  *pend = end;
174  return true;
175}
176
177
178#ifdef HAVE_SYNC_BUILTINS
179/* Similar, but doesn't require the lock held, and uses compare-and-swap
180   instead.  Note that the only memory value that changes is ws->next.  */
181
182bool
183gomp_iter_dynamic_next (long *pstart, long *pend)
184{
185  struct gomp_thread *thr = gomp_thread ();
186  struct gomp_work_share *ws = thr->ts.work_share;
187  long start, end, nend, chunk, incr;
188
189  start = ws->next;
190  end = ws->end;
191  incr = ws->incr;
192  chunk = ws->chunk_size * incr;
193
194  while (1)
195    {
196      long left = end - start;
197      long tmp;
198
199      if (start == end)
200	return false;
201
202      if (incr < 0)
203	{
204	  if (chunk < left)
205	    chunk = left;
206	}
207      else
208	{
209	  if (chunk > left)
210	    chunk = left;
211	}
212      nend = start + chunk;
213
214      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
215      if (__builtin_expect (tmp == start, 1))
216	break;
217
218      start = tmp;
219    }
220
221  *pstart = start;
222  *pend = nend;
223  return true;
224}
225#endif /* HAVE_SYNC_BUILTINS */
226
227
228/* This function implements the GUIDED scheduling method.  Arguments are
229   as for gomp_iter_static_next.  This function must be called with the
230   work share lock held.  */
231
232bool
233gomp_iter_guided_next_locked (long *pstart, long *pend)
234{
235  struct gomp_thread *thr = gomp_thread ();
236  struct gomp_work_share *ws = thr->ts.work_share;
237  struct gomp_team *team = thr->ts.team;
238  unsigned long nthreads = team ? team->nthreads : 1;
239  unsigned long n, q;
240  long start, end;
241
242  if (ws->next == ws->end)
243    return false;
244
245  start = ws->next;
246  n = (ws->end - start) / ws->incr;
247  q = (n + nthreads - 1) / nthreads;
248
249  if (q < ws->chunk_size)
250    q = ws->chunk_size;
251  if (q <= n)
252    end = start + q * ws->incr;
253  else
254    end = ws->end;
255
256  ws->next = end;
257  *pstart = start;
258  *pend = end;
259  return true;
260}
261
262#ifdef HAVE_SYNC_BUILTINS
263/* Similar, but doesn't require the lock held, and uses compare-and-swap
264   instead.  Note that the only memory value that changes is ws->next.  */
265
266bool
267gomp_iter_guided_next (long *pstart, long *pend)
268{
269  struct gomp_thread *thr = gomp_thread ();
270  struct gomp_work_share *ws = thr->ts.work_share;
271  struct gomp_team *team = thr->ts.team;
272  unsigned long nthreads = team ? team->nthreads : 1;
273  long start, end, nend, incr;
274  unsigned long chunk_size;
275
276  start = ws->next;
277  end = ws->end;
278  incr = ws->incr;
279  chunk_size = ws->chunk_size;
280
281  while (1)
282    {
283      unsigned long n, q;
284      long tmp;
285
286      if (start == end)
287	return false;
288
289      n = (end - start) / incr;
290      q = (n + nthreads - 1) / nthreads;
291
292      if (q < chunk_size)
293	q = chunk_size;
294      if (__builtin_expect (q <= n, 1))
295	nend = start + q * incr;
296      else
297	nend = end;
298
299      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
300      if (__builtin_expect (tmp == start, 1))
301	break;
302
303      start = tmp;
304    }
305
306  *pstart = start;
307  *pend = nend;
308  return true;
309}
310#endif /* HAVE_SYNC_BUILTINS */
311