1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
10#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
11
12#include <__algorithm/inplace_merge.h>
13#include <__algorithm/lower_bound.h>
14#include <__algorithm/max.h>
15#include <__algorithm/merge.h>
16#include <__algorithm/upper_bound.h>
17#include <__atomic/atomic.h>
18#include <__config>
19#include <__exception/terminate.h>
20#include <__iterator/iterator_traits.h>
21#include <__iterator/move_iterator.h>
22#include <__memory/allocator.h>
23#include <__memory/construct_at.h>
24#include <__memory/unique_ptr.h>
25#include <__numeric/reduce.h>
26#include <__utility/empty.h>
27#include <__utility/exception_guard.h>
28#include <__utility/move.h>
29#include <__utility/pair.h>
30#include <cstddef>
31#include <new>
32#include <optional>
33
34_LIBCPP_PUSH_MACROS
35#include <__undef_macros>
36
37#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
38
39_LIBCPP_BEGIN_NAMESPACE_STD
40
41namespace __par_backend {
42inline namespace __libdispatch {
43
44// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
45// we.
46// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
47_LIBCPP_EXPORTED_FROM_ABI void
48__dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
49
50template <class _Func>
51_LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
52  __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
53    (*static_cast<_Func*>(__context))(__chunk);
54  });
55}
56
57struct __chunk_partitions {
58  ptrdiff_t __chunk_count_; // includes the first chunk
59  ptrdiff_t __chunk_size_;
60  ptrdiff_t __first_chunk_size_;
61};
62
63[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;
64
65template <class _RandomAccessIterator, class _Functor>
66_LIBCPP_HIDE_FROM_ABI optional<__empty>
67__dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
68  // Perform the chunked execution.
69  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
70    auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
71    auto __index =
72        __chunk == 0
73            ? 0
74            : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
75    __func(__first + __index, __first + __index + __this_chunk_size);
76  });
77
78  return __empty{};
79}
80
81template <class _RandomAccessIterator, class _Functor>
82_LIBCPP_HIDE_FROM_ABI optional<__empty>
83__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
84  return __libdispatch::__dispatch_parallel_for(
85      __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
86}
87
88template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
89struct __merge_range {
90  __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
91      : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
92
93  _RandomAccessIterator1 __mid1_;
94  _RandomAccessIterator2 __mid2_;
95  _RandomAccessIteratorOut __result_;
96};
97
98template <typename _RandomAccessIterator1,
99          typename _RandomAccessIterator2,
100          typename _RandomAccessIterator3,
101          typename _Compare,
102          typename _LeafMerge>
103_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
104    _RandomAccessIterator1 __first1,
105    _RandomAccessIterator1 __last1,
106    _RandomAccessIterator2 __first2,
107    _RandomAccessIterator2 __last2,
108    _RandomAccessIterator3 __result,
109    _Compare __comp,
110    _LeafMerge __leaf_merge) noexcept {
111  __chunk_partitions __partitions =
112      __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
113
114  if (__partitions.__chunk_count_ == 0)
115    return __empty{};
116
117  if (__partitions.__chunk_count_ == 1) {
118    __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
119    return __empty{};
120  }
121
122  using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
123  auto const __n_ranges = __partitions.__chunk_count_ + 1;
124
125  // TODO: use __uninitialized_buffer
126  auto __destroy = [=](__merge_range_t* __ptr) {
127    std::destroy_n(__ptr, __n_ranges);
128    std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
129  };
130
131  unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
132      [&]() -> __merge_range_t* {
133#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
134        try {
135#  endif
136          return std::allocator<__merge_range_t>().allocate(__n_ranges);
137#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
138        } catch (const std::bad_alloc&) {
139          return nullptr;
140        }
141#  endif
142      }(),
143      __destroy);
144
145  if (!__ranges)
146    return nullopt;
147
148  // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
149  __merge_range_t* __r = __ranges.get();
150  std::__construct_at(__r++, __first1, __first2, __result);
151
152  bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
153
154  auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
155    auto [__mid1, __mid2] = [&] {
156      if (__iterate_first_range) {
157        auto __m1 = __first1 + __chunk_size;
158        auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
159        return std::make_pair(__m1, __m2);
160      } else {
161        auto __m2 = __first2 + __chunk_size;
162        auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
163        return std::make_pair(__m1, __m2);
164      }
165    }();
166
167    __result += (__mid1 - __first1) + (__mid2 - __first2);
168    __first1 = __mid1;
169    __first2 = __mid2;
170    return {std::move(__mid1), std::move(__mid2), __result};
171  };
172
173  // handle first chunk
174  std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
175
176  // handle 2 -> N - 1 chunks
177  for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
178    std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
179
180  // handle last chunk
181  std::__construct_at(__r, __last1, __last2, __result);
182
183  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
184    auto __first_iters = __ranges[__index];
185    auto __last_iters  = __ranges[__index + 1];
186    __leaf_merge(
187        __first_iters.__mid1_,
188        __last_iters.__mid1_,
189        __first_iters.__mid2_,
190        __last_iters.__mid2_,
191        __first_iters.__result_,
192        __comp);
193  });
194
195  return __empty{};
196}
197
198template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
199_LIBCPP_HIDE_FROM_ABI optional<_Value> __parallel_transform_reduce(
200    _RandomAccessIterator __first,
201    _RandomAccessIterator __last,
202    _Transform __transform,
203    _Value __init,
204    _Combiner __combiner,
205    _Reduction __reduction) {
206  if (__first == __last)
207    return __init;
208
209  auto __partitions = __libdispatch::__partition_chunks(__last - __first);
210
211  auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
212    std::destroy_n(__ptr, __count);
213    std::allocator<_Value>().deallocate(__ptr, __count);
214  };
215
216  // TODO: use __uninitialized_buffer
217  // TODO: allocate one element per worker instead of one element per chunk
218  unique_ptr<_Value[], decltype(__destroy)> __values(
219      std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
220
221  // __dispatch_apply is noexcept
222  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
223    auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
224    auto __index =
225        __chunk == 0
226            ? 0
227            : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
228    if (__this_chunk_size != 1) {
229      std::__construct_at(
230          __values.get() + __chunk,
231          __reduction(__first + __index + 2,
232                      __first + __index + __this_chunk_size,
233                      __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
234    } else {
235      std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
236    }
237  });
238
239  return std::reduce(
240      std::make_move_iterator(__values.get()),
241      std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
242      std::move(__init),
243      __combiner);
244}
245
246template <class _RandomAccessIterator, class _Comp, class _LeafSort>
247_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
248    _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
249  const auto __size = __last - __first;
250  auto __partitions = __libdispatch::__partition_chunks(__size);
251
252  if (__partitions.__chunk_count_ == 0)
253    return __empty{};
254
255  if (__partitions.__chunk_count_ == 1) {
256    __leaf_sort(__first, __last, __comp);
257    return __empty{};
258  }
259
260  using _Value = __iter_value_type<_RandomAccessIterator>;
261
262  auto __destroy = [__size](_Value* __ptr) {
263    std::destroy_n(__ptr, __size);
264    std::allocator<_Value>().deallocate(__ptr, __size);
265  };
266
267  // TODO: use __uninitialized_buffer
268  unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
269
270  // Initialize all elements to a moved-from state
271  // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
272  std::__construct_at(__values.get(), std::move(*__first));
273  for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
274    std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
275  }
276  *__first = std::move(__values.get()[__size - 1]);
277
278  __libdispatch::__dispatch_parallel_for(
279      __partitions,
280      __first,
281      [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
282        __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
283      });
284
285  bool __objects_are_in_buffer = false;
286  do {
287    const auto __old_chunk_size = __partitions.__chunk_size_;
288    if (__partitions.__chunk_count_ % 2 == 1) {
289      auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
290        std::inplace_merge(
291            __first_chunk_begin,
292            __first_chunk_begin + __partitions.__first_chunk_size_,
293            __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
294            __comp);
295      };
296      if (__objects_are_in_buffer)
297        __inplace_merge_chunks(__values.get());
298      else
299        __inplace_merge_chunks(__first);
300      __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
301    } else {
302      __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
303    }
304
305    __partitions.__chunk_size_ *= 2;
306    __partitions.__chunk_count_ /= 2;
307
308    auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
309      __libdispatch::__dispatch_parallel_for(
310          __partitions,
311          __from_first,
312          [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
313            std::merge(std::make_move_iterator(__chunk_first),
314                       std::make_move_iterator(__chunk_last - __old_chunk_size),
315                       std::make_move_iterator(__chunk_last - __old_chunk_size),
316                       std::make_move_iterator(__chunk_last),
317                       __to_first + (__chunk_first - __from_first),
318                       __comp);
319          });
320    };
321
322    if (__objects_are_in_buffer)
323      __merge_chunks(__values.get(), __first);
324    else
325      __merge_chunks(__first, __values.get());
326    __objects_are_in_buffer = !__objects_are_in_buffer;
327  } while (__partitions.__chunk_count_ > 1);
328
329  if (__objects_are_in_buffer) {
330    std::move(__values.get(), __values.get() + __size, __first);
331  }
332
333  return __empty{};
334}
335
336_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
337
338} // namespace __libdispatch
339} // namespace __par_backend
340
341_LIBCPP_END_NAMESPACE_STD
342
343#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
344
345_LIBCPP_POP_MACROS
346
347#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
348