1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H 10 #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H 11 12 #include <__algorithm/inplace_merge.h> 13 #include <__algorithm/lower_bound.h> 14 #include <__algorithm/max.h> 15 #include <__algorithm/merge.h> 16 #include <__algorithm/upper_bound.h> 17 #include <__atomic/atomic.h> 18 #include <__config> 19 #include <__cstddef/ptrdiff_t.h> 20 #include <__exception/terminate.h> 21 #include <__iterator/iterator_traits.h> 22 #include <__iterator/move_iterator.h> 23 #include <__memory/allocator.h> 24 #include <__memory/construct_at.h> 25 #include <__memory/unique_ptr.h> 26 #include <__new/exceptions.h> 27 #include <__numeric/reduce.h> 28 #include <__pstl/backend_fwd.h> 29 #include <__pstl/cpu_algos/any_of.h> 30 #include <__pstl/cpu_algos/cpu_traits.h> 31 #include <__pstl/cpu_algos/fill.h> 32 #include <__pstl/cpu_algos/find_if.h> 33 #include <__pstl/cpu_algos/for_each.h> 34 #include <__pstl/cpu_algos/merge.h> 35 #include <__pstl/cpu_algos/stable_sort.h> 36 #include <__pstl/cpu_algos/transform.h> 37 #include <__pstl/cpu_algos/transform_reduce.h> 38 #include <__utility/empty.h> 39 #include <__utility/exception_guard.h> 40 #include <__utility/move.h> 41 #include <__utility/pair.h> 42 #include <optional> 43 44 _LIBCPP_PUSH_MACROS 45 #include <__undef_macros> 46 47 #if _LIBCPP_STD_VER >= 17 48 49 _LIBCPP_BEGIN_NAMESPACE_STD 50 namespace __pstl { 51 52 namespace __libdispatch { 53 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do 54 // we. 55 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]? 56 _LIBCPP_EXPORTED_FROM_ABI void 57 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept; 58 59 template <class _Func> 60 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept { 61 __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) { 62 (*static_cast<_Func*>(__context))(__chunk); 63 }); 64 } 65 66 struct __chunk_partitions { 67 ptrdiff_t __chunk_count_; // includes the first chunk 68 ptrdiff_t __chunk_size_; 69 ptrdiff_t __first_chunk_size_; 70 }; 71 72 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept; 73 74 template <class _RandomAccessIterator, class _Functor> 75 _LIBCPP_HIDE_FROM_ABI optional<__empty> 76 __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) { 77 // Perform the chunked execution. 78 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 79 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 80 auto __index = 81 __chunk == 0 82 ? 0 83 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 84 __func(__first + __index, __first + __index + __this_chunk_size); 85 }); 86 87 return __empty{}; 88 } 89 } // namespace __libdispatch 90 91 template <> 92 struct __cpu_traits<__libdispatch_backend_tag> { 93 template <class _RandomAccessIterator, class _Functor> 94 _LIBCPP_HIDE_FROM_ABI static optional<__empty> 95 __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { 96 return __libdispatch::__dispatch_parallel_for( 97 __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); 98 } 99 100 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut> 101 struct __merge_range { 102 __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) 103 : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} 104 105 _RandomAccessIterator1 __mid1_; 106 _RandomAccessIterator2 __mid2_; 107 _RandomAccessIteratorOut __result_; 108 }; 109 110 template <typename _RandomAccessIterator1, 111 typename _RandomAccessIterator2, 112 typename _RandomAccessIterator3, 113 typename _Compare, 114 typename _LeafMerge> 115 _LIBCPP_HIDE_FROM_ABI static optional<__empty> 116 __merge(_RandomAccessIterator1 __first1, 117 _RandomAccessIterator1 __last1, 118 _RandomAccessIterator2 __first2, 119 _RandomAccessIterator2 __last2, 120 _RandomAccessIterator3 __result, 121 _Compare __comp, 122 _LeafMerge __leaf_merge) noexcept { 123 __libdispatch::__chunk_partitions __partitions = 124 __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2)); 125 126 if (__partitions.__chunk_count_ == 0) 127 return __empty{}; 128 129 if (__partitions.__chunk_count_ == 1) { 130 __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); 131 return __empty{}; 132 } 133 134 using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; 135 auto const __n_ranges = __partitions.__chunk_count_ + 1; 136 137 // TODO: use __uninitialized_buffer 138 auto __destroy = [=](__merge_range_t* __ptr) { 139 std::destroy_n(__ptr, __n_ranges); 140 std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); 141 }; 142 143 unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( 144 [&]() -> __merge_range_t* { 145 # if _LIBCPP_HAS_EXCEPTIONS 146 try { 147 # endif 148 return std::allocator<__merge_range_t>().allocate(__n_ranges); 149 # if _LIBCPP_HAS_EXCEPTIONS 150 } catch (const std::bad_alloc&) { 151 return nullptr; 152 } 153 # endif 154 }(), 155 __destroy); 156 157 if (!__ranges) 158 return nullopt; 159 160 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case 161 __merge_range_t* __r = __ranges.get(); 162 std::__construct_at(__r++, __first1, __first2, __result); 163 164 bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; 165 166 auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { 167 auto [__mid1, __mid2] = [&] { 168 if (__iterate_first_range) { 169 auto __m1 = __first1 + __chunk_size; 170 auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); 171 return std::make_pair(__m1, __m2); 172 } else { 173 auto __m2 = __first2 + __chunk_size; 174 auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); 175 return std::make_pair(__m1, __m2); 176 } 177 }(); 178 179 __result += (__mid1 - __first1) + (__mid2 - __first2); 180 __first1 = __mid1; 181 __first2 = __mid2; 182 return {std::move(__mid1), std::move(__mid2), __result}; 183 }; 184 185 // handle first chunk 186 std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); 187 188 // handle 2 -> N - 1 chunks 189 for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) 190 std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); 191 192 // handle last chunk 193 std::__construct_at(__r, __last1, __last2, __result); 194 195 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { 196 auto __first_iters = __ranges[__index]; 197 auto __last_iters = __ranges[__index + 1]; 198 __leaf_merge( 199 __first_iters.__mid1_, 200 __last_iters.__mid1_, 201 __first_iters.__mid2_, 202 __last_iters.__mid2_, 203 __first_iters.__result_, 204 __comp); 205 }); 206 207 return __empty{}; 208 } 209 210 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction> 211 _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce( 212 _RandomAccessIterator __first, 213 _RandomAccessIterator __last, 214 _Transform __transform, 215 _Value __init, 216 _Combiner __combiner, 217 _Reduction __reduction) { 218 if (__first == __last) 219 return __init; 220 221 auto __partitions = __libdispatch::__partition_chunks(__last - __first); 222 223 auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { 224 std::destroy_n(__ptr, __count); 225 std::allocator<_Value>().deallocate(__ptr, __count); 226 }; 227 228 // TODO: use __uninitialized_buffer 229 // TODO: allocate one element per worker instead of one element per chunk 230 unique_ptr<_Value[], decltype(__destroy)> __values( 231 std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); 232 233 // __dispatch_apply is noexcept 234 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 235 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 236 auto __index = __chunk == 0 ? 0 237 : (__chunk * __partitions.__chunk_size_) + 238 (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 239 if (__this_chunk_size != 1) { 240 std::__construct_at( 241 __values.get() + __chunk, 242 __reduction(__first + __index + 2, 243 __first + __index + __this_chunk_size, 244 __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); 245 } else { 246 std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); 247 } 248 }); 249 250 return std::reduce( 251 std::make_move_iterator(__values.get()), 252 std::make_move_iterator(__values.get() + __partitions.__chunk_count_), 253 std::move(__init), 254 __combiner); 255 } 256 257 template <class _RandomAccessIterator, class _Comp, class _LeafSort> 258 _LIBCPP_HIDE_FROM_ABI static optional<__empty> 259 __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { 260 const auto __size = __last - __first; 261 auto __partitions = __libdispatch::__partition_chunks(__size); 262 263 if (__partitions.__chunk_count_ == 0) 264 return __empty{}; 265 266 if (__partitions.__chunk_count_ == 1) { 267 __leaf_sort(__first, __last, __comp); 268 return __empty{}; 269 } 270 271 using _Value = __iter_value_type<_RandomAccessIterator>; 272 273 auto __destroy = [__size](_Value* __ptr) { 274 std::destroy_n(__ptr, __size); 275 std::allocator<_Value>().deallocate(__ptr, __size); 276 }; 277 278 // TODO: use __uninitialized_buffer 279 unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); 280 281 // Initialize all elements to a moved-from state 282 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 283 std::__construct_at(__values.get(), std::move(*__first)); 284 for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { 285 std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); 286 } 287 *__first = std::move(__values.get()[__size - 1]); 288 289 __libdispatch::__dispatch_parallel_for( 290 __partitions, 291 __first, 292 [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { 293 __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); 294 }); 295 296 bool __objects_are_in_buffer = false; 297 do { 298 const auto __old_chunk_size = __partitions.__chunk_size_; 299 if (__partitions.__chunk_count_ % 2 == 1) { 300 auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { 301 std::inplace_merge( 302 __first_chunk_begin, 303 __first_chunk_begin + __partitions.__first_chunk_size_, 304 __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, 305 __comp); 306 }; 307 if (__objects_are_in_buffer) 308 __inplace_merge_chunks(__values.get()); 309 else 310 __inplace_merge_chunks(__first); 311 __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; 312 } else { 313 __partitions.__first_chunk_size_ += __partitions.__chunk_size_; 314 } 315 316 __partitions.__chunk_size_ *= 2; 317 __partitions.__chunk_count_ /= 2; 318 319 auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { 320 __libdispatch::__dispatch_parallel_for( 321 __partitions, 322 __from_first, 323 [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { 324 std::merge(std::make_move_iterator(__chunk_first), 325 std::make_move_iterator(__chunk_last - __old_chunk_size), 326 std::make_move_iterator(__chunk_last - __old_chunk_size), 327 std::make_move_iterator(__chunk_last), 328 __to_first + (__chunk_first - __from_first), 329 __comp); 330 }); 331 }; 332 333 if (__objects_are_in_buffer) 334 __merge_chunks(__values.get(), __first); 335 else 336 __merge_chunks(__first, __values.get()); 337 __objects_are_in_buffer = !__objects_are_in_buffer; 338 } while (__partitions.__chunk_count_ > 1); 339 340 if (__objects_are_in_buffer) { 341 std::move(__values.get(), __values.get() + __size, __first); 342 } 343 344 return __empty{}; 345 } 346 347 _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {} 348 349 static constexpr size_t __lane_size = 64; 350 }; 351 352 // Mandatory implementations of the computational basis 353 template <class _ExecutionPolicy> 354 struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy> 355 : __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {}; 356 357 template <class _ExecutionPolicy> 358 struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy> 359 : __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {}; 360 361 template <class _ExecutionPolicy> 362 struct __merge<__libdispatch_backend_tag, _ExecutionPolicy> 363 : __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {}; 364 365 template <class _ExecutionPolicy> 366 struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> 367 : __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {}; 368 369 template <class _ExecutionPolicy> 370 struct __transform<__libdispatch_backend_tag, _ExecutionPolicy> 371 : __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {}; 372 373 template <class _ExecutionPolicy> 374 struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> 375 : __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {}; 376 377 template <class _ExecutionPolicy> 378 struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> 379 : __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {}; 380 381 template <class _ExecutionPolicy> 382 struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> 383 : __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {}; 384 385 // Not mandatory, but better optimized 386 template <class _ExecutionPolicy> 387 struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy> 388 : __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {}; 389 390 template <class _ExecutionPolicy> 391 struct __fill<__libdispatch_backend_tag, _ExecutionPolicy> 392 : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {}; 393 394 } // namespace __pstl 395 _LIBCPP_END_NAMESPACE_STD 396 397 #endif // _LIBCPP_STD_VER >= 17 398 399 _LIBCPP_POP_MACROS 400 401 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H 402