Skip to content

Commit

Permalink
[oneDPL][ranges] + support sized output range for copy_if; dpcpp backend
Browse files Browse the repository at this point in the history
  • Loading branch information
MikeDvorskiy committed Jan 16, 2025
1 parent 78e5794 commit 3e86e51
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 45 deletions.
4 changes: 1 addition & 3 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -901,15 +901,13 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
if (__first == __last)
return __result_first;

_It1DifferenceType __n = __last - __first;

auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
auto __buf1 = __keep1(__first, __last);
auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>();
auto __buf2 = __keep2(__result_first, __result_first + __n);

auto __res = __par_backend_hetero::__parallel_copy_if(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
__buf1.all_view(), __buf2.all_view(), __n, __pred);
__buf1.all_view(), __buf2.all_view(), __pred);

::std::size_t __num_copied = __res.get(); //is a blocking call
return __result_first + __num_copied;
Expand Down
21 changes: 12 additions & 9 deletions include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -536,19 +536,22 @@ __pattern_count_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _

template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Predicate,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
oneapi::dpl::__internal::__difference_t<_Range2>
std::pair<oneapi::dpl::__internal::__difference_t<_Range1>, oneapi::dpl::__internal::__difference_t<_Range2>>
__pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
_Predicate __pred, _Assign __assign)
{
oneapi::dpl::__internal::__difference_t<_Range2> __n = __rng1.size();
if (__n == 0)
return 0;
using _Index = oneapi::dpl::__internal::__difference_t<_Range2>;
_Index __n = __rng1.size();
if (__n == 0 || __rng2.empty())
return {0, 0};

auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if_out_lim(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), __n, __pred, __assign);
std::forward<_Range2>(__rng2), __pred, __assign).get();

return __res.get(); //is a blocking call
std::array<_Index, _2> __idx;
__res.get_values(__idx); //a blocking call
return {__idx[0], __idx[1];
}

#if _ONEDPL_CPP20_RANGES_PRESENT
Expand All @@ -561,15 +564,15 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e
auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj,
std::forward<decltype(__val)>(__val)));};

auto __res_idx = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag,
auto __res = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag,
std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::views::all_read(__in_r),
oneapi::dpl::__ranges::views::all_write(__out_r), __pred_1,
oneapi::dpl::__internal::__pstl_assign());

using __return_t = std::ranges::copy_if_result<std::ranges::borrowed_iterator_t<_InRange>,
std::ranges::borrowed_iterator_t<_OutRange>>;

return __return_t{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx};
return __return_t{std::ranges::begin(__in_r) + __res.first, std::ranges::begin(__out_r) + __res.second};
}
#endif //_ONEDPL_CPP20_RANGES_PRESENT

Expand Down
34 changes: 26 additions & 8 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1228,11 +1228,11 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _
/*_Inclusive=*/std::true_type{}, __is_unique_pattern);
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _CreateMaskOp,
typename _CopyByMaskOp>
auto
__parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op,
_InRng&& __in_rng, _OutRng&& __out_rng, _CreateMaskOp __create_mask_op,
_CopyByMaskOp __copy_by_mask_op)
{
using _ReduceOp = std::plus<_Size>;
Expand All @@ -1248,7 +1248,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
_MaskAssigner __add_mask_op;

// temporary buffer to store boolean mask
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n);
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __in_rng.size());

return __parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
Expand Down Expand Up @@ -1299,7 +1299,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n,
std::forward<_Range2>(__result),
_CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
_CopyOp{_ReduceOp{}, _Assign{}});
}
Expand Down Expand Up @@ -1360,16 +1360,34 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;

return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
std::forward<_Range2>(__result), _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
}
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
__parallel_copy_if_out_lim(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{})
{
using _ReduceOp = std::plus<_Size>;
using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
/*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
_CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{})
_InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{})
{
auto __n = __in_rng.size();
using _Size = decltype(__n);
using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;

// Next power of 2 greater than or equal to __n
Expand Down Expand Up @@ -1413,7 +1431,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
/*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
_CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
}
}
Expand Down
96 changes: 72 additions & 24 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
_T
__get_value(size_t idx = 0) const
{
assert(__result_n > 0);
assert(idx < __result_n);
if (__use_USM_host && __supports_USM_device)
{
Expand All @@ -682,6 +683,26 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
}
}

template <typename _T, std::size_t _N>
void get_values(std::array<_T, _N>& __arr)
{
assert(__result_n > 0);
assert(_N == __result_n);
if (__use_USM_host && __supports_USM_device)
{
std::copy_n(__result_buf.get(), __result_n, __arr.begin());
}
else if (__supports_USM_device)
{
__exec.queue().memcpy(__arr.begin(), __scratch_buf.get() + __scratch_n, __result_n * sizeof(_T)).wait();
}
else
{
auto _acc_h = __sycl_buf->get_host_access(sycl::read_only);
std::copy_n(_acc_h.begin() + __scratch_n, __result_n, __arr.begin());
}
}

template <typename _Event>
_T
__wait_and_get_value(_Event&& __event, size_t idx = 0) const
Expand All @@ -691,6 +712,49 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base

return __get_value(idx);
}

template <typename _Event, typename _T, std::size_t _N>
void
__wait_and_get_value(_Event&& __event, std::array<_T, _N>& __arr) const
{
if (is_USM())
__event.wait_and_throw();

return get_values(__arr);
}
};

// The type specifies the polymorphic behaviour for different value types via the overloads
struct __wait_and_get_value
{
template <typename _T>
constexpr auto
operator()(auto&& /*__event*/, const sycl::buffer<_T>& __buf)
{
return __buf.get_host_access(sycl::read_only)[0];
}

template <typename _ExecutionPolicy, typename _T>
constexpr auto
operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
{
return __storage.__wait_and_get_value(__event);
}

template <typename _ExecutionPolicy, typename _T, std::size_t _N>
constexpr void
operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage, std::array<_T, _N>& __arr)
{
return __storage.__wait_and_get_value(__event, __arr);
}

template <typename _T>
constexpr auto
operator()(auto&& __event, const _T& __val)
{
__event.wait_and_throw();
return __val;
}
};

// Tag __async_mode describe a pattern call mode which should be executed asynchronously
Expand All @@ -714,29 +778,6 @@ class __future : private std::tuple<_Args...>
{
_Event __my_event;

template <typename _T>
constexpr auto
__wait_and_get_value(const sycl::buffer<_T>& __buf)
{
//according to a contract, returned value is one-element sycl::buffer
return __buf.get_host_access(sycl::read_only)[0];
}

template <typename _ExecutionPolicy, typename _T>
constexpr auto
__wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
{
return __storage.__wait_and_get_value(__my_event);
}

template <typename _T>
constexpr auto
__wait_and_get_value(const _T& __val)
{
wait();
return __val;
}

public:
__future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {}
__future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {}
Expand Down Expand Up @@ -770,13 +811,20 @@ class __future : private std::tuple<_Args...>
#endif
}

template <typename _T, std::size_t _N>
std::enable_if_t<sizeof...(_Args) > 0>
get_values(std::array<_T, _N>& __arr)
{
__wait_and_get_value{}(event(), __val, __arr);
}

auto
get()
{
if constexpr (sizeof...(_Args) > 0)
{
auto& __val = std::get<0>(*this);
return __wait_and_get_value(__val);
return __wait_and_get_value{}(event(), __val);
}
else
wait();
Expand Down
3 changes: 2 additions & 1 deletion include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,8 @@ struct __copy_by_mask
// NOTE: we only need this explicit conversion when we have internal::tuple and
// ::std::tuple as operands, in all the other cases this is not necessary and no conversion
// is performed(i.e. __typle_type is the same type as its operand).
__assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]);
if(__out_idx < __out_acc.size())
__assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]);
}
if (__item_idx == 0)
{
Expand Down

0 comments on commit 3e86e51

Please sign in to comment.