initial migration of OA and static_map#7705
initial migration of OA and static_map#7705srinivasyadav18 wants to merge 1 commit intoNVIDIA:mainfrom
Conversation
|
Auto-sync is disabled for draft pull requests in this repository. Workflows must be run manually. Contributors can view more details about this message here. |
| _CCCL_HOST_DEVICE inline int __cuda_memcmp(const void* __lhs, const void* __rhs, ::cuda::std::size_t __count) | ||
| { | ||
| auto __lhs_c = reinterpret_cast<const unsigned char*>(__lhs); | ||
| auto __rhs_c = reinterpret_cast<const unsigned char*>(__rhs); | ||
| while (__count--) | ||
| { | ||
| auto const __lhs_v = *__lhs_c++; | ||
| auto const __rhs_v = *__rhs_c++; | ||
| if (__lhs_v < __rhs_v) | ||
| { | ||
| return -1; | ||
| } | ||
| if (__lhs_v > __rhs_v) | ||
| { | ||
| return 1; | ||
| } | ||
| } | ||
| return 0; | ||
| } |
There was a problem hiding this comment.
Does CCCL internally offer something similar? I feel this is a very generic util that we don't need to custom.
| template <class _Lhs, class _Rhs> | ||
| _CCCL_DEVICE constexpr __equal_result __equal_to(const _Lhs& __lhs, const _Rhs& __rhs) const noexcept | ||
| { | ||
| return __equal(__lhs, __rhs) ? __equal_result::__equal : __equal_result::__unequal; | ||
| } | ||
|
|
||
| template <__is_insert _IsInsert, class _Lhs, class _Rhs> | ||
| _CCCL_DEVICE constexpr __equal_result operator()(const _Lhs& __lhs, const _Rhs& __rhs) const noexcept |
There was a problem hiding this comment.
The purpose of this equal wrapper is to encapsulate both the bitwise sentinel check and the key comparison via the key_equal comparator into a single API, so users don’t need to manually perform a sentinel check before invoking the equality comparison.
However, in cases where the sentinel check has already been performed and only key equality is desired, it is preferable to call __equal_to directly instead of using the wrapper operator, which always performs the sentinel check.
It would be helpful to add documentation here to clarify this distinction and guide users on when to use each path.
There was a problem hiding this comment.
we probably want to use cuda::std::array instead of std::array for the prime array as it can be calculated on either device or host
| //! @brief Converts pair to tuple. | ||
| template <class _Key, class _Value> | ||
| struct __slot_to_tuple | ||
| { | ||
| template <class _Slot> | ||
| _CCCL_DEVICE ::cuda::std::tuple<_Key, _Value> operator()(const _Slot& __slot) | ||
| { | ||
| return ::cuda::std::tuple<_Key, _Value>(__slot.first, __slot.second); | ||
| } | ||
| }; | ||
|
|
||
| //! @brief Device functor returning whether the input slot is filled. | ||
| //! | ||
| //! Template parameter: | ||
| //! - `_Key`: Key type | ||
|
|
||
| template <class _Key> | ||
| struct __slot_is_filled | ||
| { | ||
| _Key __empty_key_sentinel; | ||
|
|
||
| template <class _Slot> | ||
| _CCCL_DEVICE bool operator()(const _Slot& __slot) | ||
| { | ||
| return !__detail::__bitwise_compare(::cuda::std::get<0>(__slot), __empty_key_sentinel); | ||
| } | ||
| }; |
There was a problem hiding this comment.
I think they can be safely removed as it's only used by the legacy impl
| constexpr _ForwardIt __lower_bound(_ForwardIt __first, _ForwardIt __last, const _Tp& __value) | ||
| { | ||
| using __diff_type = typename std::iterator_traits<_ForwardIt>::difference_type; |
There was a problem hiding this comment.
| constexpr _ForwardIt __lower_bound(_ForwardIt __first, _ForwardIt __last, const _Tp& __value) | |
| { | |
| using __diff_type = typename std::iterator_traits<_ForwardIt>::difference_type; | |
| _CCCL_HOST_DEVICE constexpr _ForwardIt __lower_bound(_ForwardIt __first, _ForwardIt __last, const _Tp& __value) | |
| { | |
| using __diff_type = typename cuda::std::iterator_traits<_ForwardIt>::difference_type; |
This is supposed to be a host device API
We could also use CCCL’s lower_bound if it exists and is constexpr under C++17.
| const auto __loop_stride = ::cuda::experimental::cuco::__detail::__grid_stride() / _CgSize; | ||
| auto __idx = ::cuda::experimental::cuco::__detail::__global_thread_id() / _CgSize; | ||
|
|
||
| auto __warp = cg::tiled_partition<32, cg::thread_block>(__block); |
There was a problem hiding this comment.
| auto __warp = cg::tiled_partition<32, cg::thread_block>(__block); | |
| auto __warp = cg::tiled_partition<warp_size, cg::thread_block>(__block); |
any internal util we could use to avoid magic number? We do have this util in cuco though.
| template <class _Key, | ||
| class _Tp, | ||
| ::cuda::thread_scope _Scope = ::cuda::thread_scope_device, | ||
| class _KeyEqual = thrust::equal_to<_Key>, |
There was a problem hiding this comment.
| class _KeyEqual = thrust::equal_to<_Key>, | |
| class _KeyEqual = cuda::std::equal_to<_Key>, |
| using key_type = _Key; | ||
| using mapped_type = _Tp; | ||
| using value_type = ::cuda::std::pair<_Key, _Tp>; | ||
| using size_type = ::cuda::std::size_t; |
There was a problem hiding this comment.
If so, we get rid of cuco::extent as well as hash sanitizing logic algother.
| //! @tparam _KeyEqual Binary callable type used to compare two keys for equality | ||
| //! @tparam _ProbingScheme Probing scheme type (e.g., `linear_probing`, `double_hashing`) | ||
| //! @tparam _BucketSize Number of slots per bucket | ||
| //! @tparam _MemoryResource Type of memory resource used for device storage |
There was a problem hiding this comment.
Let’s move all existing documentation here as well.
| { | ||
| /// @brief A valid (post-rounding) extent type. | ||
| template <class _SizeType> | ||
| using __valid_extent = extent<_SizeType, dynamic_extent>; |
There was a problem hiding this comment.
valid_extent is intended as a wrapper around fast_div for runtime size or compile-time constants otherwise. If we drop compile-time support, there is no reason to keep this type.
Description
closes #7463
Checklist