diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh index 6968f058f85..bfc2a6ed230 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh @@ -29,6 +29,7 @@ #include #include +#include namespace cuda::experimental::stf { @@ -77,7 +78,9 @@ public: size_t sz = this->shape.get_capacity() * sizeof(reserved::KeyValue); // NAIVE method ! - cuda_safe_call(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s)); + // cudaMemcpyAsync is an overload set (cuda_runtime.h alternate-spelling wrapper), + // so it keeps the runtime-status cuda_try form. + cuda_try>(dst, src, sz, kind, s); } void stream_data_allocate( @@ -95,18 +98,26 @@ public: if (memory_node.is_host()) { - // Fallback to a synchronous method - cuda_safe_call(cudaStreamSynchronize(stream)); - cuda_safe_call(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped)); + // Fallback to a synchronous method. cudaHostAlloc is an overload set + // (cuda_runtime.h templated wrapper), so it keeps the runtime-status form. + cuda_try(stream); + base_ptr = cuda_try>(s, cudaHostAllocMapped); memset(base_ptr, 0xff, s); } else { - cuda_safe_call(cudaMallocAsync(&base_ptr, s, stream)); + // cudaMallocAsync is an overload set (templated wrapper), so it keeps the + // runtime-status form. + cuda_try(cudaMallocAsync(&base_ptr, s, stream)); + // Free the buffer if the initialization below throws. + SCOPE(fail) + { + cuda_safe_call(cudaFreeAsync(base_ptr, stream)); + }; // We also need to initialize the hashtable static_assert(reserved::kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff"); - cuda_safe_call(cudaMemsetAsync(base_ptr, 0xff, s, stream)); + cuda_try(base_ptr, 0xff, s, stream); } local_desc.addr = base_ptr; @@ -120,16 +131,19 @@ public: cudaStream_t stream) override { hashtable& local_desc = this->instance(instance_id); + if (memory_node.is_host()) { // Fallback to a synchronous method - cuda_safe_call(cudaStreamSynchronize(stream)); - cuda_safe_call(cudaFreeHost(local_desc.addr)); + auto cudaStreamSynchronizeResult = cudaStreamSynchronize(stream); + cuda_try(local_desc.addr); + cuda_try(cudaStreamSynchronizeResult); } else { - cuda_safe_call(cudaFreeAsync(local_desc.addr, stream)); + cuda_try(local_desc.addr, stream); } + local_desc.addr = nullptr; // not strictly necessary, but helps debugging } }; diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index d19ea175674..a2a02fd14ce 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -232,15 +232,17 @@ public: if constexpr (dimensions == 0) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); + // cudaMemcpyAsync is an overload set (cuda_runtime.h adds an alternate-spelling + // wrapper), so it keeps the runtime-status cuda_try form. + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); } else if constexpr (dimensions == 1) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); } else if constexpr (dimensions == 2) { - cuda_safe_call(cudaMemcpy2DAsync( + cuda_try( dst_ptr, dst_instance.stride(1) * sizeof(T), src_ptr, @@ -248,14 +250,14 @@ public: b.extent(0) * sizeof(T), b.extent(1), kind, - s)); + s); } else { // We only support higher dimensions if they are contiguous ! if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions)) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); } else { @@ -279,11 +281,7 @@ public: ::std::optional get_memory_type(instance_id_t instance_id) override { - auto s = this->instance(instance_id); - - cudaPointerAttributes attributes{}; - cuda_safe_call(cudaPointerGetAttributes(&attributes, s.data_handle())); - + const auto attributes = cuda_try(this->instance(instance_id).data_handle()); // Implicitly converted to an optional return attributes.type; } diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh index 453823392ac..f89956c72ac 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh @@ -115,7 +115,7 @@ public: if (e.affine_data_place().is_host()) { // TODO make a callback when the situation gets better - cuda_safe_call(cudaStreamSynchronize(s)); + cuda_try(s); // slice_print(in, "in before op"); // slice_print(inout, "inout before op"); @@ -160,7 +160,7 @@ public: if (e.affine_data_place().is_host()) { // TODO make a callback when the situation gets better - cuda_safe_call(cudaStreamSynchronize(s)); + cuda_try(s); if constexpr (dimensions == 1) { for (size_t i = 0; i < out.extent(0); i++)