Skip to content

Commit 1d98f48

Browse files
committed
Tested on RTX5090
1 parent 5cdabd4 commit 1d98f48

File tree

5 files changed

+29
-26
lines changed

5 files changed

+29
-26
lines changed

mlir/cuda-tile/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ compdb -p build list > compile_commands.json
152152
# func.func private @cuda_shim_stream_destroy(i64)
153153
# func.func private @cuda_shim_free(i64, i64)
154154
# func.func private @cuda_shim_stream_synchronize(i64)
155-
# func.func private @cuda_shim_launch_block_packed(i64, i64, i32, i32, i32, i64, i64, i64, i32)
155+
# func.func private @cuda_shim_launch_grid_packed(i64, i64, i32, i32, i32, i64, i64, i64, i32)
156156
# func.func private @cuda_shim_memcpy_d2h(i64, i64, i64)
157157
# func.func private @cuda_shim_memcpy_h2d(i64, i64, i64)
158158
# func.func private @cuda_shim_malloc(i64, i64, i1) -> i64
@@ -255,7 +255,7 @@ compdb -p build list > compile_commands.json
255255
# %14 = arith.index_cast %intptr_27 : index to i64
256256
# %intptr_28 = memref.extract_aligned_pointer_as_index %alloc_22 : memref<4xi64> -> index
257257
# %15 = arith.index_cast %intptr_28 : index to i64
258-
# call @cuda_shim_launch_block_packed(%4, %3, %c8_i32, %c1_i32, %c1_i32, %5, %14, %15, %c4_i32) : (i64, i64, i32, i32, i32, i64, i64, i64, i32) -> ()
258+
# call @cuda_shim_launch_grid_packed(%4, %3, %c8_i32, %c1_i32, %c1_i32, %5, %14, %15, %c4_i32) : (i64, i64, i32, i32, i32, i64, i64, i64, i32) -> ()
259259
# call @cuda_shim_stream_synchronize(%5) : (i64) -> ()
260260
# call @cuda_shim_memcpy_d2h(%13, %12, %c32_i64) : (i64, i64, i64) -> ()
261261
# memref.dealloc %alloc_21 : memref<4xi64>
@@ -292,7 +292,7 @@ compdb -p build list > compile_commands.json
292292
# llvm.func @cuda_shim_stream_destroy(i64) attributes {sym_visibility = "private"}
293293
# llvm.func @cuda_shim_free(i64, i64) attributes {sym_visibility = "private"}
294294
# llvm.func @cuda_shim_stream_synchronize(i64) attributes {sym_visibility = "private"}
295-
# llvm.func @cuda_shim_launch_block_packed(i64, i64, i32, i32, i32, i64, i64, i64, i32) attributes {sym_visibility = "private"}
295+
# llvm.func @cuda_shim_launch_grid_packed(i64, i64, i32, i32, i32, i64, i64, i64, i32) attributes {sym_visibility = "private"}
296296
# llvm.func @cuda_shim_memcpy_d2h(i64, i64, i64) attributes {sym_visibility = "private"}
297297
# llvm.func @cuda_shim_memcpy_h2d(i64, i64, i64) attributes {sym_visibility = "private"}
298298
# llvm.func @cuda_shim_malloc(i64, i64, i1) -> i64 attributes {sym_visibility = "private"}
@@ -350,7 +350,7 @@ compdb -p build list > compile_commands.json
350350
# !10 = !DISubprogram(name: "cuda_shim_stream_destroy", linkageName: "cuda_shim_stream_destroy", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
351351
# !11 = !DISubprogram(name: "cuda_shim_free", linkageName: "cuda_shim_free", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
352352
# !12 = !DISubprogram(name: "cuda_shim_stream_synchronize", linkageName: "cuda_shim_stream_synchronize", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
353-
# !13 = !DISubprogram(name: "cuda_shim_launch_block_packed", linkageName: "cuda_shim_launch_block_packed", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
353+
# !13 = !DISubprogram(name: "cuda_shim_launch_grid_packed", linkageName: "cuda_shim_launch_grid_packed", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
354354
# !14 = !DISubprogram(name: "cuda_shim_memcpy_d2h", linkageName: "cuda_shim_memcpy_d2h", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
355355
# !15 = !DISubprogram(name: "cuda_shim_memcpy_h2d", linkageName: "cuda_shim_memcpy_h2d", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)
356356
# !16 = !DISubprogram(name: "cuda_shim_malloc", linkageName: "cuda_shim_malloc", scope: !9, file: !9, line: 1, type: !4, scopeLine: 1, spFlags: DISPFlagOptimized)

mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -505,13 +505,16 @@ extern "C" void cuda_shim_launch_packed(
505505

506506
// Convenience: 1D launch, shared=0, stream optional
507507
extern "C" void
508-
cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
509-
uint32_t blockX, uint32_t blockY, uint32_t blockZ,
510-
uint64_t stream, uint64_t arg_data_ptr,
511-
uint64_t arg_sizes_ptr, uint32_t num_args) {
512-
cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX,
513-
blockY, blockZ, 0, stream, arg_data_ptr,
514-
arg_sizes_ptr, num_args);
508+
cuda_shim_launch_grid_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
509+
uint32_t gridX, uint32_t gridY, uint32_t gridZ,
510+
uint64_t stream, uint64_t arg_data_ptr,
511+
uint64_t arg_sizes_ptr, uint32_t num_args) {
512+
cuda_shim_launch_packed(module_handle, kernel_name_ptr, gridX, gridY, gridZ,
513+
1, 1, 1, 0, stream, arg_data_ptr, arg_sizes_ptr,
514+
num_args);
515+
// cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX,
516+
// blockY, blockZ, 0, stream, arg_data_ptr,
517+
// arg_sizes_ptr, num_args);
515518
}
516519

517520
// Optional: global sync (avoid in async pipeline; prefer event/stream sync)

mlir/cuda-tile/Toy/include/cuda_shim/CudaShimBuilder.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ static llvm::DenseMap<CudaShimFn, llvm::StringRef> CudaShimFnNames = {
135135
{CudaShimFn::StreamDestroy, "cuda_shim_stream_destroy"},
136136
{CudaShimFn::StreamSynchronize, "cuda_shim_stream_synchronize"},
137137
{CudaShimFn::LaunchPacked, "cuda_shim_launch_packed"},
138-
{CudaShimFn::LaunchBlockPacked, "cuda_shim_launch_block_packed"},
138+
{CudaShimFn::LaunchBlockPacked, "cuda_shim_launch_grid_packed"},
139139
{CudaShimFn::CtxSynchronize, "cuda_shim_ctx_synchronize"},
140140
};
141141

@@ -333,7 +333,7 @@ class CudaShimRegistry {
333333
// {})};
334334

335335
// case CudaShimFn::LaunchBlockPacked:
336-
// return {"cuda_shim_launch_block_packed",
336+
// return {"cuda_shim_launch_grid_packed",
337337
// rewriter.getFunctionType(
338338
// {
339339
// i64, // module_handle
@@ -498,11 +498,11 @@ void cuda_shim_launch_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
498498
uint32_t num_args);
499499

500500
// Convenience: 1D launch, shared=0, stream optional
501-
void cuda_shim_launch_block_packed(uint64_t module_handle,
502-
uint64_t kernel_name_ptr, uint32_t blockX,
503-
uint32_t blockY, uint32_t blockZ,
504-
uint64_t stream, uint64_t arg_data_ptr,
505-
uint64_t arg_sizes_ptr, uint32_t num_args);
501+
void cuda_shim_launch_grid_packed(uint64_t module_handle,
502+
uint64_t kernel_name_ptr, uint32_t blockX,
503+
uint32_t blockY, uint32_t blockZ,
504+
uint64_t stream, uint64_t arg_data_ptr,
505+
uint64_t arg_sizes_ptr, uint32_t num_args);
506506

507507
// Optional: global sync (avoid in async pipeline; prefer event/stream sync)
508508
void cuda_shim_ctx_synchronize(void);
@@ -560,7 +560,7 @@ buildCudaShimSymbolMap(llvm::orc::MangleAndInterner interner) {
560560

561561
// ---- launch ----
562562
add("cuda_shim_launch_packed", (void *)&cuda_shim_launch_packed);
563-
add("cuda_shim_launch_block_packed", (void *)&cuda_shim_launch_block_packed);
563+
add("cuda_shim_launch_grid_packed", (void *)&cuda_shim_launch_grid_packed);
564564

565565
return syms;
566566
}

mlir/cuda-tile/cuda_shim/cuda_shim.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -505,10 +505,10 @@ extern "C" void cuda_shim_launch_packed(
505505

506506
// Convenience: 1D launch, shared=0, stream optional
507507
extern "C" void
508-
cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
509-
uint32_t blockX, uint32_t blockY, uint32_t blockZ,
510-
uint64_t stream, uint64_t arg_data_ptr,
511-
uint64_t arg_sizes_ptr, uint32_t num_args) {
508+
cuda_shim_launch_grid_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
509+
uint32_t blockX, uint32_t blockY, uint32_t blockZ,
510+
uint64_t stream, uint64_t arg_data_ptr,
511+
uint64_t arg_sizes_ptr, uint32_t num_args) {
512512
cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX,
513513
blockY, blockZ, 0, stream, arg_data_ptr,
514514
arg_sizes_ptr, num_args);
@@ -519,7 +519,7 @@ extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); }
519519

520520
// only for debugging
521521
extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
522-
auto *p = reinterpret_cast<const float*>(static_cast<uintptr_t>(dptr));
522+
auto *p = reinterpret_cast<const float *>(static_cast<uintptr_t>(dptr));
523523
for (uint32_t i = 0; i < n; ++i) {
524524
fprintf(stderr, "i=%u v=%f\n", i, p[i]);
525525
}

mlir/cuda-tile/sample/test.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module {
33
func.func private @cuda_shim_stream_destroy(i64)
44
func.func private @cuda_shim_free(i64, i64)
55
func.func private @cuda_shim_stream_synchronize(i64)
6-
func.func private @cuda_shim_launch_block_packed(i64, i64, i32, i32, i32, i64, i64, i64, i32)
6+
func.func private @cuda_shim_launch_grid_packed(i64, i64, i32, i32, i32, i64, i64, i64, i32)
77
func.func private @cuda_shim_memcpy_d2h(i64, i64, i64)
88
func.func private @cuda_shim_memcpy_h2d(i64, i64, i64)
99
func.func private @cuda_shim_malloc(i64, i64, i1) -> i64
@@ -135,7 +135,7 @@ module {
135135
%14 = arith.index_cast %intptr_55 : index to i64
136136
%intptr_56 = memref.extract_aligned_pointer_as_index %alloc_36 : memref<4xi64> -> index
137137
%15 = arith.index_cast %intptr_56 : index to i64
138-
call @cuda_shim_launch_block_packed(%4, %3, %c8_i32, %c1_i32, %c1_i32_54, %5, %14, %15, %c4_i32) : (i64, i64, i32, i32, i32, i64, i64, i64, i32) -> ()
138+
call @cuda_shim_launch_grid_packed(%4, %3, %c8_i32, %c1_i32, %c1_i32_54, %5, %14, %15, %c4_i32) : (i64, i64, i32, i32, i32, i64, i64, i64, i32) -> ()
139139
call @cuda_shim_stream_synchronize(%5) : (i64) -> ()
140140
call @cuda_shim_memcpy_d2h(%13, %12, %c32_i64_49) : (i64, i64, i64) -> ()
141141
memref.dealloc %alloc_35 : memref<4xi64>

0 commit comments

Comments
 (0)