From 3128637a0618dacd0459f6c1cc5f287da46be82d Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 23 Dec 2025 13:02:59 +0800 Subject: [PATCH 1/2] issue/808: success qy v0.1.0 --- README.md | 1 + include/infinicore.h | 1 + src/infiniccl-test/main.cpp | 3 +- src/infiniccl/cuda/infiniccl_cuda.h | 2 +- src/infiniccl/infiniccl.cc | 3 + src/infiniop-test/src/main.cpp | 3 +- src/infiniop/devices/handle.cc | 8 +- src/infiniop/devices/nvidia/nvidia_common.cu | 14 ++ src/infiniop/devices/nvidia/nvidia_handle.h | 11 ++ .../devices/nvidia/nvidia_kernel_common.cuh | 2 +- src/infiniop/ops/add/operator.cc | 14 +- src/infiniop/ops/causal_softmax/operator.cc | 14 +- src/infiniop/ops/clip/operator.cc | 14 +- src/infiniop/ops/conv/operator.cc | 14 +- .../nvidia/dequantize_w42f16_kernel.cuh | 42 ++++- .../nvidia/dequantize_w42f16_nvidia.cu | 58 ++++++- src/infiniop/ops/dequantize_awq/operator.cc | 14 +- src/infiniop/ops/gemm/operator.cc | 14 +- src/infiniop/ops/logsoftmax/operator.cc | 14 +- src/infiniop/ops/mul/operator.cc | 14 +- src/infiniop/ops/paged_attention/operator.cc | 14 +- src/infiniop/ops/paged_caching/operator.cc | 14 +- src/infiniop/ops/random_sample/operator.cc | 14 +- src/infiniop/ops/rearrange/operator.cc | 12 +- src/infiniop/ops/relu/cuda/kernel.cuh | 35 ++++ src/infiniop/ops/relu/nvidia/relu_nvidia.cu | 21 ++- src/infiniop/ops/relu/nvidia/relu_nvidia.cuh | 4 +- src/infiniop/ops/relu/operator.cc | 24 +-- src/infiniop/ops/rms_norm/operator.cc | 14 +- src/infiniop/ops/rope/operator.cc | 14 +- src/infiniop/ops/softplus/operator.cc | 14 +- src/infiniop/ops/sub/operator.cc | 14 +- src/infiniop/ops/swiglu/operator.cc | 14 +- .../topkrouter/nvidia/topkrouter_nvidia.cu | 2 +- src/infiniop/ops/topkrouter/operator.cc | 14 +- src/infinirt/cuda/infinirt_cuda.cuh | 2 +- src/infinirt/infinirt.cc | 5 +- test/infiniop/libinfiniop/devices.py | 4 +- test/infiniop/libinfiniop/utils.py | 7 + xmake.lua | 26 +++ xmake/qy.lua | 154 ++++++++++++++++++ 41 files changed, 632 insertions(+), 50 deletions(-) create mode 100644 src/infiniop/ops/relu/cuda/kernel.cuh create mode 100644 xmake/qy.lua diff --git a/README.md b/README.md index 8ef1b9dc8..f08cae84c 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS] | `--metax-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n | `--moore-gpu=[y\|n]` | 是否编译摩尔线程 GPU 接口实现 | n | `--iluvatar-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n +| `--qy-gpu=[y\|n]` | 是否编QY GPU 接口实现 | n | `--hygon-dcu=[y\|n]` | 是否编译海光 DCU 接口实现 | n | `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n | `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n diff --git a/include/infinicore.h b/include/infinicore.h index f9d4662d8..49654937e 100644 --- a/include/infinicore.h +++ b/include/infinicore.h @@ -46,6 +46,7 @@ typedef enum { INFINI_DEVICE_ILUVATAR = 6, INFINI_DEVICE_KUNLUN = 7, INFINI_DEVICE_HYGON = 8, + INFINI_DEVICE_QY = 9, INFINI_DEVICE_TYPE_COUNT } infiniDevice_t; diff --git a/src/infiniccl-test/main.cpp b/src/infiniccl-test/main.cpp index a46648cea..5a596b674 100644 --- a/src/infiniccl-test/main.cpp +++ b/src/infiniccl-test/main.cpp @@ -12,7 +12,7 @@ void printUsage() { std::cout << "infiniccl-test --" << std::endl << std::endl; std::cout << " --" << std::endl; - std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|hygon)." << std::endl + std::cout << " Specify the device type --(nvidia|qy|cambricon|ascend|metax|moore|iluvatar|kunlun|hygon)." << std::endl << std::endl; std::cout << "The program will run tests on all visible devices of the specified device type." << " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs."; @@ -45,6 +45,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) { else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR) else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN) else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON) + else PARSE_DEVICE("--qy", INFINI_DEVICE_QY) else { printUsage(); } diff --git a/src/infiniccl/cuda/infiniccl_cuda.h b/src/infiniccl/cuda/infiniccl_cuda.h index c9d88007f..bea936de0 100644 --- a/src/infiniccl/cuda/infiniccl_cuda.h +++ b/src/infiniccl/cuda/infiniccl_cuda.h @@ -4,7 +4,7 @@ #include "../infiniccl_impl.h" // Windows does not support CUDA -#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32) +#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API)) && defined(ENABLE_CCL) && !defined(_WIN32) INFINICCL_DEVICE_API_IMPL(cuda) #else INFINICCL_DEVICE_API_NOOP(cuda) diff --git a/src/infiniccl/infiniccl.cc b/src/infiniccl/infiniccl.cc index 179debd77..52aede8b7 100644 --- a/src/infiniccl/infiniccl.cc +++ b/src/infiniccl/infiniccl.cc @@ -26,6 +26,7 @@ __C infiniStatus_t infinicclCommInitAll( COMM_INIT_ALL(INFINI_DEVICE_METAX, metax); COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore); COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun); + COMM_INIT_ALL(INFINI_DEVICE_QY, cuda); default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -51,6 +52,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) { COMM_DESTROY(INFINI_DEVICE_METAX, metax); COMM_DESTROY(INFINI_DEVICE_MOORE, moore); COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun); + COMM_DESTROY(INFINI_DEVICE_QY, cuda); default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -83,6 +85,7 @@ __C infiniStatus_t infinicclAllReduce( ALL_REDUCE(INFINI_DEVICE_METAX, metax); ALL_REDUCE(INFINI_DEVICE_MOORE, moore); ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun); + ALL_REDUCE(INFINI_DEVICE_QY, cuda); default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp index 9863f1135..4ad1a72e2 100644 --- a/src/infiniop-test/src/main.cpp +++ b/src/infiniop-test/src/main.cpp @@ -22,7 +22,7 @@ void printUsage() { std::cout << " Path to the test gguf file" << std::endl << std::endl; std::cout << " --[:id]" << std::endl; - std::cout << " (Optional) Specify the device type --(cpu|nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|hygon) and device ID (optional). CPU by default." << std::endl + std::cout << " (Optional) Specify the device type --(cpu|nvidia|qy|cambricon|ascend|metax|moore|iluvatar|kunlun|hygon) and device ID (optional). CPU by default." << std::endl << std::endl; std::cout << " --warmup " << std::endl; std::cout << " (Optional) Number of warmups to perform before timing. Default to 0." << std::endl @@ -79,6 +79,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) { PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR) PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN) PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON) + PARSE_DEVICE("--qy", INFINI_DEVICE_QY) else if (arg == "--warmup" && i + 1 < argc) { args.warmups = std::stoi(argv[++i]); } diff --git a/src/infiniop/devices/handle.cc b/src/infiniop/devices/handle.cc index 90c199b9f..86f3ab271 100644 --- a/src/infiniop/devices/handle.cc +++ b/src/infiniop/devices/handle.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/cpu_handle.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/nvidia_handle.h" #endif #ifdef ENABLE_CAMBRICON_API @@ -44,6 +44,9 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) { #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, qy); +#endif #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, iluvatar); #endif @@ -87,6 +90,9 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { #ifdef ENABLE_NVIDIA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, qy); +#endif #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, iluvatar); #endif diff --git a/src/infiniop/devices/nvidia/nvidia_common.cu b/src/infiniop/devices/nvidia/nvidia_common.cu index 536dff853..b1435dcb0 100644 --- a/src/infiniop/devices/nvidia/nvidia_common.cu +++ b/src/infiniop/devices/nvidia/nvidia_common.cu @@ -65,8 +65,10 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) { return CUDNN_DATA_HALF; case INFINI_DTYPE_F32: return CUDNN_DATA_FLOAT; +#ifndef ENABLE_QY_API case INFINI_DTYPE_F64: return CUDNN_DATA_DOUBLE; +#endif case INFINI_DTYPE_BF16: return CUDNN_DATA_BFLOAT16; case INFINI_DTYPE_I8: @@ -116,4 +118,16 @@ infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) { } // namespace hygon +namespace qy { + +Handle::Handle(int device_id) + : nvidia::Handle(INFINI_DEVICE_QY, device_id) {} + +infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) { + *handle_ptr = new Handle(device_id); + return INFINI_STATUS_SUCCESS; +} + +} // namespace qy + } // namespace device diff --git a/src/infiniop/devices/nvidia/nvidia_handle.h b/src/infiniop/devices/nvidia/nvidia_handle.h index af1dde721..41be4d18b 100644 --- a/src/infiniop/devices/nvidia/nvidia_handle.h +++ b/src/infiniop/devices/nvidia/nvidia_handle.h @@ -46,6 +46,17 @@ struct Handle : public nvidia::Handle { } // namespace hygon +namespace qy { + +struct Handle : public nvidia::Handle { + Handle(int device_id); + +public: + static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id); +}; + +} // namespace qy + } // namespace device #endif // __INFINIOP_CUDA_HANDLE_H__ diff --git a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh index feea4018e..62bab471c 100644 --- a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh +++ b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh @@ -50,7 +50,7 @@ exp_(const float val) { return expf(val); } -#if !defined(ENABLE_ILUVATAR_API) && !defined(ENABLE_HYGON_API) +#if !defined(ENABLE_ILUVATAR_API) && !defined(ENABLE_QY_API) && !defined(ENABLE_HYGON_API) __forceinline__ __device__ long double exp_(const long double val) { return expl(val); diff --git a/src/infiniop/ops/add/operator.cc b/src/infiniop/ops/add/operator.cc index 52d19e501..4161da9de 100644 --- a/src/infiniop/ops/add/operator.cc +++ b/src/infiniop/ops/add/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/add_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/add_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -45,6 +45,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -79,6 +82,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -121,6 +127,9 @@ __C infiniStatus_t infiniopAdd( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -157,6 +166,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/causal_softmax/operator.cc b/src/infiniop/ops/causal_softmax/operator.cc index 975a12c32..835fd45ef 100644 --- a/src/infiniop/ops/causal_softmax/operator.cc +++ b/src/infiniop/ops/causal_softmax/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/causal_softmax_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/causal_softmax_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -48,6 +48,9 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -87,6 +90,9 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); #endif @@ -131,6 +137,9 @@ __C infiniStatus_t infiniopCausalSoftmax( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -170,6 +179,9 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD #ifdef ENABLE_ILUVATAR_API DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DESTROY(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/clip/operator.cc b/src/infiniop/ops/clip/operator.cc index ac0fefe7d..88a5ac719 100644 --- a/src/infiniop/ops/clip/operator.cc +++ b/src/infiniop/ops/clip/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/clip_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/clip_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -73,6 +76,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax) #endif @@ -112,6 +118,9 @@ __C infiniStatus_t infiniopClip( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -145,6 +154,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/conv/operator.cc b/src/infiniop/ops/conv/operator.cc index df033f44f..4c974febc 100644 --- a/src/infiniop/ops/conv/operator.cc +++ b/src/infiniop/ops/conv/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/conv_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/conv_nvidia.cuh" #endif @@ -42,6 +42,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -70,6 +73,9 @@ infiniopGetConvWorkspaceSize( #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -106,6 +112,9 @@ __C infiniStatus_t infiniopConv( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -130,6 +139,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh index cdb7c85aa..d1dcc0f44 100644 --- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh +++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh @@ -2,7 +2,47 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 -#error "dequantize_s4_to_fp16x2 requires CUDA compute capability >= 7.5" + // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。 + // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0], + // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。 + const unsigned int v0 = (source >> 0) & 0x0F; + const unsigned int v1 = (source >> 4) & 0x0F; + const unsigned int v2 = (source >> 8) & 0x0F; + const unsigned int v3 = (source >> 12) & 0x0F; + const unsigned int v4 = (source >> 16) & 0x0F; + const unsigned int v5 = (source >> 20) & 0x0F; + const unsigned int v6 = (source >> 24) & 0x0F; + const unsigned int v7 = (source >> 28) & 0x0F; + + // 步骤 2: 对于 signed 4-bit (s4),减去 8 以映射到 [-8, 7] 范围。 + // 定义偏移量 + __half offset = __half(8); + + // 计算 signed 值 + __half hv0 = __half(v0) - offset; + __half hv1 = __half(v1) - offset; + __half hv2 = __half(v2) - offset; + __half hv3 = __half(v3) - offset; + __half hv4 = __half(v4) - offset; + __half hv5 = __half(v5) - offset; + __half hv6 = __half(v6) - offset; + __half hv7 = __half(v7) - offset; + + // 步骤 3: 将 half 值按 PTX 交错顺序打包成 __half2 并存入 result 中。 + // 顺序:result_ptr[0]: low=hv0, high=hv4 + // result_ptr[1]: low=hv1, high=hv5 + // result_ptr[2]: low=hv2, high=hv6 + // result_ptr[3]: low=hv3, high=hv7 + // __halves2half2 函数:low 为第一个参数,high 为第二个参数。 + uint4 result; + __half2 *result_ptr = reinterpret_cast<__half2 *>(&result); + + result_ptr[0] = __halves2half2(hv0, hv4); + result_ptr[1] = __halves2half2(hv1, hv5); + result_ptr[2] = __halves2half2(hv2, hv6); + result_ptr[3] = __halves2half2(hv3, hv7); + + return result; #else uint4 result; diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu index d0775fded..d83e94c5c 100644 --- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu +++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu @@ -1,4 +1,4 @@ -#ifdef ENABLE_NVIDIA_API +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" @@ -8,6 +8,61 @@ #include "../dequantize_awq.h" #include +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750) +__global__ void __launch_bounds__(64) + dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors, + int *__restrict__ zeros, half *__restrict__ C, int G) { + // static constexpr uint32_t ZERO = 0x0; + half B_shared[32 * (128 + 8)]; + + half *B_shared_ptr2 = B_shared; + + int N = blockDim.x * gridDim.x; // 2 + int col = (blockIdx.x * blockDim.x + threadIdx.x); + int row = (blockIdx.y * blockDim.y + threadIdx.y); + int index1 = 8 * col + 8 * row * N; + half *C_ptr2 = C + index1; + + int index2 = col + row * N; + int *B_ptr2 = B + index2; + + int index3 = col + (int)(row / G) * N; + int *zeros_ptr2 = zeros + index3; + int index4 = 8 * col + (int)(row / G) * N * 8; + half *scaling_factors_ptr2 = scaling_factors + index4; + + uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2); + uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded); + uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2); + + uint32_t B_loaded = *(uint32_t *)B_ptr2; + uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded); + + // Reinterpret uint4 components as __half2 + __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16); + __half2 *B_loaded_zero_h2 = reinterpret_cast<__half2 *>(&B_loaded_zero); + __half2 *B_loaded_scale_h2 = reinterpret_cast<__half2 *>(&B_loaded_scale); + + // Replace PTX sub.f16x2 with __hsub2 for each component + B_loaded_fp16_h2[0] = __hsub2(B_loaded_fp16_h2[0], B_loaded_zero_h2[0]); + B_loaded_fp16_h2[1] = __hsub2(B_loaded_fp16_h2[1], B_loaded_zero_h2[1]); + B_loaded_fp16_h2[2] = __hsub2(B_loaded_fp16_h2[2], B_loaded_zero_h2[2]); + B_loaded_fp16_h2[3] = __hsub2(B_loaded_fp16_h2[3], B_loaded_zero_h2[3]); + + // Replace PTX fma.rn.f16x2 with __hfma2 for each component + B_loaded_fp16_h2[0] = __hfma2(B_loaded_fp16_h2[0], B_loaded_scale_h2[0], __float2half2_rn(0.0f)); + B_loaded_fp16_h2[1] = __hfma2(B_loaded_fp16_h2[1], B_loaded_scale_h2[1], __float2half2_rn(0.0f)); + B_loaded_fp16_h2[2] = __hfma2(B_loaded_fp16_h2[2], B_loaded_scale_h2[2], __float2half2_rn(0.0f)); + B_loaded_fp16_h2[3] = __hfma2(B_loaded_fp16_h2[3], B_loaded_scale_h2[3], __float2half2_rn(0.0f)); + + // Store back to shared memory + *(uint4 *)B_shared_ptr2 = B_loaded_fp16; + + for (int i = 0; i < 8; ++i) { + *(C_ptr2 + i) = B_shared[i]; + } +} +#else __global__ void __launch_bounds__(64) dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors, int *__restrict__ zeros, half *__restrict__ C, int group_size) { @@ -67,6 +122,7 @@ __global__ void __launch_bounds__(64) *(C_ptr2 + i) = B_shared[i]; } } +#endif namespace op::dequantize_awq::nvidia { diff --git a/src/infiniop/ops/dequantize_awq/operator.cc b/src/infiniop/ops/dequantize_awq/operator.cc index 537d0cfbc..fba14e63f 100644 --- a/src/infiniop/ops/dequantize_awq/operator.cc +++ b/src/infiniop/ops/dequantize_awq/operator.cc @@ -2,7 +2,7 @@ #include "../../handle.h" #include "infiniop/ops/dequantize_awq.h" -#ifdef ENABLE_NVIDIA_API +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include "nvidia/dequantize_w42f16_nvidia.cuh" #endif #ifdef ENABLE_MOORE_API @@ -39,6 +39,9 @@ __C infiniStatus_t infiniopCreateDequantizeAWQDescriptor( #endif #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, iluvatar); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -63,6 +66,9 @@ __C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDe #endif #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, iluvatar); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -94,6 +100,9 @@ __C infiniStatus_t infiniopDequantizeAWQ( #endif #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, iluvatar); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -119,6 +128,9 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) { #endif #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, iluvatar); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc index 7169046a6..e9fc37bb4 100644 --- a/src/infiniop/ops/gemm/operator.cc +++ b/src/infiniop/ops/gemm/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/gemm_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/gemm_nvidia.cuh" #endif #ifdef ENABLE_CAMBRICON_API @@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateGemmDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -99,6 +102,9 @@ infiniopGetGemmWorkspaceSize( #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); #endif @@ -154,6 +160,9 @@ __C infiniStatus_t infiniopGemm( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -199,6 +208,9 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DELETE(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/logsoftmax/operator.cc b/src/infiniop/ops/logsoftmax/operator.cc index ffb78135f..b1ecece04 100644 --- a/src/infiniop/ops/logsoftmax/operator.cc +++ b/src/infiniop/ops/logsoftmax/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/logsoftmax_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/logsoftmax_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -39,6 +39,9 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor( #ifdef ENABLE_ILUVATAR_API // CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API // CREATE(INFINI_DEVICE_METAX, metax) #endif @@ -66,6 +69,9 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript #ifdef ENABLE_ILUVATAR_API // GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API // GET(INFINI_DEVICE_METAX, metax) #endif @@ -98,6 +104,9 @@ __C infiniStatus_t infiniopLogSoftmax( #ifdef ENABLE_ILUVATAR_API // CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API // CALCULATE(INFINI_DEVICE_METAX, metax) #endif @@ -125,6 +134,9 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip #ifdef ENABLE_ILUVATAR_API // DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API // DESTROY(INFINI_DEVICE_METAX, metax) #endif diff --git a/src/infiniop/ops/mul/operator.cc b/src/infiniop/ops/mul/operator.cc index 83fd20e29..951c681fc 100644 --- a/src/infiniop/ops/mul/operator.cc +++ b/src/infiniop/ops/mul/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/mul_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/mul_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateMulDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -73,6 +76,9 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -113,6 +119,9 @@ __C infiniStatus_t infiniopMul( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -146,6 +155,9 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/paged_attention/operator.cc b/src/infiniop/ops/paged_attention/operator.cc index f41adb2cb..d86556656 100644 --- a/src/infiniop/ops/paged_attention/operator.cc +++ b/src/infiniop/ops/paged_attention/operator.cc @@ -2,7 +2,7 @@ #include "../../handle.h" #include "infiniop/ops/paged_attention.h" -#ifdef ENABLE_NVIDIA_API +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include "nvidia/paged_attention_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -34,6 +34,9 @@ __C infiniStatus_t infiniopCreatePagedAttentionDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax) #endif @@ -54,6 +57,9 @@ __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize( #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax) #endif @@ -78,6 +84,9 @@ __C infiniStatus_t infiniopPagedAttention( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax) #endif @@ -97,6 +106,9 @@ __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor( #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API DESTROY(INFINI_DEVICE_METAX, metax) #endif diff --git a/src/infiniop/ops/paged_caching/operator.cc b/src/infiniop/ops/paged_caching/operator.cc index a69b0e07e..2ed79c8cd 100644 --- a/src/infiniop/ops/paged_caching/operator.cc +++ b/src/infiniop/ops/paged_caching/operator.cc @@ -2,7 +2,7 @@ #include "../../handle.h" #include "infiniop/ops/paged_caching.h" -#ifdef ENABLE_NVIDIA_API +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include "nvidia/paged_caching_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -29,6 +29,9 @@ __C infiniStatus_t infiniopCreatePagedCachingDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax) #endif @@ -49,6 +52,9 @@ __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize( #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax) #endif @@ -73,6 +79,9 @@ __C infiniStatus_t infiniopPagedCaching( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax) #endif @@ -92,6 +101,9 @@ __C infiniStatus_t infiniopDestroyPagedCachingDescriptor( #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia) +#endif #ifdef ENABLE_METAX_API DESTROY(INFINI_DEVICE_METAX, metax) #endif diff --git a/src/infiniop/ops/random_sample/operator.cc b/src/infiniop/ops/random_sample/operator.cc index 4d40fb0ac..e237c17ab 100644 --- a/src/infiniop/ops/random_sample/operator.cc +++ b/src/infiniop/ops/random_sample/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/random_sample_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/random_sample_nvidia.cuh" #endif #ifdef ENABLE_CAMBRICON_API @@ -50,6 +50,9 @@ infiniopCreateRandomSampleDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -98,6 +101,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize( #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); #endif @@ -156,6 +162,9 @@ __C infiniStatus_t infiniopRandomSample( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -201,6 +210,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor( #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DELETE(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/rearrange/operator.cc b/src/infiniop/ops/rearrange/operator.cc index ae8643659..22a5b814e 100644 --- a/src/infiniop/ops/rearrange/operator.cc +++ b/src/infiniop/ops/rearrange/operator.cc @@ -8,7 +8,7 @@ #ifdef ENABLE_ASCEND_API #include "ascend/rearrange_ascend.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/rearrange_nvidia.cuh" #endif #ifdef ENABLE_CAMBRICON_API @@ -52,6 +52,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -96,8 +99,8 @@ __C infiniStatus_t infiniopRearrange( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); #endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); @@ -144,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor( #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DELETE(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/relu/cuda/kernel.cuh b/src/infiniop/ops/relu/cuda/kernel.cuh new file mode 100644 index 000000000..d1c92fe3c --- /dev/null +++ b/src/infiniop/ops/relu/cuda/kernel.cuh @@ -0,0 +1,35 @@ +#ifndef __RELU_CUDA_H__ +#define __RELU_CUDA_H__ + +#include + +namespace op::relu::cuda { + +typedef struct ReluOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + + if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + float result = (x_f > 0.0f ? x_f : 0.0f); + + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + float x_f = __half2float(x); + float result = (x_f > 0.0f ? x_f : 0.0f); + + return __float2half(result); + } else if constexpr (std::is_same_v) { + + return (x > 0.0f ? x : 0.0f); + } else { + return (x > 0.0 ? x : 0.0); + } + } +} ReluOp; + +} // namespace op::relu::cuda + +#endif // __RELU_CUDA_H__ diff --git a/src/infiniop/ops/relu/nvidia/relu_nvidia.cu b/src/infiniop/ops/relu/nvidia/relu_nvidia.cu index 5e9151081..5d84b3c7a 100644 --- a/src/infiniop/ops/relu/nvidia/relu_nvidia.cu +++ b/src/infiniop/ops/relu/nvidia/relu_nvidia.cu @@ -1,7 +1,11 @@ #ifdef ENABLE_NINETOOTHED #include "../../../../../build/ninetoothed/relu.h" +#endif #include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" #include "relu_nvidia.cuh" namespace op::relu::nvidia { @@ -41,6 +45,7 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_INSUFFICIENT_WORKSPACE; } +#ifdef ENABLE_NINETOOTHED const auto &ndim{_info.getNdim()}; const auto &x_shape_{_info.getInputShape(0)}; const auto &x_strides_{_info.getInputStrides(0)}; @@ -72,9 +77,21 @@ infiniStatus_t Descriptor::calculate( default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } +#else + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ReluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ReluOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +#endif return INFINI_STATUS_SUCCESS; } } // namespace op::relu::nvidia - -#endif diff --git a/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh b/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh index 20aacbb11..46c3d9b4c 100644 --- a/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh +++ b/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh @@ -1,12 +1,12 @@ #ifndef __RELU_NVIDIA_API_H__ #define __RELU_NVIDIA_API_H__ -#ifdef ENABLE_NINETOOTHED +// #ifdef ENABLE_NINETOOTHED #include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" ELEMENTWISE_DESCRIPTOR(relu, nvidia) -#endif +// #endif #endif // __RELU_NVIDIA_API_H__ diff --git a/src/infiniop/ops/relu/operator.cc b/src/infiniop/ops/relu/operator.cc index b6f3a8deb..093674de6 100644 --- a/src/infiniop/ops/relu/operator.cc +++ b/src/infiniop/ops/relu/operator.cc @@ -5,11 +5,9 @@ #ifdef ENABLE_CPU_API #include "cpu/relu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) -#ifdef ENABLE_NINETOOTHED +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/relu_nvidia.cuh" #endif -#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED #include "metax/relu_metax.h" @@ -36,14 +34,13 @@ __C infiniStatus_t infiniopCreateReluDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API -#ifdef ENABLE_NINETOOTHED CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -#endif #ifdef ENABLE_ILUVATAR_API -#ifdef ENABLE_NINETOOTHED CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); #endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED @@ -70,14 +67,13 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s GET(INFINI_DEVICE_CPU, cpu) #endif #ifdef ENABLE_NVIDIA_API -#ifdef ENABLE_NINETOOTHED GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -#endif #ifdef ENABLE_ILUVATAR_API -#ifdef ENABLE_NINETOOTHED GET(INFINI_DEVICE_ILUVATAR, nvidia) #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) #endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED @@ -111,14 +107,13 @@ __C infiniStatus_t infiniopRelu( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API -#ifdef ENABLE_NINETOOTHED CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif -#endif #ifdef ENABLE_ILUVATAR_API -#ifdef ENABLE_NINETOOTHED CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); #endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED @@ -147,14 +142,13 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API -#ifdef ENABLE_NINETOOTHED DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif -#endif #ifdef ENABLE_ILUVATAR_API -#ifdef ENABLE_NINETOOTHED DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); #endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc index 625e5d945..9c18c2d97 100644 --- a/src/infiniop/ops/rms_norm/operator.cc +++ b/src/infiniop/ops/rms_norm/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/rms_norm_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/rms_norm_nvidia.cuh" #endif #ifdef ENABLE_ASCEND_API @@ -52,6 +52,9 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -94,6 +97,9 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); #endif @@ -137,6 +143,9 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -179,6 +188,9 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t #ifdef ENABLE_ILUVATAR_API DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DESTROY(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/rope/operator.cc b/src/infiniop/ops/rope/operator.cc index feb0a1716..73e66691f 100644 --- a/src/infiniop/ops/rope/operator.cc +++ b/src/infiniop/ops/rope/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/rope_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/rope_nvidia.cuh" #endif #ifdef ENABLE_ASCEND_API @@ -56,6 +56,9 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -98,6 +101,9 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); #endif @@ -149,6 +155,9 @@ __C infiniStatus_t infiniopRoPE( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -192,6 +201,9 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DELETE(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/softplus/operator.cc b/src/infiniop/ops/softplus/operator.cc index 2548f7d34..0511f5fcb 100644 --- a/src/infiniop/ops/softplus/operator.cc +++ b/src/infiniop/ops/softplus/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/softplus_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/softplus_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -37,6 +37,9 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -62,6 +65,9 @@ __C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -95,6 +101,9 @@ __C infiniStatus_t infiniopSoftplus( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -122,6 +131,9 @@ infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/sub/operator.cc b/src/infiniop/ops/sub/operator.cc index ad1ba4b81..3e985da90 100644 --- a/src/infiniop/ops/sub/operator.cc +++ b/src/infiniop/ops/sub/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/sub_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/sub_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -73,6 +76,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -113,6 +119,9 @@ __C infiniStatus_t infiniopSub( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -146,6 +155,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/swiglu/operator.cc b/src/infiniop/ops/swiglu/operator.cc index 4d4c3ae78..f30f2d2a8 100644 --- a/src/infiniop/ops/swiglu/operator.cc +++ b/src/infiniop/ops/swiglu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/swiglu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) #include "nvidia/swiglu_nvidia.cuh" #endif #ifdef ENABLE_KUNLUN_API @@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CREATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -94,6 +97,9 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API GET(INFINI_DEVICE_HYGON, nvidia); #endif @@ -144,6 +150,9 @@ __C infiniStatus_t infiniopSwiGLU( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API CALCULATE(INFINI_DEVICE_HYGON, nvidia); #endif @@ -189,6 +198,9 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif #ifdef ENABLE_HYGON_API DELETE(INFINI_DEVICE_HYGON, nvidia); #endif diff --git a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu index e44872fcc..905253abd 100644 --- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu +++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu @@ -1,4 +1,4 @@ -#ifdef ENABLE_NVIDIA_API +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include "../../../devices/nvidia/nvidia_common.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" diff --git a/src/infiniop/ops/topkrouter/operator.cc b/src/infiniop/ops/topkrouter/operator.cc index 4d43c77ce..7f93a8cf5 100644 --- a/src/infiniop/ops/topkrouter/operator.cc +++ b/src/infiniop/ops/topkrouter/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/topkrouter_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) #include "nvidia/topkrouter_nvidia.cuh" #endif @@ -28,6 +28,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor( #endif #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); #endif } @@ -49,6 +52,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript #endif #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); #endif } @@ -71,6 +77,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void #endif #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); #endif } @@ -92,6 +101,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip #endif #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DESTROY(INFINI_DEVICE_QY, nvidia); #endif } diff --git a/src/infinirt/cuda/infinirt_cuda.cuh b/src/infinirt/cuda/infinirt_cuda.cuh index 99b4a30dc..fddd2d89f 100644 --- a/src/infinirt/cuda/infinirt_cuda.cuh +++ b/src/infinirt/cuda/infinirt_cuda.cuh @@ -3,7 +3,7 @@ #include "../infinirt_impl.h" namespace infinirt::cuda { -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_QY_API) INFINIRT_DEVICE_API_IMPL #else INFINIRT_DEVICE_API_NOOP diff --git a/src/infinirt/infinirt.cc b/src/infinirt/infinirt.cc index 119771475..068412955 100644 --- a/src/infinirt/infinirt.cc +++ b/src/infinirt/infinirt.cc @@ -23,7 +23,7 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) { return INFINI_STATUS_NULL_POINTER; } for (size_t i = 0; i < INFINI_DEVICE_TYPE_COUNT; i++) { - if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_KUNLUN || i == INFINI_DEVICE_HYGON) { + if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_QY || i == INFINI_DEVICE_KUNLUN || i == INFINI_DEVICE_HYGON) { count_array[i] = 0; continue; } @@ -80,6 +80,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_ case INFINI_DEVICE_HYGON: \ _status = infinirt::cuda::API PARAMS; \ break; \ + case INFINI_DEVICE_QY: \ + _status = infinirt::cuda::API PARAMS; \ + break; \ default: \ _status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ } \ diff --git a/test/infiniop/libinfiniop/devices.py b/test/infiniop/libinfiniop/devices.py index 6fcc0b32e..fc4e9e6c5 100644 --- a/test/infiniop/libinfiniop/devices.py +++ b/test/infiniop/libinfiniop/devices.py @@ -8,7 +8,7 @@ class InfiniDeviceEnum: ILUVATAR = 6 KUNLUN = 7 HYGON = 8 - + QY = 9 InfiniDeviceNames = { InfiniDeviceEnum.CPU: "CPU", @@ -20,6 +20,7 @@ class InfiniDeviceEnum: InfiniDeviceEnum.ILUVATAR: "Iluvatar", InfiniDeviceEnum.KUNLUN: "Kunlun", InfiniDeviceEnum.HYGON: "Hygon", + InfiniDeviceEnum.QY: "Qy", } # Mapping that maps InfiniDeviceEnum to torch device string @@ -33,4 +34,5 @@ class InfiniDeviceEnum: InfiniDeviceEnum.ILUVATAR: "cuda", InfiniDeviceEnum.KUNLUN: "cuda", InfiniDeviceEnum.HYGON: "cuda", + InfiniDeviceEnum.QY: "cuda", } diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 162b199fe..2b00f2f93 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -316,6 +316,11 @@ def get_args(): action="store_true", help="Run Iluvatar GPU test", ) + parser.add_argument( + "--qy", + action="store_true", + help="Run QY GPU test", + ) parser.add_argument( "--cambricon", action="store_true", @@ -630,6 +635,8 @@ def get_test_devices(args): devices_to_test.append(InfiniDeviceEnum.NVIDIA) if args.iluvatar: devices_to_test.append(InfiniDeviceEnum.ILUVATAR) + if args.qy: + devices_to_test.append(InfiniDeviceEnum.QY) if args.cambricon: import torch_mlu diff --git a/xmake.lua b/xmake.lua index 2985456be..898ad4042 100644 --- a/xmake.lua +++ b/xmake.lua @@ -100,6 +100,18 @@ if has_config("iluvatar-gpu") then includes("xmake/iluvatar.lua") end +-- QY +option("qy-gpu") + set_default(false) + set_showmenu(true) + set_description("Whether to compile implementations for Qy GPU") +option_end() + +if has_config("qy-gpu") then + add_defines("ENABLE_QY_API") + includes("xmake/qy.lua") +end + -- 沐曦 option("metax-gpu") set_default(false) @@ -201,6 +213,7 @@ target("infinirt") end if has_config("nv-gpu") then add_deps("infinirt-nvidia") + end if has_config("cambricon-mlu") then add_deps("infinirt-cambricon") @@ -217,6 +230,10 @@ target("infinirt") if has_config("iluvatar-gpu") then add_deps("infinirt-iluvatar") end + if has_config("qy-gpu") then + add_deps("infinirt-qy") + add_files("build/.objs/infinirt-qy/rules/qy.cuda/src/infinirt/cuda/*.cu.o", {public = true}) + end if has_config("kunlun-xpu") then add_deps("infinirt-kunlun") end @@ -242,6 +259,11 @@ target("infiniop") if has_config("iluvatar-gpu") then add_deps("infiniop-iluvatar") end + if has_config("qy-gpu") then + add_deps("infiniop-qy") + add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true}) + add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true}) + end if has_config("cambricon-mlu") then add_deps("infiniop-cambricon") @@ -292,6 +314,10 @@ target("infiniccl") if has_config("iluvatar-gpu") then add_deps("infiniccl-iluvatar") end + if has_config("qy-gpu") then + add_deps("infiniccl-qy") + add_files("build/.objs/infiniccl-qy/rules/qy.cuda/src/infiniccl/cuda/*.cu.o", {public = true}) + end if has_config("moore-gpu") then add_deps("infiniccl-moore") diff --git a/xmake/qy.lua b/xmake/qy.lua new file mode 100644 index 000000000..4ac5c29b9 --- /dev/null +++ b/xmake/qy.lua @@ -0,0 +1,154 @@ +local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH") +if CUDNN_ROOT ~= nil then + add_includedirs(CUDNN_ROOT .. "/include") +end +add_includedirs("/usr/local/denglin/sdk/include", "../include") +add_linkdirs("/usr/local/denglin/sdk/lib") +add_links("curt", "cublas", "cudnn") +set_languages("cxx17") +add_cxxflags("-std=c++17") -- 显式设置 C++17 +add_cuflags("--std=c++17",{force = true}) -- 确保 CUDA 编译器也使用 C++17 +rule("ignore.o") + set_extensions(".o") -- 防止 xmake 默认处理 + on_build_files(function () end) + +rule("qy.cuda") + set_extensions(".cu") + + -- 缓存所有 .o 文件路径 + local qy_objfiles = {} + + on_load(function (target) + target:add("includedirs", "/usr/local/denglin/sdk/include") + end) + + after_load(function (target) + -- 过滤 cudadevrt/cudart_static + local links = target:get("syslinks") or {} + local filtered = {} + for _, link in ipairs(links) do + if link ~= "cudadevrt" and link ~= "cudart_static" then + table.insert(filtered, link) + end + end + target:set("syslinks", filtered) + end) + + on_buildcmd_file(function (target, batchcmds, sourcefile, opt) + import("core.project.project") + import("core.project.config") + import("core.base.option") + + local dlcc = "/usr/local/denglin/sdk/bin/dlcc" + local sdk_path = "/usr/local/denglin/sdk" + local arch = "dlgput64" + + local relpath = path.relative(sourcefile, project.directory()) + local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o") + + -- 🟢 强制注册 .o 文件给 target + target:add("objectfiles", objfile) + target:set("buildadd", true) + local argv = { + "-c", sourcefile, + "-o", objfile, + "--cuda-path=" .. sdk_path, + "--cuda-gpu-arch=" .. arch, + "-std=c++17", "-O2", "-fPIC" + } + + for _, incdir in ipairs(target:get("includedirs") or {}) do + table.insert(argv, "-I" .. incdir) + end + for _, def in ipairs(target:get("defines") or {}) do + table.insert(argv, "-D" .. def) + end + + batchcmds:mkdir(path.directory(objfile)) + batchcmds:show_progress(opt.progress, "${color.build.object}compiling.dlcu %s", relpath) + batchcmds:vrunv(dlcc, argv) + end) +target("infiniop-qy") + set_kind("static") + add_deps("infini-utils") + on_install(function (target) end) + + add_rules("qy.cuda", {override = true}) + + if is_plat("windows") then + add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") + add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX") + add_cxxflags("/FS") + if CUDNN_ROOT ~= nil then + add_linkdirs(CUDNN_ROOT .. "\\lib\\x64") + end + else + add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror") + add_cuflags("-Xcompiler=-fPIC") + add_cuflags("--extended-lambda") + add_culdflags("-Xcompiler=-fPIC") + add_cxxflags("-fPIC") + add_cuflags("--expt-relaxed-constexpr") + if CUDNN_ROOT ~= nil then + add_linkdirs(CUDNN_ROOT .. "/lib") + end + end + + add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations") + + set_languages("cxx17") + add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu") + + if has_config("ninetoothed") then + add_files("../build/ninetoothed/*.c") + end +target_end() + +target("infinirt-qy") + set_kind("static") + add_deps("infini-utils") + on_install(function (target) end) + + add_rules("qy.cuda", {override = true}) + + if is_plat("windows") then + add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") + add_cxxflags("/FS") + else + add_cuflags("-Xcompiler=-fPIC") + add_culdflags("-Xcompiler=-fPIC") + add_cxflags("-fPIC") + end + + set_languages("cxx17") + add_files("../src/infinirt/cuda/*.cu") +target_end() + +target("infiniccl-qy") + set_kind("static") + add_deps("infinirt") + on_install(function (target) end) + if has_config("ccl") then + add_rules("qy.cuda", {override = true}) + + if not is_plat("windows") then + add_cuflags("-Xcompiler=-fPIC") + add_culdflags("-Xcompiler=-fPIC") + add_cxflags("-fPIC") + + local nccl_root = os.getenv("NCCL_ROOT") + if nccl_root then + add_includedirs(nccl_root .. "/include") + add_links(nccl_root .. "/lib/libnccl.so") + else + add_links("nccl") -- Fall back to default nccl linking + end + + add_files("../src/infiniccl/cuda/*.cu") + else + print("[Warning] NCCL is not supported on Windows") + end + end + set_languages("cxx17") + +target_end() From 5b508d2c096ba8c4a8129de4cdcb902ff1d94d39 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 23 Dec 2025 16:47:19 +0800 Subject: [PATCH 2/2] issue/808: support page_attention head_size=64 --- .../nvidia/paged_attention_nvidia.cu | 24 +++++++++++++++++++ test/infiniop/paged_attention.py | 5 ++++ 2 files changed, 29 insertions(+) diff --git a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu index d9e612c8e..de4327653 100644 --- a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu +++ b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu @@ -104,6 +104,14 @@ infiniStatus_t Descriptor::calculate( _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, stream); + } else if (_info.head_size == 64) { + launchKernel<64, CUDA_BLOCK_SIZE_1024>( + out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, + _info.num_heads, _info.num_seqs, + _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, + _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, + stream); + } else { printf("head_size: %zu\n", _info.head_size); return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -117,6 +125,14 @@ infiniStatus_t Descriptor::calculate( _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, stream); + } else if (_info.head_size == 64) { + launchKernel<64, CUDA_BLOCK_SIZE_512>( + out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, + _info.num_heads, _info.num_seqs, + _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, + _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, + stream); + } else { printf("head_size: %zu\n", _info.head_size); return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -129,6 +145,14 @@ infiniStatus_t Descriptor::calculate( _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, stream); + } else if (_info.head_size == 64) { + launchKernel<64, CUDA_BLOCK_SIZE_4096>( + out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, + _info.num_heads, _info.num_seqs, + _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, + _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, + stream); + } else { printf("head_size: %zu", _info.head_size); return INFINI_STATUS_BAD_TENSOR_SHAPE; diff --git a/test/infiniop/paged_attention.py b/test/infiniop/paged_attention.py index 4216ef8ab..a86404965 100644 --- a/test/infiniop/paged_attention.py +++ b/test/infiniop/paged_attention.py @@ -97,6 +97,11 @@ def ref_single_query_cached_kv_attention( (1, 40, 40, 128, 16, 1024, False), (1, 8, 8, 128, 16, 1024, False), (1, 64, 8, 128, 16, 2048, False), + (1, 1, 1, 64, 16, 1024, False), + (1, 40, 40, 64, 16, 1024, False), + (1, 40, 40, 64, 16, 1024, False), + (1, 8, 8, 64, 16, 1024, False), + (1, 64, 8, 64, 16, 2048, False), ] # Data types for testing