diff --git a/include/infinicore/ops/acos.hpp b/include/infinicore/ops/acos.hpp
new file mode 100644
index 000000000..349b2a87e
--- /dev/null
+++ b/include/infinicore/ops/acos.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Acos {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor acos(Tensor input);
+void acos_(Tensor output, Tensor input);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/adaptive_avg_pool1d.hpp b/include/infinicore/ops/adaptive_avg_pool1d.hpp
new file mode 100644
index 000000000..2b94b36db
--- /dev/null
+++ b/include/infinicore/ops/adaptive_avg_pool1d.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class AdaptiveAvgPool1d {
+public:
+    // Schema: execute(Output, Input)
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor adaptive_avg_pool1d(Tensor input, int64_t output_size);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/addbmm.hpp b/include/infinicore/ops/addbmm.hpp
new file mode 100644
index 000000000..eacbb373a
--- /dev/null
+++ b/include/infinicore/ops/addbmm.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Addbmm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float, float);
+    static void execute(Tensor output, Tensor input, Tensor batch1, Tensor batch2, float beta, float alpha);
+    
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor addbmm(Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f);
+
+void addbmm_(Tensor output, Tensor input, Tensor batch1, Tensor batch2, float beta, float alpha);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/affine_grid.hpp b/include/infinicore/ops/affine_grid.hpp
new file mode 100644
index 000000000..2fa4c648b
--- /dev/null
+++ b/include/infinicore/ops/affine_grid.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <vector>
+
+namespace infinicore::op {
+
+class AffineGrid {
+public:
+    using schema = void (*)(Tensor, Tensor, bool);
+    static void execute(Tensor output, Tensor theta, bool align_corners);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor affine_grid(Tensor theta, const std::vector<int64_t>& size, bool align_corners = false);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/floor.hpp b/include/infinicore/ops/floor.hpp
new file mode 100644
index 000000000..2eb829ba1
--- /dev/null
+++ b/include/infinicore/ops/floor.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Floor {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor floor(Tensor input);
+void floor_(Tensor output, Tensor input);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infiniop.h b/include/infiniop.h
index 92e6f5963..a9f9520eb 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -2,7 +2,11 @@
 #define __INFINIOP_API_H__
 
 #include "infiniop/handle.h"
+#include "infiniop/ops/acos.h"
 #include "infiniop/ops/add.h"
+#include "infiniop/ops/adaptive_avg_pool1d.h"
+#include "infiniop/ops/addbmm.h"
+#include "infiniop/ops/affine_grid.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h
new file mode 100644
index 000000000..f3ff8532c
--- /dev/null
+++ b/include/infiniop/ops/acos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ACOS_API_H__
+#define __INFINIOP_ACOS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle,
+                                                         infiniopAcosDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y, 
+                                                         infiniopTensorDescriptor_t x );
+
+__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,      
+                                         const void *x, 
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc);
+
+#endif 
\ No newline at end of file
diff --git a/include/infiniop/ops/adaptive_avg_pool1d.h b/include/infiniop/ops/adaptive_avg_pool1d.h
new file mode 100644
index 000000000..6c83945a7
--- /dev/null
+++ b/include/infiniop/ops/adaptive_avg_pool1d.h
@@ -0,0 +1,34 @@
+#ifndef __INFINIOP_ADAPTIVE_AVG_POOL1D_API_H__
+#define __INFINIOP_ADAPTIVE_AVG_POOL1D_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义算子描述符类型
+typedef struct InfiniopDescriptor *infiniopAdaptiveAvgPool1dDescriptor_t;
+
+// 1. 创建算子描述符
+__C __export infiniStatus_t infiniopCreateAdaptiveAvgPool1dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAdaptiveAvgPool1dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc);
+
+// 2. 获取 Workspace 大小
+__C __export infiniStatus_t infiniopGetAdaptiveAvgPool1dWorkspaceSize(
+    infiniopAdaptiveAvgPool1dDescriptor_t desc, 
+    size_t *size);
+
+// 3. 执行计算
+__C __export infiniStatus_t infiniopAdaptiveAvgPool1d(
+    infiniopAdaptiveAvgPool1dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+// 4. 销毁描述符
+__C __export infiniStatus_t infiniopDestroyAdaptiveAvgPool1dDescriptor(
+    infiniopAdaptiveAvgPool1dDescriptor_t desc);
+
+#endif // __INFINIOP_ADAPTIVE_AVG_POOL1D_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/addbmm.h b/include/infiniop/ops/addbmm.h
new file mode 100644
index 000000000..5286d7a4d
--- /dev/null
+++ b/include/infiniop/ops/addbmm.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_ADDBMM_API_H__
+#define __INFINIOP_ADDBMM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAddbmmDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAddbmmDescriptor(infiniopHandle_t handle,
+                                                           infiniopAddbmmDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t out_desc,
+                                                           infiniopTensorDescriptor_t input_desc,
+                                                           infiniopTensorDescriptor_t batch1_desc,
+                                                           infiniopTensorDescriptor_t batch2_desc,
+                                                           float alpha,
+                                                           float beta);
+
+__C __export infiniStatus_t infiniopGetAddbmmWorkspaceSize(infiniopAddbmmDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAddbmm(infiniopAddbmmDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *output,
+                                           const void *input,
+                                           const void *batch1,
+                                           const void *batch2,
+                                           void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAddbmmDescriptor(infiniopAddbmmDescriptor_t desc);
+
+#endif // __INFINIOP_ADDBMM_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/affine_grid.h b/include/infiniop/ops/affine_grid.h
new file mode 100644
index 000000000..84fbb673d
--- /dev/null
+++ b/include/infiniop/ops/affine_grid.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_AFFINE_GRID_API_H__
+#define __INFINIOP_AFFINE_GRID_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAffineGridDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAffineGridDescriptor(infiniopHandle_t handle,
+                                                               infiniopAffineGridDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t output_desc,
+                                                               infiniopTensorDescriptor_t input_desc,
+                                                               uint8_t align_corners);
+
+__C __export infiniStatus_t infiniopGetAffineGridWorkspaceSize(infiniopAffineGridDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAffineGrid(infiniopAffineGridDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *output,
+                                               const void *input,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAffineGridDescriptor(infiniopAffineGridDescriptor_t desc);
+
+#endif // __INFINIOP_AFFINE_GRID_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h
new file mode 100644
index 000000000..7b76e0cfc
--- /dev/null
+++ b/include/infiniop/ops/floor.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_FLOOR_API_H__
+#define __INFINIOP_FLOOR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle,
+                                                         infiniopFloorDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t intput);
+
+__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *intput,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc);
+
+#endif
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 5c541ec3c..b1acb8c7b 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -40,6 +40,9 @@
     uint8,
 )
 from infinicore.ops.add import add
+from infinicore.ops.acos import acos
+from infinicore.ops.floor import floor
+from infinicore.ops.addbmm import addbmm
 from infinicore.ops.attention import attention
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
@@ -100,6 +103,9 @@
     "uint8",
     # Operations.
     "add",
+    "acos",
+    "addbmm",
+    "floor",
     "attention",
     "matmul",
     "mul",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..a15b2a028 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,10 +6,13 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
-
+from .adaptive_avg_pool1d import adaptive_avg_pool1d
+from .affine_grid import affine_grid  
 __all__ = [
     "causal_softmax",
     "random_sample",
+    "adaptive_avg_pool1d",
+    "affine_grid",  
     "rms_norm",
     "silu",
     "swiglu",
diff --git a/python/infinicore/nn/functional/adaptive_avg_pool1d.py b/python/infinicore/nn/functional/adaptive_avg_pool1d.py
new file mode 100644
index 000000000..b0e70d034
--- /dev/null
+++ b/python/infinicore/nn/functional/adaptive_avg_pool1d.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def adaptive_avg_pool1d(input: Tensor, output_size: int) -> Tensor:
+    r"""Apply a 1D adaptive average pooling."""
+    return Tensor(_infinicore.adaptive_avg_pool1d(input._underlying, output_size))
\ No newline at end of file
diff --git a/python/infinicore/nn/functional/affine_grid.py b/python/infinicore/nn/functional/affine_grid.py
new file mode 100644
index 000000000..9ce5323bd
--- /dev/null
+++ b/python/infinicore/nn/functional/affine_grid.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def affine_grid(theta: Tensor, size: list[int], align_corners: bool = False) -> Tensor:
+    r"""Generates a 2D flow field (sampling grid), given a batch of affine matrices theta."""
+    
+    # 直接调用底层绑定
+    # theta._underlying: 传递底层 C++ Tensor 对象
+    # size: Python list[int] 自动转换为 C++ std::vector<int64_t>
+    return Tensor(_infinicore.affine_grid(theta._underlying, size, align_corners))
\ No newline at end of file
diff --git a/python/infinicore/ops/acos.py b/python/infinicore/ops/acos.py
new file mode 100644
index 000000000..33c57cbaf
--- /dev/null
+++ b/python/infinicore/ops/acos.py
@@ -0,0 +1,9 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def acos(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.acos(input._underlying))
+    _infinicore.acos_(out._underlying, input._underlying)
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/addbmm.py b/python/infinicore/ops/addbmm.py
new file mode 100644
index 000000000..858b8f37e
--- /dev/null
+++ b/python/infinicore/ops/addbmm.py
@@ -0,0 +1,25 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def addbmm(input, batch1, batch2, *, beta=1.0, alpha=1.0, out=None):
+    # 1. Out-of-place 模式 (如果没有指定 out)
+    if out is None:
+        return Tensor(_infinicore.addbmm(
+            input._underlying, 
+            batch1._underlying, 
+            batch2._underlying, 
+            beta, 
+            alpha
+        ))
+
+    # 2. In-place 模式 (指定了 out)
+    _infinicore.addbmm_(
+        out._underlying, 
+        input._underlying, 
+        batch1._underlying, 
+        batch2._underlying, 
+        beta, 
+        alpha
+    )
+
+    return out
diff --git a/python/infinicore/ops/floor.py b/python/infinicore/ops/floor.py
new file mode 100644
index 000000000..c797c2112
--- /dev/null
+++ b/python/infinicore/ops/floor.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def floor(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.floor(input._underlying))
+
+    _infinicore.floor_(out._underlying, input._underlying)
+
+    return out
\ No newline at end of file
diff --git a/src/infinicore/ops/acos/acos.cc b/src/infinicore/ops/acos/acos.cc
new file mode 100644
index 000000000..57c67e4d8
--- /dev/null
+++ b/src/infinicore/ops/acos/acos.cc
@@ -0,0 +1,24 @@
+#include "infinicore/ops/acos.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Acos::schema> &Acos::dispatcher() {
+    static common::OpDispatcher<Acos::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Acos::execute(Tensor output, Tensor input) {
+    dispatcher().lookup(context::getDevice().getType())(output, input);
+}
+
+Tensor acos(Tensor input) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    acos_(output, input);
+    return output;
+}
+
+void acos_(Tensor output, Tensor input) {
+    Acos::execute(output, input);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/acos/acos_infiniop.cc b/src/infinicore/ops/acos/acos_infiniop.cc
new file mode 100644
index 000000000..f59218e27
--- /dev/null
+++ b/src/infinicore/ops/acos/acos_infiniop.cc
@@ -0,0 +1,89 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/acos.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+#include <unordered_map>
+
+namespace infinicore::op::acos_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAcosDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAcosDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAcosDescriptor(desc));
+            desc = nullptr;
+        }
+    }
+);
+
+
+struct WorkspaceEntry {
+    size_t size = 0;
+    std::shared_ptr<Memory> buf = nullptr;
+};
+
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    // 获取或创建 descriptor
+    auto desc_opt = cache.get(seed);
+    infiniopAcosDescriptor_t desc = nullptr;
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAcosDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+    static thread_local std::unordered_map<infiniopAcosDescriptor_t, WorkspaceEntry> s_workspace_map;
+    auto it = s_workspace_map.find(desc);
+
+    if (it == s_workspace_map.end()) {
+        size_t workspace_size = 0;
+        INFINICORE_CHECK_ERROR(infiniopGetAcosWorkspaceSize(desc, &workspace_size));
+
+        WorkspaceEntry entry;
+        if (workspace_size > 0) {
+            entry.buf = context::allocateMemory(workspace_size);
+            entry.size = workspace_size;
+        } else {
+            entry.buf = nullptr;
+            entry.size = 0;
+        }
+        it = s_workspace_map.emplace(desc, std::move(entry)).first;
+    } else {
+        
+        size_t required_size = 0;
+        INFINICORE_CHECK_ERROR(infiniopGetAcosWorkspaceSize(desc, &required_size));
+        if (required_size > it->second.size) {
+            it->second.buf = context::allocateMemory(required_size);
+            it->second.size = required_size;
+        }
+    }
+    void* workspace_ptr = (it != s_workspace_map.end() && it->second.buf) ? it->second.buf->data() : nullptr;
+    size_t workspace_size = (it != s_workspace_map.end()) ? it->second.size : 0;
+    INFINICORE_CHECK_ERROR(infiniopAcos(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()
+    ));
+}
+
+
+static bool registered = []() {
+    Acos::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::acos_impl::infiniop
diff --git a/src/infinicore/ops/adaptive_avg_pool1d/adaptive_avg_pool1d.cc b/src/infinicore/ops/adaptive_avg_pool1d/adaptive_avg_pool1d.cc
new file mode 100644
index 000000000..06d267131
--- /dev/null
+++ b/src/infinicore/ops/adaptive_avg_pool1d/adaptive_avg_pool1d.cc
@@ -0,0 +1,43 @@
+#include "infinicore/ops/adaptive_avg_pool1d.hpp"
+#include <stdexcept>
+#include <vector>
+
+namespace infinicore::op {
+
+common::OpDispatcher<AdaptiveAvgPool1d::schema> &AdaptiveAvgPool1d::dispatcher() {
+    static common::OpDispatcher<AdaptiveAvgPool1d::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void AdaptiveAvgPool1d::execute(Tensor output, Tensor input) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No AdaptiveAvgPool1d implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor adaptive_avg_pool1d(Tensor input, int64_t output_size) {
+    size_t ndim = input->ndim();
+    if (ndim != 2 && ndim != 3) {
+        throw std::runtime_error("AdaptiveAvgPool1d: Input tensor must be 2D or 3D.");
+    }
+
+    if (output_size <= 0) {
+        throw std::runtime_error("AdaptiveAvgPool1d: output_size must be positive.");
+    }
+
+    auto out_shape = input->shape();
+    out_shape[ndim - 1] = output_size;
+
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+
+    AdaptiveAvgPool1d::execute(output, input);
+
+    return output;
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/adaptive_avg_pool1d/adaptive_avg_pool1d_infiniop.cc b/src/infinicore/ops/adaptive_avg_pool1d/adaptive_avg_pool1d_infiniop.cc
new file mode 100644
index 000000000..af1946ce8
--- /dev/null
+++ b/src/infinicore/ops/adaptive_avg_pool1d/adaptive_avg_pool1d_infiniop.cc
@@ -0,0 +1,96 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/adaptive_avg_pool1d.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+#include <vector>
+
+namespace infinicore::op::adaptive_avg_pool1d_impl::infiniop {
+
+// 1. 资源上下文
+struct AdaptiveAvgPool1dContext {
+    infiniopAdaptiveAvgPool1dDescriptor_t desc = nullptr;
+    std::shared_ptr<Memory> workspace_buf = nullptr;
+    size_t workspace_size = 0;
+
+    void* getWorkspacePtr() const {
+        return workspace_buf ? workspace_buf->data() : nullptr;
+    }
+};
+
+// 2. 缓存定义
+thread_local common::OpCache<size_t, AdaptiveAvgPool1dContext> caches(
+    256, 
+    [](AdaptiveAvgPool1dContext &ctx) {
+        if (ctx.desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAdaptiveAvgPool1dDescriptor(ctx.desc));
+            ctx.desc = nullptr;
+        }
+        ctx.workspace_buf = nullptr;
+    }
+);
+
+// 3. 核心计算函数
+void calculate(Tensor output, Tensor input) {
+    size_t seed = reinterpret_cast<size_t>(input.operator->());
+    if (output->ndim() >= 3) {
+        seed ^= (output->shape()[2] << 1); 
+    }
+
+    static thread_local size_t last_seed = 0;
+    static thread_local bool last_ctx_valid = false;
+    static thread_local AdaptiveAvgPool1dContext last_ctx;
+
+    AdaptiveAvgPool1dContext* active_ctx = nullptr;
+
+    if (last_ctx_valid && seed == last_seed) {
+        active_ctx = &last_ctx;
+    } else {
+        auto device_type = context::getDevice().getType();
+        auto device_index = context::getDevice().getIndex();
+        auto &cache = caches.getCache(device_type, device_index);
+
+        auto opt_ctx = cache.get(seed);
+        if (opt_ctx) {
+            last_ctx = *opt_ctx;
+        } else {
+            AdaptiveAvgPool1dContext new_ctx;
+
+            INFINICORE_CHECK_ERROR(infiniopCreateAdaptiveAvgPool1dDescriptor(
+                context::getInfiniopHandle(output->device()), 
+                &new_ctx.desc,
+                output->desc(), 
+                input->desc()));
+
+            INFINICORE_CHECK_ERROR(infiniopGetAdaptiveAvgPool1dWorkspaceSize(new_ctx.desc, &new_ctx.workspace_size));
+
+            if (new_ctx.workspace_size > 0) {
+                new_ctx.workspace_buf = context::allocateMemory(new_ctx.workspace_size);
+            }
+
+            cache.put(seed, new_ctx);
+            last_ctx = new_ctx;
+        }
+
+        last_seed = seed;
+        last_ctx_valid = true;
+        active_ctx = &last_ctx;
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopAdaptiveAvgPool1d(
+        active_ctx->desc,
+        active_ctx->getWorkspacePtr(),
+        active_ctx->workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()
+    ));
+}
+
+// 注册
+static bool registered = []() {
+    AdaptiveAvgPool1d::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::adaptive_avg_pool1d_impl::infiniop
diff --git a/src/infinicore/ops/addbmm/addbmm.cc b/src/infinicore/ops/addbmm/addbmm.cc
new file mode 100644
index 000000000..e129c3871
--- /dev/null
+++ b/src/infinicore/ops/addbmm/addbmm.cc
@@ -0,0 +1,33 @@
+#include "infinicore/ops/addbmm.hpp"
+#include "infinicore/ops/addbmm.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+// 1. 初始化 Dispatcher
+common::OpDispatcher<Addbmm::schema> &Addbmm::dispatcher() {
+    static common::OpDispatcher<Addbmm::schema> dispatcher_;
+    return dispatcher_;
+};
+
+
+void Addbmm::execute(Tensor output, Tensor input, Tensor batch1, Tensor batch2, float beta, float alpha) {
+    
+    // 切换上下文
+    infinicore::context::setDevice(output->device());
+
+    // 分发计算
+    dispatcher().lookup(output->device().getType())(output, input, batch1, batch2, beta, alpha);
+}
+
+Tensor addbmm(Tensor input, Tensor batch1, Tensor batch2, float beta, float alpha) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    Addbmm::execute(output, input, batch1, batch2, beta, alpha);
+    return output;
+}
+
+void addbmm_(Tensor output, Tensor input, Tensor batch1, Tensor batch2, float beta, float alpha) {
+    Addbmm::execute(output, input, batch1, batch2, beta, alpha);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/addbmm/addbmm_infiniop.cc b/src/infinicore/ops/addbmm/addbmm_infiniop.cc
new file mode 100644
index 000000000..0d0fe55e2
--- /dev/null
+++ b/src/infinicore/ops/addbmm/addbmm_infiniop.cc
@@ -0,0 +1,107 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/addbmm.hpp" 
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+#include <vector>
+
+namespace infinicore::op::addbmm_impl::infiniop {
+
+struct AddbmmContext {
+    infiniopAddbmmDescriptor_t desc = nullptr;
+    std::shared_ptr<Memory> workspace_buf = nullptr;
+    size_t workspace_size = 0;
+
+    void* getWorkspacePtr() const {
+        return workspace_buf ? workspace_buf->data() : nullptr;
+    }
+};
+
+thread_local common::OpCache<size_t, AddbmmContext> caches(
+    256, 
+    [](AddbmmContext &ctx) {
+        if (ctx.desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAddbmmDescriptor(ctx.desc));
+            ctx.desc = nullptr;
+        }
+        ctx.workspace_buf = nullptr;
+    }
+);
+
+inline size_t compute_key(const Tensor& output, const Tensor& input, 
+                          const Tensor& batch1, const Tensor& batch2, 
+                          float beta, float alpha) {
+    size_t seed = 0;
+    infinicore::hash_combine(seed, reinterpret_cast<size_t>(output.operator->()));
+    infinicore::hash_combine(seed, reinterpret_cast<size_t>(input.operator->()));
+    infinicore::hash_combine(seed, reinterpret_cast<size_t>(batch1.operator->()));
+    infinicore::hash_combine(seed, reinterpret_cast<size_t>(batch2.operator->()));
+    infinicore::hash_combine(seed, beta);
+    infinicore::hash_combine(seed, alpha);
+    return seed;
+}
+
+void calculate(Tensor output, Tensor input, Tensor batch1, Tensor batch2, float beta, float alpha) {
+    size_t seed = compute_key(output, input, batch1, batch2, beta, alpha);
+
+    static thread_local size_t last_seed = 0;
+    static thread_local bool last_ctx_valid = false;
+    static thread_local AddbmmContext last_ctx;
+
+    AddbmmContext* ctx_ptr = nullptr;
+
+    if (last_ctx_valid && seed == last_seed) {
+        ctx_ptr = &last_ctx;
+    } else {
+        auto device_type = context::getDevice().getType();
+        auto device_index = context::getDevice().getIndex();
+        auto &cache = caches.getCache(device_type, device_index);
+
+        auto opt_ctx = cache.get(seed);
+        if (opt_ctx) {
+            last_ctx = *opt_ctx;
+        } else {
+            AddbmmContext new_ctx;
+            
+            INFINICORE_CHECK_ERROR(infiniopCreateAddbmmDescriptor(
+                context::getInfiniopHandle(output->device()), 
+                &new_ctx.desc,
+                output->desc(), 
+                input->desc(), 
+                batch1->desc(), 
+                batch2->desc(),
+                alpha,
+                beta));
+
+            INFINICORE_CHECK_ERROR(infiniopGetAddbmmWorkspaceSize(new_ctx.desc, &new_ctx.workspace_size));
+            
+            if (new_ctx.workspace_size > 0) {
+                new_ctx.workspace_buf = context::allocateMemory(new_ctx.workspace_size);
+            }
+
+            cache.put(seed, new_ctx);
+            last_ctx = new_ctx;
+        }
+
+        last_seed = seed;
+        last_ctx_valid = true;
+        ctx_ptr = &last_ctx;
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopAddbmm(
+        ctx_ptr->desc, 
+        ctx_ptr->getWorkspacePtr(), 
+        ctx_ptr->workspace_size,
+        output->data(), 
+        input->data(), 
+        batch1->data(), 
+        batch2->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    Addbmm::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::addbmm_impl::infiniop
diff --git a/src/infinicore/ops/affine_grid/affine_grid.cc b/src/infinicore/ops/affine_grid/affine_grid.cc
new file mode 100644
index 000000000..89365ca54
--- /dev/null
+++ b/src/infinicore/ops/affine_grid/affine_grid.cc
@@ -0,0 +1,58 @@
+#include "infinicore/ops/affine_grid.hpp"
+#include <stdexcept>
+#include <vector>
+#include <string>
+
+namespace infinicore::op {
+
+common::OpDispatcher<AffineGrid::schema> &AffineGrid::dispatcher() {
+    static common::OpDispatcher<AffineGrid::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void AffineGrid::execute(Tensor output, Tensor theta, bool align_corners) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No AffineGrid implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, theta, align_corners);
+}
+
+Tensor affine_grid(Tensor theta, const std::vector<int64_t>& size, bool align_corners) {
+    if (theta->ndim() != 3) {
+        throw std::runtime_error("AffineGrid: Theta tensor must be 3D (N, 2, 3).");
+    }
+    if (theta->shape()[1] != 2 || theta->shape()[2] != 3) {
+        throw std::runtime_error("AffineGrid: Theta tensor shape must be (N, 2, 3).");
+    }
+
+    if (size.size() != 4) {
+        throw std::runtime_error("AffineGrid: target size length must be 4 (N, C, H, W).");
+    }
+
+    if (static_cast<int64_t>(theta->shape()[0]) != size[0]) {
+        throw std::runtime_error("AffineGrid: Theta batch size does not match target size batch.");
+    }
+
+    if (!theta->is_contiguous()) {
+        theta = theta->contiguous();
+    }
+
+    std::vector<size_t> out_shape;
+    out_shape.reserve(4);
+    out_shape.push_back(static_cast<size_t>(size[0]));
+    out_shape.push_back(static_cast<size_t>(size[2]));
+    out_shape.push_back(static_cast<size_t>(size[3]));
+    out_shape.push_back(2);
+
+    auto output = Tensor::empty(out_shape, theta->dtype(), theta->device());
+
+    AffineGrid::execute(output, theta, align_corners);
+
+    return output;
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/affine_grid/affine_grid_infiniop.cc b/src/infinicore/ops/affine_grid/affine_grid_infiniop.cc
new file mode 100644
index 000000000..9c34e1df4
--- /dev/null
+++ b/src/infinicore/ops/affine_grid/affine_grid_infiniop.cc
@@ -0,0 +1,65 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/affine_grid.hpp" // 引用算子定义
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::affine_grid_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAffineGridDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAffineGridDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 销毁描述符
+            INFINICORE_CHECK_ERROR(infiniopDestroyAffineGridDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor theta, bool align_corners) {
+   
+    size_t seed = hash_combine(output, theta, align_corners);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    // 获取当前设备对应的缓存
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAffineGridDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAffineGridDescriptor(
+            context::getInfiniopHandle(output->device()), 
+            &desc,
+            output->desc(), 
+            theta->desc(), 
+            align_corners)); // 传递 align_corners
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+   
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAffineGridWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+
+    INFINICORE_CHECK_ERROR(infiniopAffineGrid(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        output->data(), 
+        theta->data(), 
+        context::getStream()));
+}
+
+
+static bool registered = []() {
+    AffineGrid::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::affine_grid_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/floor/floor.cc b/src/infinicore/ops/floor/floor.cc
new file mode 100644
index 000000000..846aa4da8
--- /dev/null
+++ b/src/infinicore/ops/floor/floor.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/floor.hpp"
+
+namespace infinicore::op {
+common::OpDispatcher<Floor::schema> &Floor::dispatcher() {
+    static common::OpDispatcher<Floor::schema> dispatcher_;
+    return dispatcher_;
+};
+void Floor::execute(Tensor output, Tensor input) {
+    dispatcher().lookup(context::getDevice().getType())(output, input);
+}
+
+Tensor floor(Tensor input) {
+    
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    floor_(output, input);
+    return output;
+}
+void floor_(Tensor output, Tensor input) {
+    Floor::execute(output, input);
+}
+
+}
\ No newline at end of file
diff --git a/src/infinicore/ops/floor/floor_infiniop.cc b/src/infinicore/ops/floor/floor_infiniop.cc
new file mode 100644
index 000000000..55c0dd43a
--- /dev/null
+++ b/src/infinicore/ops/floor/floor_infiniop.cc
@@ -0,0 +1,54 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/floor.hpp" // 引入 Floor 头文件
+#include <infiniop.h>
+
+namespace infinicore::op::floor_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopFloorDescriptor_t> caches(
+    100, // capacity
+    [](infiniopFloorDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 销毁 Floor 描述符
+            INFINICORE_CHECK_ERROR(infiniopDestroyFloorDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+// 计算函数实现
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopFloorDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateFloorDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetFloorWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+    INFINICORE_CHECK_ERROR(infiniopFloor(
+        desc, 
+        workspace->data(), workspace_size,
+        output->data(), input->data(), // 参数顺序通常是 Output, Input
+        context::getStream()));
+}
+
+static bool registered = []() {
+    Floor::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::floor_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 978defa17..383fcbd48 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -3,6 +3,10 @@
 #include <pybind11/pybind11.h>
 
 #include "ops/add.hpp"
+#include "ops/adaptive_avg_pool1d.hpp"
+#include "ops/addbmm.hpp"
+#include "ops/affine_grid.hpp"
+#include "ops/acos.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
@@ -15,13 +19,18 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
-
+#include "ops/floor.hpp"
 namespace py = pybind11;
 
 namespace infinicore::ops {
 
 inline void bind(py::module &m) {
-    bind_add(m);
+    bind_add(m); 
+    bind_addbmm(m); 
+    bind_acos(m); 
+    bind_affine_grid(m); 
+    bind_floor(m); 
+    bind_adaptive_avg_pool1d(m);
     bind_attention(m);
     bind_causal_softmax(m);
     bind_random_sample(m);
diff --git a/src/infinicore/pybind11/ops/acos.hpp b/src/infinicore/pybind11/ops/acos.hpp
new file mode 100644
index 000000000..c06a8b21a
--- /dev/null
+++ b/src/infinicore/pybind11/ops/acos.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/acos.hpp" // 引用核心算子头文件
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_acos(py::module &m) {
+    // 绑定 out-of-place 接口: output = acos(input)
+    m.def("acos",
+          &op::acos,
+          py::arg("input"),
+          R"doc(Computes the inverse cosine (arccosine) of each element of input.
+          
+Returns a new tensor with the arccosine of the elements of input.
+The range of the result is [0, pi].)doc");
+
+    // 绑定 in-place 接口: acos_(output, input)
+    m.def("acos_",
+          &op::acos_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(In-place acos operation. Writes result into output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/adaptive_avg_pool1d.hpp b/src/infinicore/pybind11/ops/adaptive_avg_pool1d.hpp
new file mode 100644
index 000000000..ebc79de00
--- /dev/null
+++ b/src/infinicore/pybind11/ops/adaptive_avg_pool1d.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/adaptive_avg_pool1d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_adaptive_avg_pool1d(py::module &m) {
+    // 绑定函数接口: output = adaptive_avg_pool1d(input, output_size)
+    m.def("adaptive_avg_pool1d",
+          &op::adaptive_avg_pool1d,
+          py::arg("input"),
+          py::arg("output_size"),
+          R"doc(Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+Args:
+    input (Tensor): Input tensor of shape (C, L) or (N, C, L).
+    output_size (int): The target output size.
+
+Returns:
+    Tensor: Output tensor of shape (C, output_size) or (N, C, output_size).
+)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/addbmm.hpp b/src/infinicore/pybind11/ops/addbmm.hpp
new file mode 100644
index 000000000..6c2417afd
--- /dev/null
+++ b/src/infinicore/pybind11/ops/addbmm.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/addbmm.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_addbmm(py::module &m) {
+    // -----------------------------------------------------------
+    // 1. Out-of-place 接口: output = addbmm(...)
+    // -----------------------------------------------------------
+    m.def("addbmm",
+          &op::addbmm,
+          py::arg("input"),
+          py::arg("batch1"),
+          py::arg("batch2"),
+          py::arg("beta") = 1.0f,  
+          py::arg("alpha") = 1.0f, 
+          R"doc(Performs a batch matrix-matrix product of matrices stored in batch1 and batch2,
+with a reduced add step (summing over all matrices in the batch).
+
+.. math::
+    \text{out} = \beta \times \text{input} + \alpha \times \sum_{i=0}^{b-1} (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+
+Args:
+    input (Tensor): Matrix to be added. Shape (n, p).
+    batch1 (Tensor): The first batch of matrices to be multiplied. Shape (b, n, m).
+    batch2 (Tensor): The second batch of matrices to be multiplied. Shape (b, m, p).
+    beta (float, optional): Multiplier for input. Default: 1.0.
+    alpha (float, optional): Multiplier for batch1 @ batch2. Default: 1.0.
+
+Returns:
+    Tensor: Output tensor of shape (n, p).
+)doc");
+
+    // -----------------------------------------------------------
+    // 2. [新增] In-place 接口: addbmm_(out, ...)
+    // -----------------------------------------------------------
+    m.def("addbmm_",
+          &op::addbmm_, // 绑定到 C++ 的 void addbmm_(...)
+          py::arg("out"),   // 第一个参数通常是输出 Tensor
+          py::arg("input"),
+          py::arg("batch1"),
+          py::arg("batch2"),
+          py::arg("beta") = 1.0f,
+          py::arg("alpha") = 1.0f,
+          "In-place version of addbmm");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/affine_grid.hpp b/src/infinicore/pybind11/ops/affine_grid.hpp
new file mode 100644
index 000000000..cc50884b2
--- /dev/null
+++ b/src/infinicore/pybind11/ops/affine_grid.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h> 
+#include "infinicore/ops/affine_grid.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_affine_grid(py::module &m) {
+    // 绑定函数接口: grid = affine_grid(theta, size, align_corners)
+    m.def("affine_grid",
+          &op::affine_grid,
+          py::arg("theta"),
+          py::arg("size"),
+          py::arg("align_corners") = false, // 设置默认值
+          R"doc(Generates a 2D or 3D flow field (sampling grid), given a batch of affine matrices theta.
+
+Args:
+    theta (Tensor): Input affine matrices of shape (N, 2, 3) for 2D or (N, 3, 4) for 3D.
+    size (List[int]): The target output image size. Usually (N, C, H, W) for 2D or (N, C, D, H, W) for 3D.
+    align_corners (bool, optional): Geometrically, we consider the pixels of the input as squares rather than points. If set to True, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels. If set to False, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. Defaults to False.
+
+Returns:
+    Tensor: Output tensor of shape (N, H, W, 2).
+)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/floor.hpp b/src/infinicore/pybind11/ops/floor.hpp
new file mode 100644
index 000000000..b61c21e19
--- /dev/null
+++ b/src/infinicore/pybind11/ops/floor.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/floor.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_floor(py::module &m) {
+    // 绑定 out-of-place 接口: output = floor(input)
+    m.def("floor",
+          &op::floor,
+          py::arg("input"),
+          R"doc(Computes the floor of each element of input.)doc");
+
+    // 绑定 in-place 接口: floor_(output, input)
+    m.def("floor_",
+          &op::floor_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(In-place floor operation. Writes result into output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc
new file mode 100644
index 000000000..f1268552e
--- /dev/null
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc
@@ -0,0 +1,60 @@
+#include "acos_cpu.h"
+
+namespace op::acos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    CHECK_DTYPE(dtype, 
+        INFINI_DTYPE_BF16, 
+        INFINI_DTYPE_F16, 
+        INFINI_DTYPE_F32, 
+        INFINI_DTYPE_F64
+    );
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    // === 浮点类型 ===
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AcosOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AcosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AcosOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AcosOp, double>(_info, output, inputs, stream);
+
+    // 移除了所有整数 Case
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::acos::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h
new file mode 100644
index 000000000..dfe71d07a
--- /dev/null
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.h
@@ -0,0 +1,34 @@
+#ifndef __ACOS_CPU_H__
+#define __ACOS_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+// 使用宏声明 Descriptor 类
+ELEMENTWISE_DESCRIPTOR(acos, cpu)
+
+#include <cmath>
+#include <type_traits>
+
+namespace op::acos::cpu {
+
+typedef struct AcosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return static_cast<T>(std::acos(static_cast<double>(x)));
+        } 
+        else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+            return std::acos(x);
+        }
+        else {
+            return static_cast<T>(std::acos(static_cast<float>(x)));
+        }
+    }
+} AcosOp;
+
+} // namespace op::acos::cpu
+
+#endif // __ACOS_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh
new file mode 100644
index 000000000..97c302400
--- /dev/null
+++ b/src/infiniop/ops/acos/cuda/kernel.cuh
@@ -0,0 +1,103 @@
+#ifndef __ACOS_CUDA_H__
+#define __ACOS_CUDA_H__
+#if ENABLE_METAX_API
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+#include <cmath>
+#include <math.h>
+#include <type_traits>
+
+namespace op::acos::cuda {
+
+// ----------------------
+// Fast acos approximation
+// ----------------------
+__device__ __forceinline__ float fast_acosf(float x) {
+    // 高性能多项式近似 acos(x)
+    float ax = fabsf(x);
+    float t = sqrtf(1.0f - ax);
+    float r = ((-0.0187293f * ax + 0.0742610f) * ax - 0.2121144f) * ax + 1.5707288f;
+    return (x >= 0.0f ? t * r : 3.14159265358979323846f - t * r);
+}
+
+// ----------------------
+// float kernel (F32)
+// ----------------------
+template<typename T>
+__device__ __forceinline__ T acos_impl(T val);
+
+template<>
+__device__ __forceinline__ float acos_impl<float>(float val) {
+    return fast_acosf(val);
+}
+
+// ----------------------
+// half kernel (F16)
+// ----------------------
+template<>
+__device__ __forceinline__ half acos_impl<half>(half val) {
+#if (__CUDA_ARCH__ >= 530)
+    float f = __half2float(val);
+    return __float2half(fast_acosf(f));
+#else
+    float f = __half2float(val);
+    return __float2half(fast_acosf(f));
+#endif
+}
+
+// ----------------------
+// half2 kernel (F16x2 vectorized)
+// ----------------------
+template<>
+__device__ __forceinline__ half2 acos_impl<half2>(half2 val) {
+#if (__CUDA_ARCH__ >= 530)
+    float2 f = __half22float2(val);
+    f.x = fast_acosf(f.x);
+    f.y = fast_acosf(f.y);
+    return __float22half2_rn(f);
+#else
+    float2 f = __half22float2(val);
+    f.x = fast_acosf(f.x);
+    f.y = fast_acosf(f.y);
+    return __float22half2_rn(f);
+#endif
+}
+
+// ----------------------
+// bfloat16 kernel (BF16)
+// ----------------------
+template<>
+__device__ __forceinline__ cuda_bfloat16 acos_impl<cuda_bfloat16>(cuda_bfloat16 val) {
+    float f = __bfloat162float(val);
+    return __float2bfloat16(fast_acosf(f));
+}
+
+// ----------------------
+// Fallback kernel
+// ----------------------
+template<typename T>
+__device__ __forceinline__ T acos_impl(T val) {
+    return static_cast<T>(fast_acosf(static_cast<float>(val)));
+}
+
+// ----------------------
+// AcosOp struct
+// ----------------------
+struct AcosOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        return acos_impl(a);
+    }
+};
+
+} // namespace op::acos::cuda
+
+#endif // __ACOS_CUDA_H__
diff --git a/src/infiniop/ops/acos/metax/acos_metax.h b/src/infiniop/ops/acos/metax/acos_metax.h
new file mode 100644
index 000000000..4c5c2fcbe
--- /dev/null
+++ b/src/infiniop/ops/acos/metax/acos_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ACOS_METAX_API_H__
+#define __ACOS_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(acos, metax)
+
+#endif // __ACOS_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/metax/acos_metax.maca b/src/infiniop/ops/acos/metax/acos_metax.maca
new file mode 100644
index 000000000..99a61bd8b
--- /dev/null
+++ b/src/infiniop/ops/acos/metax/acos_metax.maca
@@ -0,0 +1,58 @@
+#include "acos_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::acos::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AcosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AcosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acos::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/moore/acos_moore.h b/src/infiniop/ops/acos/moore/acos_moore.h
new file mode 100644
index 000000000..50089ecda
--- /dev/null
+++ b/src/infiniop/ops/acos/moore/acos_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ACOS_MOORE_API_H__
+#define __ACOS_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(acos, moore)
+
+#endif // __ACOS_MOORE_API_H__
diff --git a/src/infiniop/ops/acos/moore/acos_moore.mu b/src/infiniop/ops/acos/moore/acos_moore.mu
new file mode 100644
index 000000000..b11480bd9
--- /dev/null
+++ b/src/infiniop/ops/acos/moore/acos_moore.mu
@@ -0,0 +1,69 @@
+#include "acos_moore.h"
+
+// 引入 Moore 平台的通用 Elementwise 描述符宏
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+
+#include "acos_moore_kernel.h"
+
+namespace op::acos::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // Acos is a unary operator (y = acos(x))
+    const auto &in_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = in_desc->shape();
+
+    // Acos supports floating point types.
+    // Unlike floor, acos generally doesn't support integer outputs directly.
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // Check if output shape matches input shape
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+
+    // create MOORE elementwise descriptor
+    // 这里的宏会自动生成描述符初始化的通用代码
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // Use moore::AcosOp template defined in acos_moore_kernel.h
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::AcosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::AcosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::AcosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::AcosOp, double>(_info, workspace, output, inputs, stream);
+    
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acos::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/moore/acos_moore_kernel.h b/src/infiniop/ops/acos/moore/acos_moore_kernel.h
new file mode 100644
index 000000000..bcf4406b2
--- /dev/null
+++ b/src/infiniop/ops/acos/moore/acos_moore_kernel.h
@@ -0,0 +1,56 @@
+#ifndef __ACOS_MOORE_KERNEL_H__
+#define __ACOS_MOORE_KERNEL_H__
+
+
+
+namespace op::acos::moore {
+
+typedef struct AcosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        // -----------------------------------------------------------------
+        // 1. Half2 (FP16x2)
+        // -----------------------------------------------------------------
+        if constexpr (std::is_same_v<T, half2>) {
+            
+            float f1 = __low2float(input);
+            float f2 = __high2float(input);
+            return __floats2half2_rn(::acosf(f1), ::acosf(f2));
+        } 
+        // -----------------------------------------------------------------
+        // 2. Half (FP16)
+        // -----------------------------------------------------------------
+        else if constexpr (std::is_same_v<T, half>) {
+            // Half fallback to float
+            float val_f = __half2float(input);
+            return __float2half(::acosf(val_f));
+        } 
+        // -----------------------------------------------------------------
+        // 3. Bfloat16
+        // -----------------------------------------------------------------
+        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 fallback to float
+            float val_f = __bfloat162float(input);
+            return __float2bfloat16(::acosf(val_f));
+        } 
+        // -----------------------------------------------------------------
+        // 4. Float32
+        // -----------------------------------------------------------------
+        else if constexpr (std::is_same_v<T, float>) {
+           
+            return ::acosf(input);
+        } 
+        // -----------------------------------------------------------------
+        // 5. Double / Other
+        // -----------------------------------------------------------------
+        else {
+            return ::acos(input);
+        }
+    }
+} AcosOp;
+} // namespace op::acos::moore
+
+#endif // __ACOS_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
new file mode 100644
index 000000000..4baf6139f
--- /dev/null
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
@@ -0,0 +1,70 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+
+#include "../cuda/kernel.cuh"
+#include "acos_nvidia.cuh"
+
+namespace op::acos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    
+    CHECK_DTYPE(dtype, 
+        INFINI_DTYPE_BF16, 
+        INFINI_DTYPE_F16, 
+        INFINI_DTYPE_F32, 
+        INFINI_DTYPE_F64
+    );
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // 使用通用的 Elementwise 描述符创建宏
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // -----------------------------------------------------------
+    // 算子分发：使用 cuda::AcosOp
+    // -----------------------------------------------------------
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AcosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AcosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::acos::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
new file mode 100644
index 000000000..67c2110e3
--- /dev/null
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
@@ -0,0 +1,6 @@
+#ifndef __ACOS_NVIDIA_CUH__
+#define __ACOS_NVIDIA_CUH__
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+ELEMENTWISE_DESCRIPTOR(acos, nvidia)
+
+#endif // __FLOOR_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc
new file mode 100644
index 000000000..9ba7778b9
--- /dev/null
+++ b/src/infiniop/ops/acos/operator.cc
@@ -0,0 +1,177 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/acos.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/acos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/acos_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/acos_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/acos_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateAcosDescriptor(
+    infiniopHandle_t handle,
+    infiniopAcosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input) {
+
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::acos::NAMESPACE::Descriptor::create(                     \
+                handle,                                                         \
+                reinterpret_cast<op::acos::NAMESPACE::Descriptor **>(desc_ptr), \
+                output,                                                         \
+                {input})
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // ✅ 正确：使用 CREATE 宏
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                   \
+        case CASE:                                                                                 \
+            *size = reinterpret_cast<op::acos::NAMESPACE::Descriptor *>(desc)->workspaceSize();    \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    // 🔴 修正点：之前写成了 CREATE，必须改为 GET
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif 
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopAcos(
+    infiniopAcosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                     \
+        case CASE:                                                         \
+            return reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif 
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                      \
+        case CASE:                                                                       \
+            delete reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc);      \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif 
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/adaptive_avg_pool1d.h b/src/infiniop/ops/adaptive_avg_pool1d/adaptive_avg_pool1d.h
new file mode 100644
index 000000000..cbe435a7b
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/adaptive_avg_pool1d.h
@@ -0,0 +1,47 @@
+#ifndef ADAPTIVE_AVG_POOL1D_H
+#define ADAPTIVE_AVG_POOL1D_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                            \
+                                                                         \
+    namespace op::adaptive_avg_pool1d::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        AdaptiveAvgPool1dInfo _info;                                     \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            AdaptiveAvgPool1dInfo info,                                  \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t in_desc);                         \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // ADAPTIVE_AVG_POOL1D_H
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/cpu/adaptive_avg_pool1d_cpu.cc b/src/infiniop/ops/adaptive_avg_pool1d/cpu/adaptive_avg_pool1d_cpu.cc
new file mode 100644
index 000000000..538ed32cf
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/cpu/adaptive_avg_pool1d_cpu.cc
@@ -0,0 +1,126 @@
+#include "adaptive_avg_pool1d_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <cmath>
+#include <algorithm>
+
+namespace op::adaptive_avg_pool1d::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+    auto result = AdaptiveAvgPool1dInfo::create(out_desc, in_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        nullptr,             // Opaque*
+        result.take(),       // Info
+        0,                   // Workspace Size
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+void calculate(
+    const AdaptiveAvgPool1dInfo &info,
+    void *output,
+    const void *input) {
+
+    size_t num_channels = info.num_channels(); // Batch * Channels
+    size_t isize = info.input_size();          // L_in
+    size_t osize = info.output_size();         // L_out
+
+    auto out_ptr = reinterpret_cast<Tdata *>(output);
+    auto in_ptr = reinterpret_cast<const Tdata *>(input);
+
+   
+#pragma omp parallel for
+    for (size_t c = 0; c < num_channels; ++c) {
+        const Tdata *in_c = in_ptr + c * isize;
+        Tdata *out_c = out_ptr + c * osize;
+
+        // 遍历输出的每一个元素
+        for (size_t i = 0; i < osize; ++i) {
+            size_t istart = std::floor((float)(i * isize) / osize);
+            size_t iend = std::ceil((float)((i + 1) * isize) / osize);
+
+            // 边界修正
+            istart = std::max((size_t)0, std::min(istart, isize));
+            iend = std::max((size_t)0, std::min(iend, isize));
+
+            size_t klen = iend - istart;
+
+            // 使用 float 累加防止溢出
+            float sum = 0;
+            for (size_t j = istart; j < iend; ++j) {
+                
+                if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+                    sum += utils::cast<float>(in_c[j]);
+                } else {
+                    sum += static_cast<float>(in_c[j]);
+                }
+            }
+
+            // 计算平均值并回填
+            if (klen > 0) {
+                float avg = sum / static_cast<float>(klen);
+                if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+                    out_c[i] = utils::cast<Tdata>(avg);
+                } else {
+                    out_c[i] = static_cast<Tdata>(avg);
+                }
+            } else {
+                if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+                    out_c[i] = utils::cast<Tdata>(0.0f);
+                } else {
+                    out_c[i] = static_cast<Tdata>(0);
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    // 从 Info 中获取 dtype
+    auto dtype = _info.dtype();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        cpu::calculate<fp16_t>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_BF16:
+        cpu::calculate<bf16_t>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_F32:
+        cpu::calculate<float>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+    
+    case INFINI_DTYPE_F64:
+        cpu::calculate<double>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::adaptive_avg_pool1d::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/cpu/adaptive_avg_pool1d_cpu.h b/src/infiniop/ops/adaptive_avg_pool1d/cpu/adaptive_avg_pool1d_cpu.h
new file mode 100644
index 000000000..6d40c4dac
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/cpu/adaptive_avg_pool1d_cpu.h
@@ -0,0 +1,6 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_CPU_H__
+#define __ADAPTIVE_AVG_POOL1D_CPU_H__
+
+#include "../adaptive_avg_pool1d.h"
+DESCRIPTOR(cpu)
+#endif // __ADAPTIVE_AVG_POOL1D_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/cuda/kernel.cuh b/src/infiniop/ops/adaptive_avg_pool1d/cuda/kernel.cuh
new file mode 100644
index 000000000..e8a88bb8c
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/cuda/kernel.cuh
@@ -0,0 +1,167 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_CUDA_H__
+#define __ADAPTIVE_AVG_POOL1D_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+#include <cmath>
+#include <stdio.h>
+
+namespace op::adaptive_avg_pool1d::cuda {
+
+template<typename T>
+__device__ __forceinline__ T warp_reduce_sum(T val) {
+    #pragma unroll
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float(const T &x) {
+    if constexpr (std::is_same_v<T, half>) return __half2float(x);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) return __bfloat162float(x);
+#endif
+    else return static_cast<float>(x);
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(float x) {
+    if constexpr (std::is_same_v<T, half>) return __float2half(x);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) return __float2bfloat16(x);
+#endif
+    else return static_cast<T>(x);
+}
+
+
+template <typename T>
+__global__ void global_avg_pool1d_kernel(
+    T* output,
+    const T* input,
+    size_t total_channels, // batch * channels
+    size_t isize
+) {
+    // 每一个 Block 处理一个 (Batch, Channel) 任务
+    size_t channel_idx = blockIdx.x; 
+    if (channel_idx >= total_channels) return;
+
+    const T* channel_input = input + channel_idx * isize;
+    float sum = 0.0f;
+
+    // Grid-Stride Loop within the channel (handle isize > blockDim.x)
+    for (size_t i = threadIdx.x; i < isize; i += blockDim.x) {
+        sum += to_float(channel_input[i]);
+    }
+
+    // Block 内归约
+    // 1. Warp Reduce
+    sum = warp_reduce_sum(sum);
+
+    // 2. Shared Memory Reduce (跨 Warp)
+    static __shared__ float shared_sum[32]; // Max 1024 threads / 32 = 32 warps
+    int lane = threadIdx.x % 32;
+    int wid = threadIdx.x / 32;
+
+    if (lane == 0) {
+        shared_sum[wid] = sum;
+    }
+    __syncthreads();
+
+
+    if (wid == 0) {
+        float val = (threadIdx.x < (blockDim.x + 31) / 32) ? shared_sum[lane] : 0.0f;
+        val = warp_reduce_sum(val);
+        if (threadIdx.x == 0) {
+            output[channel_idx] = from_float<T>(val / static_cast<float>(isize));
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void adaptive_avg_pool1d_general_kernel(
+    T* output,
+    const T* input,
+    size_t batch_channels,
+    size_t isize,
+    size_t osize
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = gridDim.x * blockDim.x;
+    size_t total_elements = batch_channels * osize;
+
+    // 预计算缩放因子，避免循环内除法
+    float stride_factor = static_cast<float>(isize) / static_cast<float>(osize);
+
+    for (; idx < total_elements; idx += stride) {
+        size_t bc_idx = idx / osize;
+        size_t out_idx = idx % osize;
+
+        const T* in_ptr = input + bc_idx * isize;
+
+        int istart = static_cast<int>(floorf(out_idx * stride_factor));
+        int iend   = static_cast<int>(ceilf((out_idx + 1) * stride_factor));
+        
+        // 边界保护
+        istart = max(0, istart);
+        iend = min(static_cast<int>(isize), iend);
+
+        float sum = 0.0f;
+        int klen = iend - istart;
+
+        
+        for (int i = istart; i < iend; ++i) {
+            sum += to_float(in_ptr[i]);
+        }
+
+        output[idx] = (klen > 0) ? from_float<T>(sum / klen) : from_float<T>(0.0f);
+    }
+}
+
+// -------------------------------------------
+// Launcher
+// -------------------------------------------
+template <typename T>
+void launch_adaptive_avg_pool1d(
+    T* output,
+    const T* input,
+    size_t batch_channels,
+    size_t isize,
+    size_t osize,
+    cudaStream_t stream
+) {
+    // 策略分发
+    if (osize == 1) {
+        int threads = 256;
+        // 如果 isize 很小，减少线程数
+        if (isize < 256) threads = 128;
+        if (isize < 128) threads = 64;
+        if (isize < 64)  threads = 32;
+
+        dim3 block(threads);
+        dim3 grid(batch_channels); 
+        
+        global_avg_pool1d_kernel<T><<<grid, block, 0, stream>>>(
+            output, input, batch_channels, isize
+        );
+    } else
+        size_t total_output = batch_channels * osize;
+        int threads = 256;
+        int blocks = (total_output + threads - 1) / threads;
+        
+        
+        if (blocks > 65535) blocks = 65535; 
+
+        adaptive_avg_pool1d_general_kernel<T><<<blocks, threads, 0, stream>>>(
+            output, input, batch_channels, isize, osize
+        );
+    }
+}
+
+} // namespace op::adaptive_avg_pool1d::cuda
+
+#endif // __ADAPTIVE_AVG_POOL1D_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/info.h b/src/infiniop/ops/adaptive_avg_pool1d/info.h
new file mode 100644
index 000000000..175d70fa6
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/info.h
@@ -0,0 +1,65 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_INFO_H__
+#define __ADAPTIVE_AVG_POOL1D_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::adaptive_avg_pool1d {
+
+class AdaptiveAvgPool1dInfo {
+    AdaptiveAvgPool1dInfo() = default;
+
+public:
+   
+    size_t _input_size;
+    size_t _output_size;
+    size_t _num_channels;
+    int _dtype;
+
+    size_t input_size() const { return _input_size; }
+    size_t output_size() const { return _output_size; }
+    size_t num_channels() const { return _num_channels; }
+    int dtype() const { return _dtype; }
+
+    static utils::Result<AdaptiveAvgPool1dInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc) {
+
+        // 1. 检查数据类型一致性
+        if (out_desc->dtype() != in_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 2. 检查维度 (至少 2 维: C, L 或 N, C, L)
+        size_t ndim = in_desc->ndim();
+        if (ndim < 2 || out_desc->ndim() != ndim) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+       
+        size_t num_channels = 1;
+        for (size_t i = 0; i < ndim - 1; ++i) {
+            if (in_desc->shape()[i] != out_desc->shape()[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+            num_channels *= in_desc->shape()[i];
+        }
+
+        // 4. 获取输入和输出的长度 (L)
+        size_t input_size = in_desc->shape()[ndim - 1];
+        size_t output_size = out_desc->shape()[ndim - 1];
+        int dtype = in_desc->dtype();
+
+       
+        return utils::Result<AdaptiveAvgPool1dInfo>(AdaptiveAvgPool1dInfo{
+            input_size,
+            output_size,
+            num_channels,
+            dtype});
+    }
+};
+
+} // namespace op::adaptive_avg_pool1d
+
+#endif // __ADAPTIVE_AVG_POOL1D_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/metax/adaptive_avg_pool1d_metax.h b/src/infiniop/ops/adaptive_avg_pool1d/metax/adaptive_avg_pool1d_metax.h
new file mode 100644
index 000000000..5dd0320ea
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/metax/adaptive_avg_pool1d_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_METAX_H__
+#define __ADAPTIVE_AVG_POOL1D_METAX_H__
+
+#include "../adaptive_avg_pool1d.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ADAPTIVE_AVG_POOL1D_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/metax/adaptive_avg_pool1d_metax.maca b/src/infiniop/ops/adaptive_avg_pool1d/metax/adaptive_avg_pool1d_metax.maca
new file mode 100644
index 000000000..abb40afc0
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/metax/adaptive_avg_pool1d_metax.maca
@@ -0,0 +1,191 @@
+#include "adaptive_avg_pool1d_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <mcdnn/mcdnn.h>
+#include <common/mc_library_types.h>
+
+namespace op::adaptive_avg_pool1d::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+    
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    
+    // 检查数据类型
+    auto dtype = out_desc->dtype();
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // 调用 Info::create (只传两个参数)
+    auto result = AdaptiveAvgPool1dInfo::create(out_desc, in_desc);
+    
+    // 简单的错误检查，Info::create 返回 Result 类型
+    if (!result) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{handle->internal()},
+        result.take(),
+        0,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    // 准备 MCDNN 参数
+    mcdnnDataType_t data_type;
+    mcdnnPoolingMode_t pooling_mode = MCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+    mcdnnTensorFormat_t tensor_format = MCDNN_TENSOR_NCHW;
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        data_type = MCDNN_DATA_HALF;
+        break;
+    case INFINI_DTYPE_F32:
+        data_type = MCDNN_DATA_FLOAT;
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // 维度映射:
+    // Info 类已经将 N 和 C 展平为 num_channels。
+    // 为了适配 mcdnn (NCHW)，我们将 num_channels 视为 N (Batch)，C=1, H=1, W=Length。
+    // 这样做对于 1D Pooling 是数学上等价且高效的。
+    int n = static_cast<int>(_info.num_channels());
+    int c = 1;
+    int h_in = 1;
+    int w_in = static_cast<int>(_info.input_size());
+    
+    int h_out = 1;
+    int w_out = static_cast<int>(_info.output_size());
+
+    // 动态计算 Kernel 和 Stride (Adaptive -> Fixed转换)
+    // 1. Stride = Input / Output
+    // 2. Kernel = Input - (Output - 1) * Stride
+    // 注意: 如果 Input 不能被 Output 整除，这种固定 Stride/Kernel 的方式只是近似，
+    // 但在使用标准 Pooling API 实现 Adaptive Pooling 时是常用做法。
+    int stride_w = w_in / w_out;
+    int kernel_w = w_in - (w_out - 1) * stride_w;
+    int pad_w = 0;
+
+    int windowHeight = 1;
+    int windowWidth = kernel_w;
+    int verticalPadding = 0;
+    int horizontalPadding = pad_w;
+    int verticalStride = 1;
+    int horizontalStride = stride_w;
+
+    return _opaque->internal->useMcdnn(
+        (hcStream_t)stream,
+        [&](auto raw_handle) { 
+            mcdnnHandle_t handle = reinterpret_cast<mcdnnHandle_t>(raw_handle);
+            mcdnnStatus_t status;
+
+            // 1. 输入张量描述符
+            mcdnnTensorDescriptor_t input_desc;
+            status = mcdnnCreateTensorDescriptor(&input_desc);
+            if (status != MCDNN_STATUS_SUCCESS) return INFINI_STATUS_INTERNAL_ERROR;
+
+            status = mcdnnSetTensor4dDescriptor(
+                input_desc,
+                tensor_format,
+                data_type,
+                n, c, h_in, w_in);
+            if (status != MCDNN_STATUS_SUCCESS) {
+                mcdnnDestroyTensorDescriptor(input_desc);
+                return INFINI_STATUS_INTERNAL_ERROR;
+            }
+
+            // 2. 输出张量描述符
+            mcdnnTensorDescriptor_t output_desc;
+            status = mcdnnCreateTensorDescriptor(&output_desc);
+            if (status != MCDNN_STATUS_SUCCESS) {
+                mcdnnDestroyTensorDescriptor(input_desc);
+                return INFINI_STATUS_INTERNAL_ERROR;
+            }
+
+            status = mcdnnSetTensor4dDescriptor(
+                output_desc,
+                tensor_format,
+                data_type,
+                n, c, h_out, w_out);
+            if (status != MCDNN_STATUS_SUCCESS) {
+                mcdnnDestroyTensorDescriptor(input_desc);
+                mcdnnDestroyTensorDescriptor(output_desc);
+                return INFINI_STATUS_INTERNAL_ERROR;
+            }
+
+            // 3. Pooling 描述符
+            mcdnnPoolingDescriptor_t pool_desc;
+            status = mcdnnCreatePoolingDescriptor(&pool_desc);
+            if (status != MCDNN_STATUS_SUCCESS) {
+                mcdnnDestroyTensorDescriptor(input_desc);
+                mcdnnDestroyTensorDescriptor(output_desc);
+                return INFINI_STATUS_INTERNAL_ERROR;
+            }
+
+            status = mcdnnSetPooling2dDescriptor(
+                pool_desc,
+                pooling_mode,
+                MCDNN_NOT_PROPAGATE_NAN,
+                windowHeight, windowWidth,
+                verticalPadding, horizontalPadding,
+                verticalStride, horizontalStride);
+            
+            if (status != MCDNN_STATUS_SUCCESS) {
+                mcdnnDestroyTensorDescriptor(input_desc);
+                mcdnnDestroyTensorDescriptor(output_desc);
+                mcdnnDestroyPoolingDescriptor(pool_desc);
+                return INFINI_STATUS_INTERNAL_ERROR;
+            }
+
+            // 4. 执行
+            status = mcdnnPoolingForward(
+                handle,
+                pool_desc,
+                &alpha,
+                input_desc,
+                input,
+                &beta,
+                output_desc,
+                output);
+
+            // 清理
+            mcdnnDestroyTensorDescriptor(input_desc);
+            mcdnnDestroyTensorDescriptor(output_desc);
+            mcdnnDestroyPoolingDescriptor(pool_desc);
+
+            if (status != MCDNN_STATUS_SUCCESS) {
+                return INFINI_STATUS_INTERNAL_ERROR;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        });
+}
+
+} // namespace op::adaptive_avg_pool1d::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore.h b/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore.h
new file mode 100644
index 000000000..ae35d220e
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_MOORE_API_H__
+#define __ADAPTIVE_AVG_POOL1D_MOORE_API_H__
+
+// 引入上层定义的 Descriptor 宏和基础类
+#include "../adaptive_avg_pool1d.h"
+DESCRIPTOR(moore)
+
+#endif // __ADAPTIVE_AVG_POOL1D_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore.mu b/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore.mu
new file mode 100644
index 000000000..63bb73952
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore.mu
@@ -0,0 +1,175 @@
+#include "adaptive_avg_pool1d_moore.h"
+#include "adaptive_avg_pool1d_moore_kernel.h"
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <type_traits>
+
+#include "../../../devices/moore/moore_handle.h"
+
+namespace op::adaptive_avg_pool1d::moore {
+
+// ==================================================================
+// 1. Kernel Implementation
+// ==================================================================
+
+template <typename T>
+__global__ void adaptive_avg_pool1d_kernel(
+    const int total_elements,
+    const int input_size,
+    const int output_size,
+    const T *input,
+    T *output) {
+    
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < total_elements) {
+        int w_out = idx % output_size;
+        int bc = idx / output_size;
+
+        int start = (w_out * input_size) / output_size;
+        int end = ((w_out + 1) * input_size + output_size - 1) / output_size;
+
+        start = (start < 0) ? 0 : start;
+        end = (end > input_size) ? input_size : end;
+
+        int kernel_size = end - start;
+        if (kernel_size < 1) kernel_size = 1;
+
+        const T *in_ptr = input + bc * input_size;
+
+        float sum = 0.0f;
+        for (int i = start; i < end; ++i) {
+            T val = in_ptr[i];
+            if constexpr (std::is_same_v<T, half>) {
+                sum += __half2float(val);
+            } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                sum += __bfloat162float(val);
+            } else {
+                sum += static_cast<float>(val);
+            }
+        }
+
+        float avg = sum / static_cast<float>(kernel_size);
+
+        if constexpr (std::is_same_v<T, half>) {
+            output[idx] = __float2half(avg);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            output[idx] = __float2bfloat16(avg);
+        } else {
+            output[idx] = static_cast<T>(avg);
+        }
+    }
+}
+
+// ==================================================================
+// 2. Launcher Implementation
+// ==================================================================
+
+template <typename T>
+void adaptive_avg_pool1d_moore_launch(
+    const AdaptiveAvgPool1dInfo &info,
+    T *output,
+    const T *input,
+    void *stream) {
+    
+    int input_size = info.input_size();
+    int output_size = info.output_size();
+    
+    size_t total_elements = info.num_channels() * output_size;
+
+    int threads = 256;
+    int blocks = (total_elements + threads - 1) / threads;
+
+    adaptive_avg_pool1d_kernel<T><<<blocks, threads, 0, (musaStream_t)stream>>>(
+        total_elements,
+        input_size,
+        output_size,
+        input,
+        output
+    );
+}
+
+// ==================================================================
+// 3. Descriptor Implementation
+// ==================================================================
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto info_result = AdaptiveAvgPool1dInfo::create(out_desc, in_desc);
+    if (!info_result) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(
+        nullptr,
+        *info_result,
+        0,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        adaptive_avg_pool1d_moore_launch<half>(
+            _info, 
+            static_cast<half *>(output), 
+            static_cast<const half *>(input), 
+            stream);
+        break;
+        
+    case INFINI_DTYPE_BF16:
+        adaptive_avg_pool1d_moore_launch<__mt_bfloat16>(
+            _info, 
+            static_cast<__mt_bfloat16 *>(output), 
+            static_cast<const __mt_bfloat16 *>(input), 
+            stream);
+        break;
+
+    case INFINI_DTYPE_F32:
+        adaptive_avg_pool1d_moore_launch<float>(
+            _info, 
+            static_cast<float *>(output), 
+            static_cast<const float *>(input), 
+            stream);
+        break;
+
+    case INFINI_DTYPE_F64:
+        adaptive_avg_pool1d_moore_launch<double>(
+            _info, 
+            static_cast<double *>(output), 
+            static_cast<const double *>(input), 
+            stream);
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::adaptive_avg_pool1d::moore
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore_kernel.h b/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore_kernel.h
new file mode 100644
index 000000000..281dbc4f1
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/moore/adaptive_avg_pool1d_moore_kernel.h
@@ -0,0 +1,61 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_MOORE_KERNEL_H__
+#define __ADAPTIVE_AVG_POOL1D_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h> 
+
+#include <type_traits>
+
+namespace op::adaptive_avg_pool1d::moore {
+
+typedef struct AdaptiveAvgPool1dOp {
+public:
+    template <typename T>
+    __device__ __forceinline__ void operator()(
+        const int w_out,
+        const int input_size,
+        const int output_size,
+        const T* input_base,
+        T* output_ptr
+    ) const {
+        
+        int start = (w_out * input_size) / output_size;
+        int end = ((w_out + 1) * input_size + output_size - 1) / output_size;
+
+        start = (start < 0) ? 0 : start;
+        end = (end > input_size) ? input_size : end;
+
+        int kernel_size = end - start;
+        kernel_size = (kernel_size < 1) ? 1 : kernel_size;
+
+        float sum = 0.0f;
+
+        for (int i = start; i < end; ++i) {
+            T val = input_base[i];
+
+            if constexpr (std::is_same_v<T, half>) {
+                sum += __half2float(val);
+            } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                sum += __bfloat162float(val);
+            } else {
+                sum += static_cast<float>(val);
+            }
+        }
+
+        float avg = sum / static_cast<float>(kernel_size);
+
+        if constexpr (std::is_same_v<T, half>) {
+            *output_ptr = __float2half(avg);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            *output_ptr = __float2bfloat16(avg);
+        } else {
+            *output_ptr = static_cast<T>(avg);
+        }
+    }
+
+} AdaptiveAvgPool1dOp;
+
+} // namespace op::adaptive_avg_pool1d::moore
+
+#endif // __ADAPTIVE_AVG_POOL1D_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/nvidia/adaptive_avg_pool1d_nvidia.cu b/src/infiniop/ops/adaptive_avg_pool1d/nvidia/adaptive_avg_pool1d_nvidia.cu
new file mode 100644
index 000000000..e9699599c
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/nvidia/adaptive_avg_pool1d_nvidia.cu
@@ -0,0 +1,100 @@
+
+#include "../cuda/kernel.cuh"
+#include "adaptive_avg_pool1d_nvidia.cuh"
+#include "../../../handle.h" 
+
+namespace op::adaptive_avg_pool1d::nvidia {
+
+
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    size_t num_channels, // 这里实际上是 total_channels (Batch * C)
+    size_t isize, 
+    size_t osize, 
+    void *stream) {
+    
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    cuda::launch_adaptive_avg_pool1d<T>(
+        out_ptr,
+        in_ptr,
+        num_channels, 
+        isize,
+        osize,
+        cuda_stream
+    );
+}
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+
+    // 1. 使用 Info 类解析参数
+    auto info_result = AdaptiveAvgPool1dInfo::create(out_desc, in_desc);
+    if (!info_result) {
+        return info_result.status();
+    }
+    auto info = info_result.take();
+
+    // 2. 创建 Descriptor
+    *desc_ptr = new Descriptor(
+        new Opaque(),        // Opaque 指针
+        info,                // Info 对象
+        0,                   // Workspace size
+        handle->device,      // Device Type
+        handle->device_id    // Device ID
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    auto num_channels = _info.num_channels(); // 这里通常是 Batch * Channel
+    auto input_size = _info.input_size();
+    auto output_size = _info.output_size();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, num_channels, input_size, output_size, stream);
+        break;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    case INFINI_DTYPE_BF16:
+        // 使用标准类型 nv_bfloat16
+        launch_kernel<nv_bfloat16>(output, input, num_channels, input_size, output_size, stream);
+        break;
+#endif
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, num_channels, input_size, output_size, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, num_channels, input_size, output_size, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::adaptive_avg_pool1d::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/nvidia/adaptive_avg_pool1d_nvidia.cuh b/src/infiniop/ops/adaptive_avg_pool1d/nvidia/adaptive_avg_pool1d_nvidia.cuh
new file mode 100644
index 000000000..b95e1088f
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/nvidia/adaptive_avg_pool1d_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ADAPTIVE_AVG_POOL1D_CUH__
+#define __ADAPTIVE_AVG_POOL1D_CUH__
+
+#include "../adaptive_avg_pool1d.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GEMM_CUDA_CUH__
diff --git a/src/infiniop/ops/adaptive_avg_pool1d/operator.cc b/src/infiniop/ops/adaptive_avg_pool1d/operator.cc
new file mode 100644
index 000000000..ed24bb837
--- /dev/null
+++ b/src/infiniop/ops/adaptive_avg_pool1d/operator.cc
@@ -0,0 +1,195 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/adaptive_avg_pool1d.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/adaptive_avg_pool1d_cpu.h"
+#endif
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/adaptive_avg_pool1d_nvidia.cuh"
+#endif
+
+// [Metax Support]
+#ifdef ENABLE_METAX_API
+#include "metax/adaptive_avg_pool1d_metax.h"
+#endif
+
+// [Moore Threads Support]
+#ifdef ENABLE_MOORE_API
+#include "moore/adaptive_avg_pool1d_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateAdaptiveAvgPool1dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAdaptiveAvgPool1dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+    #define CREATE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                           \
+            return op::adaptive_avg_pool1d::NAMESPACE::Descriptor::create(                   \
+                handle,                                                                      \
+                reinterpret_cast<op::adaptive_avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
+                output_desc,                                                                 \
+                input_desc)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    
+    // [Metax 分支]
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // [Moore 分支]
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetAdaptiveAvgPool1dWorkspaceSize(infiniopAdaptiveAvgPool1dDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                                  \
+        case CASE:                                                                                                \
+            *size = reinterpret_cast<op::adaptive_avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize();    \
+            return INFINI_STATUS_SUCCESS
+
+    switch (reinterpret_cast<InfiniopDescriptor *>(desc)->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+
+    // [Metax 分支]
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    // [Moore 分支]
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopAdaptiveAvgPool1d(
+    infiniopAdaptiveAvgPool1dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                                    \
+        case CASE:                                                                        \
+            return reinterpret_cast<const op::adaptive_avg_pool1d::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (reinterpret_cast<InfiniopDescriptor *>(desc)->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+
+    // [Metax 分支]
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // [Moore 分支]
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyAdaptiveAvgPool1dDescriptor(infiniopAdaptiveAvgPool1dDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                                        \
+        case CASE:                                                                                         \
+            delete reinterpret_cast<const op::adaptive_avg_pool1d::NAMESPACE::Descriptor *>(desc);         \
+            return INFINI_STATUS_SUCCESS
+
+    switch (reinterpret_cast<InfiniopDescriptor *>(desc)->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+
+    // [Metax 分支]
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // [Moore 分支]
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/addbmm.h b/src/infiniop/ops/addbmm/addbmm.h
new file mode 100644
index 000000000..25a2ce40c
--- /dev/null
+++ b/src/infiniop/ops/addbmm/addbmm.h
@@ -0,0 +1,52 @@
+#ifndef ADDBMM_H
+#define ADDBMM_H
+
+#include "../../operator.h"
+#include "info.h" // 对应 addbmm_info.h
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+// 注意：addbmm 需要处理 alpha, beta 和多个输入张量
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::addbmm::NAMESPACE {                                    \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        AddbmmInfo _info;                                                \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            AddbmmInfo info,                                             \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        /* create 函数接收 Tensor 描述符列表和标量系数 */                \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            std::vector<infiniopTensorDescriptor_t> input_desc_vec,      \
+            float alpha,                                                 \
+            float beta);                                                 \
+                                                                         \
+        /* calculate 函数接收数据指针列表 */                             \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            std::vector<const void *> inputs,                            \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // ADDBMM_H
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/cpu/addbmm_cpu.cc b/src/infiniop/ops/addbmm/cpu/addbmm_cpu.cc
new file mode 100644
index 000000000..f9d8b9021
--- /dev/null
+++ b/src/infiniop/ops/addbmm/cpu/addbmm_cpu.cc
@@ -0,0 +1,187 @@
+#include "addbmm_cpu.h"
+#include <cstdint>
+#include <stddef.h>
+#include <vector>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../handle.h"
+
+namespace op::addbmm::cpu {
+
+Descriptor::~Descriptor() = default;
+
+// ==================================================================
+// 辅助函数：通用 stride 寻址
+// ==================================================================
+
+// 计算 2D 张量的偏移量
+inline size_t offset_2d(size_t r, size_t c, const int64_t *strides) {
+    return r * strides[0] + c * strides[1];
+}
+
+// 计算 3D 张量的偏移量
+inline size_t offset_3d(size_t b, size_t r, size_t c, const int64_t *strides) {
+    return b * strides[0] + r * strides[1] + c * strides[2];
+}
+
+// ==================================================================
+// 核心 Kernel 实现
+// ==================================================================
+
+/**
+ * @brief Addbmm 核心 CPU 计算函数 (支持任意 Stride)
+ */
+template <typename Tdata>
+void calculate_impl(
+    const AddbmmInfo &info,
+    void *output,
+    const void *input,
+    const void *batch1,
+    const void *batch2) {
+
+    // [变更 1] 使用 Getter 获取维度
+    size_t b_dim = info.b(); 
+    size_t n = info.n();     
+    size_t m = info.m();     
+    size_t p = info.p();     
+    
+    float alpha = info.alpha();
+    float beta = info.beta();
+    
+    // 指针转换
+    Tdata *out_ptr = reinterpret_cast<Tdata *>(output);
+    const Tdata *inp_ptr = reinterpret_cast<const Tdata *>(input);
+    const Tdata *b1_ptr = reinterpret_cast<const Tdata *>(batch1); 
+    const Tdata *b2_ptr = reinterpret_cast<const Tdata *>(batch2); 
+
+    
+    const int64_t *out_strides = info.out_strides().data();
+    const int64_t *in_strides = info.in_strides().data();
+    const int64_t *b1_strides = info.b1_strides().data();
+    const int64_t *b2_strides = info.b2_strides().data();
+
+    // 1. 初始化 output = beta * input 
+    for (size_t i = 0; i < n; ++i) { 
+        for (size_t k = 0; k < p; ++k) { 
+            size_t out_idx = offset_2d(i, k, out_strides);
+            size_t in_idx = offset_2d(i, k, in_strides);
+            
+            float val_in = (beta != 0.0f) ? utils::cast<float>(inp_ptr[in_idx]) : 0.0f;
+            
+            if (beta == 0.0f && alpha == 0.0f) {
+                out_ptr[out_idx] = utils::cast<Tdata>(0.0f);
+            } else {
+                out_ptr[out_idx] = utils::cast<Tdata>(val_in * beta);
+            }
+        }
+    }
+
+    // 2. 累加矩阵乘法: out += alpha * sum(b1 @ b2) 
+    for (size_t b = 0; b < b_dim; ++b) {           // Batch
+        for (size_t i = 0; i < n; ++i) {           // Row
+            for (size_t k = 0; k < p; ++k) {       // Col
+                
+                float dot_product = 0.0f;
+                
+                // 内部点积 (Inner dimension m)
+                for (size_t j = 0; j < m; ++j) {
+                    size_t b1_idx = offset_3d(b, i, j, b1_strides);
+                    size_t b2_idx = offset_3d(b, j, k, b2_strides);
+                    
+                    float v1 = utils::cast<float>(b1_ptr[b1_idx]);
+                    float v2 = utils::cast<float>(b2_ptr[b2_idx]);
+                    
+                    dot_product += v1 * v2;
+                }
+
+                // 累加到 Output
+                size_t out_idx = offset_2d(i, k, out_strides);
+                float current_val = utils::cast<float>(out_ptr[out_idx]);
+                out_ptr[out_idx] = utils::cast<Tdata>(current_val + alpha * dot_product);
+            }
+        }
+    }
+}
+
+// ==================================================================
+// Descriptor 接口实现
+// ==================================================================
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec, 
+    float alpha,
+    float beta) {
+
+    if (input_desc_vec.size() != 3) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    infiniopTensorDescriptor_t in_desc = input_desc_vec[0];
+    infiniopTensorDescriptor_t batch1_desc = input_desc_vec[1];
+    infiniopTensorDescriptor_t batch2_desc = input_desc_vec[2];
+
+    auto dtype = out_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+    
+    // 创建 Info 对象
+    auto result = AddbmmInfo::create(out_desc, in_desc, batch1_desc, batch2_desc, alpha, beta);
+    CHECK_RESULT(result);
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    *desc_ptr = new Descriptor(
+        nullptr,
+        result.take(),
+        0,
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs, 
+    void *stream) const {
+
+    if (inputs.size() != 3) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    const void *input = inputs[0];
+    const void *batch1 = inputs[1];
+    const void *batch2 = inputs[2];
+
+    auto dtype = _info.dtype();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        calculate_impl<fp16_t>(_info, output, input, batch1, batch2);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_BF16:
+        calculate_impl<bf16_t>(_info, output, input, batch1, batch2);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_F32:
+        calculate_impl<float>(_info, output, input, batch1, batch2);
+        return INFINI_STATUS_SUCCESS;
+    
+    case INFINI_DTYPE_F64:
+        calculate_impl<double>(_info, output, input, batch1, batch2);
+        return INFINI_STATUS_SUCCESS;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::addbmm::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/cpu/addbmm_cpu.h b/src/infiniop/ops/addbmm/cpu/addbmm_cpu.h
new file mode 100644
index 000000000..b2e30c7d3
--- /dev/null
+++ b/src/infiniop/ops/addbmm/cpu/addbmm_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ADDBMM_CPU_H__
+#define __ADDBMM_CPU_H__
+
+#include "../addbmm.h"
+
+DESCRIPTOR(cpu)
+
+#endif  //_GRID_CPU_H_
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/cuda/kernel.cuh b/src/infiniop/ops/addbmm/cuda/kernel.cuh
new file mode 100644
index 000000000..f8483ed1b
--- /dev/null
+++ b/src/infiniop/ops/addbmm/cuda/kernel.cuh
@@ -0,0 +1,155 @@
+#ifndef __ADDBMM_NVIDIA_CUH__
+#define __ADDBMM_NVIDIA_CUH__
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <type_traits>
+#include <stdio.h>
+
+namespace op::addbmm::nvidia {
+
+// --- 常量定义 ---
+constexpr int BLOCK_SIZE = 16; // 16x16 线程块，处理 FP32/FP16 比较通用
+
+template <typename T>
+__device__ __forceinline__ float to_float_acc(const T &x) {
+    if constexpr (std::is_same_v<T, half>) return __half2float(x);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) return __bfloat162float(x);
+#endif
+    else return static_cast<float>(x);
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float_res(float x) {
+    if constexpr (std::is_same_v<T, half>) return __float2half(x);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) return __float2bfloat16(x);
+#endif
+    else return static_cast<T>(x);
+}
+
+
+template <typename T>
+__global__ void addbmm_tiled_kernel(
+    T *output,
+    const T *input,
+    const T *batch1,
+    const T *batch2,
+    size_t b, size_t n, size_t m, size_t p,
+    float alpha, float beta,
+    // Strides
+    ptrdiff_t out_s0, ptrdiff_t out_s1,
+    ptrdiff_t inp_s0, ptrdiff_t inp_s1,
+    ptrdiff_t b1_s0, ptrdiff_t b1_s1, ptrdiff_t b1_s2,
+    ptrdiff_t b2_s0, ptrdiff_t b2_s1, ptrdiff_t b2_s2
+) {
+    // Block 行列索引 (Output 的坐标)
+    int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+    int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+    float acc = 0.0f;
+
+    // Shared Memory 缓存
+    // 大小: 2个矩阵 * BLOCK_SIZE * BLOCK_SIZE
+    __shared__ float s_b1[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float s_b2[BLOCK_SIZE][BLOCK_SIZE];
+
+    // 遍历每一个 Batch
+    for (int batch_idx = 0; batch_idx < b; ++batch_idx) {
+        
+        // 遍历 K 维度 (即 m 维度)，步长为 BLOCK_SIZE
+        for (int k = 0; k < m; k += BLOCK_SIZE) {
+            
+           
+            if (row < n && (k + threadIdx.x) < m) {
+                // b1: [batch_idx, row, k + tx]
+                size_t idx = batch_idx * b1_s0 + row * b1_s1 + (k + threadIdx.x) * b1_s2;
+                s_b1[threadIdx.y][threadIdx.x] = to_float_acc(batch1[idx]);
+            } else {
+                s_b1[threadIdx.y][threadIdx.x] = 0.0f;
+            }
+
+            if ((k + threadIdx.y) < m && col < p) {
+                // b2: [batch_idx, k + ty, col]
+                size_t idx = batch_idx * b2_s0 + (k + threadIdx.y) * b2_s1 + col * b2_s2;
+                s_b2[threadIdx.y][threadIdx.x] = to_float_acc(batch2[idx]);
+            } else {
+                s_b2[threadIdx.y][threadIdx.x] = 0.0f;
+            }
+
+            // 等待所有线程加载完毕
+            __syncthreads();
+
+            // 2. 计算子块乘积 (Partial Accumulation)
+            // 循环展开以提高指令吞吐量
+            #pragma unroll
+            for (int e = 0; e < BLOCK_SIZE; ++e) {
+                acc += s_b1[threadIdx.y][e] * s_b2[e][threadIdx.x];
+            }
+
+            // 等待计算完毕，准备加载下一个 tile
+            __syncthreads();
+        }
+    }
+
+    // 3. 写入结果 (Scale + Add Input)
+    if (row < n && col < p) {
+        float res = 0.0f;
+        
+        // Input 部分: beta * input
+        if (beta != 0.0f) {
+            size_t inp_idx = row * inp_s0 + col * inp_s1;
+            res = beta * to_float_acc(input[inp_idx]);
+        }
+        
+        // Matmul 部分: alpha * sum
+        res += alpha * acc;
+
+        size_t out_idx = row * out_s0 + col * out_s1;
+        output[out_idx] = from_float_res<T>(res);
+    }
+}
+
+// ==================================================================
+// 2. Launcher
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, const void *input, const void *batch1, const void *batch2, 
+    size_t b, size_t n, size_t m, size_t p, 
+    float alpha, float beta,
+    // Strides
+    ptrdiff_t out_s0, ptrdiff_t out_s1,
+    ptrdiff_t inp_s0, ptrdiff_t inp_s1,
+    ptrdiff_t b1_s0, ptrdiff_t b1_s1, ptrdiff_t b1_s2,
+    ptrdiff_t b2_s0, ptrdiff_t b2_s1, ptrdiff_t b2_s2,
+    void *stream) {
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto b1_ptr = reinterpret_cast<const T *>(batch1);
+    auto b2_ptr = reinterpret_cast<const T *>(batch2);
+
+    // 2D Grid 配置
+    dim3 block(BLOCK_SIZE, BLOCK_SIZE); // 16x16 threads
+    dim3 grid(
+        (p + BLOCK_SIZE - 1) / BLOCK_SIZE, // x轴覆盖 col (p)
+        (n + BLOCK_SIZE - 1) / BLOCK_SIZE  // y轴覆盖 row (n)
+    );
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    addbmm_tiled_kernel<T><<<grid, block, 0, cuda_stream>>>(
+        out_ptr, in_ptr, b1_ptr, b2_ptr,
+        b, n, m, p,
+        alpha, beta,
+        out_s0, out_s1, inp_s0, inp_s1, 
+        b1_s0, b1_s1, b1_s2, b2_s0, b2_s1, b2_s2
+    );
+}
+
+} // namespace op::addbmm::nvidia
+
+#endif // __ADDBMM_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/info.h b/src/infiniop/ops/addbmm/info.h
new file mode 100644
index 000000000..dd1f12ba3
--- /dev/null
+++ b/src/infiniop/ops/addbmm/info.h
@@ -0,0 +1,123 @@
+#ifndef __ADDBMM_INFO_H__
+#define __ADDBMM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::addbmm {
+
+class AddbmmInfo {
+    AddbmmInfo() = default;
+
+public:
+    // 矩阵运算维度的定义:
+    // batch1: (b, n, m)
+    // batch2: (b, m, p)
+    // input/output: (n, p)
+    size_t _b;
+    size_t _n;
+    size_t _m;
+    size_t _p;
+
+    // 【新增】步长信息 (Strides)
+    // 用于处理非连续内存布局
+    std::vector<int64_t> _out_strides; // [stride_n, stride_p]
+    std::vector<int64_t> _in_strides;  // [stride_n, stride_p]
+    std::vector<int64_t> _b1_strides;  // [stride_b, stride_n, stride_m]
+    std::vector<int64_t> _b2_strides;  // [stride_b, stride_m, stride_p]
+
+    // 标量系数
+    float _alpha;
+    float _beta;
+
+    // 数据类型
+    int _dtype;
+
+    // Getters
+    size_t b() const { return _b; }
+    size_t n() const { return _n; }
+    size_t m() const { return _m; }
+    size_t p() const { return _p; }
+
+    // 【新增】Strides Getters
+    const std::vector<int64_t>& out_strides() const { return _out_strides; }
+    const std::vector<int64_t>& in_strides() const { return _in_strides; }
+    const std::vector<int64_t>& b1_strides() const { return _b1_strides; }
+    const std::vector<int64_t>& b2_strides() const { return _b2_strides; }
+
+    float alpha() const { return _alpha; }
+    float beta() const { return _beta; }
+    int dtype() const { return _dtype; }
+
+    static utils::Result<AddbmmInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        infiniopTensorDescriptor_t batch1_desc,
+        infiniopTensorDescriptor_t batch2_desc,
+        float alpha,
+        float beta) {
+
+        // 1. 检查数据类型一致性
+        int dtype = out_desc->dtype();
+        if (in_desc->dtype() != dtype || 
+            batch1_desc->dtype() != dtype || 
+            batch2_desc->dtype() != dtype) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 2. 检查维度数量 (ndim)
+        if (batch1_desc->ndim() != 3 || batch2_desc->ndim() != 3) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (in_desc->ndim() != 2 || out_desc->ndim() != 2) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        const auto& b1_shape = batch1_desc->shape();
+        const auto& b2_shape = batch2_desc->shape();
+        const auto& in_shape = in_desc->shape();
+        const auto& out_shape = out_desc->shape();
+
+        // 3. 解析并校验维度
+        size_t b = b1_shape[0];
+        size_t n = b1_shape[1];
+        size_t m = b1_shape[2];
+
+        if (b2_shape[0] != b || b2_shape[1] != m) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        size_t p = b2_shape[2];
+
+        if (out_shape[0] != n || out_shape[1] != p) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (in_shape[0] != n || in_shape[1] != p) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 4. 【新增】提取 Strides
+        auto out_strides = out_desc->strides();
+        auto in_strides = in_desc->strides();
+        auto b1_strides = batch1_desc->strides();
+        auto b2_strides = batch2_desc->strides();
+
+        // 5. 返回 Info 对象
+        AddbmmInfo info;
+        info._b = b; info._n = n; info._m = m; info._p = p;
+        info._out_strides = out_strides;
+        info._in_strides = in_strides;
+        info._b1_strides = b1_strides;
+        info._b2_strides = b2_strides;
+        info._alpha = alpha;
+        info._beta = beta;
+        info._dtype = dtype;
+
+        return utils::Result<AddbmmInfo>(info);
+    }
+};
+
+} // namespace op::addbmm
+
+#endif // __ADDBMM_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/metax/addbmm_metax.h b/src/infiniop/ops/addbmm/metax/addbmm_metax.h
new file mode 100644
index 000000000..45cdfe64a
--- /dev/null
+++ b/src/infiniop/ops/addbmm/metax/addbmm_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ADDBMM_METAX_H__
+#define __ADDBMM_METAX_H__
+
+#include "../addbmm.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ADDBMM_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/metax/addbmm_metax.maca b/src/infiniop/ops/addbmm/metax/addbmm_metax.maca
new file mode 100644
index 000000000..a8cf58403
--- /dev/null
+++ b/src/infiniop/ops/addbmm/metax/addbmm_metax.maca
@@ -0,0 +1,383 @@
+#include "addbmm_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <mcblas/mcblas.h>
+#include <common/mc_library_types.h>
+#include <vector>
+#include <algorithm>
+#include <cstdint>
+
+namespace op::addbmm::metax {
+
+// ==================================================================
+// 辅助 Kernel
+// ==================================================================
+template <typename T>
+__global__ void copy_tensor_kernel(
+    T* dst, 
+    const T* src, 
+    int batch, int rows, int cols,
+    int64_t stride_b, int64_t stride_h, int64_t stride_w) 
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = batch * rows * cols;
+    if (idx >= total) return;
+
+    int c = idx % cols;
+    int r = (idx / cols) % rows;
+    int b = idx / (rows * cols);
+
+    int64_t src_offset = b * stride_b + r * stride_h + c * stride_w;
+    dst[idx] = src[src_offset];
+}
+
+template <typename T>
+__global__ void copy_back_kernel(
+    T* dst, 
+    const T* src, 
+    int rows, int cols,
+    int64_t stride_h, int64_t stride_w) 
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = rows * cols;
+    if (idx >= total) return;
+
+    int c = idx % cols;
+    int r = idx / cols;
+
+    int64_t dst_offset = r * stride_h + c * stride_w;
+    dst[dst_offset] = src[idx];
+}
+
+
+bool needs_copy(const std::vector<int64_t>& strides, size_t rows, size_t cols) {
+    if (strides.empty()) return false;
+    
+    int64_t stride_row = strides[strides.size() - 2];
+    int64_t stride_col = strides[strides.size() - 1];
+    
+    if (stride_col != 1) return true;
+    if (stride_row != static_cast<int64_t>(cols)) return true;
+    return false;
+}
+
+
+std::pair<int64_t, int64_t> get_rc_strides(const std::vector<int64_t>& s) {
+    if (s.size() < 2) return {0, 1}; // Should not happen
+    return {s[s.size() - 2], s[s.size() - 1]};
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    bool copy_b1 = false;
+    bool copy_b2 = false;
+    bool copy_out = false;
+    size_t size_b1 = 0;
+    size_t size_b2 = 0;
+    size_t size_out = 0;
+    size_t offset_b1 = 0;
+    size_t offset_b2 = 0;
+    size_t offset_out = 0;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float alpha,
+    float beta) {
+    
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    
+    if (input_desc_vec.size() != 3) return INFINI_STATUS_BAD_PARAM;
+    
+    auto in_desc = input_desc_vec[0];
+    auto b1_desc = input_desc_vec[1];
+    auto b2_desc = input_desc_vec[2];
+
+    auto dtype = out_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = AddbmmInfo::create(out_desc, in_desc, b1_desc, b2_desc, alpha, beta);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque = new Opaque{handle->internal()};
+    
+    size_t dtype_size = (dtype == INFINI_DTYPE_F32) ? 4 : 2;
+    size_t total_workspace = 0;
+
+    // B1: (b, n, m)
+    if (needs_copy(info.b1_strides(), info.n(), info.m())) {
+        opaque->copy_b1 = true;
+        opaque->size_b1 = info.b() * info.n() * info.m() * dtype_size;
+        opaque->offset_b1 = total_workspace;
+        total_workspace += opaque->size_b1;
+        total_workspace = (total_workspace + 255) / 256 * 256; 
+    }
+
+    // B2: (b, m, p)
+    if (needs_copy(info.b2_strides(), info.m(), info.p())) {
+        opaque->copy_b2 = true;
+        opaque->size_b2 = info.b() * info.m() * info.p() * dtype_size;
+        opaque->offset_b2 = total_workspace;
+        total_workspace += opaque->size_b2;
+        total_workspace = (total_workspace + 255) / 256 * 256; 
+    }
+
+    // Out: (n, p)
+    if (needs_copy(info.out_strides(), info.n(), info.p())) {
+        opaque->copy_out = true;
+        opaque->size_out = info.n() * info.p() * dtype_size;
+        opaque->offset_out = total_workspace;
+        total_workspace += opaque->size_out;
+    }
+
+    *desc_ptr = new Descriptor(opaque, info, total_workspace, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (inputs.size() != 3) return INFINI_STATUS_BAD_PARAM;
+
+    const void *input_ptr = inputs[0]; 
+    const void *b1_ptr = inputs[1];
+    const void *b2_ptr = inputs[2];
+    auto hc_stream = (hcStream_t)stream;
+
+    decltype(MACA_R_32F) a_type, b_type, c_type;
+    mcblasComputeType_t compute_type;
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        a_type = b_type = c_type = MACA_R_16F;
+        compute_type = MCBLAS_COMPUTE_32F;
+        break;
+    case INFINI_DTYPE_BF16:
+        a_type = b_type = c_type = MACA_R_16BF;
+        compute_type = MCBLAS_COMPUTE_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        a_type = b_type = c_type = MACA_R_32F;
+        compute_type = MCBLAS_COMPUTE_32F; 
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    char* ws_base = (char*)workspace;
+
+    // --- 准备 B1 ---
+    const void* b1_active = b1_ptr;
+    
+    long long strideb_blas = _info.b1_strides()[0];
+    int ldb_blas = static_cast<int>(_info.b1_strides()[1]);
+
+    if (_opaque->copy_b1) {
+        void* b1_ws = ws_base + _opaque->offset_b1;
+        size_t total = _info.b() * _info.n() * _info.m();
+        size_t block = 256;
+        size_t grid = (total + block - 1) / block;
+        
+        // 提取 stride
+        auto& s = _info.b1_strides();
+        int64_t sb = s[0], sh = s[1], sw = s[2];
+
+        if (_info.dtype() == INFINI_DTYPE_F32) {
+            copy_tensor_kernel<float><<<grid, block, 0, hc_stream>>>(
+                (float*)b1_ws, (const float*)b1_ptr, 
+                _info.b(), _info.n(), _info.m(), sb, sh, sw);
+        } else if (_info.dtype() == INFINI_DTYPE_F16) {
+            copy_tensor_kernel<half><<<grid, block, 0, hc_stream>>>(
+                (half*)b1_ws, (const half*)b1_ptr, 
+                _info.b(), _info.n(), _info.m(), sb, sh, sw);
+        } else {
+            copy_tensor_kernel<__maca_bfloat16><<<grid, block, 0, hc_stream>>>(
+                (__maca_bfloat16*)b1_ws, (const __maca_bfloat16*)b1_ptr, 
+                _info.b(), _info.n(), _info.m(), sb, sh, sw);
+        }
+        b1_active = b1_ws;
+        strideb_blas = _info.n() * _info.m();
+        ldb_blas = _info.m();
+    }
+
+    // --- 准备 B2 ---
+    const void* b2_active = b2_ptr;
+    long long stridea_blas = _info.b2_strides()[0];
+    int lda_blas = static_cast<int>(_info.b2_strides()[1]);
+
+    if (_opaque->copy_b2) {
+        void* b2_ws = ws_base + _opaque->offset_b2;
+        size_t total = _info.b() * _info.m() * _info.p();
+        size_t block = 256;
+        size_t grid = (total + block - 1) / block;
+
+        auto& s = _info.b2_strides();
+        int64_t sb = s[0], sh = s[1], sw = s[2];
+
+        if (_info.dtype() == INFINI_DTYPE_F32) {
+            copy_tensor_kernel<float><<<grid, block, 0, hc_stream>>>(
+                (float*)b2_ws, (const float*)b2_ptr, 
+                _info.b(), _info.m(), _info.p(), sb, sh, sw);
+        } else if (_info.dtype() == INFINI_DTYPE_F16) {
+            copy_tensor_kernel<half><<<grid, block, 0, hc_stream>>>(
+                (half*)b2_ws, (const half*)b2_ptr, 
+                _info.b(), _info.m(), _info.p(), sb, sh, sw);
+        } else {
+            copy_tensor_kernel<__maca_bfloat16><<<grid, block, 0, hc_stream>>>(
+                (__maca_bfloat16*)b2_ws, (const __maca_bfloat16*)b2_ptr, 
+                _info.b(), _info.m(), _info.p(), sb, sh, sw);
+        }
+        b2_active = b2_ws;
+        stridea_blas = _info.m() * _info.p();
+        lda_blas = _info.p();
+    }
+
+    // --- 准备 Output / Input(Beta) ---
+    void* out_active = output;
+    
+   
+    auto out_rc = get_rc_strides(_info.out_strides());
+    int ldc_blas = static_cast<int>(out_rc.first); // Row Stride
+
+    if (_opaque->copy_out) {
+        out_active = ws_base + _opaque->offset_out;
+        ldc_blas = _info.p();
+        
+        size_t total = _info.n() * _info.p();
+        size_t block = 256;
+        size_t grid = (total + block - 1) / block;
+    
+        auto in_rc = get_rc_strides(_info.in_strides());
+
+        if (_info.dtype() == INFINI_DTYPE_F32) {
+            copy_tensor_kernel<float><<<grid, block, 0, hc_stream>>>(
+                (float*)out_active, (const float*)input_ptr, 
+                1, _info.n(), _info.p(), 
+                0, in_rc.first, in_rc.second);
+        } else if (_info.dtype() == INFINI_DTYPE_F16) {
+            copy_tensor_kernel<half><<<grid, block, 0, hc_stream>>>(
+                (half*)out_active, (const half*)input_ptr, 
+                1, _info.n(), _info.p(), 
+                0, in_rc.first, in_rc.second);
+        } else {
+            copy_tensor_kernel<__maca_bfloat16><<<grid, block, 0, hc_stream>>>(
+                (__maca_bfloat16*)out_active, (const __maca_bfloat16*)input_ptr, 
+                1, _info.n(), _info.p(), 
+                0, in_rc.first, in_rc.second);
+        }
+    } else {
+        if (output != input_ptr) {
+             size_t total = _info.n() * _info.p();
+             size_t block = 256;
+             size_t grid = (total + block - 1) / block;
+             
+             // [修复] 强制使用 kernel copy，并安全获取 stride
+             auto in_rc = get_rc_strides(_info.in_strides());
+
+             if (_info.dtype() == INFINI_DTYPE_F32) {
+                copy_tensor_kernel<float><<<grid, block, 0, hc_stream>>>(
+                    (float*)output, (const float*)input_ptr, 
+                    1, _info.n(), _info.p(), 
+                    0, in_rc.first, in_rc.second);
+             } else if (_info.dtype() == INFINI_DTYPE_F16) {
+                copy_tensor_kernel<half><<<grid, block, 0, hc_stream>>>(
+                    (half*)output, (const half*)input_ptr, 
+                    1, _info.n(), _info.p(), 
+                    0, in_rc.first, in_rc.second);
+             } else {
+                copy_tensor_kernel<__maca_bfloat16><<<grid, block, 0, hc_stream>>>(
+                    (__maca_bfloat16*)output, (const __maca_bfloat16*)input_ptr, 
+                    1, _info.n(), _info.p(), 
+                    0, in_rc.first, in_rc.second);
+             }
+        }
+    }
+
+    // --- 执行 BLAS 计算 ---
+    int m_blas = static_cast<int>(_info.p());
+    int n_blas = static_cast<int>(_info.n());
+    int k_blas = static_cast<int>(_info.m());
+    float alpha = _info.alpha();
+    float beta_user = _info.beta();
+    size_t batch_size = _info.b();
+    int data_width = (_info.dtype() == INFINI_DTYPE_F32 ? 4 : 2);
+
+    auto status = _opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](auto raw_handle) { 
+            mcblasHandle_t handle = reinterpret_cast<mcblasHandle_t>(raw_handle);
+
+            for (size_t i = 0; i < batch_size; ++i) {
+                float current_beta = (i == 0) ? beta_user : 1.0f;
+                
+                const void* curr_b2 = static_cast<const char*>(b2_active) + i * stridea_blas * data_width;
+                const void* curr_b1 = static_cast<const char*>(b1_active) + i * strideb_blas * data_width;
+
+                mcblasStatus_t s = mcblasGemmEx(
+                    handle,
+                    MCBLAS_OP_N, MCBLAS_OP_N,
+                    m_blas, n_blas, k_blas,
+                    &alpha,
+                    curr_b2, a_type, lda_blas,
+                    curr_b1, b_type, ldb_blas,
+                    &current_beta,
+                    out_active, c_type, ldc_blas,
+                    compute_type,
+                    MCBLAS_GEMM_DEFAULT
+                );
+
+                if (s != MCBLAS_STATUS_SUCCESS) return INFINI_STATUS_INTERNAL_ERROR;
+            }
+            return INFINI_STATUS_SUCCESS;
+        });
+
+    if (status != INFINI_STATUS_SUCCESS) return status;
+
+    // --- 拷回结果 ---
+    if (_opaque->copy_out) {
+        size_t total = _info.n() * _info.p();
+        size_t block = 256;
+        size_t grid = (total + block - 1) / block;
+        
+        // [修复] 安全获取 output stride
+        auto out_rc = get_rc_strides(_info.out_strides());
+
+        if (_info.dtype() == INFINI_DTYPE_F32) {
+            copy_back_kernel<float><<<grid, block, 0, hc_stream>>>(
+                (float*)output, (const float*)out_active, 
+                _info.n(), _info.p(), 
+                out_rc.first, out_rc.second);
+        } else if (_info.dtype() == INFINI_DTYPE_F16) {
+            copy_back_kernel<half><<<grid, block, 0, hc_stream>>>(
+                (half*)output, (const half*)out_active, 
+                _info.n(), _info.p(), 
+                out_rc.first, out_rc.second);
+        } else {
+            copy_back_kernel<__maca_bfloat16><<<grid, block, 0, hc_stream>>>(
+                (__maca_bfloat16*)output, (const __maca_bfloat16*)out_active, 
+                _info.n(), _info.p(), 
+                out_rc.first, out_rc.second);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addbmm::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/moore/addbmm_moore.h b/src/infiniop/ops/addbmm/moore/addbmm_moore.h
new file mode 100644
index 000000000..3c37093bb
--- /dev/null
+++ b/src/infiniop/ops/addbmm/moore/addbmm_moore.h
@@ -0,0 +1,9 @@
+#ifndef __ADDBMM_MOORE_API_H__
+#define __ADDBMM_MOORE_API_H__
+
+#include "../addbmm.h"
+
+
+DESCRIPTOR(moore)
+
+#endif // __ADDBMM_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/moore/addbmm_moore.mu b/src/infiniop/ops/addbmm/moore/addbmm_moore.mu
new file mode 100644
index 000000000..74c751232
--- /dev/null
+++ b/src/infiniop/ops/addbmm/moore/addbmm_moore.mu
@@ -0,0 +1,240 @@
+#include "addbmm_moore.h"
+#include "addbmm_moore_kernel.h" 
+
+#include <musa_runtime.h>
+#include <musa_fp16.h> 
+#include <musa_bf16.h>
+#include "../../../devices/moore/moore_handle.h"
+#include <vector>
+
+namespace op::addbmm::moore {
+
+// ==================================================================
+// 1. Kernel Implementation (Removed Macros)
+// ==================================================================
+
+template <typename T>
+__global__ void addbmm_kernel(
+    const size_t B, const size_t N, const size_t M, const size_t P,
+    const float alpha, const float beta,
+    T *output,
+    const T *input,
+    const T *batch1,
+    const T *batch2,
+    const int64_t out_s0, const int64_t out_s1,
+    const int64_t in_s0, const int64_t in_s1,
+    const int64_t b1_s0, const int64_t b1_s1, const int64_t b1_s2,
+    const int64_t b2_s0, const int64_t b2_s1, const int64_t b2_s2) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = N * P;
+
+    if (idx < total_elements) {
+        size_t n = idx / P;
+        size_t p = idx % P;
+
+        float matmul_sum = 0.0f;
+
+        // 预先计算与 b 无关的偏移量部分，略微优化性能
+        int64_t b1_n_offset = n * b1_s1;
+        int64_t b2_p_offset = p * b2_s2;
+
+        for (size_t b = 0; b < B; ++b) {
+            int64_t b1_b_offset = b * b1_s0;
+            int64_t b2_b_offset = b * b2_s0;
+
+            for (size_t m = 0; m < M; ++m) {
+                // 直接计算偏移：Batch1[b, n, m]
+                int64_t offset1 = b1_b_offset + b1_n_offset + m * b1_s2;
+                // 直接计算偏移：Batch2[b, m, p]
+                int64_t offset2 = b2_b_offset + m * b2_s1 + b2_p_offset;
+
+                T val1 = batch1[offset1];
+                T val2 = batch2[offset2];
+                
+                float v1_f, v2_f;
+                if constexpr (std::is_same_v<T, half>) {
+                    v1_f = __half2float(val1);
+                    v2_f = __half2float(val2);
+                } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                    v1_f = __bfloat162float(val1);
+                    v2_f = __bfloat162float(val2);
+                } else {
+                    v1_f = static_cast<float>(val1);
+                    v2_f = static_cast<float>(val2);
+                }
+                matmul_sum += v1_f * v2_f;
+            }
+        }
+
+        // 直接计算偏移：Input[n, p]
+        int64_t in_offset = n * in_s0 + p * in_s1;
+        T in_val = input[in_offset];
+        
+        float in_val_f;
+        if constexpr (std::is_same_v<T, half>) {
+            in_val_f = __half2float(in_val);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            in_val_f = __bfloat162float(in_val);
+        } else {
+            in_val_f = static_cast<float>(in_val);
+        }
+
+        float result = beta * in_val_f + alpha * matmul_sum;
+
+        // 直接计算偏移：Output[n, p]
+        int64_t out_offset = n * out_s0 + p * out_s1;
+
+        if constexpr (std::is_same_v<T, half>) {
+            output[out_offset] = __float2half(result);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            output[out_offset] = __float2bfloat16(result);
+        } else {
+            output[out_offset] = static_cast<T>(result);
+        }
+    }
+}
+
+// ==================================================================
+// 2. Launcher Implementation
+// ==================================================================
+
+template <typename T>
+void addbmm_moore_launch(
+    const AddbmmInfo &info,
+    T *output,
+    const T *input,
+    const T *batch1,
+    const T *batch2,
+    void *stream) {
+
+    size_t total_elements = info.n() * info.p();
+    int threads = 256;
+    int blocks = (total_elements + threads - 1) / threads;
+
+    const auto& out_strides = info.out_strides();
+    const auto& in_strides = info.in_strides();
+    const auto& b1_strides = info.b1_strides();
+    const auto& b2_strides = info.b2_strides();
+
+    addbmm_kernel<T><<<blocks, threads, 0, (musaStream_t)stream>>>(
+        info.b(), info.n(), info.m(), info.p(),
+        info.alpha(), info.beta(),
+        output, input, batch1, batch2,
+        out_strides[0], out_strides[1],
+        in_strides[0], in_strides[1],
+        b1_strides[0], b1_strides[1], b1_strides[2],
+        b2_strides[0], b2_strides[1], b2_strides[2]
+    );
+}
+
+// ==================================================================
+// 3. Descriptor Implementation
+// ==================================================================
+
+Descriptor::~Descriptor() = default;
+
+// 匹配 std::vector 接口
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float alpha,
+    float beta) {
+
+    if (input_desc_vec.size() != 3) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    infiniopTensorDescriptor_t in_desc = input_desc_vec[0];
+    infiniopTensorDescriptor_t batch1_desc = input_desc_vec[1];
+    infiniopTensorDescriptor_t batch2_desc = input_desc_vec[2];
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto info_result = AddbmmInfo::create(out_desc, in_desc, batch1_desc, batch2_desc, alpha, beta);
+
+    if (!info_result) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(
+        nullptr, 
+        *info_result,
+        0, 
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 匹配 std::vector 接口
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (inputs.size() != 3) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    const void *input = inputs[0];
+    const void *batch1 = inputs[1];
+    const void *batch2 = inputs[2];
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        addbmm_moore_launch<half>(
+            _info,
+            static_cast<half *>(output),
+            static_cast<const half *>(input),
+            static_cast<const half *>(batch1),
+            static_cast<const half *>(batch2),
+            stream);
+        break;
+
+    case INFINI_DTYPE_BF16:
+        addbmm_moore_launch<__mt_bfloat16>(
+            _info,
+            static_cast<__mt_bfloat16 *>(output),
+            static_cast<const __mt_bfloat16 *>(input),
+            static_cast<const __mt_bfloat16 *>(batch1),
+            static_cast<const __mt_bfloat16 *>(batch2),
+            stream);
+        break;
+
+    case INFINI_DTYPE_F32:
+        addbmm_moore_launch<float>(
+            _info,
+            static_cast<float *>(output),
+            static_cast<const float *>(input),
+            static_cast<const float *>(batch1),
+            static_cast<const float *>(batch2),
+            stream);
+        break;
+
+    case INFINI_DTYPE_F64:
+        addbmm_moore_launch<double>(
+            _info,
+            static_cast<double *>(output),
+            static_cast<const double *>(input),
+            static_cast<const double *>(batch1),
+            static_cast<const double *>(batch2),
+            stream);
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addbmm::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/moore/addbmm_moore_kernel.h b/src/infiniop/ops/addbmm/moore/addbmm_moore_kernel.h
new file mode 100644
index 000000000..859fd1398
--- /dev/null
+++ b/src/infiniop/ops/addbmm/moore/addbmm_moore_kernel.h
@@ -0,0 +1,121 @@
+#ifndef __ADDBMM_MOORE_KERNEL_H__
+#define __ADDBMM_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h> 
+
+#include <type_traits> // 用于 std::is_same_v
+
+namespace op::addbmm::moore {
+
+
+typedef struct AddbmmOp {
+public:
+    template <typename T>
+    __device__ __forceinline__ void operator()(
+        // 当前线程处理的输出坐标 (n, p)
+        const int n, 
+        const int p,
+        
+        // 维度信息
+        const int B, // Batch size
+        const int M, // 中间维度
+
+        // 标量系数
+        const float alpha,
+        const float beta,
+
+        // 数据指针 (Base Pointers)
+        const T* input,
+        const T* batch1,
+        const T* batch2,
+        T* output,
+
+        // Strides (解包传递)
+        // input/output: (n, p)
+        const int64_t in_s0, const int64_t in_s1,
+        const int64_t out_s0, const int64_t out_s1,
+        // batch1: (b, n, m)
+        const int64_t b1_s0, const int64_t b1_s1, const int64_t b1_s2,
+        // batch2: (b, m, p)
+        const int64_t b2_s0, const int64_t b2_s1, const int64_t b2_s2
+    ) const {
+        
+       
+        float matmul_sum = 0.0f;
+
+        
+        int64_t b1_n_offset = n * b1_s1;
+        // Batch2 的 p 维度偏移
+        int64_t b2_p_offset = p * b2_s2;
+
+        // 遍历 Batch 维度
+        for (int b = 0; b < B; ++b) {
+            
+            // 预计算当前 Batch 的偏移
+            int64_t b1_b_offset = b * b1_s0;
+            int64_t b2_b_offset = b * b2_s0;
+
+            // 遍历中间维度 M (矩阵乘法)
+            for (int m = 0; m < M; ++m) {
+                // 计算实际内存偏移
+                // Batch1[b, n, m] -> ptr + b*s0 + n*s1 + m*s2
+                int64_t offset1 = b1_b_offset + b1_n_offset + m * b1_s2;
+                // Batch2[b, m, p] -> ptr + b*s0 + m*s1 + p*s2
+                int64_t offset2 = b2_b_offset + m * b2_s1 + b2_p_offset;
+
+                T val1_t = batch1[offset1];
+                T val2_t = batch2[offset2];
+
+                float val1_f, val2_f;
+
+                // 类型转换：T -> float
+                if constexpr (std::is_same_v<T, half>) {
+                    val1_f = __half2float(val1_t);
+                    val2_f = __half2float(val2_t);
+                } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                    val1_f = __bfloat162float(val1_t);
+                    val2_f = __bfloat162float(val2_t);
+                } else {
+                    val1_f = static_cast<float>(val1_t);
+                    val2_f = static_cast<float>(val2_t);
+                }
+
+                matmul_sum += val1_f * val2_f;
+            }
+        }
+
+        
+        int64_t in_offset = n * in_s0 + p * in_s1;
+        T in_val_t = input[in_offset];
+        float in_val_f;
+
+        if constexpr (std::is_same_v<T, half>) {
+            in_val_f = __half2float(in_val_t);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            in_val_f = __bfloat162float(in_val_t);
+        } else {
+            in_val_f = static_cast<float>(in_val_t);
+        }
+
+       
+        float result_f = beta * in_val_f + alpha * matmul_sum;
+
+        // 4. 写回 Output[n, p]
+        int64_t out_offset = n * out_s0 + p * out_s1;
+
+        if constexpr (std::is_same_v<T, half>) {
+            output[out_offset] = __float2half(result_f);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            output[out_offset] = __float2bfloat16(result_f);
+        } else {
+            output[out_offset] = static_cast<T>(result_f);
+        }
+    }
+
+} AddbmmOp;
+
+} // namespace op::addbmm::moore
+
+#endif // __ADDBMM_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/nvidia/addbmm_nvidia.cu b/src/infiniop/ops/addbmm/nvidia/addbmm_nvidia.cu
new file mode 100644
index 000000000..918cb9650
--- /dev/null
+++ b/src/infiniop/ops/addbmm/nvidia/addbmm_nvidia.cu
@@ -0,0 +1,163 @@
+#include "infinicore/ops/addbmm.hpp" // Descriptor 声明
+#include "addbmm_nvidia.cuh"         // Tiled Kernel Launcher 定义
+#include "../cuda/kernel.cuh"        // Descriptor 基类定义 (关键！)
+#include "../../../handle.h"         // Handle 定义
+#include <vector>
+
+// ==================================================================
+// 匿名命名空间：辅助函数 Wrapper
+// ==================================================================
+namespace {
+
+// 引用 Info 类
+using AddbmmInfo = ::op::addbmm::AddbmmInfo;
+
+// 泛型 Wrapper：负责从 Info 提取参数并调用底层 Launcher
+template <typename T>
+void launch_kernel_wrapper(
+    void *output, 
+    const void *input, 
+    const void *batch1, 
+    const void *batch2, 
+    const AddbmmInfo &info, // 接收 Info 对象
+    void *stream) {
+    
+    // 1. 提取维度
+    size_t b = info.b();
+    size_t n = info.n();
+    size_t m = info.m();
+    size_t p = info.p();
+    float alpha = info.alpha();
+    float beta = info.beta();
+
+    // 2. 提取 Strides
+    const auto& os = info.out_strides();
+    const auto& is = info.in_strides();
+    const auto& b1s = info.b1_strides();
+    const auto& b2s = info.b2_strides();
+
+    // 3. 调用 .cuh 中的优化版 Launcher
+    // 【关键修复】不再使用 addbmm_kernel<<<...>>>
+    // 而是调用 op::addbmm::nvidia::launch_kernel
+    ::op::addbmm::nvidia::launch_kernel<T>(
+        output, input, batch1, batch2,
+        b, n, m, p,
+        alpha, beta,
+        // 显式转换为 ptrdiff_t，匹配 .cuh 签名
+        static_cast<ptrdiff_t>(os[0]), static_cast<ptrdiff_t>(os[1]),           
+        static_cast<ptrdiff_t>(is[0]), static_cast<ptrdiff_t>(is[1]),           
+        static_cast<ptrdiff_t>(b1s[0]), static_cast<ptrdiff_t>(b1s[1]), static_cast<ptrdiff_t>(b1s[2]), 
+        static_cast<ptrdiff_t>(b2s[0]), static_cast<ptrdiff_t>(b2s[1]), static_cast<ptrdiff_t>(b2s[2]), 
+        stream
+    );
+}
+
+} // anonymous namespace
+
+// ==================================================================
+// Descriptor 成员函数实现
+// ==================================================================
+namespace op::addbmm::nvidia {
+
+// Opaque 结构体定义
+struct Descriptor::Opaque {};
+
+// 析构函数
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+// Create 函数实现
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec, // 接收 Vector
+    float alpha,
+    float beta) {
+
+    // 1. 参数校验
+    if (input_desc_vec.size() != 3) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // 2. 调用 Info::create 解析参数
+    auto info_result = ::op::addbmm::AddbmmInfo::create(
+        out_desc, 
+        input_desc_vec[0], // input
+        input_desc_vec[1], // batch1
+        input_desc_vec[2], // batch2
+        alpha,
+        beta
+    );
+
+    if (!info_result) {
+        return info_result.status();
+    }
+    
+    // 3. 创建 Descriptor 实例
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        info_result.take(),
+        0, // Tiled Kernel 不需要 workspace
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// Calculate 函数实现
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 1. 参数校验
+    if (inputs.size() != 3) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    const void *input_ptr = inputs[0];
+    const void *batch1_ptr = inputs[1];
+    const void *batch2_ptr = inputs[2];
+
+    // 2. 提取参数
+    auto dtype = _info.dtype();
+
+    // 3. 分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel_wrapper<half>(
+            output, input_ptr, batch1_ptr, batch2_ptr, _info, stream);
+        break;
+
+    case INFINI_DTYPE_BF16:
+        // Host 端无需检查 __CUDA_ARCH__
+        launch_kernel_wrapper<nv_bfloat16>(
+            output, input_ptr, batch1_ptr, batch2_ptr, _info, stream);
+        break;
+
+    case INFINI_DTYPE_F32:
+        launch_kernel_wrapper<float>(
+            output, input_ptr, batch1_ptr, batch2_ptr, _info, stream);
+        break;
+
+    case INFINI_DTYPE_F64:
+        // 假设 double 也使用 Tiled Kernel (如果 .cuh 支持)
+        launch_kernel_wrapper<double>(
+            output, input_ptr, batch1_ptr, batch2_ptr, _info, stream);
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addbmm::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/nvidia/addbmm_nvidia.cuh b/src/infiniop/ops/addbmm/nvidia/addbmm_nvidia.cuh
new file mode 100644
index 000000000..34a774f07
--- /dev/null
+++ b/src/infiniop/ops/addbmm/nvidia/addbmm_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ADDBMM_CUH__
+#define __ADDBMM_CUH__
+
+#include "../addbmm.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GEMM_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/addbmm/operator.cc b/src/infiniop/ops/addbmm/operator.cc
new file mode 100644
index 000000000..3a6159977
--- /dev/null
+++ b/src/infiniop/ops/addbmm/operator.cc
@@ -0,0 +1,208 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/addbmm.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/addbmm_cpu.h"
+#endif
+
+// Nvidia, 天数智芯(Iluvatar), 昆仑芯(QY) 通常共享 CUDA 实现接口
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/addbmm_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/addbmm_metax.h"
+#endif
+
+// [Moore Threads Support]
+// 添加摩尔线程头文件
+#ifdef ENABLE_MOORE_API
+#include "moore/addbmm_moore.h"
+#endif
+
+// =======================================================================
+// [修复] 定义结构体
+// =======================================================================
+struct infiniopAddbmmDescriptor {
+    int device_type;
+};
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateAddbmmDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddbmmDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    infiniopTensorDescriptor_t batch1,
+    infiniopTensorDescriptor_t batch2,
+    float alpha, 
+    float beta) {
+
+    // 宏：根据不同后端调用对应的 C++ create 方法
+    #define CREATE(CASE, NAMESPACE)                                                 \
+        case CASE:                                                                  \
+            return op::addbmm::NAMESPACE::Descriptor::create(                       \
+                handle,                                                             \
+                reinterpret_cast<op::addbmm::NAMESPACE::Descriptor **>(desc_ptr),   \
+                output,                                                             \
+                {input, batch1, batch2},                                            \
+                alpha,                                                              \
+                beta)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+
+    // [Moore Threads Support]
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetAddbmmWorkspaceSize(infiniopAddbmmDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                   \
+        case CASE:                                                                                 \
+            *size = reinterpret_cast<op::addbmm::NAMESPACE::Descriptor *>(desc)->workspaceSize();  \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+
+    // [Moore Threads Support]
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopAddbmm(
+    infiniopAddbmmDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *batch1,
+    const void *batch2,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                                        \
+        case CASE:                                                                            \
+            return reinterpret_cast<const op::addbmm::NAMESPACE::Descriptor *>(desc)          \
+                ->calculate(workspace, workspace_size, output, {input, batch1, batch2}, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+
+    // [Moore Threads Support]
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyAddbmmDescriptor(infiniopAddbmmDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                         \
+        case CASE:                                                                          \
+            delete reinterpret_cast<const op::addbmm::NAMESPACE::Descriptor *>(desc);       \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+
+    // [Moore Threads Support]
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE,moore);
+    #endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/affine_grid.h b/src/infiniop/ops/affine_grid/affine_grid.h
new file mode 100644
index 000000000..06d1b6809
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/affine_grid.h
@@ -0,0 +1,49 @@
+#ifndef AFFINE_GRID_H
+#define AFFINE_GRID_H
+
+#include "../../operator.h"
+#include "info.h"
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+#define DESCRIPTOR(NAMESPACE)                                            \
+                                                                         \
+    namespace op::affine_grid::NAMESPACE {                               \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        AffineGridInfo _info;                                            \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            AffineGridInfo info,                                         \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t in_desc,                          \
+            bool align_corners); /* 增加 align_corners 参数 */            \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // AFFINE_GRID_H
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/cpu/affine_grid_cpu.cc b/src/infiniop/ops/affine_grid/cpu/affine_grid_cpu.cc
new file mode 100644
index 000000000..659661438
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/cpu/affine_grid_cpu.cc
@@ -0,0 +1,147 @@
+#include "affine_grid_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <cmath>
+#include <algorithm>
+#include <omp.h>
+
+namespace op::affine_grid::cpu {
+
+
+template <typename T>
+inline float to_float(T val) {
+    if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+        return utils::cast<float>(val);
+    } else {
+        return static_cast<float>(val);
+    }
+}
+
+template <typename T>
+inline T from_float(float val) {
+    if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+        return utils::cast<T>(val);
+    } else {
+        return static_cast<T>(val);
+    }
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    bool align_corners) { // 接收 align_corners
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 检查数据类型
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 2. 创建 Info 对象 (传递 align_corners)
+    auto result = AffineGridInfo::create(out_desc, in_desc, align_corners);
+    CHECK_RESULT(result);
+
+    // 3. 创建 Descriptor
+    *desc_ptr = new Descriptor(
+        nullptr,             // Opaque*
+        result.take(),       // Info
+        0,                   // Workspace Size (AffineGrid CPU 不需要额外 workspace)
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+void calculate_cpu_impl(
+    const AffineGridInfo &info,
+    void *output,
+    const void *input) {
+
+    size_t batch = info.batch();
+    size_t H = info.height();
+    size_t W = info.width();
+    bool align_corners = info.align_corners();
+
+    auto out_ptr = reinterpret_cast<Tdata *>(output);
+    auto in_ptr = reinterpret_cast<const Tdata *>(input);
+
+    // 并行化处理 Batch
+#pragma omp parallel for
+    for (size_t n = 0; n < batch; ++n) {
+       
+        const Tdata *theta_n = in_ptr + n * 6;
+
+        // 提取仿射矩阵参数并转为 float
+        float r00 = to_float(theta_n[0]);
+        float r01 = to_float(theta_n[1]);
+        float tx  = to_float(theta_n[2]);
+        float r10 = to_float(theta_n[3]);
+        float r11 = to_float(theta_n[4]);
+        float ty  = to_float(theta_n[5]);
+
+        // 遍历空间维度
+        for (size_t h = 0; h < H; ++h) {
+            for (size_t w = 0; w < W; ++w) {
+                // 1. 计算归一化坐标 (-1 到 1)
+                float x_norm, y_norm;
+
+                if (align_corners) {
+                    x_norm = (W > 1) ? (2.0f * w) / (W - 1.0f) - 1.0f : 0.0f;
+                    y_norm = (H > 1) ? (2.0f * h) / (H - 1.0f) - 1.0f : 0.0f;
+                } else {
+                    x_norm = (2.0f * w + 1.0f) / W - 1.0f;
+                    y_norm = (2.0f * h + 1.0f) / H - 1.0f;
+                }
+
+                // 2. 应用仿射变换
+                float grid_x = r00 * x_norm + r01 * y_norm + tx;
+                float grid_y = r10 * x_norm + r11 * y_norm + ty;
+
+                /
+                size_t offset = (n * H * W + h * W + w) * 2;
+
+                out_ptr[offset + 0] = from_float<Tdata>(grid_x);
+                out_ptr[offset + 1] = from_float<Tdata>(grid_y);
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    // 从 Info 中获取 dtype
+    auto dtype = _info.dtype();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        cpu::calculate_cpu_impl<fp16_t>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_cpu_impl<bf16_t>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_F32:
+        cpu::calculate_cpu_impl<float>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+    
+    case INFINI_DTYPE_F64:
+        cpu::calculate_cpu_impl<double>(_info, output, input);
+        return INFINI_STATUS_SUCCESS;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::affine_grid::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/cpu/affine_grid_cpu.h b/src/infiniop/ops/affine_grid/cpu/affine_grid_cpu.h
new file mode 100644
index 000000000..8d954eba8
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/cpu/affine_grid_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __AFFINE_GRID_CPU_H__
+#define __AFFINE_GRID_CPU_H__
+
+#include "../affine_grid.h"
+
+DESCRIPTOR(cpu)
+
+#endif  //_GRID_CPU_H_
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/cuda/kernel.cuh b/src/infiniop/ops/affine_grid/cuda/kernel.cuh
new file mode 100644
index 000000000..ad390e4f0
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/cuda/kernel.cuh
@@ -0,0 +1,111 @@
+#ifndef __AFFINE_GRID_CUDA_H__
+#define __AFFINE_GRID_CUDA_H__
+
+#include <cmath>
+#include <type_traits>
+#if defined(__MACA__) || defined(__MACACC__)
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+    using nv_bfloat16 = __maca_bfloat16;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+namespace op::affine_grid::cuda {
+
+
+template <typename T>
+__device__ __forceinline__ float to_float_acc(const T &x) {
+    if constexpr (std::is_same_v<T, half>) return __half2float(x);
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) return __bfloat162float(x);
+    else return static_cast<float>(x);
+}
+
+template <typename T>
+__global__ void affine_grid_kernel(
+    T * __restrict__ output,        // [优化1] 使用 __restrict__
+    const T * __restrict__ theta,   // [优化1] 使用 __restrict__
+    size_t N,
+    size_t H,
+    size_t W,
+    bool align_corners
+) {
+    // 扁平化索引
+    size_t total_elements = N * H * W;
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx >= total_elements) return;
+
+   
+    float w_scale, h_scale, w_bias, h_bias;
+
+    if (align_corners) {
+        // align_corners = True: formula is (2*i)/(size-1) - 1
+        // => i * (2/(size-1)) - 1
+        w_scale = (W > 1) ? 2.0f / (W - 1.0f) : 0.0f;
+        h_scale = (H > 1) ? 2.0f / (H - 1.0f) : 0.0f;
+        w_bias = -1.0f;
+        h_bias = -1.0f;
+    } else {
+        // align_corners = False: formula is (2*i + 1)/size - 1
+        // => i * (2/size) + (1/size - 1)
+        w_scale = 2.0f / W;
+        h_scale = 2.0f / H;
+        w_bias = 1.0f / W - 1.0f;
+        h_bias = 1.0f / H - 1.0f;
+    }
+
+    
+    size_t w = idx % W;
+    size_t temp = idx / W;
+    size_t h = temp % H;
+    size_t n = temp / H; // 此时 temp = n * H + h
+
+    // 2. 计算归一化坐标 (使用乘法代替除法)
+    float x_norm = (float)w * w_scale + w_bias;
+    float y_norm = (float)h * h_scale + h_bias;
+    
+    // 如果 align_corners=True 且 size=1，特判修正
+    if (align_corners) {
+         if (W <= 1) x_norm = 0.0f;
+         if (H <= 1) y_norm = 0.0f;
+    }
+
+    
+    const T* theta_ptr = theta + n * 6;
+    
+   
+    float r00 = to_float_acc(theta_ptr[0]); 
+    float r01 = to_float_acc(theta_ptr[1]); 
+    float tx  = to_float_acc(theta_ptr[2]); 
+    float r10 = to_float_acc(theta_ptr[3]); 
+    float r11 = to_float_acc(theta_ptr[4]); 
+    float ty  = to_float_acc(theta_ptr[5]); 
+
+    
+    float grid_x = r00 * x_norm + r01 * y_norm + tx;
+    float grid_y = r10 * x_norm + r11 * y_norm + ty;
+
+    // 5. 向量化写入 (Vectorized Store)
+    if constexpr (std::is_same_v<T, float>) {
+        float2* out_vec = reinterpret_cast<float2*>(output);
+        out_vec[idx] = make_float2(grid_x, grid_y);
+    }
+    else if constexpr (std::is_same_v<T, half>) {
+        half2* out_vec = reinterpret_cast<half2*>(output);
+        out_vec[idx] = __floats2half2_rn(grid_x, grid_y);
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        nv_bfloat162* out_vec = reinterpret_cast<nv_bfloat162*>(output);
+        out_vec[idx] = __floats2bfloat162_rn(grid_x, grid_y);
+    }
+    else {
+        output[idx * 2 + 0] = static_cast<T>(grid_x);
+        output[idx * 2 + 1] = static_cast<T>(grid_y);
+    }
+}
+
+} // namespace op::affine_grid::cuda
+
+#endif // __AFFINE_GRID_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/info.h b/src/infiniop/ops/affine_grid/info.h
new file mode 100644
index 000000000..e08f0450e
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/info.h
@@ -0,0 +1,78 @@
+#ifndef __AFFINE_GRID_INFO_H__
+#define __AFFINE_GRID_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::affine_grid {
+
+class AffineGridInfo {
+    AffineGridInfo() = default;
+
+public:
+    size_t _batch;
+    size_t _height;
+    size_t _width;
+    bool _align_corners;
+    int _dtype;
+
+    size_t batch() const { return _batch; }
+    size_t height() const { return _height; }
+    size_t width() const { return _width; }
+    bool align_corners() const { return _align_corners; }
+    int dtype() const { return _dtype; }
+
+    static utils::Result<AffineGridInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        bool align_corners) {
+
+        // 1. 检查数据类型一致性
+        if (out_desc->dtype() != in_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        // 2. 检查输入 Theta 的形状
+        // 标准 2D Affine Grid 输入必须是 (N, 2, 3)
+        if (in_desc->ndim() != 3) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (in_desc->shape()[1] != 2 || in_desc->shape()[2] != 3) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 3. 检查输出 Grid 的形状
+        // 标准 2D Affine Grid 输出必须是 (N, H, W, 2)
+        if (out_desc->ndim() != 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        // 最后一维必须是 2 (代表 x, y 坐标)
+        if (out_desc->shape()[3] != 2) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 4. 检查 Batch Size 是否匹配
+        if (in_desc->shape()[0] != out_desc->shape()[0]) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 5. 提取维度信息
+        size_t batch = out_desc->shape()[0];
+        size_t height = out_desc->shape()[1];
+        size_t width = out_desc->shape()[2];
+        int dtype = in_desc->dtype();
+
+        // 6. 返回 Info 对象
+        return utils::Result<AffineGridInfo>(AffineGridInfo{
+            batch,
+            height,
+            width,
+            align_corners,
+            dtype});
+    }
+};
+
+} // namespace op::affine_grid
+
+#endif // __AFFINE_GRID_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/metax/affine_grid_metax.h b/src/infiniop/ops/affine_grid/metax/affine_grid_metax.h
new file mode 100644
index 000000000..0aa153d95
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/metax/affine_grid_metax.h
@@ -0,0 +1,8 @@
+#ifndef __AFFINE_GRID_METAX_H__
+#define __AFFINE_GRID_METAX_H__
+
+#include "../affine_grid.h"
+
+DESCRIPTOR(metax)
+
+#endif // __AFFINE_GRID_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/metax/affine_grid_metax.maca b/src/infiniop/ops/affine_grid/metax/affine_grid_metax.maca
new file mode 100644
index 000000000..d8e2aa55f
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/metax/affine_grid_metax.maca
@@ -0,0 +1,179 @@
+#include "affine_grid_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <common/mc_library_types.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+
+namespace op::affine_grid::metax {
+
+// ==================================================================
+// Kernel: Index-Based Double Precision (Maximum Accuracy)
+// ==================================================================
+__global__ void affine_grid_kernel_f32_double_index(
+    float * __restrict__ output,        
+    const float * __restrict__ theta,   
+    int N, int H, int W, int align_corners_int,
+    int in_s_n, int in_s_h, int in_s_w,
+    int out_s_n, int out_s_h, int out_s_w, int out_s_c
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total_elements = N * H * W;
+
+    if (idx >= total_elements) return;
+
+    // 1. 索引分解
+    int w_idx = idx % W;
+    int temp = idx / W;
+    int h_idx = temp % H;
+    int n_idx = temp / H; 
+
+    // 2. 坐标生成 (Double Precision + Index Formula)
+    double x, y;
+    
+    if (align_corners_int == 1) {
+        // align_corners = True
+        // Formula: (i * 2) / (N - 1) - 1
+        if (W > 1) 
+            x = ((double)w_idx * 2.0) / (double)(W - 1) - 1.0;
+        else 
+            x = 0.0;
+
+        if (H > 1) 
+            y = ((double)h_idx * 2.0) / (double)(H - 1) - 1.0;
+        else 
+            y = 0.0;
+    } else {
+        // align_corners = False
+        // Formula: (2 * i + 1) / N - 1
+        x = ((double)(2 * w_idx + 1)) / (double)W - 1.0;
+        y = ((double)(2 * h_idx + 1)) / (double)H - 1.0;
+    }
+
+    // 3. 读取 Theta (Convert to Double)
+    int theta_base = n_idx * in_s_n;
+    double r00 = (double)theta[theta_base + 0 * in_s_h + 0 * in_s_w];
+    double r01 = (double)theta[theta_base + 0 * in_s_h + 1 * in_s_w];
+    double tx  = (double)theta[theta_base + 0 * in_s_h + 2 * in_s_w];
+    
+    double r10 = (double)theta[theta_base + 1 * in_s_h + 0 * in_s_w];
+    double r11 = (double)theta[theta_base + 1 * in_s_h + 1 * in_s_w];
+    double ty  = (double)theta[theta_base + 1 * in_s_h + 2 * in_s_w];
+
+    
+    double grid_x = r00 * x + r01 * y + tx;
+    double grid_y = r10 * x + r11 * y + ty;
+
+    // 5. 结果写入 (Cast back to float)
+    int out_base = n_idx * out_s_n + h_idx * out_s_h + w_idx * out_s_w;
+    output[out_base + 0 * out_s_c] = (float)grid_x;
+    output[out_base + 1 * out_s_c] = (float)grid_y;
+}
+
+// ==================================================================
+// Launch Helper & Descriptor
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, const void *input, size_t batch, size_t height, size_t width, bool align_corners, 
+    void *stream, const int64_t* in_strides = nullptr, const int64_t* out_strides = nullptr
+) {
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    size_t total_elements = batch * height * width;
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+
+    if constexpr (std::is_same_v<T, float>) {
+        
+        auto out_ptr = reinterpret_cast<float *>(output);
+        auto in_ptr = reinterpret_cast<const float *>(input);
+        int align_corners_i32 = align_corners ? 1 : 0;
+
+        int is_n = 6, is_h = 3, is_w = 1; 
+        if (in_strides) { is_n = (int)in_strides[0]; is_h = (int)in_strides[1]; is_w = (int)in_strides[2]; }
+
+        int os_n = height * width * 2, os_h = width * 2, os_w = 2, os_c = 1;
+        if (out_strides) { os_n = (int)out_strides[0]; os_h = (int)out_strides[1]; os_w = (int)out_strides[2]; os_c = (int)out_strides[3]; }
+
+        affine_grid_kernel_f32_double_index<<<grid_size, block_size, 0, hc_stream>>>(
+            out_ptr, in_ptr, (int)batch, (int)height, (int)width, align_corners_i32,
+            is_n, is_h, is_w, os_n, os_h, os_w, os_c
+        );
+    } 
+    else {
+        // [FP16 / BF16 路径] 使用通用模板
+        auto out_ptr = reinterpret_cast<T *>(output);
+        auto in_ptr = reinterpret_cast<const T *>(input);
+        op::affine_grid::cuda::affine_grid_kernel<T><<<grid_size, block_size, 0, hc_stream>>>(
+            out_ptr, in_ptr, batch, height, width, align_corners
+        );
+    }
+}
+
+// ==================================================================
+// Descriptor Implementation
+// ==================================================================
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    std::vector<int64_t> in_strides; 
+    std::vector<int64_t> out_strides;
+};
+
+Descriptor::~Descriptor() { if (_opaque) delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t in_desc, bool align_corners) { 
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto info_result = AffineGridInfo::create(out_desc, in_desc, align_corners);
+    if (!info_result) return info_result.status();
+    auto opaque = new Opaque{handle->internal()};
+
+    // 获取 Input Strides
+    auto idesc = reinterpret_cast<const InfiniopTensorDescriptor*>(in_desc);
+    if (idesc && idesc->ndim() == 3) {
+        auto s = idesc->strides();
+        for (auto val : s) opaque->in_strides.push_back((int64_t)val);
+    }
+    // 获取 Output Strides
+    auto odesc = reinterpret_cast<const InfiniopTensorDescriptor*>(out_desc);
+    if (odesc && odesc->ndim() == 4) {
+        auto s = odesc->strides();
+        for (auto val : s) opaque->out_strides.push_back((int64_t)val);
+    }
+    *desc_ptr = new Descriptor(opaque, info_result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size, void *output, const void *input, void *stream) const {
+    auto dtype = _info.dtype();
+    const int64_t* in_strides_ptr = _opaque->in_strides.empty() ? nullptr : _opaque->in_strides.data();
+    const int64_t* out_strides_ptr = _opaque->out_strides.empty() ? nullptr : _opaque->out_strides.data();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<__half>(output, input, _info.batch(), _info.height(), _info.width(), _info.align_corners(), stream);
+        break;
+    case INFINI_DTYPE_BF16:
+#if defined(__MACA__) || defined(__MACACC__)
+        launch_kernel<__maca_bfloat16>(output, input, _info.batch(), _info.height(), _info.width(), _info.align_corners(), stream);
+#endif
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info.batch(), _info.height(), _info.width(), _info.align_corners(), stream, in_strides_ptr, out_strides_ptr);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info.batch(), _info.height(), _info.width(), _info.align_corners(), stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::affine_grid::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/moore/affine_grid_moore.h b/src/infiniop/ops/affine_grid/moore/affine_grid_moore.h
new file mode 100644
index 000000000..fb48416d6
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/moore/affine_grid_moore.h
@@ -0,0 +1,9 @@
+#ifndef __AFFINE_GRID_MOORE_API_H__
+#define __AFFINE_GRID_MOORE_API_H__
+
+#include "../affine_grid.h"
+
+// 使用 affine_grid.h 中定义的宏生成 op::affine_grid::moore::Descriptor 类定义
+DESCRIPTOR(moore)
+
+#endif // __AFFINE_GRID_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/moore/affine_grid_moore.mu b/src/infiniop/ops/affine_grid/moore/affine_grid_moore.mu
new file mode 100644
index 000000000..e10c990ce
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/moore/affine_grid_moore.mu
@@ -0,0 +1,144 @@
+#include "affine_grid_moore.h"
+#include "affine_grid_moore_kernel.h"
+#include <musa_runtime.h>
+
+// 引用 Handle 路径
+#include "../../../devices/moore/moore_handle.h" 
+
+namespace op::affine_grid::moore {
+
+template <typename T>
+__global__ void affine_grid_kernel(
+    const int N, const int H, const int W,
+    const bool align_corners,
+    const T *theta,
+    T *output) {
+    
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total_pixels = N * H * W;
+
+    if (idx < total_pixels) {
+        int w = idx % W;
+        int h = (idx / W) % H;
+        int n = idx / (W * H);
+
+        const T *current_theta = theta + n * 6;
+        T *out_ptr = output + idx * 2;
+
+        AffineGridOp op;
+        op(w, h, W, H, current_theta, align_corners, &out_ptr[0], &out_ptr[1]);
+    }
+}
+
+// ==================================================================
+// 2. Launcher Implementation
+// ==================================================================
+
+template <typename T>
+void affine_grid_moore_launch(
+    const AffineGridInfo &info,
+    T *output,
+    const T *input,
+    void *stream) {
+    
+    size_t num_pixels = info.batch() * info.height() * info.width();
+
+    int threads = 256;
+    int blocks = (num_pixels + threads - 1) / threads;
+
+    affine_grid_kernel<T><<<blocks, threads, 0, (musaStream_t)stream>>>(
+        info.batch(),
+        info.height(),
+        info.width(),
+        info.align_corners(),
+        input,
+        output
+    );
+}
+
+// ==================================================================
+// 3. Descriptor Implementation
+// ==================================================================
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    bool align_corners) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto info_result = AffineGridInfo::create(out_desc, in_desc, align_corners);
+    
+    if (!info_result) {
+       
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(
+        nullptr, 
+        *info_result, 
+        0, 
+        handle->device,    // 原: handle->device_type()
+        handle->device_id  // 原: handle->device_id()
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        affine_grid_moore_launch<half>(
+            _info, 
+            static_cast<half *>(output), 
+            static_cast<const half *>(input), 
+            stream);
+        break;
+        
+    case INFINI_DTYPE_BF16:
+        
+        affine_grid_moore_launch<__mt_bfloat16>(
+            _info, 
+            static_cast<__mt_bfloat16 *>(output), 
+            static_cast<const __mt_bfloat16 *>(input), 
+            stream);
+        break;
+        
+    case INFINI_DTYPE_F32:
+        affine_grid_moore_launch<float>(
+            _info, 
+            static_cast<float *>(output), 
+            static_cast<const float *>(input), 
+            stream);
+        break;
+        
+    case INFINI_DTYPE_F64:
+        affine_grid_moore_launch<double>(
+            _info, 
+            static_cast<double *>(output), 
+            static_cast<const double *>(input), 
+            stream);
+        break;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::affine_grid::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/moore/affine_grid_moore_kernel.h b/src/infiniop/ops/affine_grid/moore/affine_grid_moore_kernel.h
new file mode 100644
index 000000000..89d91b48f
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/moore/affine_grid_moore_kernel.h
@@ -0,0 +1,82 @@
+#ifndef __AFFINE_GRID_MOORE_KERNEL_H__
+#define __AFFINE_GRID_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h> 
+#include <musa_bf16.h> // 包含 __mt_bfloat16 定义
+
+
+namespace op::affine_grid::moore {
+typedef struct AffineGridOp {
+public:
+    static constexpr size_t num_dimensions = 2; 
+
+    template <typename T>
+    __device__ __forceinline__ void operator()(
+        const int w_idx, const int h_idx, 
+        const int W, const int H, 
+        const T* theta,         
+        const bool align_corners,
+        T* out_x, T* out_y      
+    ) const {
+        // 1. 归一化坐标计算
+        float x_norm, y_norm;
+        if (align_corners) {
+            x_norm = (float)(w_idx * 2 - (W - 1)) / (float)(MAX(W - 1, 1));
+            y_norm = (float)(h_idx * 2 - (H - 1)) / (float)(MAX(H - 1, 1));
+        } else {
+            x_norm = (float)(w_idx * 2 + 1) / (float)W - 1.0f;
+            y_norm = (float)(h_idx * 2 + 1) / (float)H - 1.0f;
+        }
+
+        // 2. 仿射变换逻辑
+        if constexpr (std::is_same_v<T, half>) {
+            float t00 = __half2float(theta[0]);
+            float t01 = __half2float(theta[1]);
+            float t02 = __half2float(theta[2]);
+            float t10 = __half2float(theta[3]);
+            float t11 = __half2float(theta[4]);
+            float t12 = __half2float(theta[5]);
+
+            float res_x = t00 * x_norm + t01 * y_norm + t02;
+            float res_y = t10 * x_norm + t11 * y_norm + t12;
+
+            *out_x = __float2half(res_x);
+            *out_y = __float2half(res_y);
+
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) { // 【修改】使用 __mt_bfloat16
+            // 显式转换 __mt_bfloat16 -> float
+            float t00 = __bfloat162float(theta[0]);
+            float t01 = __bfloat162float(theta[1]);
+            float t02 = __bfloat162float(theta[2]);
+            float t10 = __bfloat162float(theta[3]);
+            float t11 = __bfloat162float(theta[4]);
+            float t12 = __bfloat162float(theta[5]);
+
+            float res_x = t00 * x_norm + t01 * y_norm + t02;
+            float res_y = t10 * x_norm + t11 * y_norm + t12;
+
+            // 转换回 __mt_bfloat16
+            *out_x = __float2bfloat16(res_x);
+            *out_y = __float2bfloat16(res_y);
+
+        } else if constexpr (std::is_same_v<T, float>) {
+            float res_x = __fadd_rn(__fmul_rn(theta[0], x_norm), __fadd_rn(__fmul_rn(theta[1], y_norm), theta[2]));
+            float res_y = __fadd_rn(__fmul_rn(theta[3], x_norm), __fadd_rn(__fmul_rn(theta[4], y_norm), theta[5]));
+            *out_x = res_x;
+            *out_y = res_y;
+        } else {
+            *out_x = theta[0] * static_cast<T>(x_norm) + theta[1] * static_cast<T>(y_norm) + theta[2];
+            *out_y = theta[3] * static_cast<T>(x_norm) + theta[4] * static_cast<T>(y_norm) + theta[5];
+        }
+    }
+
+private:
+    __device__ __forceinline__ int MAX(int a, int b) const {
+        return a > b ? a : b;
+    }
+
+} AffineGridOp;
+} // namespace op::affine_grid::moore
+
+#endif // __AFFINE_GRID_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/nvidia/affine_grid_nvidia.cu b/src/infiniop/ops/affine_grid/nvidia/affine_grid_nvidia.cu
new file mode 100644
index 000000000..0f70e8c1a
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/nvidia/affine_grid_nvidia.cu
@@ -0,0 +1,124 @@
+#include "affine_grid_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+
+namespace op::affine_grid::nvidia {
+
+// ==================================================================
+// Kernel Launch Helper
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output,
+    const void *input, // 这里对应 Theta
+    size_t batch,
+    size_t height,
+    size_t width,
+    bool align_corners,
+    void *stream) {
+
+    // 指针强转
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+
+    // 计算总线程数: N * H * W
+    // 每个线程负责生成一个 (x, y) 坐标对
+    size_t total_elements = batch * height * width;
+
+    
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    
+    cuda::affine_grid_kernel<T><<<grid_size, block_size, 0, cuda_stream>>>(
+        out_ptr,
+        in_ptr,
+        batch,
+        height,
+        width,
+        align_corners
+    );
+}
+
+// ==================================================================
+// Descriptor Implementation
+// ==================================================================
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+// 创建算子描述符
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    bool align_corners) { 
+
+    // 1. 使用 Info 类解析并校验参数
+    // create 方法内部会检查 Tensor 维度和类型一致性
+    auto info_result = AffineGridInfo::create(out_desc, in_desc, align_corners);
+    if (!info_result) {
+        return info_result.status();
+    }
+    auto info = info_result.take();
+
+    // 2. 创建 Descriptor
+    *desc_ptr = new Descriptor(
+        new Opaque(),        // Opaque 指针
+        info,                // Info 对象 (包含 N, H, W, align_corners)
+        0,                   // Workspace size (AffineGrid 不需要额外 workspace)
+        handle->device,      // Device Type
+        handle->device_id    // Device ID
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 执行计算
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+    auto dtype = _info.dtype();
+    auto batch = _info.batch();
+    auto height = _info.height();
+    auto width = _info.width();
+    auto align_corners = _info.align_corners();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, batch, height, width, align_corners, stream);
+        break;
+        
+    case INFINI_DTYPE_BF16:
+        
+        launch_kernel<nv_bfloat16>(output, input, batch, height, width, align_corners, stream);
+        break;
+        
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, batch, height, width, align_corners, stream);
+        break;
+        
+    case INFINI_DTYPE_F64:
+     
+        launch_kernel<double>(output, input, batch, height, width, align_corners, stream);
+        break;
+        
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::affine_grid::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/nvidia/affine_grid_nvidia.cuh b/src/infiniop/ops/affine_grid/nvidia/affine_grid_nvidia.cuh
new file mode 100644
index 000000000..9864bf3ec
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/nvidia/affine_grid_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __AFFINE_GRID_CUH__
+#define __AFFINE_GRID_CUH__
+
+#include "../affine_grid.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GEMM_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/affine_grid/operator.cc b/src/infiniop/ops/affine_grid/operator.cc
new file mode 100644
index 000000000..acd7f11ef
--- /dev/null
+++ b/src/infiniop/ops/affine_grid/operator.cc
@@ -0,0 +1,177 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/affine_grid.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/affine_grid_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/affine_grid_nvidia.cuh"
+#endif
+
+// 【新增】引入 Moore 后端的 API 头文件
+#ifdef ENABLE_MOORE_API
+#include "moore/affine_grid_moore.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/affine_grid_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAffineGridDescriptor(
+    infiniopHandle_t handle,
+    infiniopAffineGridDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    uint8_t align_corners) { 
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::affine_grid::NAMESPACE::Descriptor::create(                       \
+            handle,                                                                  \
+            reinterpret_cast<op::affine_grid::NAMESPACE::Descriptor **>(desc_ptr),   \
+            output_desc,                                                             \
+            input_desc,                                                              \
+            align_corners) 
+
+    switch (handle->device) { 
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+// 【新增】Moore 后端分发
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAffineGridWorkspaceSize(infiniopAffineGridDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::affine_grid::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (reinterpret_cast<InfiniopDescriptor *>(desc)->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+// 【新增】Moore 后端分发
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAffineGrid(
+    infiniopAffineGridDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input, 
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::affine_grid::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (reinterpret_cast<InfiniopDescriptor *>(desc)->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+// 【新增】Moore 后端分发
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAffineGridDescriptor(infiniopAffineGridDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                     \
+        delete reinterpret_cast<const op::affine_grid::NAMESPACE::Descriptor *>(desc);             \
+        return INFINI_STATUS_SUCCESS
+
+    switch (reinterpret_cast<InfiniopDescriptor *>(desc)->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+// 【新增】Moore 后端分发
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc
new file mode 100644
index 000000000..ea4261be7
--- /dev/null
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc
@@ -0,0 +1,84 @@
+// 【修改点 1】引用 Floor 专用的 CPU 头文件
+#include "floor_cpu.h"
+
+// 【修改点 2】命名空间必须是 floor，否则 operator.cc 找不到定义
+namespace op::floor::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    // 【修改点 3】Floor 算子通常支持浮点和整数
+    // (整数做 floor 结果不变，但为了通用性建议加上)
+    CHECK_DTYPE(dtype, 
+        INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
+        INFINI_DTYPE_I8,   INFINI_DTYPE_U8,
+        INFINI_DTYPE_I16,  INFINI_DTYPE_U16,
+        INFINI_DTYPE_I32,  INFINI_DTYPE_U32,
+        INFINI_DTYPE_I64,  INFINI_DTYPE_U64
+    );
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 【修改点 4】分发计算：将 GeluOp 替换为 FloorOp
+    switch (_dtype) {
+    // === 浮点类型 ===
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<FloorOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<FloorOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<FloorOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<FloorOp, double>(_info, output, inputs, stream);
+
+    // === 整数类型 (直接调用 FloorOp，因为 FloorOp 对整数是恒等映射) ===
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<FloorOp, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<FloorOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<FloorOp, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<FloorOp, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<FloorOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<FloorOp, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<FloorOp, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<FloorOp, uint64_t>(_info, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::floor::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h
new file mode 100644
index 000000000..e26d75d92
--- /dev/null
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.h
@@ -0,0 +1,38 @@
+#ifndef __FLOOR_CPU_H__
+#define __FLOOR_CPU_H__
+
+// 引入基础宏定义
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+// 使用宏声明 Descriptor 类
+ELEMENTWISE_DESCRIPTOR(floor, cpu)
+
+#include <cmath>
+#include <type_traits>
+
+namespace op::floor::cpu {
+
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        // 1. 整数类型：直接返回
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } 
+        // 2. 标准浮点类型 (float, double)：直接调用 std::floor，不降精度
+        else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+            return std::floor(x);
+        }
+        // 3. 半精度类型 (fp16, bf16)：先转 float 计算
+        else {
+            return static_cast<T>(std::floor(static_cast<float>(x)));
+        }
+    }
+} FloorOp;
+
+} // namespace op::floor::cpu
+
+#endif // __FLOOR_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh
new file mode 100644
index 000000000..a3f232b8e
--- /dev/null
+++ b/src/infiniop/ops/floor/cuda/kernel.cuh
@@ -0,0 +1,68 @@
+#ifndef __FLOOR_CUDA_H__
+#define __FLOOR_CUDA_H__
+
+#include <cmath>
+#include <type_traits> // 必须包含：用于 std::is_integral_v 等检查
+#if defined(__MACA__) || defined(__MACACC__)
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+namespace op::floor::cuda {
+
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        
+        // 1. Half2 (向量化)
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(x);
+            float2 vr = make_float2(floorf(vf.x), floorf(vf.y));
+            return __float22half2_rn(vr);
+        } 
+        // 2. BFloat162 (向量化)
+        else if constexpr (std::is_same_v<T, nv_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(x));
+            float f1 = __bfloat162float(__high2bfloat16(x));
+            // 已修复：使用 _rn 标准函数
+            return __floats2bfloat162_rn(floorf(f0), floorf(f1));
+        }
+        // 3. BFloat16 (标量)
+        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16(floorf(__bfloat162float(x)));
+        } 
+        // 4. Half (标量)
+        else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(floorf(__half2float(x)));
+        } 
+        // 5. Float
+        else if constexpr (std::is_same_v<T, float>) {
+            return floorf(x);
+        } 
+        // 6. Double
+        else if constexpr (std::is_same_v<T, double>) {
+            // 【关键修复】使用 ::floor 避免与 namespace op::floor 冲突
+            return ::floor(x);
+        }
+        // 7. 整数
+        else if constexpr (std::is_integral_v<T>) {
+            return x;
+        }
+        // 8. 兜底
+        else {
+            // 【关键修复】使用 ::floor
+            return ::floor(x);
+        }
+    }
+} FloorOp;
+
+} // namespace op::floor::cuda
+
+#endif // __FLOOR_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/metax/floor_metax.h b/src/infiniop/ops/floor/metax/floor_metax.h
new file mode 100644
index 000000000..3d293de25
--- /dev/null
+++ b/src/infiniop/ops/floor/metax/floor_metax.h
@@ -0,0 +1,8 @@
+#ifndef __FLOOR_METAX_API_H__
+#define __FLOOR_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(floor, metax)
+
+#endif // __FLOOR_METAX_API_H__
diff --git a/src/infiniop/ops/floor/metax/floor_metax.maca b/src/infiniop/ops/floor/metax/floor_metax.maca
new file mode 100644
index 000000000..bac10d431
--- /dev/null
+++ b/src/infiniop/ops/floor/metax/floor_metax.maca
@@ -0,0 +1,69 @@
+#include "floor_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh" // 确保这里面定义了 cuda::FloorOp
+
+namespace op::floor::metax { // 1. 修改命名空间
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 2. Floor 是单输入算子，只需要获取 input_desc_vec[0]
+    const auto &input_desc = input_desc_vec.at(0);
+    
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = input_desc->shape();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // 4. 检查形状一致性 (Out == In)
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+
+    // create CUDA elementwise descriptor
+    // 这里的宏通常处理通用逻辑，直接复用即可
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 5. 调用 calculate 并传入 cuda::FloorOp
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::FloorOp, nv_bfloat162>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::FloorOp, double>(_info, workspace, output, inputs, stream);
+    // 如果需要支持整数 (identity映射)，可以取消下面的注释，但前提是 cuda::FloorOp 支持整数类型
+    case INFINI_DTYPE_I32:
+         return _device_info->calculate<256, cuda::FloorOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+         return _device_info->calculate<256, cuda::FloorOp, int64_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::floor::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/moore/floor_moore.h b/src/infiniop/ops/floor/moore/floor_moore.h
new file mode 100644
index 000000000..c7d20217c
--- /dev/null
+++ b/src/infiniop/ops/floor/moore/floor_moore.h
@@ -0,0 +1,8 @@
+#ifndef __FLOOR_MOORE_API_H__
+#define __FLOOR_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(floor, moore)
+
+#endif // __FLOOR_MOORE_API_H__
diff --git a/src/infiniop/ops/floor/moore/floor_moore.mu b/src/infiniop/ops/floor/moore/floor_moore.mu
new file mode 100644
index 000000000..e0b91e9d7
--- /dev/null
+++ b/src/infiniop/ops/floor/moore/floor_moore.mu
@@ -0,0 +1,68 @@
+#include "floor_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "floor_moore_kernel.h"
+
+namespace op::floor::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // Floor is a unary operator, so we only look at the first input
+    const auto &in_desc = input_desc_vec.at(0);
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = in_desc->shape();
+
+    // Floor supports floating point types generally, and int types (though effectively no-op)
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+
+    // Check if output shape matches input shape
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // Use moore::FloorOp template
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::FloorOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::FloorOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::FloorOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::FloorOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, moore::FloorOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, moore::FloorOp, int64_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::floor::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/moore/floor_moore_kernel.h b/src/infiniop/ops/floor/moore/floor_moore_kernel.h
new file mode 100644
index 000000000..c78166da5
--- /dev/null
+++ b/src/infiniop/ops/floor/moore/floor_moore_kernel.h
@@ -0,0 +1,39 @@
+#ifndef __FLOOR_MOORE_KERNEL_H__
+#define __FLOOR_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Floor operation implementation for the MUSA backend.
+ */
+
+namespace op::floor::moore {
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // MUSA 环境可能缺失 __h2floor，改用拆分转 float 处理
+            // 提取低位和高位浮点数
+            float f1 = __low2float(input);
+            float f2 = __high2float(input);
+            // 分别向下取整，然后合并回 half2
+            // 使用 __floats2half2_rn (round-to-nearest) 进行转换合并
+            return __floats2half2_rn(::floorf(f1), ::floorf(f2));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // MUSA 环境缺失 __hfloor，改用转 float 处理
+            return __float2half(::floorf(__half2float(input)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // Bfloat16 转 float 处理
+            float val_f = __bfloat162float(input);
+            return __float2bfloat16(::floorf(val_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return ::floorf(input);
+        } else {
+            return ::floor(input);
+        }
+    }
+} FloorOp;
+} // namespace op::floor::moore
+
+#endif // __FLOOR_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
new file mode 100644
index 000000000..73662bed8
--- /dev/null
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
@@ -0,0 +1,89 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+// 引入核心计算 Functor (我们在 src/infiniop/ops/floor/cuda/floor_cuda.h 中定义的)
+#include "../cuda/kernel.cuh"
+#include "floor_nvidia.cuh"
+
+namespace op::floor::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    CHECK_DTYPE(dtype, 
+        INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
+        INFINI_DTYPE_I8,   INFINI_DTYPE_U8,
+        INFINI_DTYPE_I16,  INFINI_DTYPE_U16,
+        INFINI_DTYPE_I32,  INFINI_DTYPE_U32,
+        INFINI_DTYPE_I64,  INFINI_DTYPE_U64
+    );
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // -----------------------------------------------------------
+    // 2. 算子分发：将 GeluOp 替换为 FloorOp
+    //    模板参数 <256, ...> 表示 CUDA Block Size
+    // -----------------------------------------------------------
+    switch (_dtype) {
+    // === 浮点类型 ===
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::FloorOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::FloorOp, double>(_info, workspace, output, inputs, stream);
+    
+    // === 整数类型 (调用 FloorOp 也会正确处理，直接返回原值) ===
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::FloorOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::FloorOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::FloorOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::FloorOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::FloorOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::FloorOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::FloorOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::FloorOp, uint64_t>(_info, workspace, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::floor::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
new file mode 100644
index 000000000..1b9001772
--- /dev/null
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
@@ -0,0 +1,6 @@
+#ifndef __FLOOR_NVIDIA_CUH__
+#define __FLOOR_NVIDIA_CUH__
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+ELEMENTWISE_DESCRIPTOR(floor, nvidia)
+
+#endif // __FLOOR_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc
new file mode 100644
index 000000000..0eeccb1f0
--- /dev/null
+++ b/src/infiniop/ops/floor/operator.cc
@@ -0,0 +1,195 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/floor.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/floor_cpu.h"
+#endif
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/floor_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/floor_metax.h"
+#endif
+
+// ==========================================
+// 1. 添加 MOORE 头文件引用
+// ==========================================
+#ifdef ENABLE_MOORE_API
+#include "moore/floor_moore.h"
+#endif
+// ==========================================
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateFloorDescriptor(
+    infiniopHandle_t handle,
+    infiniopFloorDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input) {
+
+    #define CREATE(CASE, NAMESPACE)                                             \
+        case CASE:                                                              \
+            return op::floor::NAMESPACE::Descriptor::create(                    \
+                handle,                                                         \
+                reinterpret_cast<op::floor::NAMESPACE::Descriptor **>(desc_ptr),\
+                output,                                                         \
+                {input})
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    // ==========================================
+    // 添加 MOORE 分支
+    // ==========================================
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                               \
+        case CASE:                                                                             \
+            *size = reinterpret_cast<op::floor::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    // ==========================================
+    // 添加 MOORE 分支
+    // ==========================================
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopFloor(
+    infiniopFloorDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                          \
+        case CASE:                                                              \
+            return reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    // ==========================================
+    // 添加 MOORE 分支
+    // ==========================================
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                            \
+        case CASE:                                                             \
+            delete reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc); \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    // ==========================================
+    // 添加 MOORE 分支
+    // ==========================================
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/test/infinicore/ops/acos.py b/test/infinicore/ops/acos.py
index 87732413c..d3278f9f2 100644
--- a/test/infinicore/ops/acos.py
+++ b/test/infinicore/ops/acos.py
@@ -97,9 +97,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.acos(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.acos(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.acos(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/adaptive_avg_pool1d.py b/test/infinicore/ops/adaptive_avg_pool1d.py
index 05382724d..82dc57893 100644
--- a/test/infinicore/ops/adaptive_avg_pool1d.py
+++ b/test/infinicore/ops/adaptive_avg_pool1d.py
@@ -70,9 +70,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.adaptive_avg_pool1d(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.adaptive_avg_pool1d(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.adaptive_avg_pool1d(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/addbmm.py b/test/infinicore/ops/addbmm.py
index ba37a1ac1..767c863b4 100644
--- a/test/infinicore/ops/addbmm.py
+++ b/test/infinicore/ops/addbmm.py
@@ -104,11 +104,23 @@ def get_test_cases(self):
         return parse_test_cases()
 
     def torch_operator(self, *args, **kwargs):
+        """
+        moore平台测试
+        original_out_tensor = kwargs.get("out")
+        cpu_args = [arg.cpu() if isinstance(arg, torch.Tensor) else arg for arg in args]
+        cpu_kwargs = {
+            k: v.cpu() if isinstance(v, torch.Tensor) else v 
+            for k, v in kwargs.items()
+        }
+        if original_out_tensor is not None and isinstance(original_out_tensor, torch.Tensor):
+            original_out_tensor.copy_(cpu_result)
+            return original_out_tensor
+        target_device = args[0].device if len(args) > 0 and isinstance(args[0], torch.Tensor) else "musa"
+        return cpu_result.to(target_device)"""
         return torch.addbmm(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.addbmm(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.addbmm(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/affine_grid.py b/test/infinicore/ops/affine_grid.py
index 7e782ea6a..df02f8702 100644
--- a/test/infinicore/ops/affine_grid.py
+++ b/test/infinicore/ops/affine_grid.py
@@ -75,9 +75,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.affine_grid(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.affine_grid(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.affine_grid(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/floor.py b/test/infinicore/ops/floor.py
index c7663d39b..f879a4e6d 100644
--- a/test/infinicore/ops/floor.py
+++ b/test/infinicore/ops/floor.py
@@ -87,9 +87,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.floor(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.floor(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.floor(*args, **kwargs)
 
 
 def main():