From c5f6c1d16dbe964dc23f0f8c89fa7dc19100ff59 Mon Sep 17 00:00:00 2001
From: Felix Mathew <felixmathew264@gmail.com>
Date: Wed, 4 Mar 2026 12:38:30 -0600
Subject: [PATCH 1/2] Fix inference extension build by including
 torch/nn/functional.h explicitly

---
 src/layers/extensions/inference/impl.cpp | 1 +
 1 file changed, 1 insertion(+)
diff --git a/src/layers/extensions/inference/impl.cpp b/src/layers/extensions/inference/impl.cpp
index 6419fff..9d37389 100644
--- a/src/layers/extensions/inference/impl.cpp
+++ b/src/layers/extensions/inference/impl.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+#include <torch/nn/functional.h>
 #include "def.h"
 namespace F = torch::nn::functional;
 

From b1e1d69064c3b507ba0de64f06ea01aa87a50fc3 Mon Sep 17 00:00:00 2001
From: Felix Mathew <felixmathew264@gmail.com>
Date: Wed, 4 Mar 2026 13:09:39 -0600
Subject: [PATCH 2/2] Fix Windows CUDA inference build by avoiding
 torch/extension.h in CUDA TU

---
 src/layers/extensions/inference/common.h  |  2 +-
 src/layers/extensions/inference/def.h     |  2 +-
 src/layers/extensions/inference/kernel.cu | 18 ++++++++++--------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/layers/extensions/inference/common.h b/src/layers/extensions/inference/common.h
index 7a8bfed..e42ae52 100644
--- a/src/layers/extensions/inference/common.h
+++ b/src/layers/extensions/inference/common.h
@@ -6,7 +6,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <torch/extension.h>
+#include <torch/types.h>
 
 // T maybe vector type, and may be different from t.dtype
 template <typename T>
diff --git a/src/layers/extensions/inference/def.h b/src/layers/extensions/inference/def.h
index 17e7782..19ecc92 100644
--- a/src/layers/extensions/inference/def.h
+++ b/src/layers/extensions/inference/def.h
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <torch/extension.h>
+#include <torch/types.h>
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 process_with_mask_cuda(const torch::Tensor& y, const torch::Tensor& scales, const torch::Tensor& means,
diff --git a/src/layers/extensions/inference/kernel.cu b/src/layers/extensions/inference/kernel.cu
index 41ebc78..a855d85 100644
--- a/src/layers/extensions/inference/kernel.cu
+++ b/src/layers/extensions/inference/kernel.cu
@@ -6,7 +6,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/ops/empty_like.h>
+#include <torch/types.h>
 
 #include "common.h"
 #include "def.h"
@@ -121,10 +123,10 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 process_with_mask_cuda(const torch::Tensor& y, const torch::Tensor& scales, const torch::Tensor& means,
                        const torch::Tensor& mask, const float force_zero_thres)
 {
-    auto y_res = torch::empty_like(y);
-    auto y_q = torch::empty_like(y);
-    auto y_hat = torch::empty_like(y);
-    auto s_hat = torch::empty_like(y);
+    auto y_res = at::empty_like(y);
+    auto y_q = at::empty_like(y);
+    auto y_hat = at::empty_like(y);
+    auto s_hat = at::empty_like(y);
 
     if (y.dtype() == torch::kFloat32) {
         process_with_mask_dispatcher<float, float4>(y_res, y_q, y_hat, s_hat, y, scales, means,
@@ -855,7 +857,7 @@ __forceinline__ void round_and_to_int8_dispatcher(torch::Tensor& z, torch::Tenso
 
 torch::Tensor round_and_to_int8_cuda(torch::Tensor& z)
 {
-    auto z_int8 = torch::empty_like(z, at::TensorOptions().dtype(torch::kInt8));
+    auto z_int8 = at::empty_like(z, at::TensorOptions().dtype(torch::kInt8));
     if (z.dtype() == torch::kFloat32) {
         round_and_to_int8_dispatcher<float, float4>(z, z_int8);
     } else if (z.dtype() == torch::kFloat16) {
@@ -900,7 +902,7 @@ __forceinline__ void clamp_reciprocal_with_quant_dispatcher(torch::Tensor& q_dec
 torch::Tensor clamp_reciprocal_with_quant_cuda(const torch::Tensor& q_dec, torch::Tensor& y,
                                                const float min_val)
 {
-    auto q_dec_clamp = torch::empty_like(q_dec);
+    auto q_dec_clamp = at::empty_like(q_dec);
     if (q_dec.dtype() == torch::kFloat32) {
         clamp_reciprocal_with_quant_dispatcher<float, float4>(q_dec_clamp, q_dec, y, min_val);
     } else if (q_dec.dtype() == torch::kFloat16) {
@@ -1123,7 +1125,7 @@ torch::Tensor bias_wsilu_depthwise_conv2d_cuda(const torch::Tensor& x, const tor
     const int H = x_shape[2];
     const int W = x_shape[3];
 
-    auto out = torch::empty_like(x);
+    auto out = at::empty_like(x);
 
     const int BLOCK_SIZE = 32;
     const int THREAD_NUM_X = 16;