diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py
index adbf161328..b0139646e5 100644
--- a/Deeploy/Targets/GAP9/DMA/L3Dma.py
+++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py
@@ -6,8 +6,7 @@
 from typing import Dict, Tuple
 
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
-from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
-    PerTensorWaitingStrategy
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
 
 
 class GAP9L3DmaFuture(Future):
@@ -60,5 +59,5 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu
         return operatorRepresentation
 
 
-# Blocking adapter for L3 DMA (used in GAP9 L3 tiling)
-gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma())
+# Async L3 DMA for GAP9 L3 tiling
+gap9L3DmaHack = GAP9L3Dma()
diff --git a/Deeploy/TilingExtension/AsyncDma.py b/Deeploy/TilingExtension/AsyncDma.py
index 9679681051..b54bb76f73 100644
--- a/Deeploy/TilingExtension/AsyncDma.py
+++ b/Deeploy/TilingExtension/AsyncDma.py
@@ -266,6 +266,13 @@ def transfer(self,
                                          strideExt[-kernelRank:], strideLoc[-kernelRank:], direction, future)
 
             callStack.extend(dma_code)
+            # The decomposed sub-transfers all share the single per-tensor request
+            # handle (future). Under an asynchronous DMA the in-flight copies would
+            # overwrite that handle before it is waited -> request reuse -> overrun /
+            # hang. Wait for each sub-transfer before issuing the next so only one is
+            # ever in flight on the shared handle. This serializes the (rank > kernel-
+            # rank) decomposition only; non-decomposed transfers stay fully async.
+            callStack.append(future.wait())
             callStack.append(CodeSnippet(self.NestedForLoopCloseTemplate(nestedLoopDepth), {}))
             return callStack
         elif kernelRank == transferRank:
diff --git a/DeeployTest/Tests/Kernels/FP32/Transpose3D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Transpose3D/inputs.npz
new file mode 100644
index 0000000000..b5d6c2c817
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Transpose3D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Transpose3D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Transpose3D/network.onnx
new file mode 100644
index 0000000000..95b448f690
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Transpose3D/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Transpose3D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Transpose3D/outputs.npz
new file mode 100644
index 0000000000..0e6308905d
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Transpose3D/outputs.npz differ
diff --git a/DeeployTest/test_gap9_tiled_config.py b/DeeployTest/test_gap9_tiled_config.py
index 764d61f0ca..4b1be9d548 100644
--- a/DeeployTest/test_gap9_tiled_config.py
+++ b/DeeployTest/test_gap9_tiled_config.py
@@ -83,4 +83,14 @@
 L3_DOUBLEBUFFER_MODELS = {
     "Models/miniMobileNet": [60000, 24000, 12000, 6000],
     "Models/miniMobileNetv2": [60000, 32000, 24000, 16000],
+    "Models/MLPerf/KeywordSpotting": [64000],
+    "Models/MLPerf/ImageClassification": [64000],
+    "Models/MLPerf/AnomalyDetection": [64000],
+    "Kernels/Integer/Attention": [60000, 20000, 10000],
+    "Models/Transformer": [60000, 30000],
+    # Regression for the async-L3 multi-2D-copy decomposition bug: a small-L1 NCHW->NHWC
+    # transpose tiles into a rank-3 transfer that the Anydim adapter splits into a loop
+    # of 2D copies sharing one request handle. Hangs under a naive async L3 DMA; passes
+    # once the decomposition waits each sub-copy.
+    "Kernels/FP32/Transpose3D": [2000],
 }