diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py index adbf161328..b0139646e5 100644 --- a/Deeploy/Targets/GAP9/DMA/L3Dma.py +++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py @@ -6,8 +6,7 @@ from typing import Dict, Tuple from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer -from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \ - PerTensorWaitingStrategy +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy class GAP9L3DmaFuture(Future): @@ -60,5 +59,5 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu return operatorRepresentation -# Blocking adapter for L3 DMA (used in GAP9 L3 tiling) -gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma()) +# Async L3 DMA for GAP9 L3 tiling +gap9L3DmaHack = GAP9L3Dma() diff --git a/Deeploy/TilingExtension/AsyncDma.py b/Deeploy/TilingExtension/AsyncDma.py index 9679681051..b54bb76f73 100644 --- a/Deeploy/TilingExtension/AsyncDma.py +++ b/Deeploy/TilingExtension/AsyncDma.py @@ -266,6 +266,13 @@ def transfer(self, strideExt[-kernelRank:], strideLoc[-kernelRank:], direction, future) callStack.extend(dma_code) + # The decomposed sub-transfers all share the single per-tensor request + # handle (future). Under an asynchronous DMA the in-flight copies would + # overwrite that handle before it is waited -> request reuse -> overrun / + # hang. Wait for each sub-transfer before issuing the next so only one is + # ever in flight on the shared handle. This serializes the (rank > kernel- + # rank) decomposition only; non-decomposed transfers stay fully async. + callStack.append(future.wait()) callStack.append(CodeSnippet(self.NestedForLoopCloseTemplate(nestedLoopDepth), {})) return callStack elif kernelRank == transferRank: diff --git a/DeeployTest/Tests/Kernels/FP32/Transpose3D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Transpose3D/inputs.npz new file mode 100644 index 0000000000..b5d6c2c817 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Transpose3D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Transpose3D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Transpose3D/network.onnx new file mode 100644 index 0000000000..95b448f690 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Transpose3D/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Transpose3D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Transpose3D/outputs.npz new file mode 100644 index 0000000000..0e6308905d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Transpose3D/outputs.npz differ diff --git a/DeeployTest/test_gap9_tiled_config.py b/DeeployTest/test_gap9_tiled_config.py index 764d61f0ca..4b1be9d548 100644 --- a/DeeployTest/test_gap9_tiled_config.py +++ b/DeeployTest/test_gap9_tiled_config.py @@ -83,4 +83,14 @@ L3_DOUBLEBUFFER_MODELS = { "Models/miniMobileNet": [60000, 24000, 12000, 6000], "Models/miniMobileNetv2": [60000, 32000, 24000, 16000], + "Models/MLPerf/KeywordSpotting": [64000], + "Models/MLPerf/ImageClassification": [64000], + "Models/MLPerf/AnomalyDetection": [64000], + "Kernels/Integer/Attention": [60000, 20000, 10000], + "Models/Transformer": [60000, 30000], + # Regression for the async-L3 multi-2D-copy decomposition bug: a small-L1 NCHW->NHWC + # transpose tiles into a rank-3 transfer that the Anydim adapter splits into a loop + # of 2D copies sharing one request handle. Hangs under a naive async L3 DMA; passes + # once the decomposition waits each sub-copy. + "Kernels/FP32/Transpose3D": [2000], }