Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Deeploy/Targets/GAP9/DMA/L3Dma.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from typing import Dict, Tuple

from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
PerTensorWaitingStrategy
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy


class GAP9L3DmaFuture(Future):
Expand Down Expand Up @@ -60,5 +59,5 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu
return operatorRepresentation


# Blocking adapter for L3 DMA (used in GAP9 L3 tiling)
gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma())
# Async L3 DMA for GAP9 L3 tiling
gap9L3DmaHack = GAP9L3Dma()
7 changes: 7 additions & 0 deletions Deeploy/TilingExtension/AsyncDma.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,13 @@ def transfer(self,
strideExt[-kernelRank:], strideLoc[-kernelRank:], direction, future)

callStack.extend(dma_code)
# The decomposed sub-transfers all share the single per-tensor request
# handle (future). Under an asynchronous DMA the in-flight copies would
# overwrite that handle before it is waited -> request reuse -> overrun /
# hang. Wait for each sub-transfer before issuing the next so only one is
# ever in flight on the shared handle. This serializes the (rank > kernel-
# rank) decomposition only; non-decomposed transfers stay fully async.
callStack.append(future.wait())
callStack.append(CodeSnippet(self.NestedForLoopCloseTemplate(nestedLoopDepth), {}))
return callStack
elif kernelRank == transferRank:
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
10 changes: 10 additions & 0 deletions DeeployTest/test_gap9_tiled_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,14 @@
L3_DOUBLEBUFFER_MODELS = {
"Models/miniMobileNet": [60000, 24000, 12000, 6000],
"Models/miniMobileNetv2": [60000, 32000, 24000, 16000],
"Models/MLPerf/KeywordSpotting": [64000],
"Models/MLPerf/ImageClassification": [64000],
"Models/MLPerf/AnomalyDetection": [64000],
"Kernels/Integer/Attention": [60000, 20000, 10000],
"Models/Transformer": [60000, 30000],
# Regression for the async-L3 multi-2D-copy decomposition bug: a small-L1 NCHW->NHWC
# transpose tiles into a rank-3 transfer that the Anydim adapter splits into a loop
# of 2D copies sharing one request handle. Hangs under a naive async L3 DMA; passes
# once the decomposition waits each sub-copy.
"Kernels/FP32/Transpose3D": [2000],

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a model and should be moved into a L3_DOUBLEBUFFER_KERNELS category.

}
Loading