From 57e7b82d977b89d90c1cba21fb007dac33c874df Mon Sep 17 00:00:00 2001 From: runwangdl Date: Thu, 2 Jul 2026 12:08:33 +0000 Subject: [PATCH 1/2] perf(GAP9): -O3 on hot forward kernels (Conv / DWConv / Gemm) Compile the conv / depthwise-conv / Gemm translation units at -O3, appended last so it wins over the SDK's default -Os on the same files. Everything else stays at -Os. -O3 turns on the RISC-V (XpulpV2) hardware loops on the kernels' tight inner loops; on a forward conv the -O3 object has 18 lp.setup HW-loop instructions vs 0 at -Os, at the cost of ~+50% .text on those files. --- TargetLibraries/GAP9/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/TargetLibraries/GAP9/CMakeLists.txt b/TargetLibraries/GAP9/CMakeLists.txt index ca4c3ffbeb..a6282c6785 100644 --- a/TargetLibraries/GAP9/CMakeLists.txt +++ b/TargetLibraries/GAP9/CMakeLists.txt @@ -31,6 +31,16 @@ target_compile_options(deeploygap9 PRIVATE target_link_libraries(deeploygap9 PUBLIC pmsis) +# Compile the hot forward kernels at -O3 (set last so it wins over the SDK's +# default -Os). Conv / depthwise-conv / Gemm dominate GAP9 inference cycles; +# -O3 turns on the RISC-V (XpulpV2) hardware loops on their tight inner loops. +set(_KERNEL_O3_FILES + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/Convolution_fp32.c + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/DWConvolution_fp32.c + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/Gemm.c +) +set_source_files_properties(${_KERNEL_O3_FILES} PROPERTIES COMPILE_OPTIONS "-O3") + #RW: Link PULP-NN #RW: Set PULP-NN version and bitwidth for pulp-nn-mixed set(PULPNNVERSION XPULPV2) From 1e91ff743672d7d07c5853e2c3bdecd9a95fbfc3 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Thu, 2 Jul 2026 12:08:33 +0000 Subject: [PATCH 2/2] fix(GAP9): hoist L2->L1 tile-control tables to L2 The hoisted tile-control tables (numTiles / DMA cmd / size / dims / padding / offsets) are read-only lookup tables the cluster controller uses to drive the tiling loop and program DMAs -- not bulk tile data. Previously the L2->L1 tiling pass emitted them with _memoryLevel=self.memory="L1", so they landed in the GAP9 L1 TCDM next to the cluster master stack. On memory-tight nets this both wastes scarce L1 (~11.6 KB on CCT, ~7.0 KB on MobileNetV1, ~2.5 KB on ResNet8) and creates a correctness hazard: a deep master-stack write can clobber a single table entry, turning a DMA cmd into a garbage code pointer so mchan_transfer_wait() hangs forever (observed on MobileNetV1 training). Redirect only the L2->L1 pass to emit these tables in L2. The L3->L2 pass keeps its tables in L2 (== self.memory, unchanged). Platforms that don't tile into a level named "L1" are unaffected. Tile *data* buffers still go to L1 as before -- only the constant control tables move. --- .../CodeTransformationPasses/TilingHoistingMixIn.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py index 8a0c1b9b54..5ec56e375f 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py @@ -60,7 +60,18 @@ def _hoistValues(self, else: cb._type = PointerClass(BasicDataTypes.minimalIntegerType(values)) cb._instance = cb._type(cb.name, ctxt) - cb._memoryLevel = self.memory + # These are constant tile *control* tables (numTiles / DMA cmd / size / + # dims / offsets) read by the (cluster) controller to drive the tiling + # loop and program DMAs -- not bulk tile data. Putting them in the + # innermost tile memory (L1/TCDM) wastes scarce L1 and, on GAP9, places + # them in the contended L1 region next to the cluster master stack: a + # deep stack write can clobber a single table entry, turning a DMA `cmd` + # into a garbage code pointer so mchan_transfer_wait() hangs forever + # (observed on MobileNetV1 training). Keep them in the controller- + # addressable outer memory (L2) instead. Only redirect the L2->L1 pass; + # the L3->L2 pass keeps its tables in L2 (== self.memory), never L3. + # Platforms that don't tile into a level named "L1" are unaffected. + cb._memoryLevel = "L2" if self.memory == "L1" else self.memory return cb def _hoistReference(self,