From ceafd01462fb8bfe7a3ed9640ff09031202c2137 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 3 Apr 2026 23:26:09 +0200
Subject: [PATCH 1/3] GPU: Add GPUCommonAlignedAlloc.h for aligned buffers, and
 use it for TPCFastTransformPOD

---
 .../Workflow/src/BarrelAlignmentSpec.cxx      |  2 +-
 .../include/TPCCalibration/CalculatedEdx.h    |  4 +-
 .../include/TPCCalibration/TrackDump.h        |  2 +-
 .../TPC/calibration/src/CalculatedEdx.cxx     |  4 +-
 Detectors/TPC/calibration/src/TrackDump.cxx   |  4 +-
 .../reconstruction/test/testGPUCATracking.cxx |  4 +-
 Detectors/TPC/workflow/src/TPCScalerSpec.cxx  |  3 +-
 GPU/Common/CMakeLists.txt                     |  1 +
 GPU/Common/GPUCommonAlignedAlloc.h            | 61 +++++++++++++++++++
 GPU/GPUTracking/Base/GPUReconstruction.cxx    | 16 ++---
 GPU/GPUTracking/Base/GPUReconstruction.h      | 11 ++--
 GPU/GPUTracking/Base/GPUReconstructionCPU.cxx |  4 +-
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |  4 +-
 GPU/GPUTracking/Global/GPUChainTracking.h     |  5 +-
 GPU/GPUTracking/Global/GPUChainTrackingIO.cxx |  2 +-
 .../Standalone/Benchmark/standalone.cxx       |  8 +--
 .../TPCFastTransformPOD.cxx                   | 26 +++++++-
 .../TPCFastTransformPOD.h                     | 59 +++++-------------
 .../include/GPUWorkflow/GPUWorkflowSpec.h     |  6 +-
 .../include/GPUWorkflow/O2GPUDPLDisplay.h     |  3 +-
 GPU/Workflow/src/GPUWorkflowSpec.cxx          |  2 +-
 GPU/Workflow/src/GPUWorkflowTPC.cxx           | 18 ++----
 GPU/Workflow/src/O2GPUDPLDisplay.cxx          |  4 +-
 macro/runTPCRefit.C                           |  2 +-
 24 files changed, 154 insertions(+), 101 deletions(-)
 create mode 100644 GPU/Common/GPUCommonAlignedAlloc.h
diff --git a/Detectors/Align/Workflow/src/BarrelAlignmentSpec.cxx b/Detectors/Align/Workflow/src/BarrelAlignmentSpec.cxx
index e7a64ef544ee0..84674e69529fb 100644
--- a/Detectors/Align/Workflow/src/BarrelAlignmentSpec.cxx
+++ b/Detectors/Align/Workflow/src/BarrelAlignmentSpec.cxx
@@ -118,7 +118,7 @@ class BarrelAlignmentSpec : public Task
 
   o2::tpc::VDriftHelper mTPCVDriftHelper{};
 
-  std::vector<char> mCorrMapBuffer; // buffer to hold the raw map data from CCDB, needed to keep the pointer valid in the CorrectionMapsHelper
+  o2::gpu::aligned_unique_buffer_ptr<o2::gpu::TPCFastTransformPOD> mCorrMapBuffer; // buffer to hold the raw map data from CCDB, needed to keep the pointer valid in the CorrectionMapsHelper
   const o2::gpu::TPCFastTransformPOD* mTPCCorrMaps{};
 
   //
diff --git a/Detectors/TPC/calibration/include/TPCCalibration/CalculatedEdx.h b/Detectors/TPC/calibration/include/TPCCalibration/CalculatedEdx.h
index d62eb8a1ab868..4d8c4e89322a8 100644
--- a/Detectors/TPC/calibration/include/TPCCalibration/CalculatedEdx.h
+++ b/Detectors/TPC/calibration/include/TPCCalibration/CalculatedEdx.h
@@ -228,7 +228,7 @@ class CalculatedEdx
   std::vector<TPCClRefElem>* mTPCTrackClIdxVecInput{nullptr}; ///< input vector with TPC tracks cluster indicies
   const o2::tpc::ClusterNativeAccess* mClusterIndex{nullptr}; ///< needed to access clusternative with tpctracks
   const o2::gpu::TPCFastTransformPOD* mTPCCorrMap{nullptr};   ///< cluster correction maps helper
-  std::vector<char> mTPCCorrMapBuffer;
+  o2::gpu::aligned_unique_buffer_ptr<o2::gpu::TPCFastTransformPOD> mTPCCorrMapBuffer;
   std::vector<unsigned char> mTPCRefitterShMap;                  ///< externally set TPC clusters sharing map
   std::vector<unsigned int> mTPCRefitterOccMap;                  ///< externally set TPC clusters occupancy map
   std::unique_ptr<o2::gpu::GPUO2InterfaceRefit> mRefit{nullptr}; ///< TPC refitter used for TPC tracks refit during the reconstruction
@@ -247,4 +247,4 @@ class CalculatedEdx
 
 } // namespace o2::tpc
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h b/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h
index 3f60b165e4167..4eae504db7220 100644
--- a/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h
+++ b/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h
@@ -77,7 +77,7 @@ class TrackDump
     float gyc(float vertexTime = 0) const;
     float zc(float vertexTime = 0) const;
 
-    inline static std::vector<char> corrMapBuffer;                      // buffer for owning the correction map in case of update during runtime
+    inline static o2::gpu::aligned_unique_buffer_ptr<o2::gpu::TPCFastTransformPOD> corrMapBuffer; // buffer for owning the correction map in case of update during runtime
     inline static const o2::gpu::TPCFastTransformPOD* corrMap{nullptr}; // local copy of the correction map for quick access to the transform functions
     static void loadCorrMaps(std::string_view corrMapFile, std::string_view corrMapFileRef = "");
     ClassDefNV(ClusterNativeAdd, 1);
diff --git a/Detectors/TPC/calibration/src/CalculatedEdx.cxx b/Detectors/TPC/calibration/src/CalculatedEdx.cxx
index bbf5b0ca93128..396214775eb76 100644
--- a/Detectors/TPC/calibration/src/CalculatedEdx.cxx
+++ b/Detectors/TPC/calibration/src/CalculatedEdx.cxx
@@ -32,10 +32,10 @@ using namespace o2::tpc;
 
 CalculatedEdx::CalculatedEdx()
 {
-  std::vector<char> buffer;
+  gpu::aligned_unique_buffer_ptr<gpu::TPCFastTransformPOD> buffer;
   gpu::TPCFastTransformPOD::create(buffer, *TPCFastTransformHelperO2::instance()->create(0));
   mTPCCorrMapBuffer = std::move(buffer);
-  mTPCCorrMap = &gpu::TPCFastTransformPOD::get(mTPCCorrMapBuffer.data());
+  mTPCCorrMap = mTPCCorrMapBuffer.get();
 }
 
 void CalculatedEdx::setMembers(std::vector<o2::tpc::TPCClRefElem>* tpcTrackClIdxVecInput, const o2::tpc::ClusterNativeAccess& clIndex, std::vector<o2::tpc::TrackTPC>* vTPCTracksArrayInp)
diff --git a/Detectors/TPC/calibration/src/TrackDump.cxx b/Detectors/TPC/calibration/src/TrackDump.cxx
index 52cf7a4e3c7e3..72042a537dc5f 100644
--- a/Detectors/TPC/calibration/src/TrackDump.cxx
+++ b/Detectors/TPC/calibration/src/TrackDump.cxx
@@ -236,8 +236,8 @@ float TrackDump::ClusterNativeAdd::zc(float vertexTime) const
 void TrackDump::ClusterNativeAdd::loadCorrMaps(std::string_view corrMapFile, std::string_view corrMapFileRef)
 {
   auto fastTransformTmp = gpu::TPCFastTransform::loadFromFile(corrMapFile.data());
-  std::vector<char> buffer;
+  o2::gpu::aligned_unique_buffer_ptr<o2::gpu::TPCFastTransformPOD> buffer;
   gpu::TPCFastTransformPOD::create(buffer, *fastTransformTmp);
   corrMapBuffer = std::move(buffer);
-  corrMap = &gpu::TPCFastTransformPOD::get(corrMapBuffer.data());
+  corrMap = corrMapBuffer.get();
 }
diff --git a/Detectors/TPC/reconstruction/test/testGPUCATracking.cxx b/Detectors/TPC/reconstruction/test/testGPUCATracking.cxx
index 811f474d8491d..20660473f4c37 100644
--- a/Detectors/TPC/reconstruction/test/testGPUCATracking.cxx
+++ b/Detectors/TPC/reconstruction/test/testGPUCATracking.cxx
@@ -74,9 +74,9 @@ BOOST_AUTO_TEST_CASE(CATracking_test1)
   config.configWorkflow.outputs.set(gpudatatypes::InOutType::TPCMergedTracks);
 
   auto fastTransformTmp = TPCFastTransformHelperO2::instance()->create(0);
-  std::vector<char> fastTransformBuf;
+  aligned_unique_buffer_ptr<TPCFastTransformPOD> fastTransformBuf;
   TPCFastTransformPOD::create(fastTransformBuf, *fastTransformTmp);
-  config.configCalib.fastTransform = &TPCFastTransformPOD::get(fastTransformBuf.data());
+  config.configCalib.fastTransform = fastTransformBuf.get();
 
   auto dEdxCalibContainer = GPUO2InterfaceUtils::getCalibdEdxContainerDefault();
   config.configCalib.dEdxCalibContainer = dEdxCalibContainer.get();
diff --git a/Detectors/TPC/workflow/src/TPCScalerSpec.cxx b/Detectors/TPC/workflow/src/TPCScalerSpec.cxx
index 461963fcb261d..09ffa644520cf 100644
--- a/Detectors/TPC/workflow/src/TPCScalerSpec.cxx
+++ b/Detectors/TPC/workflow/src/TPCScalerSpec.cxx
@@ -219,7 +219,8 @@ class TPCScalerSpec : public Task
 
     Output corrMapOutput{header::gDataOriginTPC, "TPCCORRMAP", 0};
     auto outputBuffer = o2::pmr::vector<char>(pc.outputs().getMemoryResource(corrMapOutput));
-    auto* pod = TPCFastTransformPOD::create(outputBuffer, finalMap.getCorrection());
+    outputBuffer.resize(TPCFastTransformPOD::estimateSize(finalMap.getCorrection()));
+    auto* pod = TPCFastTransformPOD::create(outputBuffer.data(), outputBuffer.size(), finalMap.getCorrection());
     const auto& vd = mTPCVDriftHelper.getVDriftObject();
     o2::tpc::TPCFastTransformHelperO2::instance()->updateCalibration(*pod, 0, vd.corrFact, vd.refVDrift, vd.getTimeOffset());
     pc.outputs().adoptContainer(corrMapOutput, std::move(outputBuffer));
diff --git a/GPU/Common/CMakeLists.txt b/GPU/Common/CMakeLists.txt
index 8f7a7c2e169ed..45ca83a3033f0 100644
--- a/GPU/Common/CMakeLists.txt
+++ b/GPU/Common/CMakeLists.txt
@@ -13,6 +13,7 @@ set(MODULE GPUCommon)
 
 set(HDRS_INSTALL
     GPUCommonAlgorithm.h
+    GPUCommonAlignedAlloc.h
     GPUCommonDef.h
     GPUCommonDefAPI.h
     GPUCommonHelpers.h
diff --git a/GPU/Common/GPUCommonAlignedAlloc.h b/GPU/Common/GPUCommonAlignedAlloc.h
new file mode 100644
index 0000000000000..8e028399f4910
--- /dev/null
+++ b/GPU/Common/GPUCommonAlignedAlloc.h
@@ -0,0 +1,61 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUCommonAlignedAlloc.h
+/// \author David Rohr
+
+#ifndef GPUCOMMONAKUGBEDALLOC_H
+#define GPUCOMMONAKUGBEDALLOC_H
+
+#include <memory>
+
+namespace o2::gpu
+{
+
+template <typename T, std::size_t MIN_ALIGN = 0>
+struct alignedDeleter {
+  void operator()(void* ptr) { ::operator delete(ptr, std::align_val_t(std::max(MIN_ALIGN, alignof(T)))); };
+};
+
+template <typename T, std::size_t MIN_ALIGN = 0>
+struct alignedAllocator {
+  using value_type = T;
+  T* allocate(std::size_t n)
+  {
+    return (T*)::operator new(n, std::align_val_t(std::max(MIN_ALIGN, alignof(T))));
+  }
+  void deallocate(T* ptr, std::size_t)
+  {
+    alignedDeleter<T, MIN_ALIGN>()(ptr);
+  }
+};
+
+template <typename T>
+struct aligned_unique_buffer_ptr : public std::unique_ptr<char[], alignedDeleter<T>> {
+  aligned_unique_buffer_ptr() = default;
+  aligned_unique_buffer_ptr(size_t n) { alloc(n); }
+  aligned_unique_buffer_ptr(T* ptr) { std::unique_ptr<char[], alignedDeleter<T>>::reset((char*)ptr); }
+  char* getraw() { return std::unique_ptr<char[], alignedDeleter<T>>::get(); }
+  const char* getraw() const { return std::unique_ptr<char[], alignedDeleter<T>>::get(); }
+  T* get() { return (T*)std::unique_ptr<char[], alignedDeleter<T>>::get(); }
+  const T* get() const { return (T*)std::unique_ptr<char[], alignedDeleter<T>>::get(); }
+  T* operator->() { return get(); }
+  const T* operator->() const { return get(); }
+  T* alloc(std::size_t n)
+  {
+    std::unique_ptr<char[], alignedDeleter<T>>::reset((char*)alignedAllocator<T>().allocate(n));
+    return get();
+  }
+};
+
+} // namespace o2::gpu
+
+#endif // GPUCOMMONAKUGBEDALLOC_H
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index f6aa62778a061..37468477c3b7d 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -467,7 +467,7 @@ int32_t GPUReconstruction::Exit()
       if (mMemoryResources[i].mReuse >= 0) {
         continue;
       }
-      operator delete(mMemoryResources[i].mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+      ::operator delete(mMemoryResources[i].mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
       mMemoryResources[i].mPtr = mMemoryResources[i].mPtrDevice = nullptr;
     }
   }
@@ -630,7 +630,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) {
     if (!(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
       if (res->mPtrDevice && res->mReuse < 0) {
-        operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+        ::operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
       }
       res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
       if (res->mReuse >= 0) {
@@ -640,7 +640,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
         }
         res->mPtrDevice = mMemoryResources[res->mReuse].mPtrDevice;
       } else {
-        res->mPtrDevice = operator new(res->mSize + GPUCA_BUFFER_ALIGNMENT, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+        res->mPtrDevice = ::operator new(res->mSize + GPUCA_BUFFER_ALIGNMENT, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
       }
       res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
       res->SetPointers(res->mPtr);
@@ -733,9 +733,9 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
     if ((type & GPUMemoryResource::MEMORY_STACK)) {
-      mNonPersistentIndividualDirectAllocations.emplace_back(retVal, alignedDeleter());
+      mNonPersistentIndividualDirectAllocations.emplace_back(retVal, alignedDefaultBufferDeleter());
     } else {
-      mDirectMemoryChunks.emplace_back(retVal, alignedDeleter());
+      mDirectMemoryChunks.emplace_back(retVal, alignedDefaultBufferDeleter());
     }
     return retVal;
   }
@@ -798,7 +798,7 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
   }
   char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
   stdspinlock spinlock(mMemoryMutex);
-  mVolatileChunks.emplace_back(retVal, alignedDeleter());
+  mVolatileChunks.emplace_back(retVal, alignedDefaultBufferDeleter());
   return retVal;
 }
 
@@ -876,7 +876,7 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res)
     std::cout << "Freeing " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
   }
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) {
-    operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+    ::operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
   }
   res->mPtr = nullptr;
   res->mPtrDevice = nullptr;
@@ -916,7 +916,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, cons
       std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
     }
     if (res->mReuse < 0) {
-      operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+      ::operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
     }
     res->mPtr = nullptr;
     res->mPtrDevice = nullptr;
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 9a337c02ad26d..21195af1d4a89 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -32,6 +32,7 @@
 #include "GPUOutputControl.h"
 #include "GPUParam.h"
 #include "GPUConstantMem.h"
+#include "GPUCommonAlignedAlloc.h"
 #include "GPUDef.h"
 
 namespace o2::its
@@ -381,15 +382,13 @@ class GPUReconstruction
     GPUProcessor* proc = nullptr;
     std::vector<uint16_t> res;
   };
-  struct alignedDeleter {
-    void operator()(void* ptr) { ::operator delete[](ptr, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); };
-  };
   std::unordered_map<GPUMemoryReuse::ID, MemoryReuseMeta> mMemoryReuse1to1;
   std::vector<std::tuple<void*, void*, size_t, size_t, uint64_t>> mNonPersistentMemoryStack; // hostPoolAddress, devicePoolAddress, individualAllocationCount, directIndividualAllocationCound, tag
   std::vector<GPUMemoryResource*> mNonPersistentIndividualAllocations;
-  std::vector<std::unique_ptr<char[], alignedDeleter>> mNonPersistentIndividualDirectAllocations;
-  std::vector<std::unique_ptr<char[], alignedDeleter>> mDirectMemoryChunks;
-  std::vector<std::unique_ptr<char[], alignedDeleter>> mVolatileChunks;
+  using alignedDefaultBufferDeleter = alignedDeleter<char, GPUCA_BUFFER_ALIGNMENT>;
+  std::vector<std::unique_ptr<char[], alignedDefaultBufferDeleter>> mNonPersistentIndividualDirectAllocations;
+  std::vector<std::unique_ptr<char[], alignedDefaultBufferDeleter>> mDirectMemoryChunks;
+  std::vector<std::unique_ptr<char[], alignedDefaultBufferDeleter>> mVolatileChunks;
   std::atomic_flag mMemoryMutex = ATOMIC_FLAG_INIT;
 
   std::unique_ptr<GPUReconstructionPipelineContext> mPipelineContext;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index 752b5f27ded3f..6dd38c4c4d6b7 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -189,7 +189,7 @@ int32_t GPUReconstructionCPU::InitDevice()
       if (mDeviceMemorySize > mHostMemorySize) {
         mHostMemorySize = mDeviceMemorySize;
       }
-      mHostMemoryBase = operator new(mHostMemorySize, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+      mHostMemoryBase = ::operator new(mHostMemorySize, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
     }
     mHostMemoryPermanent = mHostMemoryBase;
     ClearAllocatedMemory();
@@ -205,7 +205,7 @@ int32_t GPUReconstructionCPU::ExitDevice()
 {
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     if (mMaster == nullptr) {
-      operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+      ::operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
     }
     mHostMemoryPool = mHostMemoryBase = mHostMemoryPoolEnd = mHostMemoryPermanent = nullptr;
     mHostMemorySize = 0;
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index d669f60356101..c5e9fd7630295 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -557,10 +557,10 @@ void GPUChainTracking::AllocateIOMemory()
   AllocateIOMemoryHelper(mIOPtrs.nTRDTriggerRecords, mIOPtrs.trdTrackletIdxFirst, mIOMem.trdTrackletIdxFirst);
 }
 
-void GPUChainTracking::SetTPCFastTransform(std::unique_ptr<TPCFastTransformPOD>&& tpcFastTransform)
+void GPUChainTracking::SetTPCFastTransform(aligned_unique_buffer_ptr<TPCFastTransformPOD>&& tpcFastTransform)
 {
   mTPCFastTransformU = std::move(tpcFastTransform);
-  processors()->calibObjects.fastTransform = mTPCFastTransformU.get();
+  processors()->calibObjects.fastTransform = (TPCFastTransformPOD*)mTPCFastTransformU.get();
 }
 
 void GPUChainTracking::SetMatLUT(std::unique_ptr<o2::base::MatLayerCylSet>&& lut)
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 2af33f86ab0d7..ccc864e422065 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -18,6 +18,7 @@
 #include "GPUChain.h"
 #include "GPUDataTypesIO.h"
 #include "GPUDataTypesConfig.h"
+#include "GPUCommonAlignedAlloc.h"
 #include <atomic>
 #include <mutex>
 #include <functional>
@@ -182,7 +183,7 @@ class GPUChainTracking : public GPUChain
   const GPUTRDRecoParam* GetTRDRecoParam() const;
   const o2::base::Propagator* GetO2Propagator() const;
   const o2::base::Propagator* GetDeviceO2Propagator();
-  void SetTPCFastTransform(std::unique_ptr<TPCFastTransformPOD>&& tpcFastTransform);
+  void SetTPCFastTransform(aligned_unique_buffer_ptr<TPCFastTransformPOD>&& tpcFastTransform);
   void SetMatLUT(std::unique_ptr<o2::base::MatLayerCylSet>&& lut);
   void SetTRDGeometry(std::unique_ptr<o2::trd::GeometryFlat>&& geo);
   void SetTRDRecoParam(std::unique_ptr<GPUTRDRecoParam>&& par);
@@ -260,7 +261,7 @@ class GPUChainTracking : public GPUChain
   std::unique_ptr<GPUTPCClusterStatistics> mCompressionStatistics;
 
   // Ptr to detector / calibration objects
-  std::unique_ptr<TPCFastTransformPOD> mTPCFastTransformU;           // Global TPC fast transformation object
+  aligned_unique_buffer_ptr<TPCFastTransformPOD> mTPCFastTransformU; // Global TPC fast transformation object
   std::unique_ptr<TPCPadGainCalib> mTPCPadGainCalibU;                // TPC gain calibration and cluster finder parameters
   std::unique_ptr<TPCZSLinkMapping> mTPCZSLinkMappingU;              // TPC Mapping data required by ZS Link decoder
   std::unique_ptr<o2::tpc::CalibdEdxContainer> mdEdxCalibContainerU; // TPC dEdx calibration container
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
index 21bc9a66eac0c..0a879db818c8f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
@@ -335,7 +335,7 @@ void GPUChainTracking::ReadSettings(const char* dir)
   f = dir;
   f += "tpctransform.dump";
   mTPCFastTransformU = ReadStructFromFile<TPCFastTransformPOD>(f.c_str());
-  processors()->calibObjects.fastTransform = mTPCFastTransformU.get();
+  processors()->calibObjects.fastTransform = (TPCFastTransformPOD*)mTPCFastTransformU.get();
   f = dir;
   f += "tpcpadgaincalib.dump";
   mTPCPadGainCalibU = ReadStructFromFile<TPCPadGainCalib>(f.c_str());
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 93ab5523c6cec..7bc09f81ce2df 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -78,7 +78,7 @@ GPUChainITS *chainITS, *chainITSAsync, *chainITSPipeline;
 std::string eventsDir;
 void unique_ptr_aligned_delete(char* v)
 {
-  operator delete(v, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
+  ::operator delete(v, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
 }
 std::unique_ptr<char, void (*)(char*)> outputmemory(nullptr, unique_ptr_aligned_delete), outputmemoryPipeline(nullptr, unique_ptr_aligned_delete), inputmemory(nullptr, unique_ptr_aligned_delete);
 std::unique_ptr<GPUDisplayFrontendInterface> eventDisplay;
@@ -239,20 +239,20 @@ int32_t ReadConfiguration(int argc, char** argv)
 
   if (configStandalone.outputcontrolmem) {
     bool forceEmptyMemory = getenv("LD_PRELOAD") && strstr(getenv("LD_PRELOAD"), "valgrind") != nullptr;
-    outputmemory.reset((char*)operator new(configStandalone.outputcontrolmem, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)));
+    outputmemory.reset((char*)::operator new(configStandalone.outputcontrolmem, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)));
     if (forceEmptyMemory) {
       printf("Valgrind detected, emptying GPU output memory to avoid false positive undefined reads");
       memset(outputmemory.get(), 0, configStandalone.outputcontrolmem);
     }
     if (configStandalone.proc.doublePipeline) {
-      outputmemoryPipeline.reset((char*)operator new(configStandalone.outputcontrolmem, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)));
+      outputmemoryPipeline.reset((char*)::operator new(configStandalone.outputcontrolmem, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)));
       if (forceEmptyMemory) {
         memset(outputmemoryPipeline.get(), 0, configStandalone.outputcontrolmem);
       }
     }
   }
   if (configStandalone.inputcontrolmem) {
-    inputmemory.reset((char*)operator new(configStandalone.inputcontrolmem, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)));
+    inputmemory.reset((char*)::operator new(configStandalone.inputcontrolmem, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)));
   }
 
   configStandalone.proc.showOutputStat = true;
diff --git a/GPU/TPCFastTransformation/TPCFastTransformPOD.cxx b/GPU/TPCFastTransformation/TPCFastTransformPOD.cxx
index 295b3bb19431e..dc288aeb4b608 100644
--- a/GPU/TPCFastTransformation/TPCFastTransformPOD.cxx
+++ b/GPU/TPCFastTransformation/TPCFastTransformPOD.cxx
@@ -25,7 +25,31 @@ namespace o2
 namespace gpu
 {
 
-#if !defined(GPUCA_NO_ROOT) && !defined(GPUCA_NO_FMT) && !defined(GPUCA_STANDALONE)
+#if !defined(GPUCA_NO_ROOT) && !defined(GPUCA_NO_FMT) && !defined(GPUCA_STANDALONE) && !defined(GPUCA_GPUCODE)
+
+/// Create POD transform from old flat-buffer one. Provided vector will serve as a buffer
+TPCFastTransformPOD* TPCFastTransformPOD::create(aligned_unique_buffer_ptr<TPCFastTransformPOD>& destVector, const TPCFastTransform& src)
+{
+  size_t size = estimateSize(src);
+  destVector.alloc(size); // allocate exact size
+  LOGP(debug, "OrigCorrSize:{} SelfSize: {} Estimated POS size: {}", src.getCorrection().getFlatBufferSize(), sizeof(TPCFastTransformPOD), size);
+  auto res = create(destVector.getraw(), size, src);
+  res->setTimeStamp(src.getTimeStamp());
+  res->setVDrift(src.getVDrift());
+  res->setT0(src.getT0());
+  res->setLumi(src.getLumi());
+  res->setIDC(src.getIDC());
+  return res;
+}
+
+TPCFastTransformPOD* TPCFastTransformPOD::create(aligned_unique_buffer_ptr<TPCFastTransformPOD>& destVector, const TPCFastSpaceChargeCorrection& origCorr)
+{
+  // create filling only part corresponding to TPCFastSpaceChargeCorrection. Data members coming from TPCFastTransform (e.g. VDrift, T0..) are not set
+  size_t size = estimateSize(origCorr);
+  destVector.alloc(size);
+  LOGP(debug, "OrigCorrSize:{} SelfSize: {} Estimated POS size: {}", origCorr.getFlatBufferSize(), sizeof(TPCFastTransformPOD), size);
+  return create(destVector.getraw(), size, origCorr);
+}
 
 size_t TPCFastTransformPOD::estimateSize(const TPCFastSpaceChargeCorrection& origCorr)
 {
diff --git a/GPU/TPCFastTransformation/TPCFastTransformPOD.h b/GPU/TPCFastTransformation/TPCFastTransformPOD.h
index 347fc23ec954f..0fa4c2a251932 100644
--- a/GPU/TPCFastTransformation/TPCFastTransformPOD.h
+++ b/GPU/TPCFastTransformation/TPCFastTransformPOD.h
@@ -19,6 +19,11 @@
 
 #include "GPUCommonRtypes.h"
 #include "TPCFastTransform.h"
+#ifndef GPUCA_GPUCODE
+#include <memory>
+#include <cstdlib>
+#include "GPUCommonAlignedAlloc.h"
+#endif
 
 /*
 Binary buffer should be cast to TPCFastTransformPOD class using static TPCFastTransformPOD& t = get(buffer); method,
@@ -209,20 +214,23 @@ class TPCFastTransformPOD
 
 #if !defined(GPUCA_GPUCODE)
   /// Create POD transform from old flat-buffer one. Provided vector will serve as a buffer
-  template <typename V>
-  static TPCFastTransformPOD* create(V& destVector, const TPCFastTransform& src);
+  static TPCFastTransformPOD* create(aligned_unique_buffer_ptr<TPCFastTransformPOD>& destVector, const TPCFastTransform& src);
 
   /// create filling only part corresponding to TPCFastSpaceChargeCorrection. Data members coming from TPCFastTransform (e.g. VDrift, T0..) are not set
-  template <typename V>
-  static TPCFastTransformPOD* create(V& destVector, const TPCFastSpaceChargeCorrection& src);
+  static TPCFastTransformPOD* create(aligned_unique_buffer_ptr<TPCFastTransformPOD>& destVector, const TPCFastSpaceChargeCorrection& src);
 
-  static TPCFastTransformPOD* create(std::vector<char>& buf, const TPCFastTransformPOD& src)
+  static TPCFastTransformPOD* create(aligned_unique_buffer_ptr<TPCFastTransformPOD>& destVector, const TPCFastTransformPOD& src)
   {
-    buf.resize(src.size());
-    std::memcpy(buf.data(), &src, src.size());
-    return reinterpret_cast<TPCFastTransformPOD*>(buf.data());
+    destVector.alloc(src.size());
+    std::memcpy(destVector.get(), &src, src.size());
+    return destVector.get();
   }
 
+  static TPCFastTransformPOD* create(char* buff, size_t buffSize, const TPCFastTransform& src);
+  static TPCFastTransformPOD* create(char* buff, size_t buffSize, const TPCFastSpaceChargeCorrection& src);
+  static size_t estimateSize(const TPCFastTransform& src) { return estimateSize(src.getCorrection()); }
+  static size_t estimateSize(const TPCFastSpaceChargeCorrection& origCorr);
+
   bool test(const TPCFastTransform& src, int32_t npoints = 100000) const { return test(src.getCorrection(), npoints); }
   bool test(const TPCFastSpaceChargeCorrection& origCorr, int32_t npoints = 100000) const;
 #endif
@@ -244,10 +252,6 @@ class TPCFastTransformPOD
     auto res = offs % AlignmentBytes;
     return res ? offs + (AlignmentBytes - res) : offs;
   }
-  static size_t estimateSize(const TPCFastTransform& src) { return estimateSize(src.getCorrection()); }
-  static size_t estimateSize(const TPCFastSpaceChargeCorrection& origCorr);
-  static TPCFastTransformPOD* create(char* buff, size_t buffSize, const TPCFastTransform& src);
-  static TPCFastTransformPOD* create(char* buff, size_t buffSize, const TPCFastSpaceChargeCorrection& src);
   GPUd() static TPCFastTransformPOD& getNonConst(char* head) { return *reinterpret_cast<TPCFastTransformPOD*>(head); }
 #endif
 
@@ -421,37 +425,6 @@ GPUdi() bool TPCFastTransformPOD::isRealLocalInsideGrid(int32_t sector, int32_t
   return true;
 }
 
-#if !defined(GPUCA_GPUCODE)
-/// Create POD transform from old flat-buffer one. Provided vector will serve as a buffer
-template <typename V>
-TPCFastTransformPOD* TPCFastTransformPOD::create(V& destVector, const TPCFastTransform& src)
-{
-  const auto& origCorr = src.getCorrection();
-  size_t estSize = estimateSize(src);
-  destVector.resize(estSize); // allocate exact size
-  LOGP(debug, "OrigCorrSize:{} SelfSize: {} Estimated POS size: {}", src.getCorrection().getFlatBufferSize(), sizeof(TPCFastTransformPOD), estSize);
-  char* base = destVector.data();
-  auto res = create(destVector.data(), destVector.size(), src);
-  res->setTimeStamp(src.getTimeStamp());
-  res->setVDrift(src.getVDrift());
-  res->setT0(src.getT0());
-  res->setLumi(src.getLumi());
-  res->setIDC(src.getIDC());
-  return res;
-}
-
-template <typename V>
-TPCFastTransformPOD* TPCFastTransformPOD::create(V& destVector, const TPCFastSpaceChargeCorrection& origCorr)
-{
-  // create filling only part corresponding to TPCFastSpaceChargeCorrection. Data members coming from TPCFastTransform (e.g. VDrift, T0..) are not set
-  size_t estSize = estimateSize(origCorr);
-  destVector.resize(estSize); // allocate exact size
-  LOGP(debug, "OrigCorrSize:{} SelfSize: {} Estimated POS size: {}", origCorr.getFlatBufferSize(), sizeof(TPCFastTransformPOD), estSize);
-  char* base = destVector.data();
-  return create(destVector.data(), destVector.size(), origCorr);
-}
-#endif
-
 GPUdi() void TPCFastTransformPOD::TransformLocal(int32_t sector, int32_t row, float& x, float& y, float& z) const
 {
   if (!mApplyCorrection) {
diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
index e827c2cf0d256..170df79c95981 100644
--- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
+++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
@@ -22,6 +22,7 @@
 #include "Framework/ConcreteDataMatcher.h"
 #include "Framework/InitContext.h"
 #include "Framework/CompletionPolicy.h"
+#include "GPUCommonAlignedAlloc.h"
 #include "Algorithm/Parser.h"
 #include <string>
 #include <array>
@@ -154,13 +155,10 @@ class GPURecoWorkflowSpec : public o2::framework::Task
  private:
   struct calibObjectStruct {
     std::vector<char> mUpdatedTransformBuffer;
-    const TPCFastTransformPOD* mFastTransform{nullptr};
     std::unique_ptr<TPCPadGainCalib> mTPCPadGainCalib;
     std::unique_ptr<o2::tpc::CalibdEdxContainer> mdEdxCalibContainer;
     float mInstLumiCTP{-1};
-    // #if !defined(GPUCA_GPUCODE_DEVICE)
-    std::vector<char> mCorrMapBuffer;
-    // #endif
+    aligned_unique_buffer_ptr<TPCFastTransformPOD> mFastTransformBuffer;
   };
 
   /// initialize TPC options from command line
diff --git a/GPU/Workflow/include/GPUWorkflow/O2GPUDPLDisplay.h b/GPU/Workflow/include/GPUWorkflow/O2GPUDPLDisplay.h
index cd9752053d7bb..517e82480565b 100644
--- a/GPU/Workflow/include/GPUWorkflow/O2GPUDPLDisplay.h
+++ b/GPU/Workflow/include/GPUWorkflow/O2GPUDPLDisplay.h
@@ -12,6 +12,7 @@
 #ifndef O2_GPU_DPL_DISPLAY_H
 #define O2_GPU_DPL_DISPLAY_H
 
+#include "GPUCommonAlignedAlloc.h"
 #include "ReconstructionDataFormats/GlobalTrackID.h"
 #include "Framework/Task.h"
 #include <memory>
@@ -59,7 +60,7 @@ class O2GPUDPLDisplaySpec : public o2::framework::Task
   bool mGRPGeomUpdated = false;
   bool mAutoContinuousMaxTimeBin = false;
   bool mGeometryCreated = false;
-  std::vector<char> mBufferFastTransform;
+  aligned_unique_buffer_ptr<TPCFastTransformPOD> mBufferFastTransform;
   o2::dataformats::GlobalTrackID::mask_t mTrkMask;
   o2::dataformats::GlobalTrackID::mask_t mClMask;
   std::unique_ptr<GPUO2InterfaceDisplay> mDisplay;
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index 96bfeec3531ad..be1063fcb915b 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -251,7 +251,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
     // initialize TPC calib objects
     initFunctionTPCCalib(ic);
 
-    mConfig->configCalib.fastTransform = mCalibObjects.mFastTransform;
+    mConfig->configCalib.fastTransform = mCalibObjects.mFastTransformBuffer.get();
     // mConfig->configCalib.buffer = mCalibObjects.mBuffer; // TODO WRONG
     if (mConfig->configCalib.fastTransform == nullptr) {
       throw std::invalid_argument("GPU workflow: initialization of the TPC transformation failed");
diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx
index cfb65223a12f5..37b2caee29a4c 100644
--- a/GPU/Workflow/src/GPUWorkflowTPC.cxx
+++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx
@@ -105,10 +105,7 @@ void GPURecoWorkflowSpec::initFunctionTPCCalib(InitContext& ic)
   mCalibObjects.mdEdxCalibContainer.reset(new o2::tpc::CalibdEdxContainer());
   mTPCVDriftHelper.reset(new o2::tpc::VDriftHelper());
 
-  std::vector<char> buffer;
-  gpu::TPCFastTransformPOD::create(buffer, *o2::tpc::TPCFastTransformHelperO2::instance()->create(0));
-  mCalibObjects.mCorrMapBuffer = std::move(buffer);
-  mCalibObjects.mFastTransform = &TPCFastTransformPOD::get(mCalibObjects.mCorrMapBuffer.data());
+  gpu::TPCFastTransformPOD::create(mCalibObjects.mFastTransformBuffer, *o2::tpc::TPCFastTransformHelperO2::instance()->create(0));
 
   if (mConfParam->dEdxDisableTopologyPol) {
     LOGP(info, "Disabling loading of track topology correction using polynomials from CCDB");
@@ -348,16 +345,13 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC<GPUCalibObjectsConst>(ProcessingCon
         mCalibObjects.mInstLumiCTP = pc.inputs().get<float>("lumiCTP");
 
         // get the raw buffer and reinterpret as TPCFastTransformPOD
-        oldCalibObjects.mFastTransform = mCalibObjects.mFastTransform;            // save OLD pointer ✓
-        oldCalibObjects.mCorrMapBuffer = std::move(mCalibObjects.mCorrMapBuffer); // OLD buffer alive ✓
-
+        oldCalibObjects.mFastTransformBuffer = std::move(mCalibObjects.mFastTransformBuffer); // OLD buffer alive ✓
         auto const& raw = pc.inputs().get<const char*>("corrMap");
         const auto* newMap = &gpu::TPCFastTransformPOD::get(raw); // NEW map from DPL
-        std::vector<char> buffer(newMap->size());
-        std::memcpy(buffer.data(), newMap, buffer.size()); // copy NEW map ✓
-        mCalibObjects.mCorrMapBuffer = std::move(buffer);
-        mCalibObjects.mFastTransform = &TPCFastTransformPOD::get(mCalibObjects.mCorrMapBuffer.data());
-        newCalibObjects.fastTransform = mCalibObjects.mFastTransform;
+        aligned_unique_buffer_ptr<TPCFastTransformPOD> buffer(newMap->size());
+        std::memcpy(buffer.get(), newMap, newMap->size()); // copy NEW map ✓
+        mCalibObjects.mFastTransformBuffer = std::move(buffer);
+        newCalibObjects.fastTransform = mCalibObjects.mFastTransformBuffer.get();
         mustUpdate = true;
       }
       if (mTPCVDriftHelper->isUpdated()) {
diff --git a/GPU/Workflow/src/O2GPUDPLDisplay.cxx b/GPU/Workflow/src/O2GPUDPLDisplay.cxx
index 06b7511164fbb..798744f31abbe 100644
--- a/GPU/Workflow/src/O2GPUDPLDisplay.cxx
+++ b/GPU/Workflow/src/O2GPUDPLDisplay.cxx
@@ -65,10 +65,10 @@ void O2GPUDPLDisplaySpec::init(InitContext& ic)
   mConfig->configGRP.solenoidBzNominalGPU = 0;
   mConfParam.reset(new GPUSettingsO2(mConfig->ReadConfigurableParam()));
 
-  std::vector<char> buffer;
+  aligned_unique_buffer_ptr<TPCFastTransformPOD> buffer;
   gpu::TPCFastTransformPOD::create(buffer, *TPCFastTransformHelperO2::instance()->create(0));
   mBufferFastTransform = std::move(buffer);
-  mFastTransform = &TPCFastTransformPOD::get(mBufferFastTransform.data());
+  mFastTransform = mBufferFastTransform.get();
   mConfig->configCalib.fastTransform = mFastTransform;
 
   mTrdGeo.reset(new o2::trd::GeometryFlat());
diff --git a/macro/runTPCRefit.C b/macro/runTPCRefit.C
index a495ffe5987e3..9e5674ad66edb 100644
--- a/macro/runTPCRefit.C
+++ b/macro/runTPCRefit.C
@@ -51,7 +51,7 @@ int runTPCRefit(TString trackFile = "tpctracks.root", TString clusterFile = "tpc
   Propagator::initFieldFromGRP(NameConf::getGRPFileName());
   const auto grp = o2::parameters::GRPObject::loadFrom("o2sim_grp.root");
   float bz = 5.00668f * grp->getL3Current() / 30000.;
-  std::vector<char> buffer;
+  aligned_unique_buffer_ptr<TPCFastTransformPOD>;
   o2::gpu::TPCFastTransformPOD::create(buffer, *TPCFastTransformHelperO2::instance()->create(0));
   const TPCFastTransformPOD corrMap = o2::gpu::TPCFastTransformPOD::get(buffer.data());
   auto* prop = Propagator::Instance();

From 939afbd4e6874802e574235250b69d69fba21fa0 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 3 Apr 2026 23:26:32 +0200
Subject: [PATCH 2/3] GPU Standalone: Add dumping and reading of dynamic
 structs with larger buffer than sizeof(struct)

---
 GPU/GPUTracking/Base/GPUReconstruction.h      |  4 ++
 GPU/GPUTracking/Base/GPUReconstructionIO.h    | 51 +++++++++++++++++++
 GPU/GPUTracking/Global/GPUChain.h             | 10 ++++
 GPU/GPUTracking/Global/GPUChainTrackingIO.cxx |  6 +--
 4 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 21195af1d4a89..c8e162a14c870 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -300,9 +300,13 @@ class GPUReconstruction
   template <class T>
   void DumpStructToFile(const T* obj, const char* file);
   template <class T>
+  void DumpDynamicStructToFile(const T* obj, size_t dynamicSize, const char* file);
+  template <class T>
   std::unique_ptr<T> ReadStructFromFile(const char* file);
   template <class T>
   int32_t ReadStructFromFile(const char* file, T* obj);
+  template <class T, auto F>
+  aligned_unique_buffer_ptr<T> ReadDynamicStructFromFile(const char* file);
 
   // Others
   virtual RecoStepField AvailableGPURecoSteps() { return RecoStep::AllRecoSteps; }
diff --git a/GPU/GPUTracking/Base/GPUReconstructionIO.h b/GPU/GPUTracking/Base/GPUReconstructionIO.h
index 810ebfffe1703..c6c15462bc29e 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionIO.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionIO.h
@@ -208,6 +208,57 @@ inline int32_t GPUReconstruction::ReadStructFromFile(const char* file, T* obj)
   return 0;
 }
 
+template <class T>
+inline void GPUReconstruction::DumpDynamicStructToFile(const T* obj, size_t dynamicSize, const char* file)
+{
+  FILE* fp = fopen(file, "w+b");
+  if (fp == nullptr) {
+    return;
+  }
+  size_t size = sizeof(*obj);
+  fwrite(&size, sizeof(size), 1, fp);
+  fwrite(&dynamicSize, sizeof(dynamicSize), 1, fp);
+  fwrite(obj, 1, dynamicSize, fp);
+  fclose(fp);
+}
+
+template <class T, auto F>
+inline aligned_unique_buffer_ptr<T> GPUReconstruction::ReadDynamicStructFromFile(const char* file)
+{
+  FILE* fp = fopen(file, "rb");
+  if (fp == nullptr) {
+    return nullptr;
+  }
+  size_t size, dynsize, r, r2;
+  r = fread(&size, sizeof(size), 1, fp);
+  r2 = fread(&dynsize, sizeof(dynsize), 1, fp);
+  if (r == 0 || r2 == 0 || size != sizeof(T) || dynsize < size) {
+    fclose(fp);
+    GPUError("ERROR reading %s, invalid size: %ld (%ld buffer size, %ld object size expected)", file, (int64_t)size, (int64_t)dynsize, (int64_t)sizeof(T));
+    throw std::runtime_error("invalid size");
+  }
+  std::unique_ptr<T> tmp = std::make_unique<T>();
+  r = fread(tmp.get(), sizeof(T), 1, fp);
+  if (r == 0) {
+    fclose(fp);
+    GPUError("ERROR reading %s", file, (int64_t)size, (int64_t)sizeof(T));
+    throw std::runtime_error("read error");
+  }
+  if ((tmp.get()->*F)() != dynsize) {
+    fclose(fp);
+    GPUError("ERROR: invalid size: %ld (%ld expected)", file, (int64_t)dynsize, (int64_t)(tmp.get()->*F)());
+    throw std::runtime_error("invalid size");
+  }
+  aligned_unique_buffer_ptr<T> newObj(dynsize);
+  memcpy(newObj.get(), tmp.get(), sizeof(T));
+  r = fread(newObj.getraw() + sizeof(T), 1, dynsize - sizeof(T), fp);
+  fclose(fp);
+  if (GetProcessingSettings().debugLevel >= 2) {
+    GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
+  }
+  return newObj;
+}
+
 } // namespace o2::gpu
 
 #endif
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index a524fd9ec3992..61107f7893e9c 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -176,6 +176,16 @@ class GPUChain
   {
     mRec->ReadStructFromFile<T>(file, obj);
   }
+  template <class T>
+  void DumpDynamicStructToFile(const T* obj, size_t dynamicSize, const char* file)
+  {
+    mRec->DumpDynamicStructToFile<T>(obj, dynamicSize, file);
+  }
+  template <class T, auto F>
+  aligned_unique_buffer_ptr<T> ReadDynamicStructFromFile(const char* file)
+  {
+    return mRec->ReadDynamicStructFromFile<T, F>(file);
+  }
 
   template <class S, int32_t I = 0, typename... Args>
     requires(sizeof(S) >= 0) // Yields better incomplete type errors than calling runKernelCallInterface directly
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
index 0a879db818c8f..f81f29b45c317 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
@@ -295,7 +295,7 @@ void GPUChainTracking::DumpSettings(const char* dir)
   if (processors()->calibObjects.fastTransform != nullptr) {
     f = dir;
     f += "tpctransform.dump";
-    DumpStructToFile(processors()->calibObjects.fastTransform, f.c_str());
+    DumpDynamicStructToFile(processors()->calibObjects.fastTransform, processors()->calibObjects.fastTransform->size(), f.c_str());
   }
   if (processors()->calibObjects.tpcPadGain != nullptr) {
     f = dir;
@@ -334,8 +334,8 @@ void GPUChainTracking::ReadSettings(const char* dir)
   std::string f;
   f = dir;
   f += "tpctransform.dump";
-  mTPCFastTransformU = ReadStructFromFile<TPCFastTransformPOD>(f.c_str());
-  processors()->calibObjects.fastTransform = (TPCFastTransformPOD*)mTPCFastTransformU.get();
+  mTPCFastTransformU = ReadDynamicStructFromFile<TPCFastTransformPOD, &TPCFastTransformPOD::size>(f.c_str());
+  processors()->calibObjects.fastTransform = mTPCFastTransformU.get();
   f = dir;
   f += "tpcpadgaincalib.dump";
   mTPCPadGainCalibU = ReadStructFromFile<TPCPadGainCalib>(f.c_str());

From 5a93e74ebd3b195037c26c91f47e1bc51363c115 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sat, 4 Apr 2026 09:10:08 +0000
Subject: [PATCH 3/3] Please consider the following formatting changes

---
 Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h b/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h
index 4eae504db7220..adbf3ecf5a299 100644
--- a/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h
+++ b/Detectors/TPC/calibration/include/TPCCalibration/TrackDump.h
@@ -78,7 +78,7 @@ class TrackDump
     float zc(float vertexTime = 0) const;
 
     inline static o2::gpu::aligned_unique_buffer_ptr<o2::gpu::TPCFastTransformPOD> corrMapBuffer; // buffer for owning the correction map in case of update during runtime
-    inline static const o2::gpu::TPCFastTransformPOD* corrMap{nullptr}; // local copy of the correction map for quick access to the transform functions
+    inline static const o2::gpu::TPCFastTransformPOD* corrMap{nullptr};                           // local copy of the correction map for quick access to the transform functions
     static void loadCorrMaps(std::string_view corrMapFile, std::string_view corrMapFileRef = "");
     ClassDefNV(ClusterNativeAdd, 1);
   };