QuEST-Kit · otbrown · Apr 24, 2026 · Apr 24, 2026 · May 4, 2026 · May 4, 2026
diff --git a/quest/include/environment.h b/quest/include/environment.h
@@ -83,6 +83,14 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
+/** @notyetdoced
+ * GPU thread per block control
+ * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
+ * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
+ */
+int getQuESTNumGpuThreadsPerBlock();
+void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
+
 
 // end de-mangler
 #ifdef __cplusplus

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -492,22 +492,34 @@ void reportQuESTEnv() {
 void getEnvironmentString(char str[200]) {
     validate_envIsInit(__func__);
 
-    QuESTEnv env = getQuESTEnv();
-
     int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
-    int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
-    int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
+    int cuQuantum = globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled();
+    int gpuDirect = globalEnvPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
     snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
-        env.isGpuAccelerated,
-        env.isMultithreaded,
-        env.isDistributed,
+        globalEnvPtr->isGpuAccelerated,
+        globalEnvPtr->isMultithreaded,
+        globalEnvPtr->isDistributed,
         numThreads,
-        env.numNodes,
+        globalEnvPtr->numNodes,
         cuQuantum,
         gpuDirect);
 }
 
 
+int getQuESTNumGpuThreadsPerBlock() {
+    validate_envIsInit(__func__);
+
+    return globalEnvPtr->isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0;
+}
+
+void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
+    validate_envIsInit(__func__);
+
+    // just rely on the internal function to throw an error if there's no GPU support compiled
+    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
+    return;
+}
+
 // end de-mangler
 }
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
@@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() {
 #if COMPILE_OPENMP
     int n = -1;
 
-    #pragma omp parallel shared(n)
-    #pragma omp single
-    n = omp_get_num_threads();
+    n = omp_get_max_threads();
 
     return n;
 #else

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
@@ -41,6 +41,7 @@
     #include "quest/src/gpu/cuda_to_hip.hpp"
 #endif
 
+int numThreadsPerBlock = 128;
 
 
 /*
@@ -330,6 +331,24 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock() {
+#if COMPILE_CUDA
+    return numThreadsPerBlock;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
+}
+
+void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) {
+#if COMPILE_CUDA
+    numThreadsPerBlock = newThreadsPerBlock;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+#endif
+    return;
+}
+
 
 std::array<char,17> getBoundGpuUuid() {
 #if COMPILE_CUDA

diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
@@ -19,7 +19,6 @@
 #include "quest/include/channels.h"
 
 
-
 /*
  * CUDA ERROR HANDLING
  */
@@ -65,6 +64,10 @@ qindex gpu_getMaxNumConcurrentThreads();
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock();
+
+void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock);
+
 void gpu_bindLocalGPUsToNodes();
 
 bool gpu_areAnyNodesBoundToSameGpu();
@@ -76,7 +79,6 @@ void gpu_initCuQuantum();
 void gpu_finalizeCuQuantum();
 
 
-
 /*
  * MEMORY MANAGEMENT
  */
@@ -122,4 +124,4 @@ size_t gpu_getCacheMemoryInBytes();
 
 
 
-#endif // GPU_CONFIG_HPP
+#endif // GPU_CONFIG_HPP
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
@@ -42,23 +42,19 @@
  * THREAD MANAGEMENT
  */
 
-
-const int NUM_THREADS_PER_BLOCK = 128;
-
-
 __forceinline__ __device__ qindex getThreadInd() {
     return blockIdx.x*blockDim.x + threadIdx.x;
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads) {
+__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
     /// making it function specific
 
     // CUDA ceil
-    return ceil(numThreads / static_cast<qreal>(NUM_THREADS_PER_BLOCK));
+    return ceil(numThreads / static_cast<qreal>(numThreadsPerBlock));
 }