diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp
index 81c423235..e140e62ac 100644
--- a/cpp/command/benchmark.cpp
+++ b/cpp/command/benchmark.cpp
@@ -18,7 +18,8 @@
 
 using namespace std;
 
-static NNEvaluator* createNNEval(int maxNumThreads, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params);
+static NNEvaluator* createNNEval(int expectedConcurrentEvals, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params);
+static NNEvaluator* createNNEvalWithBatchSize(int expectedConcurrentEvals, int defaultMaxBatchSize, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params);
 
 static vector<PlayUtils::BenchmarkResults> doFixedTuneThreads(
   const SearchParams& params,
@@ -51,6 +52,52 @@ static const int64_t defaultMaxVisits = 800;
 static constexpr double defaultSecondsPerGameMove = 5.0;
 static const int ternarySearchInitialMax = 32;
 
+static int getDefaultMaxBatchSize(int expectedConcurrentEvals) {
+  return std::max(8,((expectedConcurrentEvals+3)/4)*4);
+}
+
+static void addUniqueInt(vector<int>& values, int value) {
+  if(value <= 0 || value > 65536)
+    return;
+  for(int x: values) {
+    if(x == value)
+      return;
+  }
+  values.push_back(value);
+}
+
+static vector<int> getNNServerThreadsToTest(int baseNumNNServerThreads) {
+  testAssert(baseNumNNServerThreads >= 1);
+  vector<int> ret;
+  const int maxNumNNServerThreadsToTry = std::max(baseNumNNServerThreads,4);
+  const int multipliers[] = {1,2,4};
+  for(int multiplier: multipliers) {
+    int numThreads = baseNumNNServerThreads * multiplier;
+    if(numThreads > maxNumNNServerThreadsToTry)
+      break;
+    ret.push_back(numThreads);
+  }
+  return ret;
+}
+
+static vector<int> getNNMaxBatchSizesToTest(int numSearchThreads) {
+  testAssert(numSearchThreads >= 1);
+  const int defaultMaxBatchSize = getDefaultMaxBatchSize(numSearchThreads);
+  vector<int> ret;
+  const int fixedCandidates[] = {8,16,32,64};
+  for(int batchSize: fixedCandidates)
+    addUniqueInt(ret,batchSize);
+  if(defaultMaxBatchSize >= 128)
+    addUniqueInt(ret,128);
+  addUniqueInt(ret,defaultMaxBatchSize);
+  sort(ret.begin(),ret.end());
+  return ret;
+}
+
+static double getNNEvalsPerSecond(const PlayUtils::BenchmarkResults& result) {
+  return result.numNNEvals / (result.totalSeconds + 0.00001);
+}
+
 int MainCmds::benchmark(const vector<string>& args) {
   Board::initHash();
   ScoreValue::initTables();
@@ -256,6 +303,7 @@ int MainCmds::benchmark(const vector<string>& args) {
   cout << "Your GTP config is currently set to trtUseFP16 = " << nnEval->getUsingFP16Mode().toString() << endl;
   if(nnEval->getUsingFP16Mode() == enabled_t::False)
     cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080) setting this to true may give a large performance boost." << endl;
+  cout << "For repeated TensorRT benchmark or genconfig runs with the same model/GPU/batch size, building with -DUSE_CACHE_TENSORRT_PLAN=1 can greatly reduce startup time." << endl;
 #endif
 #ifdef USE_METAL_BACKEND
   cout << "You are currently using the Metal version of KataGo." << endl;
@@ -320,10 +368,11 @@ static void warmStartNNEval(const CompactSgf& sgf, Logger& logger, const SearchP
   delete bot;
 }
 
-static NNEvaluator* createNNEval(int maxNumThreads, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) {
-  int expectedConcurrentEvals = maxNumThreads;
-  const int defaultMaxBatchSize = std::max(8,((maxNumThreads+3)/4)*4);
+static NNEvaluator* createNNEval(int expectedConcurrentEvals, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) {
+  return createNNEvalWithBatchSize(expectedConcurrentEvals,getDefaultMaxBatchSize(expectedConcurrentEvals),sgf,modelFile,logger,cfg,params);
+}
 
+static NNEvaluator* createNNEvalWithBatchSize(int expectedConcurrentEvals, int defaultMaxBatchSize, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) {
   Rand seedRand;
 
 #ifdef USE_EIGEN_BACKEND
@@ -632,6 +681,8 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
   vector<int> configDeviceIdxs;
   int configNNCacheSizePowerOfTwo = 20;
   int configNNMutexPoolSizePowerOfTwo = 16;
+  int configNNMaxBatchSize = -1;
+  int configNumNNServerThreadsPerModel = 1;
   int configNumSearchThreads = 6;
 
   cout << endl;
@@ -783,6 +834,8 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
         }
       });
   }
+  if(configDeviceIdxs.size() > 0)
+    configNumNNServerThreadsPerModel = (int)configDeviceIdxs.size();
 #endif
 
   {
@@ -825,9 +878,15 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
   bool skipThreadTuning = false;
   if(FileUtils::exists(outputFile)) {
     int oldConfigNumSearchThreads = -1;
+    int oldConfigNumNNServerThreadsPerModel = -1;
+    int oldConfigNNMaxBatchSize = -1;
     try {
       ConfigParser oldCfg(outputFile);
       oldConfigNumSearchThreads = oldCfg.getInt("numSearchThreads",1,4096);
+      if(oldCfg.contains("numNNServerThreadsPerModel"))
+        oldConfigNumNNServerThreadsPerModel = oldCfg.getInt("numNNServerThreadsPerModel",1,1024);
+      if(oldCfg.contains("nnMaxBatchSize"))
+        oldConfigNNMaxBatchSize = oldCfg.getInt("nnMaxBatchSize",1,65536);
     }
     catch(const StringError&) {
       cout << "NOTE: Overwritten config does not specify numSearchThreads or otherwise could not be parsed." << endl;
@@ -842,6 +901,10 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
       );
       if(skipThreadTuning) {
         configNumSearchThreads = oldConfigNumSearchThreads;
+        if(oldConfigNumNNServerThreadsPerModel > 0)
+          configNumNNServerThreadsPerModel = oldConfigNumNNServerThreadsPerModel;
+        if(oldConfigNNMaxBatchSize > 0)
+          configNNMaxBatchSize = oldConfigNNMaxBatchSize;
       }
     }
   }
@@ -855,8 +918,10 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
       configMaxTime,
       configMaxPonderTime,
       configDeviceIdxs,
+      configNNMaxBatchSize,
       configNNCacheSizePowerOfTwo,
       configNNMutexPoolSizePowerOfTwo,
+      configNumNNServerThreadsPerModel,
       configNumSearchThreads
     );
   };
@@ -968,6 +1033,155 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
     configNumSearchThreads = results[bestIdx].numThreads;
 
     delete nnEval;
+    nnEval = NULL;
+
+#ifndef USE_EIGEN_BACKEND
+#ifdef USE_TENSORRT_BACKEND
+    cout << "Tip: For repeated TensorRT genconfig runs on the same model/GPU/batch size, a build with -DUSE_CACHE_TENSORRT_PLAN=1 can make startup much faster." << endl;
+#endif
+
+    {
+      int baseNumNNServerThreads = configDeviceIdxs.size() > 0 ? (int)configDeviceIdxs.size() : 1;
+      vector<int> numNNServerThreadsToTest = getNNServerThreadsToTest(baseNumNNServerThreads);
+
+      if(numNNServerThreadsToTest.size() > 1) {
+        cout << endl;
+        cout << "=========================================================================" << endl;
+        cout << "TUNING NEURAL NET SERVER THREADS NOW" << endl;
+        cout << "Tuning numNNServerThreadsPerModel using nnEvals/s at "
+             << configNumSearchThreads << " numSearchThreads." << endl;
+
+        int bestNumNNServerThreads = configNumNNServerThreadsPerModel;
+        double bestNNEvalsPerSecond = -1.0;
+
+        for(int numNNServerThreads: numNNServerThreadsToTest) {
+          configNumNNServerThreadsPerModel = numNNServerThreads;
+          updateConfigContents();
+
+          istringstream nnServerInConfig(configFileContents);
+          ConfigParser nnServerCfg(nnServerInConfig);
+          Logger nnServerLogger(&nnServerCfg, logToStdOut);
+          Setup::initializeSession(nnServerCfg);
+
+          SearchParams nnServerParams = Setup::loadSingleParams(nnServerCfg,Setup::SETUP_FOR_BENCHMARK);
+          nnServerParams.maxVisits = maxVisits;
+          nnServerParams.maxPlayouts = maxVisits;
+          nnServerParams.maxTime = 1e20;
+          nnServerParams.searchFactorAfterOnePass = 1.0;
+          nnServerParams.searchFactorAfterTwoPass = 1.0;
+
+          int maxNumThreadsForBatch = std::max(configNumSearchThreads,numNNServerThreads);
+          NNEvaluator* nnServerEval = createNNEval(maxNumThreadsForBatch, *sgf, modelFile, nnServerLogger, nnServerCfg, nnServerParams);
+          auto getNNServerDesiredBatchSize = [&](int currentNumThreads) {
+            (void)currentNumThreads;
+            return nnServerEval->getMaxBatchSize();
+          };
+
+          vector<int> numThreads = {configNumSearchThreads};
+          vector<PlayUtils::BenchmarkResults> nnServerResults = doFixedTuneThreads(
+            nnServerParams,*sgf,numPositionsPerGame,nnServerEval,nnServerLogger,secondsPerGameMove,numThreads,false,getNNServerDesiredBatchSize
+          );
+          testAssert(nnServerResults.size() == 1);
+          double nnEvalsPerSecond = getNNEvalsPerSecond(nnServerResults[0]);
+          cout << "numNNServerThreadsPerModel = " << numNNServerThreads
+               << ": nnEvals/s = " << Global::strprintf("%.2f",nnEvalsPerSecond)
+               << " visits/s = " << Global::strprintf("%.2f",nnServerResults[0].totalVisits / (nnServerResults[0].totalSeconds + 0.00001))
+               << " avgBatchSize = " << Global::strprintf("%.2f",nnServerResults[0].avgBatchSize)
+               << endl;
+
+          if(nnEvalsPerSecond > bestNNEvalsPerSecond) {
+            bestNNEvalsPerSecond = nnEvalsPerSecond;
+            bestNumNNServerThreads = numNNServerThreads;
+          }
+
+          delete nnServerEval;
+        }
+
+        configNumNNServerThreadsPerModel = bestNumNNServerThreads;
+        cout << "Using " << configNumNNServerThreadsPerModel
+             << " numNNServerThreadsPerModel based on nnEvals/s!" << endl;
+      }
+    }
+
+    {
+      vector<int> nnMaxBatchSizesToTest = getNNMaxBatchSizesToTest(configNumSearchThreads);
+
+      if(nnMaxBatchSizesToTest.size() > 1) {
+        cout << endl;
+        cout << "=========================================================================" << endl;
+        cout << "TUNING NEURAL NET MAX BATCH SIZE NOW" << endl;
+        cout << "Tuning nnMaxBatchSize using nnEvals/s at "
+             << configNumSearchThreads << " numSearchThreads and "
+             << configNumNNServerThreadsPerModel << " numNNServerThreadsPerModel." << endl;
+
+        int bestNNMaxBatchSize = getDefaultMaxBatchSize(configNumSearchThreads);
+        double bestNNEvalsPerSecond = -1.0;
+
+        for(int nnMaxBatchSize: nnMaxBatchSizesToTest) {
+          configNNMaxBatchSize = nnMaxBatchSize;
+          updateConfigContents();
+
+          double nnEvalsPerSecond = -1.0;
+          double visitsPerSecond = -1.0;
+          double avgBatchSize = -1.0;
+          NNEvaluator* batchEval = NULL;
+
+          try {
+            istringstream batchInConfig(configFileContents);
+            ConfigParser batchCfg(batchInConfig);
+            Logger batchLogger(&batchCfg, logToStdOut);
+            Setup::initializeSession(batchCfg);
+
+            SearchParams batchParams = Setup::loadSingleParams(batchCfg,Setup::SETUP_FOR_BENCHMARK);
+            batchParams.maxVisits = maxVisits;
+            batchParams.maxPlayouts = maxVisits;
+            batchParams.maxTime = 1e20;
+            batchParams.searchFactorAfterOnePass = 1.0;
+            batchParams.searchFactorAfterTwoPass = 1.0;
+
+            int expectedConcurrentEvals = std::max(configNumSearchThreads,configNumNNServerThreadsPerModel);
+            batchEval = createNNEvalWithBatchSize(expectedConcurrentEvals, nnMaxBatchSize, *sgf, modelFile, batchLogger, batchCfg, batchParams);
+            auto getBatchDesiredBatchSize = [&](int currentNumThreads) {
+              (void)currentNumThreads;
+              return batchEval->getMaxBatchSize();
+            };
+
+            vector<int> numThreads = {configNumSearchThreads};
+            vector<PlayUtils::BenchmarkResults> batchResults = doFixedTuneThreads(
+              batchParams,*sgf,numPositionsPerGame,batchEval,batchLogger,secondsPerGameMove,numThreads,false,getBatchDesiredBatchSize
+            );
+            testAssert(batchResults.size() == 1);
+
+            nnEvalsPerSecond = getNNEvalsPerSecond(batchResults[0]);
+            visitsPerSecond = batchResults[0].totalVisits / (batchResults[0].totalSeconds + 0.00001);
+            avgBatchSize = batchResults[0].avgBatchSize;
+
+            if(nnEvalsPerSecond > bestNNEvalsPerSecond) {
+              bestNNEvalsPerSecond = nnEvalsPerSecond;
+              bestNNMaxBatchSize = nnMaxBatchSize;
+            }
+          }
+          catch(const StringError& e) {
+            cout << "nnMaxBatchSize = " << nnMaxBatchSize << " failed: " << e.what() << endl;
+          }
+
+          delete batchEval;
+
+          if(nnEvalsPerSecond >= 0.0) {
+            cout << "nnMaxBatchSize = " << nnMaxBatchSize
+                 << ": nnEvals/s = " << Global::strprintf("%.2f",nnEvalsPerSecond)
+                 << " visits/s = " << Global::strprintf("%.2f",visitsPerSecond)
+                 << " avgBatchSize = " << Global::strprintf("%.2f",avgBatchSize)
+                 << endl;
+          }
+        }
+
+        configNNMaxBatchSize = bestNNMaxBatchSize;
+        cout << "Using " << configNNMaxBatchSize
+             << " nnMaxBatchSize based on nnEvals/s!" << endl;
+      }
+    }
+#endif
   }
 
   updateConfigContents();
diff --git a/cpp/command/runtests.cpp b/cpp/command/runtests.cpp
index 75bb9760c..21c82b2bd 100644
--- a/cpp/command/runtests.cpp
+++ b/cpp/command/runtests.cpp
@@ -377,8 +377,10 @@ int MainCmds::runtinynntests(const vector<string>& args) {
       maxTime,
       maxPonderTime,
       std::vector<int>(),
+      -1,
       nnCacheSizePowerOfTwo,
       nnMutexPoolSizePowerOfTwo,
+      1,
       numSearchThreads
     );
     istringstream in(cfgStr);
diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg
index 8a261c4c3..d7994bb8c 100644
--- a/cpp/configs/gtp_example.cfg
+++ b/cpp/configs/gtp_example.cfg
@@ -365,6 +365,11 @@ searchFactorWhenWinningThreshold = 0.95
 # if running out of memory, or using multiple GPUs that expect to share work.
 # nnMaxBatchSize = <integer>
 
+# TensorRT users who repeatedly run genconfig or benchmark with the same
+# model/GPU/batch settings may greatly reduce startup time by building with
+# CMake option -DUSE_CACHE_TENSORRT_PLAN=1. This is not recommended for
+# distributed clients, which update models frequently.
+
 # Controls the neural network cache size, which is the primary RAM/memory use.
 # KataGo will cache up to (2 ** nnCacheSizePowerOfTwo) many neural net
 # evaluations in case of transpositions in the tree.
diff --git a/cpp/program/gtpconfig.cpp b/cpp/program/gtpconfig.cpp
index 3fa8651e5..661decff3 100644
--- a/cpp/program/gtpconfig.cpp
+++ b/cpp/program/gtpconfig.cpp
@@ -265,7 +265,17 @@ searchFactorWhenWinningThreshold = 0.95
 # Maximum number of positions to send to a single GPU at once. The default
 # value is roughly equal to numSearchThreads, but can be specified manually
 # if running out of memory, or using multiple GPUs that expect to share work.
-# nnMaxBatchSize = <integer>
+$$NN_MAX_BATCH_SIZE
+#
+# Number of neural net server threads per model. Usually this is the number of
+# GPUs, but genconfig may tune this higher to run multiple backend contexts on
+# the same GPU when that improves nnEvals/s.
+# numNNServerThreadsPerModel = 1
+#
+# TensorRT users who repeatedly run genconfig or benchmark with the same
+# model/GPU/batch settings may greatly reduce startup time by building with
+# CMake option -DUSE_CACHE_TENSORRT_PLAN=1. This is not recommended for
+# distributed clients, which update models frequently.
 
 # Controls the neural network cache size, which is the primary RAM/memory use.
 # KataGo will cache up to (2 ** nnCacheSizePowerOfTwo) many neural net
@@ -466,10 +476,13 @@ string GTPConfig::makeConfig(
   double maxTime,
   double maxPonderTime,
   const std::vector<int>& deviceIdxs,
+  int nnMaxBatchSize,
   int nnCacheSizePowerOfTwo,
   int nnMutexPoolSizePowerOfTwo,
+  int numNNServerThreadsPerModel,
   int numSearchThreads
 ) {
+  testAssert(numNNServerThreadsPerModel >= 1);
   string config = gtpBasePart1 + gtpBasePart2;
   auto replace = [&](const string& key, const string& replacement) {
     size_t pos = config.find(key);
@@ -518,25 +531,28 @@ string GTPConfig::makeConfig(
   else                                 replace("$$PONDERING", "ponderingEnabled = true\n# maxTimePondering = 60.0");
 
   replace("$$NUM_SEARCH_THREADS", Global::intToString(numSearchThreads));
+  if(nnMaxBatchSize > 0) replace("$$NN_MAX_BATCH_SIZE", "nnMaxBatchSize = " + Global::intToString(nnMaxBatchSize));
+  else replace("$$NN_MAX_BATCH_SIZE", "# nnMaxBatchSize = <integer>");
   replace("$$NN_CACHE_SIZE_POWER_OF_TWO", Global::intToString(nnCacheSizePowerOfTwo));
   replace("$$NN_MUTEX_POOL_SIZE_POWER_OF_TWO", Global::intToString(nnMutexPoolSizePowerOfTwo));
 
-  if(deviceIdxs.size() <= 0) {
+  if(deviceIdxs.size() <= 0 && numNNServerThreadsPerModel <= 1) {
     replace("$$MULTIPLE_GPUS", "");
   }
   else {
     string replacement = "";
-    replacement += "numNNServerThreadsPerModel = " + Global::uint64ToString(deviceIdxs.size()) + "\n";
+    replacement += "numNNServerThreadsPerModel = " + Global::intToString(numNNServerThreadsPerModel) + "\n";
 
-    for(int i = 0; i<deviceIdxs.size(); i++) {
+    for(int i = 0; i<numNNServerThreadsPerModel; i++) {
+      int deviceIdx = deviceIdxs.size() <= 0 ? 0 : deviceIdxs[(size_t)i % deviceIdxs.size()];
 #ifdef USE_CUDA_BACKEND
-      replacement += "cudaDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
+      replacement += "cudaDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdx) + "\n";
 #endif
 #ifdef USE_TENSORRT_BACKEND
-      replacement += "trtDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
+      replacement += "trtDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdx) + "\n";
 #endif
 #ifdef USE_OPENCL_BACKEND
-      replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
+      replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdx) + "\n";
 #endif
     }
     replace("$$MULTIPLE_GPUS", replacement);
diff --git a/cpp/program/gtpconfig.h b/cpp/program/gtpconfig.h
index bd4019983..77c040ce2 100644
--- a/cpp/program/gtpconfig.h
+++ b/cpp/program/gtpconfig.h
@@ -12,8 +12,10 @@ namespace GTPConfig {
     double maxTime,
     double maxPonderTime,
     const std::vector<int>& deviceIdxs,
+    int nnMaxBatchSize,
     int nnCacheSizePowerOfTwo,
     int nnMutexPoolSizePowerOfTwo,
+    int numNNServerThreadsPerModel,
     int numSearchThreads
   );
 }