diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp index 81c423235..e140e62ac 100644 --- a/cpp/command/benchmark.cpp +++ b/cpp/command/benchmark.cpp @@ -18,7 +18,8 @@ using namespace std; -static NNEvaluator* createNNEval(int maxNumThreads, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params); +static NNEvaluator* createNNEval(int expectedConcurrentEvals, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params); +static NNEvaluator* createNNEvalWithBatchSize(int expectedConcurrentEvals, int defaultMaxBatchSize, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params); static vector doFixedTuneThreads( const SearchParams& params, @@ -51,6 +52,52 @@ static const int64_t defaultMaxVisits = 800; static constexpr double defaultSecondsPerGameMove = 5.0; static const int ternarySearchInitialMax = 32; +static int getDefaultMaxBatchSize(int expectedConcurrentEvals) { + return std::max(8,((expectedConcurrentEvals+3)/4)*4); +} + +static void addUniqueInt(vector& values, int value) { + if(value <= 0 || value > 65536) + return; + for(int x: values) { + if(x == value) + return; + } + values.push_back(value); +} + +static vector getNNServerThreadsToTest(int baseNumNNServerThreads) { + testAssert(baseNumNNServerThreads >= 1); + vector ret; + const int maxNumNNServerThreadsToTry = std::max(baseNumNNServerThreads,4); + const int multipliers[] = {1,2,4}; + for(int multiplier: multipliers) { + int numThreads = baseNumNNServerThreads * multiplier; + if(numThreads > maxNumNNServerThreadsToTry) + break; + ret.push_back(numThreads); + } + return ret; +} + +static vector getNNMaxBatchSizesToTest(int numSearchThreads) { + testAssert(numSearchThreads >= 1); + const int defaultMaxBatchSize = getDefaultMaxBatchSize(numSearchThreads); + vector ret; + const int fixedCandidates[] = {8,16,32,64}; + for(int batchSize: fixedCandidates) + addUniqueInt(ret,batchSize); + if(defaultMaxBatchSize >= 128) + addUniqueInt(ret,128); + addUniqueInt(ret,defaultMaxBatchSize); + sort(ret.begin(),ret.end()); + return ret; +} + +static double getNNEvalsPerSecond(const PlayUtils::BenchmarkResults& result) { + return result.numNNEvals / (result.totalSeconds + 0.00001); +} + int MainCmds::benchmark(const vector& args) { Board::initHash(); ScoreValue::initTables(); @@ -256,6 +303,7 @@ int MainCmds::benchmark(const vector& args) { cout << "Your GTP config is currently set to trtUseFP16 = " << nnEval->getUsingFP16Mode().toString() << endl; if(nnEval->getUsingFP16Mode() == enabled_t::False) cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080) setting this to true may give a large performance boost." << endl; + cout << "For repeated TensorRT benchmark or genconfig runs with the same model/GPU/batch size, building with -DUSE_CACHE_TENSORRT_PLAN=1 can greatly reduce startup time." << endl; #endif #ifdef USE_METAL_BACKEND cout << "You are currently using the Metal version of KataGo." << endl; @@ -320,10 +368,11 @@ static void warmStartNNEval(const CompactSgf& sgf, Logger& logger, const SearchP delete bot; } -static NNEvaluator* createNNEval(int maxNumThreads, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) { - int expectedConcurrentEvals = maxNumThreads; - const int defaultMaxBatchSize = std::max(8,((maxNumThreads+3)/4)*4); +static NNEvaluator* createNNEval(int expectedConcurrentEvals, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) { + return createNNEvalWithBatchSize(expectedConcurrentEvals,getDefaultMaxBatchSize(expectedConcurrentEvals),sgf,modelFile,logger,cfg,params); +} +static NNEvaluator* createNNEvalWithBatchSize(int expectedConcurrentEvals, int defaultMaxBatchSize, const CompactSgf& sgf, const string& modelFile, Logger& logger, ConfigParser& cfg, const SearchParams& params) { Rand seedRand; #ifdef USE_EIGEN_BACKEND @@ -632,6 +681,8 @@ int MainCmds::genconfig(const vector& args, const string& firstCommand) vector configDeviceIdxs; int configNNCacheSizePowerOfTwo = 20; int configNNMutexPoolSizePowerOfTwo = 16; + int configNNMaxBatchSize = -1; + int configNumNNServerThreadsPerModel = 1; int configNumSearchThreads = 6; cout << endl; @@ -783,6 +834,8 @@ int MainCmds::genconfig(const vector& args, const string& firstCommand) } }); } + if(configDeviceIdxs.size() > 0) + configNumNNServerThreadsPerModel = (int)configDeviceIdxs.size(); #endif { @@ -825,9 +878,15 @@ int MainCmds::genconfig(const vector& args, const string& firstCommand) bool skipThreadTuning = false; if(FileUtils::exists(outputFile)) { int oldConfigNumSearchThreads = -1; + int oldConfigNumNNServerThreadsPerModel = -1; + int oldConfigNNMaxBatchSize = -1; try { ConfigParser oldCfg(outputFile); oldConfigNumSearchThreads = oldCfg.getInt("numSearchThreads",1,4096); + if(oldCfg.contains("numNNServerThreadsPerModel")) + oldConfigNumNNServerThreadsPerModel = oldCfg.getInt("numNNServerThreadsPerModel",1,1024); + if(oldCfg.contains("nnMaxBatchSize")) + oldConfigNNMaxBatchSize = oldCfg.getInt("nnMaxBatchSize",1,65536); } catch(const StringError&) { cout << "NOTE: Overwritten config does not specify numSearchThreads or otherwise could not be parsed." << endl; @@ -842,6 +901,10 @@ int MainCmds::genconfig(const vector& args, const string& firstCommand) ); if(skipThreadTuning) { configNumSearchThreads = oldConfigNumSearchThreads; + if(oldConfigNumNNServerThreadsPerModel > 0) + configNumNNServerThreadsPerModel = oldConfigNumNNServerThreadsPerModel; + if(oldConfigNNMaxBatchSize > 0) + configNNMaxBatchSize = oldConfigNNMaxBatchSize; } } } @@ -855,8 +918,10 @@ int MainCmds::genconfig(const vector& args, const string& firstCommand) configMaxTime, configMaxPonderTime, configDeviceIdxs, + configNNMaxBatchSize, configNNCacheSizePowerOfTwo, configNNMutexPoolSizePowerOfTwo, + configNumNNServerThreadsPerModel, configNumSearchThreads ); }; @@ -968,6 +1033,155 @@ int MainCmds::genconfig(const vector& args, const string& firstCommand) configNumSearchThreads = results[bestIdx].numThreads; delete nnEval; + nnEval = NULL; + +#ifndef USE_EIGEN_BACKEND +#ifdef USE_TENSORRT_BACKEND + cout << "Tip: For repeated TensorRT genconfig runs on the same model/GPU/batch size, a build with -DUSE_CACHE_TENSORRT_PLAN=1 can make startup much faster." << endl; +#endif + + { + int baseNumNNServerThreads = configDeviceIdxs.size() > 0 ? (int)configDeviceIdxs.size() : 1; + vector numNNServerThreadsToTest = getNNServerThreadsToTest(baseNumNNServerThreads); + + if(numNNServerThreadsToTest.size() > 1) { + cout << endl; + cout << "=========================================================================" << endl; + cout << "TUNING NEURAL NET SERVER THREADS NOW" << endl; + cout << "Tuning numNNServerThreadsPerModel using nnEvals/s at " + << configNumSearchThreads << " numSearchThreads." << endl; + + int bestNumNNServerThreads = configNumNNServerThreadsPerModel; + double bestNNEvalsPerSecond = -1.0; + + for(int numNNServerThreads: numNNServerThreadsToTest) { + configNumNNServerThreadsPerModel = numNNServerThreads; + updateConfigContents(); + + istringstream nnServerInConfig(configFileContents); + ConfigParser nnServerCfg(nnServerInConfig); + Logger nnServerLogger(&nnServerCfg, logToStdOut); + Setup::initializeSession(nnServerCfg); + + SearchParams nnServerParams = Setup::loadSingleParams(nnServerCfg,Setup::SETUP_FOR_BENCHMARK); + nnServerParams.maxVisits = maxVisits; + nnServerParams.maxPlayouts = maxVisits; + nnServerParams.maxTime = 1e20; + nnServerParams.searchFactorAfterOnePass = 1.0; + nnServerParams.searchFactorAfterTwoPass = 1.0; + + int maxNumThreadsForBatch = std::max(configNumSearchThreads,numNNServerThreads); + NNEvaluator* nnServerEval = createNNEval(maxNumThreadsForBatch, *sgf, modelFile, nnServerLogger, nnServerCfg, nnServerParams); + auto getNNServerDesiredBatchSize = [&](int currentNumThreads) { + (void)currentNumThreads; + return nnServerEval->getMaxBatchSize(); + }; + + vector numThreads = {configNumSearchThreads}; + vector nnServerResults = doFixedTuneThreads( + nnServerParams,*sgf,numPositionsPerGame,nnServerEval,nnServerLogger,secondsPerGameMove,numThreads,false,getNNServerDesiredBatchSize + ); + testAssert(nnServerResults.size() == 1); + double nnEvalsPerSecond = getNNEvalsPerSecond(nnServerResults[0]); + cout << "numNNServerThreadsPerModel = " << numNNServerThreads + << ": nnEvals/s = " << Global::strprintf("%.2f",nnEvalsPerSecond) + << " visits/s = " << Global::strprintf("%.2f",nnServerResults[0].totalVisits / (nnServerResults[0].totalSeconds + 0.00001)) + << " avgBatchSize = " << Global::strprintf("%.2f",nnServerResults[0].avgBatchSize) + << endl; + + if(nnEvalsPerSecond > bestNNEvalsPerSecond) { + bestNNEvalsPerSecond = nnEvalsPerSecond; + bestNumNNServerThreads = numNNServerThreads; + } + + delete nnServerEval; + } + + configNumNNServerThreadsPerModel = bestNumNNServerThreads; + cout << "Using " << configNumNNServerThreadsPerModel + << " numNNServerThreadsPerModel based on nnEvals/s!" << endl; + } + } + + { + vector nnMaxBatchSizesToTest = getNNMaxBatchSizesToTest(configNumSearchThreads); + + if(nnMaxBatchSizesToTest.size() > 1) { + cout << endl; + cout << "=========================================================================" << endl; + cout << "TUNING NEURAL NET MAX BATCH SIZE NOW" << endl; + cout << "Tuning nnMaxBatchSize using nnEvals/s at " + << configNumSearchThreads << " numSearchThreads and " + << configNumNNServerThreadsPerModel << " numNNServerThreadsPerModel." << endl; + + int bestNNMaxBatchSize = getDefaultMaxBatchSize(configNumSearchThreads); + double bestNNEvalsPerSecond = -1.0; + + for(int nnMaxBatchSize: nnMaxBatchSizesToTest) { + configNNMaxBatchSize = nnMaxBatchSize; + updateConfigContents(); + + double nnEvalsPerSecond = -1.0; + double visitsPerSecond = -1.0; + double avgBatchSize = -1.0; + NNEvaluator* batchEval = NULL; + + try { + istringstream batchInConfig(configFileContents); + ConfigParser batchCfg(batchInConfig); + Logger batchLogger(&batchCfg, logToStdOut); + Setup::initializeSession(batchCfg); + + SearchParams batchParams = Setup::loadSingleParams(batchCfg,Setup::SETUP_FOR_BENCHMARK); + batchParams.maxVisits = maxVisits; + batchParams.maxPlayouts = maxVisits; + batchParams.maxTime = 1e20; + batchParams.searchFactorAfterOnePass = 1.0; + batchParams.searchFactorAfterTwoPass = 1.0; + + int expectedConcurrentEvals = std::max(configNumSearchThreads,configNumNNServerThreadsPerModel); + batchEval = createNNEvalWithBatchSize(expectedConcurrentEvals, nnMaxBatchSize, *sgf, modelFile, batchLogger, batchCfg, batchParams); + auto getBatchDesiredBatchSize = [&](int currentNumThreads) { + (void)currentNumThreads; + return batchEval->getMaxBatchSize(); + }; + + vector numThreads = {configNumSearchThreads}; + vector batchResults = doFixedTuneThreads( + batchParams,*sgf,numPositionsPerGame,batchEval,batchLogger,secondsPerGameMove,numThreads,false,getBatchDesiredBatchSize + ); + testAssert(batchResults.size() == 1); + + nnEvalsPerSecond = getNNEvalsPerSecond(batchResults[0]); + visitsPerSecond = batchResults[0].totalVisits / (batchResults[0].totalSeconds + 0.00001); + avgBatchSize = batchResults[0].avgBatchSize; + + if(nnEvalsPerSecond > bestNNEvalsPerSecond) { + bestNNEvalsPerSecond = nnEvalsPerSecond; + bestNNMaxBatchSize = nnMaxBatchSize; + } + } + catch(const StringError& e) { + cout << "nnMaxBatchSize = " << nnMaxBatchSize << " failed: " << e.what() << endl; + } + + delete batchEval; + + if(nnEvalsPerSecond >= 0.0) { + cout << "nnMaxBatchSize = " << nnMaxBatchSize + << ": nnEvals/s = " << Global::strprintf("%.2f",nnEvalsPerSecond) + << " visits/s = " << Global::strprintf("%.2f",visitsPerSecond) + << " avgBatchSize = " << Global::strprintf("%.2f",avgBatchSize) + << endl; + } + } + + configNNMaxBatchSize = bestNNMaxBatchSize; + cout << "Using " << configNNMaxBatchSize + << " nnMaxBatchSize based on nnEvals/s!" << endl; + } + } +#endif } updateConfigContents(); diff --git a/cpp/command/runtests.cpp b/cpp/command/runtests.cpp index 75bb9760c..21c82b2bd 100644 --- a/cpp/command/runtests.cpp +++ b/cpp/command/runtests.cpp @@ -377,8 +377,10 @@ int MainCmds::runtinynntests(const vector& args) { maxTime, maxPonderTime, std::vector(), + -1, nnCacheSizePowerOfTwo, nnMutexPoolSizePowerOfTwo, + 1, numSearchThreads ); istringstream in(cfgStr); diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg index 8a261c4c3..d7994bb8c 100644 --- a/cpp/configs/gtp_example.cfg +++ b/cpp/configs/gtp_example.cfg @@ -365,6 +365,11 @@ searchFactorWhenWinningThreshold = 0.95 # if running out of memory, or using multiple GPUs that expect to share work. # nnMaxBatchSize = +# TensorRT users who repeatedly run genconfig or benchmark with the same +# model/GPU/batch settings may greatly reduce startup time by building with +# CMake option -DUSE_CACHE_TENSORRT_PLAN=1. This is not recommended for +# distributed clients, which update models frequently. + # Controls the neural network cache size, which is the primary RAM/memory use. # KataGo will cache up to (2 ** nnCacheSizePowerOfTwo) many neural net # evaluations in case of transpositions in the tree. diff --git a/cpp/program/gtpconfig.cpp b/cpp/program/gtpconfig.cpp index 3fa8651e5..661decff3 100644 --- a/cpp/program/gtpconfig.cpp +++ b/cpp/program/gtpconfig.cpp @@ -265,7 +265,17 @@ searchFactorWhenWinningThreshold = 0.95 # Maximum number of positions to send to a single GPU at once. The default # value is roughly equal to numSearchThreads, but can be specified manually # if running out of memory, or using multiple GPUs that expect to share work. -# nnMaxBatchSize = +$$NN_MAX_BATCH_SIZE +# +# Number of neural net server threads per model. Usually this is the number of +# GPUs, but genconfig may tune this higher to run multiple backend contexts on +# the same GPU when that improves nnEvals/s. +# numNNServerThreadsPerModel = 1 +# +# TensorRT users who repeatedly run genconfig or benchmark with the same +# model/GPU/batch settings may greatly reduce startup time by building with +# CMake option -DUSE_CACHE_TENSORRT_PLAN=1. This is not recommended for +# distributed clients, which update models frequently. # Controls the neural network cache size, which is the primary RAM/memory use. # KataGo will cache up to (2 ** nnCacheSizePowerOfTwo) many neural net @@ -466,10 +476,13 @@ string GTPConfig::makeConfig( double maxTime, double maxPonderTime, const std::vector& deviceIdxs, + int nnMaxBatchSize, int nnCacheSizePowerOfTwo, int nnMutexPoolSizePowerOfTwo, + int numNNServerThreadsPerModel, int numSearchThreads ) { + testAssert(numNNServerThreadsPerModel >= 1); string config = gtpBasePart1 + gtpBasePart2; auto replace = [&](const string& key, const string& replacement) { size_t pos = config.find(key); @@ -518,25 +531,28 @@ string GTPConfig::makeConfig( else replace("$$PONDERING", "ponderingEnabled = true\n# maxTimePondering = 60.0"); replace("$$NUM_SEARCH_THREADS", Global::intToString(numSearchThreads)); + if(nnMaxBatchSize > 0) replace("$$NN_MAX_BATCH_SIZE", "nnMaxBatchSize = " + Global::intToString(nnMaxBatchSize)); + else replace("$$NN_MAX_BATCH_SIZE", "# nnMaxBatchSize = "); replace("$$NN_CACHE_SIZE_POWER_OF_TWO", Global::intToString(nnCacheSizePowerOfTwo)); replace("$$NN_MUTEX_POOL_SIZE_POWER_OF_TWO", Global::intToString(nnMutexPoolSizePowerOfTwo)); - if(deviceIdxs.size() <= 0) { + if(deviceIdxs.size() <= 0 && numNNServerThreadsPerModel <= 1) { replace("$$MULTIPLE_GPUS", ""); } else { string replacement = ""; - replacement += "numNNServerThreadsPerModel = " + Global::uint64ToString(deviceIdxs.size()) + "\n"; + replacement += "numNNServerThreadsPerModel = " + Global::intToString(numNNServerThreadsPerModel) + "\n"; - for(int i = 0; i& deviceIdxs, + int nnMaxBatchSize, int nnCacheSizePowerOfTwo, int nnMutexPoolSizePowerOfTwo, + int numNNServerThreadsPerModel, int numSearchThreads ); }