From 7ab5a400e53ec555db5dbf0470f3efc0c8dabc4f Mon Sep 17 00:00:00 2001 From: Nicolas Frisby Date: Sun, 8 Feb 2026 11:40:23 -0500 Subject: [PATCH] Add FdExhaustion-DownloadCount-Hypothesis benchmark --- .../FdExhaustionDownloadCountHypothesis.hs | 63 +++++++++++++++++++ hackage-server.cabal | 21 +++++++ .../Server/Features/DownloadCount.hs | 16 +++-- .../Server/Features/DownloadCount/State.hs | 36 ++++++++++- 4 files changed, 127 insertions(+), 9 deletions(-) create mode 100644 benchmarks/FdExhaustionDownloadCountHypothesis.hs diff --git a/benchmarks/FdExhaustionDownloadCountHypothesis.hs b/benchmarks/FdExhaustionDownloadCountHypothesis.hs new file mode 100644 index 000000000..20774caba --- /dev/null +++ b/benchmarks/FdExhaustionDownloadCountHypothesis.hs @@ -0,0 +1,63 @@ +-- | This benchmark is unusual in that it's only useful to run with a +-- very low limit on how many file descriptors it can have open at +-- once. So run it in a way similar to this: +-- +-- @ +-- $ cabal build FdExhaustion-DownloadCount-Hypothesis && (ulimit -n 44 && cabal run FdExhaustion-DownloadCount-Hypothesis) +-- @ +module Main where + +import Control.Exception (evaluate) +import System.FilePath (()) +import System.IO.Temp (withSystemTempDirectory) + +import Data.SafeCopy (safePut) +import Data.Serialize.Put (runPutLazy) + +import Distribution.Package (PackageName) +import Distribution.Simple.Utils (writeFileAtomic) +import Distribution.Text (display, simpleParse) + +import Distribution.Server.Features.DownloadCount.State +import Distribution.Server.Util.CountingMap + +main :: IO () +main = do + withSystemTempDirectory "hackage-server-FdExhaustion-test" $ \dir -> do + -- Write 100 files, each containing 'fakePerPkg'. + sequence_ + [ writeFileAtomic (dir display (nm :: PackageName)) $ + runPutLazy $ safePut fakePerPkg + | Just nm <- [ simpleParse ("foo" ++ show i) | i <- [1..100 :: Int] ] + ] + + -- The first invocation of 'registerDownloads' on each day + -- does some bookkeeping of the download counts that are + -- stored on disk. + -- + -- This silly logic below this comment is distillation of that + -- that also hammers the file descriptor acquire and release. + readOnDiskStatsLazily dir >>= writeOnDiskStats dir + readOnDiskStatsLazily dir >>= evaluate . initRecentAndTotalDownloads (toEnum 0, toEnum 30) + pure () + +-- | The resulting file is 413,433 bytes. +-- +-- That's enough bytes that the results of BSL.hGetContents will not +-- immediately reach the eof. (The chunksize is rarely above 32 KB.) +-- +-- We create a fake on-disk state where every package coincidentally +-- has thi equivalent (incredibly dense 'OnDiskPerPkg'). +-- +-- TODO if this were /smaller/ this benchmark might need more file +-- descriptors at once? Depends on the exact operational dynamics of +-- 'unsafeInterleaveIO', the context switching of the RTS +-- capabilities, etc. Hard to anticipate /before/ we have a repro. +fakePerPkg :: OnDiskPerPkg +fakePerPkg = + foldl' (\acc k -> cmInsert k 1 acc) cmEmpty keys + where + keys = + [ (toEnum day, version) + | day <- [0..100], Just version <- map (simpleParse . show) [0..100 :: Int] + ] diff --git a/hackage-server.cabal b/hackage-server.cabal index 0b3a43bb5..3204b7556 100644 --- a/hackage-server.cabal +++ b/hackage-server.cabal @@ -718,3 +718,24 @@ test-suite DocTests if !flag(doctests) buildable: False + +benchmark FdExhaustion-DownloadCount-Hypothesis + import: exe-base + import: defaults + + -- This benchmark needs to have the same rtsopts as hackage-server + + -- from exe-base + ghc-options: -threaded -rtsopts + -- from hackage-server above + ghc-options: -with-rtsopts=-I00 + + type: exitcode-stdio-1.0 + hs-source-dirs: benchmarks + main-is: FdExhaustionDownloadCountHypothesis.hs + + build-depends: + , cereal + , hackage-server + , safecopy + , temporary diff --git a/src/Distribution/Server/Features/DownloadCount.hs b/src/Distribution/Server/Features/DownloadCount.hs index 527ac859d..bf56e4935 100644 --- a/src/Distribution/Server/Features/DownloadCount.hs +++ b/src/Distribution/Server/Features/DownloadCount.hs @@ -103,15 +103,18 @@ inMemStateComponent stateDir = do , resetState = inMemStateComponent } +mkOnDiskStatePath :: FilePath -> FilePath +mkOnDiskStatePath stateDir = dcPath stateDir "ondisk" + onDiskStateComponent :: FilePath -> StateComponent OnDiskState OnDiskStats onDiskStateComponent stateDir = StateComponent { stateDesc = "All time download counts" , stateHandle = OnDiskState - , getState = readOnDiskStats (dcPath stateDir "ondisk") + , getState = readOnDiskStatsLazily (mkOnDiskStatePath stateDir) , putState = \onDiskStats -> do --TODO: we should extend the backup system so we can -- write these files out incrementally - writeOnDiskStats (dcPath stateDir "ondisk") onDiskStats + writeOnDiskStats (mkOnDiskStatePath stateDir) onDiskStats reconstructLog (dcPath stateDir) onDiskStats , backupState = \_ -> onDiskBackup , restoreState = onDiskRestore @@ -180,13 +183,14 @@ downloadFeature CoreFeature{} -- Write yesterday's downloads to the log appendToLog (dcPath serverStateDir) inMemStats + let onDiskStateFile = mkOnDiskStatePath serverStateDir -- Update the on-disk statistics and recompute recent downloads - onDiskStats' <- updateHistory inMemStats <$> getState onDiskState - writeOnDiskStats (dcPath serverStateDir "ondisk") onDiskStats' + onDiskStats' <- updateHistory inMemStats <$> readOnDiskStatsLazily onDiskStateFile + writeOnDiskStats onDiskStateFile onDiskStats' --TODO: this is still stupid, writing it out only to read it back -- we should be able to update the in memory ones incrementally (recentDownloads, - totalDownloads) <- computeRecentAndTotalDownloads =<< getState onDiskState + totalDownloads) <- computeRecentAndTotalDownloads =<< readOnDiskStatsLazily onDiskStateFile writeMemState recentDownloadsCache recentDownloads writeMemState totalDownloadsCache totalDownloads @@ -232,7 +236,7 @@ downloadFeature CoreFeature{} onDiskStats <- cmFromCSV csv liftIO $ do --TODO: if the onDiskStats are large, can we stream it? - writeOnDiskStats (dcPath serverStateDir "ondisk") onDiskStats + writeOnDiskStats (mkOnDiskStatePath serverStateDir) onDiskStats (recentDownloads, totalDownloads) <- computeRecentAndTotalDownloads onDiskStats writeMemState recentDownloadsCache recentDownloads diff --git a/src/Distribution/Server/Features/DownloadCount/State.hs b/src/Distribution/Server/Features/DownloadCount/State.hs index cdd591885..7492e3358 100644 --- a/src/Distribution/Server/Features/DownloadCount/State.hs +++ b/src/Distribution/Server/Features/DownloadCount/State.hs @@ -200,12 +200,42 @@ instance MemSize InMemStats where deriveSafeCopy 0 'base ''InMemStats deriveSafeCopy 0 'base ''OnDiskPerPkg -readOnDiskStats :: FilePath -> IO OnDiskStats -readOnDiskStats stateDir = do +-- | This processes all of the files. If you're only interested in a +-- couple of them, use 'readOnDiskStatsLazily' instead, to avoid most +-- of the IO. +readOnDiskStatsEagerly :: FilePath -> IO OnDiskStats +readOnDiskStatsEagerly = readOnDiskStatsHelper False + +-- | Compared to 'readOnDiskStatsEagerly', this defers opening and +-- processing the files; each file will be opened and processed when +-- the corresponding entry in the result map is forced. +-- +-- It is therefore perhaps unwise to call this function and then +-- quickly force every entry of the map. It's not easy to predict +-- whether it will open many files before closing any of them; might +-- depend on the behaviors of the threaded RTS\/the platform\/etc. +-- +-- So if you're going to quickly force many of the packages' entries +-- in the map, you could instead call 'readOnDiskStatsEagerly' to +-- avoid any risk of exhausing file descriptions: every file will be +-- closed before the next is opened. +-- +-- But we haven't /actually demonstrated/ 'readOnDiskStatsLazily' +-- risks exhausting file descriptors; we merely just suspect it +-- might. See the FdExhaustion-DownloadCount-Hypothesis +-- benchmark. Thus, for now, the rest of the codebase continues to use +-- only 'readOnDiskStateLazily', since that's what it used before we +-- started investigating. +readOnDiskStatsLazily :: FilePath -> IO OnDiskStats +readOnDiskStatsLazily = readOnDiskStatsHelper True + +-- | See 'readOnDiskStatsLazily' and 'readOnDiskStatsEagerly'. +readOnDiskStatsHelper :: Bool -> FilePath -> IO OnDiskStats +readOnDiskStatsHelper whetherToInterleaveIO stateDir = do createDirectoryIfMissing True stateDir pkgStrs <- getDirectoryContents stateDir OnDiskStats . NCM 0 . Map.fromList <$> sequence - [ do onDiskPerPkg <- unsafeInterleaveIO $ + [ do onDiskPerPkg <- (if whetherToInterleaveIO then unsafeInterleaveIO else id) $ either (const cmEmpty) id <$> readOnDiskPerPkg pkgFile return (pkgName, onDiskPerPkg)