From 705a14c6c50aa97b0af16694211277bf31320a6c Mon Sep 17 00:00:00 2001 From: Git'Fellow <12234510+solracsf@users.noreply.github.com> Date: Tue, 2 Jun 2026 12:22:44 +0200 Subject: [PATCH 1/2] perf(files): keep systemtag and filecache joins index-friendly Signed-off-by: Git'Fellow <12234510+solracsf@users.noreply.github.com> --- .../DB/QueryBuilder/Partitioned/JoinCondition.php | 2 +- lib/private/Files/Cache/CacheQueryBuilder.php | 5 ++++- lib/private/Files/Cache/QuerySearchHelper.php | 7 ++++++- .../QueryBuilder/Partitioned/JoinConditionTest.php | 14 ++++++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/lib/private/DB/QueryBuilder/Partitioned/JoinCondition.php b/lib/private/DB/QueryBuilder/Partitioned/JoinCondition.php index 28e4a4a8ebd01..d4d18aa40424a 100644 --- a/lib/private/DB/QueryBuilder/Partitioned/JoinCondition.php +++ b/lib/private/DB/QueryBuilder/Partitioned/JoinCondition.php @@ -149,7 +149,7 @@ private static function clearConditionPart(string $part): string { } elseif (str_starts_with($part, 'to_number(to_char(')) { // oracle cast to int $part = substr($part, strlen('to_number(to_char('), -2); - } elseif (str_starts_with($part, 'to_number(to_char(')) { + } elseif (str_starts_with($part, 'to_char(')) { // oracle cast to string $part = substr($part, strlen('to_char('), -1); } diff --git a/lib/private/Files/Cache/CacheQueryBuilder.php b/lib/private/Files/Cache/CacheQueryBuilder.php index b62dd6f21208e..586ccbc466a49 100644 --- a/lib/private/Files/Cache/CacheQueryBuilder.php +++ b/lib/private/Files/Cache/CacheQueryBuilder.php @@ -33,8 +33,11 @@ public function selectTagUsage(): self { ->selectAlias($this->func()->count('filecache.fileid'), 'number_files') ->selectAlias($this->func()->max('filecache.fileid'), 'ref_file_id') ->from('filecache', 'filecache') + // Compare as strings (objectid = CAST(fileid AS CHAR)) so the systag_by_objectid + // index on the string column objectid stays usable; casting objectid to int (or + // the implicit string->number coercion against the bigint fileid) defeats it. ->leftJoin('filecache', 'systemtag_object_mapping', 'systemtagmap', $this->expr()->andX( - $this->expr()->eq('filecache.fileid', $this->expr()->castColumn('systemtagmap.objectid', IQueryBuilder::PARAM_INT)), + $this->expr()->eq('systemtagmap.objectid', $this->expr()->castColumn('filecache.fileid', IQueryBuilder::PARAM_STR)), $this->expr()->eq('systemtagmap.objecttype', $this->createNamedParameter('files')) )) ->leftJoin('systemtagmap', 'systemtag', 'systemtag', $this->expr()->andX( diff --git a/lib/private/Files/Cache/QuerySearchHelper.php b/lib/private/Files/Cache/QuerySearchHelper.php index 8cbc4591d750b..c5ceee2ae2a2c 100644 --- a/lib/private/Files/Cache/QuerySearchHelper.php +++ b/lib/private/Files/Cache/QuerySearchHelper.php @@ -97,8 +97,13 @@ public function findUsedTagsInCaches(ISearchQuery $searchQuery, array $caches): } protected function equipQueryForSystemTags(CacheQueryBuilder $query, IUser $user): void { + // Compare the join as strings (objectid = CAST(fileid AS CHAR)) instead of + // casting objectid to an integer. objectid is a string column, so casting it + // (or letting MySQL implicitly coerce the string column to a number when comparing + // to the bigint fileid) makes the systag_by_objectid index unusable and turns this + // into a full/BNL join. Casting fileid instead keeps the index on objectid usable. $query->leftJoin('file', 'systemtag_object_mapping', 'systemtagmap', $query->expr()->andX( - $query->expr()->eq('file.fileid', $query->expr()->castColumn('systemtagmap.objectid', IQueryBuilder::PARAM_INT)), + $query->expr()->eq('systemtagmap.objectid', $query->expr()->castColumn('file.fileid', IQueryBuilder::PARAM_STR)), $query->expr()->eq('systemtagmap.objecttype', $query->createNamedParameter('files')) )); $on = $query->expr()->andX($query->expr()->eq('systemtag.id', 'systemtagmap.systemtagid')); diff --git a/tests/lib/DB/QueryBuilder/Partitioned/JoinConditionTest.php b/tests/lib/DB/QueryBuilder/Partitioned/JoinConditionTest.php index 77e78a8b6b729..14c913e40d4f0 100644 --- a/tests/lib/DB/QueryBuilder/Partitioned/JoinConditionTest.php +++ b/tests/lib/DB/QueryBuilder/Partitioned/JoinConditionTest.php @@ -72,4 +72,18 @@ public function testParseCastCondition(string $platform): void { $this->assertEquals('f.fileid', $parsed->toColumn); $this->assertEquals([], $parsed->fromConditions); } + + #[\PHPUnit\Framework\Attributes\DataProvider('platformProvider')] + public function testParseStringCastCondition(string $platform): void { + $query = $this->getBuilder($platform); + + // Mirrors the systemtag<->filecache join: the string objectid column is compared + // to fileid cast to a string (so the index on objectid stays usable). The string + // cast must be stripped on every platform when parsing the join condition. + $condition = $query->expr()->eq('m.objectid', $query->expr()->castColumn('f.fileid', IQueryBuilder::PARAM_STR)); + $parsed = JoinCondition::parse($condition, 'filecache', 'f', 'm'); + $this->assertEquals('m.objectid', $parsed->fromColumn); + $this->assertEquals('f.fileid', $parsed->toColumn); + $this->assertEquals([], $parsed->fromConditions); + } } From b92e7be94a60de7a2473f6a70850a411c6214fac Mon Sep 17 00:00:00 2001 From: Git'Fellow <12234510+solracsf@users.noreply.github.com> Date: Tue, 2 Jun 2026 12:56:23 +0200 Subject: [PATCH 2/2] perf(files): clean orphaned items without a cross-table GROUP BY join Signed-off-by: Git'Fellow <12234510+solracsf@users.noreply.github.com> --- .../lib/BackgroundJob/DeleteOrphanedItems.php | 95 ++++++++++--------- .../DeleteOrphanedItemsJobTest.php | 70 ++++++++++++++ 2 files changed, 121 insertions(+), 44 deletions(-) diff --git a/apps/files/lib/BackgroundJob/DeleteOrphanedItems.php b/apps/files/lib/BackgroundJob/DeleteOrphanedItems.php index e99fa85856a07..baecffb07396c 100644 --- a/apps/files/lib/BackgroundJob/DeleteOrphanedItems.php +++ b/apps/files/lib/BackgroundJob/DeleteOrphanedItems.php @@ -18,7 +18,7 @@ * Delete all share entries that have no matching entries in the file cache table. */ class DeleteOrphanedItems extends TimedJob { - public const CHUNK_SIZE = 200; + public const CHUNK_SIZE = 1000; /** * sets the correct interval for this timed job @@ -46,74 +46,81 @@ public function run($argument) { } /** - * Deleting orphaned system tag mappings + * Delete mapping rows of the 'files' object type whose referenced file no + * longer exists in the file cache. * - * @param string $table - * @param string $idCol - * @param string $typeCol + * The candidate ids are read from the mapping table itself in keyset-paginated + * chunks and each chunk is checked against the filecache primary key. This + * avoids joining the (potentially huge) mapping table against filecache with a + * GROUP BY, which reads the whole mapping table - and on some databases + * materialises a temp table - on every run even when there are no orphans (the + * common case). It also works the same way whether or not filecache is sharded, + * so a single code path covers both. + * + * @param string $table The mapping table to clean up + * @param string $idCol The column referencing the file id + * @param string $typeCol The column holding the object type + * @param bool $numericId Whether $idCol is an integer column. String columns + * hold the numeric file id as text; the keyset cursor is + * compared (and the rows ordered) using the column's own + * type so the index on $idCol stays usable on every + * database instead of being defeated by an implicit cast. * @return int Number of deleted entries */ - protected function cleanUp(string $table, string $idCol, string $typeCol): int { + protected function cleanUp(string $table, string $idCol, string $typeCol, bool $numericId): int { $deletedEntries = 0; $deleteQuery = $this->connection->getQueryBuilder(); $deleteQuery->delete($table) - ->where($deleteQuery->expr()->eq($idCol, $deleteQuery->createParameter('objectid'))); - - if ($this->connection->getShardDefinition('filecache')) { - $sourceIdChunks = $this->getItemIds($table, $idCol, $typeCol, 1000); - foreach ($sourceIdChunks as $sourceIdChunk) { - $deletedSources = $this->findMissingSources($sourceIdChunk); - $deleteQuery->setParameter('objectid', $deletedSources, IQueryBuilder::PARAM_INT_ARRAY); - $deletedEntries += $deleteQuery->executeStatement(); - } - } else { - $query = $this->connection->getQueryBuilder(); - $query->select('t1.' . $idCol) - ->from($table, 't1') - ->where($query->expr()->eq($typeCol, $query->expr()->literal('files'))) - ->leftJoin('t1', 'filecache', 't2', $query->expr()->eq($query->expr()->castColumn('t1.' . $idCol, IQueryBuilder::PARAM_INT), 't2.fileid')) - ->andWhere($query->expr()->isNull('t2.fileid')) - ->groupBy('t1.' . $idCol) - ->setMaxResults(self::CHUNK_SIZE); - - $deleteQuery = $this->connection->getQueryBuilder(); - $deleteQuery->delete($table) - ->where($deleteQuery->expr()->in($idCol, $deleteQuery->createParameter('objectid'))); - - $deletedInLastChunk = self::CHUNK_SIZE; - while ($deletedInLastChunk === self::CHUNK_SIZE) { - $chunk = $query->executeQuery()->fetchFirstColumn(); - $deletedInLastChunk = count($chunk); - - $deleteQuery->setParameter('objectid', $chunk, IQueryBuilder::PARAM_INT_ARRAY); - $deletedEntries += $deleteQuery->executeStatement(); + ->where($deleteQuery->expr()->in($idCol, $deleteQuery->createParameter('objectid'))); + + foreach ($this->getItemIds($table, $idCol, $typeCol, $numericId, self::CHUNK_SIZE) as $idChunk) { + $missingSources = $this->findMissingSources($idChunk); + if (count($missingSources) === 0) { + continue; } + + $deleteQuery->setParameter('objectid', $missingSources, IQueryBuilder::PARAM_INT_ARRAY); + $deletedEntries += $deleteQuery->executeStatement(); } return $deletedEntries; } /** + * Yield the distinct 'files' ids of $table in keyset-paginated chunks. + * + * Chunks are ordered by $idCol and advanced with a `$idCol > cursor` + * comparison so the scan stays on the index covering ($typeCol, $idCol). The + * cursor is bound - and the rows therefore ordered and compared - using the + * column's own type: an integer column numerically, a string column lexically. + * Mixing the two (e.g. comparing a varchar column to an integer) would force + * an implicit cast that defeats the index and makes the ordering and the + * comparison disagree, which could skip chunks. + * * @param string $table * @param string $idCol * @param string $typeCol + * @param bool $numericId Whether $idCol is an integer column * @param int $chunkSize * @return \Iterator * @throws \OCP\DB\Exception */ - private function getItemIds(string $table, string $idCol, string $typeCol, int $chunkSize): \Iterator { + private function getItemIds(string $table, string $idCol, string $typeCol, bool $numericId, int $chunkSize): \Iterator { + $cursorType = $numericId ? IQueryBuilder::PARAM_INT : IQueryBuilder::PARAM_STR; + $query = $this->connection->getQueryBuilder(); $query->select($idCol) ->from($table) ->where($query->expr()->eq($typeCol, $query->expr()->literal('files'))) - ->groupBy($idCol) ->andWhere($query->expr()->gt($idCol, $query->createParameter('min_id'))) + ->groupBy($idCol) + ->orderBy($idCol) ->setMaxResults($chunkSize); - $minId = 0; + $minId = $numericId ? 0 : '0'; while (true) { - $query->setParameter('min_id', $minId); + $query->setParameter('min_id', $minId, $cursorType); $rows = $query->executeQuery()->fetchFirstColumn(); if (count($rows) > 0) { $minId = $rows[count($rows) - 1]; @@ -139,7 +146,7 @@ private function findMissingSources(array $ids): array { * @return int Number of deleted entries */ protected function cleanSystemTags() { - $deletedEntries = $this->cleanUp('systemtag_object_mapping', 'objectid', 'objecttype'); + $deletedEntries = $this->cleanUp('systemtag_object_mapping', 'objectid', 'objecttype', false); $this->logger->debug("$deletedEntries orphaned system tag relations deleted", ['app' => 'DeleteOrphanedItems']); return $deletedEntries; } @@ -150,7 +157,7 @@ protected function cleanSystemTags() { * @return int Number of deleted entries */ protected function cleanUserTags() { - $deletedEntries = $this->cleanUp('vcategory_to_object', 'objid', 'type'); + $deletedEntries = $this->cleanUp('vcategory_to_object', 'objid', 'type', true); $this->logger->debug("$deletedEntries orphaned user tag relations deleted", ['app' => 'DeleteOrphanedItems']); return $deletedEntries; } @@ -161,7 +168,7 @@ protected function cleanUserTags() { * @return int Number of deleted entries */ protected function cleanComments() { - $deletedEntries = $this->cleanUp('comments', 'object_id', 'object_type'); + $deletedEntries = $this->cleanUp('comments', 'object_id', 'object_type', false); $this->logger->debug("$deletedEntries orphaned comments deleted", ['app' => 'DeleteOrphanedItems']); return $deletedEntries; } @@ -172,7 +179,7 @@ protected function cleanComments() { * @return int Number of deleted entries */ protected function cleanCommentMarkers() { - $deletedEntries = $this->cleanUp('comments_read_markers', 'object_id', 'object_type'); + $deletedEntries = $this->cleanUp('comments_read_markers', 'object_id', 'object_type', false); $this->logger->debug("$deletedEntries orphaned comment read marks deleted", ['app' => 'DeleteOrphanedItems']); return $deletedEntries; } diff --git a/apps/files/tests/BackgroundJob/DeleteOrphanedItemsJobTest.php b/apps/files/tests/BackgroundJob/DeleteOrphanedItemsJobTest.php index 8f3ec56c94105..f64d80324dbd9 100644 --- a/apps/files/tests/BackgroundJob/DeleteOrphanedItemsJobTest.php +++ b/apps/files/tests/BackgroundJob/DeleteOrphanedItemsJobTest.php @@ -51,6 +51,35 @@ protected function getMappings(string $table): array { return $mapping; } + protected function createFileCacheEntry(): int { + $path = 'apps/files/tests/deleteorphaneditemsjobtest-' . self::getUniqueID(); + $query = $this->connection->getQueryBuilder(); + $query->insert('filecache') + ->values([ + 'storage' => $query->createNamedParameter(1337, IQueryBuilder::PARAM_INT), + 'path' => $query->createNamedParameter($path), + 'path_hash' => $query->createNamedParameter(md5($path)), + ])->executeStatement(); + return $query->getLastInsertId(); + } + + protected function deleteFileCacheEntry(int $fileId): void { + $query = $this->connection->getQueryBuilder(); + $query->delete('filecache') + ->where($query->expr()->eq('fileid', $query->createNamedParameter($fileId, IQueryBuilder::PARAM_INT))) + ->executeStatement(); + } + + protected function insertSystemTagMapping(int $objectId, int $tagId): void { + $query = $this->connection->getQueryBuilder(); + $query->insert('systemtag_object_mapping') + ->values([ + 'objectid' => $query->createNamedParameter($objectId, IQueryBuilder::PARAM_INT), + 'objecttype' => $query->createNamedParameter('files'), + 'systemtagid' => $query->createNamedParameter($tagId, IQueryBuilder::PARAM_INT), + ])->executeStatement(); + } + /** * Test clearing orphaned system tag mappings */ @@ -100,6 +129,47 @@ public function testClearSystemTagMappings(): void { $this->cleanMapping('systemtag_object_mapping'); } + /** + * The chunked clean-up must delete every orphaned row - including a file + * referenced by several tags - while keeping all rows whose file still + * exists, no matter how many tags it has (the GROUP BY/dedup case). + */ + public function testClearSystemTagMappingsKeepsPresentRemovesOrphans(): void { + $this->cleanMapping('systemtag_object_mapping'); + + $presentA = $this->createFileCacheEntry(); + $presentB = $this->createFileCacheEntry(); + // A file id guaranteed to be absent from the cache: create a row, keep + // its id, then delete it again (auto-increment ids are never reused). + $orphan = $this->createFileCacheEntry(); + $this->deleteFileCacheEntry($orphan); + + // Present file with two tags -> both kept. + $this->insertSystemTagMapping($presentA, 1); + $this->insertSystemTagMapping($presentA, 2); + // Present file with one tag -> kept. + $this->insertSystemTagMapping($presentB, 1); + // Orphaned file with two tags -> both removed. + $this->insertSystemTagMapping($orphan, 1); + $this->insertSystemTagMapping($orphan, 2); + + $this->assertCount(5, $this->getMappings('systemtag_object_mapping')); + + $job = new DeleteOrphanedItems($this->timeFactory, $this->connection, $this->logger); + self::invokePrivate($job, 'cleanSystemTags'); + + $mapping = $this->getMappings('systemtag_object_mapping'); + $remainingIds = array_map(static fn (array $row): int => (int)$row['objectid'], $mapping); + sort($remainingIds); + $expected = [$presentA, $presentA, $presentB]; + sort($expected); + $this->assertSame($expected, $remainingIds); + + $this->deleteFileCacheEntry($presentA); + $this->deleteFileCacheEntry($presentB); + $this->cleanMapping('systemtag_object_mapping'); + } + /** * Test clearing orphaned system tag mappings */