Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 51 additions & 44 deletions apps/files/lib/BackgroundJob/DeleteOrphanedItems.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
* Delete all share entries that have no matching entries in the file cache table.
*/
class DeleteOrphanedItems extends TimedJob {
public const CHUNK_SIZE = 200;
public const CHUNK_SIZE = 1000;

/**
* sets the correct interval for this timed job
Expand Down Expand Up @@ -46,74 +46,81 @@ public function run($argument) {
}

/**
* Deleting orphaned system tag mappings
* Delete mapping rows of the 'files' object type whose referenced file no
* longer exists in the file cache.
*
* @param string $table
* @param string $idCol
* @param string $typeCol
* The candidate ids are read from the mapping table itself in keyset-paginated
* chunks and each chunk is checked against the filecache primary key. This
* avoids joining the (potentially huge) mapping table against filecache with a
* GROUP BY, which reads the whole mapping table - and on some databases
* materialises a temp table - on every run even when there are no orphans (the
* common case). It also works the same way whether or not filecache is sharded,
* so a single code path covers both.
*
* @param string $table The mapping table to clean up
* @param string $idCol The column referencing the file id
* @param string $typeCol The column holding the object type
* @param bool $numericId Whether $idCol is an integer column. String columns
* hold the numeric file id as text; the keyset cursor is
* compared (and the rows ordered) using the column's own
* type so the index on $idCol stays usable on every
* database instead of being defeated by an implicit cast.
* @return int Number of deleted entries
*/
protected function cleanUp(string $table, string $idCol, string $typeCol): int {
protected function cleanUp(string $table, string $idCol, string $typeCol, bool $numericId): int {
$deletedEntries = 0;

$deleteQuery = $this->connection->getQueryBuilder();
$deleteQuery->delete($table)
->where($deleteQuery->expr()->eq($idCol, $deleteQuery->createParameter('objectid')));

if ($this->connection->getShardDefinition('filecache')) {
$sourceIdChunks = $this->getItemIds($table, $idCol, $typeCol, 1000);
foreach ($sourceIdChunks as $sourceIdChunk) {
$deletedSources = $this->findMissingSources($sourceIdChunk);
$deleteQuery->setParameter('objectid', $deletedSources, IQueryBuilder::PARAM_INT_ARRAY);
$deletedEntries += $deleteQuery->executeStatement();
}
} else {
$query = $this->connection->getQueryBuilder();
$query->select('t1.' . $idCol)
->from($table, 't1')
->where($query->expr()->eq($typeCol, $query->expr()->literal('files')))
->leftJoin('t1', 'filecache', 't2', $query->expr()->eq($query->expr()->castColumn('t1.' . $idCol, IQueryBuilder::PARAM_INT), 't2.fileid'))
->andWhere($query->expr()->isNull('t2.fileid'))
->groupBy('t1.' . $idCol)
->setMaxResults(self::CHUNK_SIZE);

$deleteQuery = $this->connection->getQueryBuilder();
$deleteQuery->delete($table)
->where($deleteQuery->expr()->in($idCol, $deleteQuery->createParameter('objectid')));

$deletedInLastChunk = self::CHUNK_SIZE;
while ($deletedInLastChunk === self::CHUNK_SIZE) {
$chunk = $query->executeQuery()->fetchFirstColumn();
$deletedInLastChunk = count($chunk);

$deleteQuery->setParameter('objectid', $chunk, IQueryBuilder::PARAM_INT_ARRAY);
$deletedEntries += $deleteQuery->executeStatement();
->where($deleteQuery->expr()->in($idCol, $deleteQuery->createParameter('objectid')));

foreach ($this->getItemIds($table, $idCol, $typeCol, $numericId, self::CHUNK_SIZE) as $idChunk) {
$missingSources = $this->findMissingSources($idChunk);
if (count($missingSources) === 0) {
continue;
}

$deleteQuery->setParameter('objectid', $missingSources, IQueryBuilder::PARAM_INT_ARRAY);
$deletedEntries += $deleteQuery->executeStatement();
}

return $deletedEntries;
}

/**
* Yield the distinct 'files' ids of $table in keyset-paginated chunks.
*
* Chunks are ordered by $idCol and advanced with a `$idCol > cursor`
* comparison so the scan stays on the index covering ($typeCol, $idCol). The
* cursor is bound - and the rows therefore ordered and compared - using the
* column's own type: an integer column numerically, a string column lexically.
* Mixing the two (e.g. comparing a varchar column to an integer) would force
* an implicit cast that defeats the index and makes the ordering and the
* comparison disagree, which could skip chunks.
*
* @param string $table
* @param string $idCol
* @param string $typeCol
* @param bool $numericId Whether $idCol is an integer column
* @param int $chunkSize
* @return \Iterator<int[]>
* @throws \OCP\DB\Exception
*/
private function getItemIds(string $table, string $idCol, string $typeCol, int $chunkSize): \Iterator {
private function getItemIds(string $table, string $idCol, string $typeCol, bool $numericId, int $chunkSize): \Iterator {
$cursorType = $numericId ? IQueryBuilder::PARAM_INT : IQueryBuilder::PARAM_STR;

$query = $this->connection->getQueryBuilder();
$query->select($idCol)
->from($table)
->where($query->expr()->eq($typeCol, $query->expr()->literal('files')))
->groupBy($idCol)
->andWhere($query->expr()->gt($idCol, $query->createParameter('min_id')))
->groupBy($idCol)
->orderBy($idCol)
->setMaxResults($chunkSize);

$minId = 0;
$minId = $numericId ? 0 : '0';
while (true) {
$query->setParameter('min_id', $minId);
$query->setParameter('min_id', $minId, $cursorType);
$rows = $query->executeQuery()->fetchFirstColumn();
if (count($rows) > 0) {
$minId = $rows[count($rows) - 1];
Expand All @@ -139,7 +146,7 @@ private function findMissingSources(array $ids): array {
* @return int Number of deleted entries
*/
protected function cleanSystemTags() {
$deletedEntries = $this->cleanUp('systemtag_object_mapping', 'objectid', 'objecttype');
$deletedEntries = $this->cleanUp('systemtag_object_mapping', 'objectid', 'objecttype', false);
$this->logger->debug("$deletedEntries orphaned system tag relations deleted", ['app' => 'DeleteOrphanedItems']);
return $deletedEntries;
}
Expand All @@ -150,7 +157,7 @@ protected function cleanSystemTags() {
* @return int Number of deleted entries
*/
protected function cleanUserTags() {
$deletedEntries = $this->cleanUp('vcategory_to_object', 'objid', 'type');
$deletedEntries = $this->cleanUp('vcategory_to_object', 'objid', 'type', true);
$this->logger->debug("$deletedEntries orphaned user tag relations deleted", ['app' => 'DeleteOrphanedItems']);
return $deletedEntries;
}
Expand All @@ -161,7 +168,7 @@ protected function cleanUserTags() {
* @return int Number of deleted entries
*/
protected function cleanComments() {
$deletedEntries = $this->cleanUp('comments', 'object_id', 'object_type');
$deletedEntries = $this->cleanUp('comments', 'object_id', 'object_type', false);
$this->logger->debug("$deletedEntries orphaned comments deleted", ['app' => 'DeleteOrphanedItems']);
return $deletedEntries;
}
Expand All @@ -172,7 +179,7 @@ protected function cleanComments() {
* @return int Number of deleted entries
*/
protected function cleanCommentMarkers() {
$deletedEntries = $this->cleanUp('comments_read_markers', 'object_id', 'object_type');
$deletedEntries = $this->cleanUp('comments_read_markers', 'object_id', 'object_type', false);
$this->logger->debug("$deletedEntries orphaned comment read marks deleted", ['app' => 'DeleteOrphanedItems']);
return $deletedEntries;
}
Expand Down
70 changes: 70 additions & 0 deletions apps/files/tests/BackgroundJob/DeleteOrphanedItemsJobTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,35 @@ protected function getMappings(string $table): array {
return $mapping;
}

protected function createFileCacheEntry(): int {
$path = 'apps/files/tests/deleteorphaneditemsjobtest-' . self::getUniqueID();
$query = $this->connection->getQueryBuilder();
$query->insert('filecache')
->values([
'storage' => $query->createNamedParameter(1337, IQueryBuilder::PARAM_INT),
'path' => $query->createNamedParameter($path),
'path_hash' => $query->createNamedParameter(md5($path)),
])->executeStatement();
return $query->getLastInsertId();
}

protected function deleteFileCacheEntry(int $fileId): void {
$query = $this->connection->getQueryBuilder();
$query->delete('filecache')
->where($query->expr()->eq('fileid', $query->createNamedParameter($fileId, IQueryBuilder::PARAM_INT)))
->executeStatement();
}

protected function insertSystemTagMapping(int $objectId, int $tagId): void {
$query = $this->connection->getQueryBuilder();
$query->insert('systemtag_object_mapping')
->values([
'objectid' => $query->createNamedParameter($objectId, IQueryBuilder::PARAM_INT),
'objecttype' => $query->createNamedParameter('files'),
'systemtagid' => $query->createNamedParameter($tagId, IQueryBuilder::PARAM_INT),
])->executeStatement();
}

/**
* Test clearing orphaned system tag mappings
*/
Expand Down Expand Up @@ -100,6 +129,47 @@ public function testClearSystemTagMappings(): void {
$this->cleanMapping('systemtag_object_mapping');
}

/**
* The chunked clean-up must delete every orphaned row - including a file
* referenced by several tags - while keeping all rows whose file still
* exists, no matter how many tags it has (the GROUP BY/dedup case).
*/
public function testClearSystemTagMappingsKeepsPresentRemovesOrphans(): void {
$this->cleanMapping('systemtag_object_mapping');

$presentA = $this->createFileCacheEntry();
$presentB = $this->createFileCacheEntry();
// A file id guaranteed to be absent from the cache: create a row, keep
// its id, then delete it again (auto-increment ids are never reused).
$orphan = $this->createFileCacheEntry();
$this->deleteFileCacheEntry($orphan);

// Present file with two tags -> both kept.
$this->insertSystemTagMapping($presentA, 1);
$this->insertSystemTagMapping($presentA, 2);
// Present file with one tag -> kept.
$this->insertSystemTagMapping($presentB, 1);
// Orphaned file with two tags -> both removed.
$this->insertSystemTagMapping($orphan, 1);
$this->insertSystemTagMapping($orphan, 2);

$this->assertCount(5, $this->getMappings('systemtag_object_mapping'));

$job = new DeleteOrphanedItems($this->timeFactory, $this->connection, $this->logger);
self::invokePrivate($job, 'cleanSystemTags');

$mapping = $this->getMappings('systemtag_object_mapping');
$remainingIds = array_map(static fn (array $row): int => (int)$row['objectid'], $mapping);
sort($remainingIds);
$expected = [$presentA, $presentA, $presentB];
sort($expected);
$this->assertSame($expected, $remainingIds);

$this->deleteFileCacheEntry($presentA);
$this->deleteFileCacheEntry($presentB);
$this->cleanMapping('systemtag_object_mapping');
}

/**
* Test clearing orphaned system tag mappings
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ private static function clearConditionPart(string $part): string {
} elseif (str_starts_with($part, 'to_number(to_char(')) {
// oracle cast to int
$part = substr($part, strlen('to_number(to_char('), -2);
} elseif (str_starts_with($part, 'to_number(to_char(')) {
} elseif (str_starts_with($part, 'to_char(')) {
// oracle cast to string
$part = substr($part, strlen('to_char('), -1);
}
Expand Down
5 changes: 4 additions & 1 deletion lib/private/Files/Cache/CacheQueryBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,11 @@ public function selectTagUsage(): self {
->selectAlias($this->func()->count('filecache.fileid'), 'number_files')
->selectAlias($this->func()->max('filecache.fileid'), 'ref_file_id')
->from('filecache', 'filecache')
// Compare as strings (objectid = CAST(fileid AS CHAR)) so the systag_by_objectid
// index on the string column objectid stays usable; casting objectid to int (or
// the implicit string->number coercion against the bigint fileid) defeats it.
->leftJoin('filecache', 'systemtag_object_mapping', 'systemtagmap', $this->expr()->andX(
$this->expr()->eq('filecache.fileid', $this->expr()->castColumn('systemtagmap.objectid', IQueryBuilder::PARAM_INT)),
$this->expr()->eq('systemtagmap.objectid', $this->expr()->castColumn('filecache.fileid', IQueryBuilder::PARAM_STR)),
$this->expr()->eq('systemtagmap.objecttype', $this->createNamedParameter('files'))
))
->leftJoin('systemtagmap', 'systemtag', 'systemtag', $this->expr()->andX(
Expand Down
7 changes: 6 additions & 1 deletion lib/private/Files/Cache/QuerySearchHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,13 @@ public function findUsedTagsInCaches(ISearchQuery $searchQuery, array $caches):
}

protected function equipQueryForSystemTags(CacheQueryBuilder $query, IUser $user): void {
// Compare the join as strings (objectid = CAST(fileid AS CHAR)) instead of
// casting objectid to an integer. objectid is a string column, so casting it
// (or letting MySQL implicitly coerce the string column to a number when comparing
// to the bigint fileid) makes the systag_by_objectid index unusable and turns this
// into a full/BNL join. Casting fileid instead keeps the index on objectid usable.
$query->leftJoin('file', 'systemtag_object_mapping', 'systemtagmap', $query->expr()->andX(
$query->expr()->eq('file.fileid', $query->expr()->castColumn('systemtagmap.objectid', IQueryBuilder::PARAM_INT)),
$query->expr()->eq('systemtagmap.objectid', $query->expr()->castColumn('file.fileid', IQueryBuilder::PARAM_STR)),
$query->expr()->eq('systemtagmap.objecttype', $query->createNamedParameter('files'))
));
$on = $query->expr()->andX($query->expr()->eq('systemtag.id', 'systemtagmap.systemtagid'));
Expand Down
14 changes: 14 additions & 0 deletions tests/lib/DB/QueryBuilder/Partitioned/JoinConditionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,18 @@ public function testParseCastCondition(string $platform): void {
$this->assertEquals('f.fileid', $parsed->toColumn);
$this->assertEquals([], $parsed->fromConditions);
}

#[\PHPUnit\Framework\Attributes\DataProvider('platformProvider')]
public function testParseStringCastCondition(string $platform): void {
$query = $this->getBuilder($platform);

// Mirrors the systemtag<->filecache join: the string objectid column is compared
// to fileid cast to a string (so the index on objectid stays usable). The string
// cast must be stripped on every platform when parsing the join condition.
$condition = $query->expr()->eq('m.objectid', $query->expr()->castColumn('f.fileid', IQueryBuilder::PARAM_STR));
$parsed = JoinCondition::parse($condition, 'filecache', 'f', 'm');
$this->assertEquals('m.objectid', $parsed->fromColumn);
$this->assertEquals('f.fileid', $parsed->toColumn);
$this->assertEquals([], $parsed->fromConditions);
}
}
Loading