From d55875e18c01461847e56073a444015ea5e23393 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 13 Mar 2026 18:44:04 +0530 Subject: [PATCH 1/2] feat: add support for multimodal indexing Signed-off-by: Anupam Kumar --- appinfo/info.xml | 2 +- lib/AppInfo/Application.php | 29 +++++++++++ lib/BackgroundJobs/StorageCrawlJob.php | 9 +++- lib/Command/QueueMultimodalFiles.php | 68 ++++++++++++++++++++++++++ lib/Listener/ShareListener.php | 6 ++- lib/Service/FsEventService.php | 7 ++- lib/Service/StorageService.php | 48 +++++++++++++----- lib/Service/TaskTypeService.php | 59 ++++++++++++++++++++++ 8 files changed, 207 insertions(+), 21 deletions(-) create mode 100644 lib/Command/QueueMultimodalFiles.php create mode 100644 lib/Service/TaskTypeService.php diff --git a/appinfo/info.xml b/appinfo/info.xml index 9394a0a..3a6772d 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -44,11 +44,11 @@ Refer to the [Context Chat Backend's readme](https://github.com/nextcloud/contex OCA\ContextChat\BackgroundJobs\FileSystemListenerJob - OCA\ContextChat\BackgroundJobs\ActionJob OCA\ContextChat\BackgroundJobs\RotateLogsJob OCA\ContextChat\Command\Prompt + OCA\ContextChat\Command\QueueMultimodalFiles OCA\ContextChat\Command\Search OCA\ContextChat\Command\Statistics diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 0ea6b81..24e87cb 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -61,6 +61,35 @@ class Application extends App implements IBootstrap { 'text/org', ]; + public const IMAGE_MIMETYPES = [ + 'image/bmp', + 'image/bpg', + 'image/emf', + 'image/gif', + 'image/heic', + 'image/heif', + 'image/jp2', + 'image/jpeg', + 'image/png', + 'image/svg+xml', + 'image/tga', + 'image/tiff', + 'image/webp', + 'image/x-dcraw', + 'image/x-icon', + ]; + + public const AUDIO_MIMETYPES = [ + 'audio/aac', + 'audio/flac', + 'audio/mp4', + 'audio/mpeg', + 'audio/ogg', + 'audio/wav', + 'audio/webm', + 'audio/x-scpls', + ]; + public function __construct(array $urlParams = []) { parent::__construct(self::APP_ID, $urlParams); } diff --git a/lib/BackgroundJobs/StorageCrawlJob.php b/lib/BackgroundJobs/StorageCrawlJob.php index 1447fed..9436e40 100644 --- a/lib/BackgroundJobs/StorageCrawlJob.php +++ b/lib/BackgroundJobs/StorageCrawlJob.php @@ -10,11 +10,13 @@ namespace OCA\ContextChat\BackgroundJobs; +use OCA\ContextChat\AppInfo\Application; use OCA\ContextChat\Db\QueueFile; use OCA\ContextChat\Logger; use OCA\ContextChat\Service\DiagnosticService; use OCA\ContextChat\Service\QueueService; use OCA\ContextChat\Service\StorageService; +use OCA\ContextChat\Service\TaskTypeService; use OCP\AppFramework\Services\IAppConfig; use OCP\AppFramework\Utility\ITimeFactory; use OCP\BackgroundJob\IJobList; @@ -33,12 +35,13 @@ public function __construct( private StorageService $storageService, private DiagnosticService $diagnosticService, private IAppConfig $appConfig, + private TaskTypeService $taskTypeService, ) { parent::__construct($timeFactory); } /** - * @param array{storage_id:int, root_id:int, overridden_root:int|null, override_root:int|null, last_file_id:int} $argument + * @param array{storage_id:int, root_id:int, overridden_root:int|null, override_root:int|null, last_file_id:int, only_non_textual?:bool} $argument * @return void */ protected function run($argument): void { @@ -46,6 +49,8 @@ protected function run($argument): void { $rootId = $argument['root_id']; $overrideRoot = $argument['overridden_root'] ?? $argument['override_root'] ?? $rootId; $lastFileId = $argument['last_file_id']; + $onlyNonTextual = $argument['only_non_textual'] ?? false; + $mimeTypes = $this->taskTypeService->getMultimodalMimetypes(!$onlyNonTextual); // Remove current iteration $this->jobList->remove(self::class, $argument); @@ -56,7 +61,7 @@ protected function run($argument): void { $mountFilesCount = 0; $lastSuccessfulFileId = -1; - foreach ($this->storageService->getFilesInMount($storageId, $overrideRoot ?? $rootId, $lastFileId, self::BATCH_SIZE) as $fileId) { + foreach ($this->storageService->getFilesInMount($storageId, $overrideRoot ?? $rootId, $lastFileId, self::BATCH_SIZE, $mimeTypes) as $fileId) { $queueFile = new QueueFile(); $queueFile->setStorageId($storageId); $queueFile->setRootId($rootId); diff --git a/lib/Command/QueueMultimodalFiles.php b/lib/Command/QueueMultimodalFiles.php new file mode 100644 index 0000000..0bfd45a --- /dev/null +++ b/lib/Command/QueueMultimodalFiles.php @@ -0,0 +1,68 @@ +setName('context_chat:queue-multimodal-files') + ->setDescription( + 'Queue existing multimodal files (Images and Audio) for indexation.' + . ' Each type of files is queued only if the required TaskProcessing task provider is available.' + . ' OCR for Images and Speech-to-text for Audio.' + . ' See https://docs.nextcloud.com/server/latest/admin_manual/ai/overview.html for more information.' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output) { + if (!$this->taskTypeService->isOcrTaskTypeAvailable()) { + $output->writeln('OCR task type is not available.'); + } + if (!$this->taskTypeService->isSpeechToTextTaskTypeAvailable()) { + $output->writeln('Speech-to-text task type is not available.'); + } + + try { + foreach ($this->storageService->getMounts() as $mount) { + $this->logger->debug('Scheduling StorageCrawlJob storage_id=' . $mount['storage_id'] . ' root_id=' . $mount['root_id' ] . 'override_root=' . $mount['overridden_root']); + $this->jobList->add(StorageCrawlJob::class, [ + 'storage_id' => $mount['storage_id'], + 'root_id' => $mount['root_id' ], + 'overridden_root' => $mount['overridden_root'], + 'last_file_id' => 0, + 'only_non_textual' => true, + ]); + } + } catch (\Exception $e) { + $this->logger->error('Failed to schedule StorageCrawlJob to find files for indexation.', ['exception' => $e]); + $output->writeln('Failed to schedule StorageCrawlJob to find files for indexation: ' . $e->getMessage() . ''); + return 1; + } + + $output->writeln('Multimodal files have been scheduled to be queued for indexation.'); + return 0; + } +} diff --git a/lib/Listener/ShareListener.php b/lib/Listener/ShareListener.php index 528bd4f..6f76805 100644 --- a/lib/Listener/ShareListener.php +++ b/lib/Listener/ShareListener.php @@ -10,12 +10,12 @@ namespace OCA\ContextChat\Listener; -use OCA\ContextChat\AppInfo\Application; use OCA\ContextChat\Logger; use OCA\ContextChat\Public\UpdateAccessOp; use OCA\ContextChat\Service\ActionScheduler; use OCA\ContextChat\Service\ProviderConfigService; use OCA\ContextChat\Service\StorageService; +use OCA\ContextChat\Service\TaskTypeService; use OCP\EventDispatcher\Event; use OCP\EventDispatcher\IEventListener; use OCP\Files\FileInfo; @@ -37,6 +37,7 @@ public function __construct( private IManager $shareManager, private ActionScheduler $actionService, private IGroupManager $groupManager, + private TaskTypeService $taskTypeService, ) { } @@ -145,6 +146,7 @@ public function handle(Event $event): void { private function allowedMimeType(Node $file): bool { $mimeType = $file->getMimeType(); - return in_array($mimeType, Application::MIMETYPES, true); + $mimeTypes = $this->taskTypeService->getMultimodalMimetypes(); + return in_array($mimeType, $mimeTypes, true); } } diff --git a/lib/Service/FsEventService.php b/lib/Service/FsEventService.php index 58adb0f..c6bc432 100644 --- a/lib/Service/FsEventService.php +++ b/lib/Service/FsEventService.php @@ -7,13 +7,11 @@ namespace OCA\ContextChat\Service; -use OCA\ContextChat\AppInfo\Application; use OCA\ContextChat\Db\QueueFile; use OCA\ContextChat\Logger; use OCP\DB\Exception; use OCP\Files\Folder; use OCP\Files\InvalidPathException; -use OCP\Files\IRootFolder; use OCP\Files\Node; use OCP\Files\NotFoundException; @@ -24,7 +22,7 @@ public function __construct( private QueueService $queue, private ActionScheduler $actionService, private StorageService $storageService, - private IRootFolder $rootFolder, + private TaskTypeService $taskTypeService, ) { } @@ -134,7 +132,8 @@ public function onInsert(Node $node, bool $recurse = true, bool $update = false) private function allowedMimeType(Node $file): bool { $mimeType = $file->getMimeType(); - return in_array($mimeType, Application::MIMETYPES, true); + $mimeTypes = $this->taskTypeService->getMultimodalMimetypes(); + return in_array($mimeType, $mimeTypes, true); } private function allowedPath(Node $file): bool { diff --git a/lib/Service/StorageService.php b/lib/Service/StorageService.php index b937259..7709bea 100644 --- a/lib/Service/StorageService.php +++ b/lib/Service/StorageService.php @@ -20,7 +20,6 @@ use OCP\Files\Config\IUserMountCache; use OCP\Files\Folder; use OCP\Files\IMimeTypeLoader; -use OCP\Files\IRootFolder; use OCP\Files\Node; use OCP\FilesMetadata\IFilesMetadataManager; use OCP\IDBConnection; @@ -44,8 +43,8 @@ public function __construct( private IMimeTypeLoader $mimeTypes, private IUserMountCache $userMountCache, private IFilesMetadataManager $metadataManager, - private IRootFolder $rootFolder, private IFileAccess $fileAccess, + private TaskTypeService $taskTypeService, ) { } @@ -84,7 +83,8 @@ public function countFilesInMount(int $storageId, int $rootId): int { return 0; } - $mimeTypes = array_map(fn ($mimeType) => $this->mimeTypes->getId($mimeType), Application::MIMETYPES); + $mimeTypes = $this->taskTypeService->getMultimodalMimetypes(); + $mimeTypesIds = array_map(fn ($mimeType) => $this->mimeTypes->getId($mimeType), $mimeTypes); $qb = $this->getCacheQueryBuilder(); @@ -110,7 +110,7 @@ public function countFilesInMount(int $storageId, int $rootId): int { ->andWhere($qb->expr()->notLike('filecache.path', $qb->createNamedParameter('files_versions/%'))) ->andWhere($qb->expr()->notLike('filecache.path', $qb->createNamedParameter('files_trashbin/%'))) ->andWhere($qb->expr()->eq('filecache.storage', $qb->createNamedParameter($storageId))) - ->andWhere($qb->expr()->in('filecache.mimetype', $qb->createNamedParameter($mimeTypes, IQueryBuilder::PARAM_INT_ARRAY))) + ->andWhere($qb->expr()->in('filecache.mimetype', $qb->createNamedParameter($mimeTypesIds, IQueryBuilder::PARAM_INT_ARRAY))) ->andWhere($qb->expr()->lte('filecache.size', $qb->createNamedParameter(Application::CC_MAX_SIZE, IQueryBuilder::PARAM_INT))) ->andWhere($qb->expr()->gt('filecache.size', $qb->createNamedParameter(0, IQueryBuilder::PARAM_INT))); $result = $qb->executeQuery(); @@ -199,14 +199,24 @@ private function getMountsOld(): \Generator { * @param int $rootId * @param int $lastFileId * @param int $maxResults + * @param list $mimeTypes * @return \Generator */ - public function getFilesInMount(int $storageId, int $rootId, int $lastFileId = 0, int $maxResults = 100): \Generator { + public function getFilesInMount( + int $storageId, + int $rootId, + int $lastFileId = 0, + int $maxResults = 100, + array $mimeTypes = [], + ): \Generator { + if ($mimeTypes === []) { + $mimeTypes = $this->taskTypeService->getMultimodalMimetypes(); + } if (!$this->isFileAccessAvailable()) { - return $this->getFilesInMountOld($storageId, $rootId, $lastFileId, $maxResults); + return $this->getFilesInMountOld($storageId, $rootId, $lastFileId, $maxResults, $mimeTypes); } - return $this->getFilesInMountUsingFileAccess($storageId, $rootId, $lastFileId, $maxResults); + return $this->getFilesInMountUsingFileAccess($storageId, $rootId, $lastFileId, $maxResults, $mimeTypes); } /** @@ -214,10 +224,17 @@ public function getFilesInMount(int $storageId, int $rootId, int $lastFileId = 0 * @param int $rootId * @param int $lastFileId * @param int $maxResults + * @param list $mimeTypes * @return \Generator */ - private function getFilesInMountUsingFileAccess(int $storageId, int $rootId, int $lastFileId = 0, int $maxResults = 100): \Generator { - $mimeTypeIds = array_map(fn ($mimeType) => $this->mimeTypes->getId($mimeType), Application::MIMETYPES); + private function getFilesInMountUsingFileAccess( + int $storageId, + int $rootId, + int $lastFileId = 0, + int $maxResults = 100, + array $mimeTypes = Application::MIMETYPES, + ): \Generator { + $mimeTypeIds = array_map(fn ($mimeType) => $this->mimeTypes->getId($mimeType), $mimeTypes); foreach ($this->fileAccess->getByAncestorInStorage($storageId, $rootId, $lastFileId, $maxResults, $mimeTypeIds, false, true) as $cacheEntry) { yield $cacheEntry['fileid']; } @@ -228,9 +245,16 @@ private function getFilesInMountUsingFileAccess(int $storageId, int $rootId, int * @param int $rootId * @param int $lastFileId * @param int $maxResults + * @param list $mimeTypes * @return \Generator */ - private function getFilesInMountOld(int $storageId, int $rootId, int $lastFileId = 0, int $maxResults = 100): \Generator { + private function getFilesInMountOld( + int $storageId, + int $rootId, + int $lastFileId = 0, + int $maxResults = 100, + array $mimeTypes = Application::MIMETYPES, + ): \Generator { $qb = $this->getCacheQueryBuilder(); try { $qb->selectFileCache(); @@ -249,7 +273,7 @@ private function getFilesInMountOld(int $storageId, int $rootId, int $lastFileId return; } - $mimeTypes = array_map(fn ($mimeType) => $this->mimeTypes->getId($mimeType), Application::MIMETYPES); + $mimeTypesIds = array_map(fn ($mimeType) => $this->mimeTypes->getId($mimeType), $mimeTypes); $qb = $this->getCacheQueryBuilder(); @@ -272,7 +296,7 @@ private function getFilesInMountOld(int $storageId, int $rootId, int $lastFileId ->andWhere($qb->expr()->like('filecache.path', $qb->createNamedParameter($path . '%'))) ->andWhere($qb->expr()->eq('filecache.storage', $qb->createNamedParameter($storageId))) ->andWhere($qb->expr()->gt('filecache.fileid', $qb->createNamedParameter($lastFileId))) - ->andWhere($qb->expr()->in('filecache.mimetype', $qb->createNamedParameter($mimeTypes, IQueryBuilder::PARAM_INT_ARRAY))); + ->andWhere($qb->expr()->in('filecache.mimetype', $qb->createNamedParameter($mimeTypesIds, IQueryBuilder::PARAM_INT_ARRAY))); if ($maxResults !== 0) { $qb->setMaxResults($maxResults); diff --git a/lib/Service/TaskTypeService.php b/lib/Service/TaskTypeService.php new file mode 100644 index 0000000..cbe0bc5 --- /dev/null +++ b/lib/Service/TaskTypeService.php @@ -0,0 +1,59 @@ +taskProcessingManager->getPreferredProvider(self::OCR_TASK_TYPE); + return true; + } catch (\Exception $e) { + $this->logger->debug('OCR task type is not available: ' . $e->getMessage()); + return false; + } + } + + public function isSpeechToTextTaskTypeAvailable(): bool { + try { + $this->taskProcessingManager->getPreferredProvider(self::SPEECH_TO_TEXT_TASK_TYPE); + return true; + } catch (\Exception $e) { + $this->logger->debug('Speech-to-text task type is not available: ' . $e->getMessage()); + return false; + } + } + + /** + * @return list + */ + public function getMultimodalMimetypes(bool $includingTextual = true): array { + $imagesEnabled = $this->isOcrTaskTypeAvailable(); + $audioEnabled = $this->isSpeechToTextTaskTypeAvailable(); + return array_merge( + $includingTextual ? Application::MIMETYPES : [], + $imagesEnabled ? Application::IMAGE_MIMETYPES : [], + $audioEnabled ? Application::AUDIO_MIMETYPES : [] + ); + } +} From 7f69a727475ede4204a4ee1d854dca163647555b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 17 Mar 2026 14:24:08 +0530 Subject: [PATCH 2/2] chore: convert the multimodal file queueing command to a migration Signed-off-by: Anupam Kumar --- appinfo/info.xml | 1 - lib/Command/QueueMultimodalFiles.php | 68 ------------------- .../Version006000000Date20260316135634.php | 68 +++++++++++++++++++ 3 files changed, 68 insertions(+), 69 deletions(-) delete mode 100644 lib/Command/QueueMultimodalFiles.php create mode 100644 lib/Migration/Version006000000Date20260316135634.php diff --git a/appinfo/info.xml b/appinfo/info.xml index 3a6772d..744f35e 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -48,7 +48,6 @@ Refer to the [Context Chat Backend's readme](https://github.com/nextcloud/contex OCA\ContextChat\Command\Prompt - OCA\ContextChat\Command\QueueMultimodalFiles OCA\ContextChat\Command\Search OCA\ContextChat\Command\Statistics diff --git a/lib/Command/QueueMultimodalFiles.php b/lib/Command/QueueMultimodalFiles.php deleted file mode 100644 index 0bfd45a..0000000 --- a/lib/Command/QueueMultimodalFiles.php +++ /dev/null @@ -1,68 +0,0 @@ -setName('context_chat:queue-multimodal-files') - ->setDescription( - 'Queue existing multimodal files (Images and Audio) for indexation.' - . ' Each type of files is queued only if the required TaskProcessing task provider is available.' - . ' OCR for Images and Speech-to-text for Audio.' - . ' See https://docs.nextcloud.com/server/latest/admin_manual/ai/overview.html for more information.' - ); - } - - protected function execute(InputInterface $input, OutputInterface $output) { - if (!$this->taskTypeService->isOcrTaskTypeAvailable()) { - $output->writeln('OCR task type is not available.'); - } - if (!$this->taskTypeService->isSpeechToTextTaskTypeAvailable()) { - $output->writeln('Speech-to-text task type is not available.'); - } - - try { - foreach ($this->storageService->getMounts() as $mount) { - $this->logger->debug('Scheduling StorageCrawlJob storage_id=' . $mount['storage_id'] . ' root_id=' . $mount['root_id' ] . 'override_root=' . $mount['overridden_root']); - $this->jobList->add(StorageCrawlJob::class, [ - 'storage_id' => $mount['storage_id'], - 'root_id' => $mount['root_id' ], - 'overridden_root' => $mount['overridden_root'], - 'last_file_id' => 0, - 'only_non_textual' => true, - ]); - } - } catch (\Exception $e) { - $this->logger->error('Failed to schedule StorageCrawlJob to find files for indexation.', ['exception' => $e]); - $output->writeln('Failed to schedule StorageCrawlJob to find files for indexation: ' . $e->getMessage() . ''); - return 1; - } - - $output->writeln('Multimodal files have been scheduled to be queued for indexation.'); - return 0; - } -} diff --git a/lib/Migration/Version006000000Date20260316135634.php b/lib/Migration/Version006000000Date20260316135634.php new file mode 100644 index 0000000..5ed9c91 --- /dev/null +++ b/lib/Migration/Version006000000Date20260316135634.php @@ -0,0 +1,68 @@ +taskTypeService->isOcrTaskTypeAvailable()) { + $output->warning('[Context Chat] OCR task type is not available, image files will not be indexed.'); + } + if (!$this->taskTypeService->isSpeechToTextTaskTypeAvailable()) { + $output->warning('[Context Chat] Speech-to-text task type is not available, audio files will not be indexed.'); + } + + try { + foreach ($this->storageService->getMounts() as $mount) { + $this->logger->debug('Scheduling StorageCrawlJob storage_id=' . $mount['storage_id'] . ' root_id=' . $mount['root_id' ] . 'override_root=' . $mount['overridden_root']); + $this->jobList->add(StorageCrawlJob::class, [ + 'storage_id' => $mount['storage_id'], + 'root_id' => $mount['root_id' ], + 'overridden_root' => $mount['overridden_root'], + 'last_file_id' => 0, + 'only_non_textual' => true, + ]); + } + } catch (\Exception $e) { + $this->logger->error('Failed to schedule StorageCrawlJob to find files for indexation.', ['exception' => $e]); + $output->warning('Failed to schedule StorageCrawlJob to find files for indexation: ' . $e->getMessage()); + return; + } + + $output->info('Multimodal files have been scheduled to be queued for indexation.'); + } +}