Switch to using dnaio for simpler install requirements

nickzoic · nickzoic · commit 57b5e0dd97d5 · 2025-06-26T12:52:35.000+10:00
diff --git a/countess/core/plugins.py b/countess/core/plugins.py
@@ -339,7 +339,7 @@ def execute(
             self.progress = 100
             return self.combine(ddbc, [ddbc.table(tablename)])
         else:
-            row_limit_per_file = (row_limit // len(filenames_and_params)) if row_limit else None
+            row_limit_per_file = int(row_limit // len(filenames_and_params)) if row_limit else None
             progress_per_file = 100 / len(filenames_and_params)
 
             def _load(x):
@@ -353,6 +353,7 @@ def _load(x):
                 logger.debug("DuckdbParallelLoadFilePlugin.execute _load table %s %s", tablename, repr(filename))
                 self.load_file_wrapper(cursor, filename, file_param, row_limit_per_file).create(tablename)
                 self.progress += progress_per_file / 2
+                logger.debug("DuckdbParallelLoadFilePlugin.execute _load table %s done", tablename)
                 return tablename
 
             # run a bunch of _loads in parallel threads, collecting them in whatever order they return.
diff --git a/countess/gui/main.py b/countess/gui/main.py
@@ -31,7 +31,7 @@
     info_button,
 )
 
-preview_row_limit: Optional[int] = (psutil.virtual_memory().available/1024/1024/1024)*10000
+preview_row_limit: Optional[int] = int(psutil.virtual_memory().available / 1024 / 1024 / 1024) * 10000
 
 usage = """usage: countess_gui [--log LEVEL] [INIFILE]
 
diff --git a/countess/plugins/fastq.py b/countess/plugins/fastq.py
@@ -34,33 +34,43 @@ def load_file(
     ) -> duckdb.DuckDBPyRelation:
         # Open the file, convert it to a RecordBatchReader and then
         # wrap that up as a DuckDBPyRelation so we can filter it.
-        fastq_iter = dnaio.open(filename, open_threads=1)
+        logger.debug("Loading file %s row_limit %s", filename, row_limit)
+
+        # Take up to row_limit records from this file
+        fastq_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
+
+        def _record_to_dict(record):
+            d = {"sequence": record.sequence}
+            if self.header_column:
+                d["header"] = record.name
+            return d
+
+        def _avg_quality(record):
+            return sum(ord(c) for c in record.qualities) / len(record.qualities) - 33
+
+        pyarrow_schema = pyarrow.schema([pyarrow.field("sequence", pyarrow.string())])
+        if self.header_column:
+            pyarrow_schema.append(pyarrow.field("header", pyarrow.string()))
+
+        # Generator which batches records 5000 at a time into RecordBatches
         record_batch_iter = (
-            pyarrow.RecordBatch.from_pylist([{'sequence': z.sequence, 'quality_scores': z.qualities} for z in y])
-            for y in itertools.batched(fastq_iter, 5000)
-        )
-        rel = cursor.from_arrow(
-            pyarrow.RecordBatchReader.from_batches(
-                pyarrow.schema({'sequence': 'str', 'quality_scores': 'str'}),
-                record_batch_iter
+            pyarrow.RecordBatch.from_pylist(
+                [
+                    _record_to_dict(record)
+                    for record in batch
+                    if self.min_avg_quality <= 0 or self.min_avg_quality <= _avg_quality(record)
+                ]
             )
+            for batch in itertools.batched(fastq_iter, 5000)
         )
-        if row_limit is not None:
-            pass
-        #rel = rel.limit(row_limit)
-
-        if self.min_avg_quality > 0:
-            rel = rel.filter(
-                "list_aggregate(list_transform(string_split(quality_scores, ''), x -> ord(x)), 'avg') - 33 >= %f"
-                % self.min_avg_quality.value
-            )
+
+        # We can turn that generator of RecordBatches into a temporary table
+        rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
 
         if self.group:
             rel = rel.aggregate("sequence, count(*) as count")
-        elif self.header_column:
-            rel = rel.project("sequence, name || ' ' || description as header")
-        else:
-            rel = rel.project("sequence")
+
+        logger.debug("Loading file %s row_limit %s done", filename, row_limit)
         return rel
 
     def combine(
@@ -83,23 +93,17 @@ class LoadFastaPlugin(DuckdbLoadFileWithTheLotPlugin):
 
     file_types = [("FASTA", [".fasta", ".fa", ".fasta.gz", ".fa.gz", ".fasta.bz2", ".fa.bz2"])]
 
-    sequence_column = StringParam("Sequence Column", "sequence")
-    header_column = StringParam("Header Column", "header")
-
     def load_file(
         self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
     ) -> duckdb.DuckDBPyRelation:
-        fasta_iter = dnaio.open(filename, open_threads=1)
+        pyarrow_schema = pyarrow.schema(
+            [pyarrow.field("sequence", pyarrow.string()), pyarrow.field("header", pyarrow.string())]
+        )
+
+        fasta_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
         record_batch_iter = (
-            pyarrow.RecordBatch.from_pylist([{'seq': z.sequence, 'qual': z.qualities} for z in y])
+            pyarrow.RecordBatch.from_pylist([{"sequence": z.sequence, "header": z.name} for z in y])
             for y in itertools.batched(fasta_iter, 5000)
         )
-        rel = cursor.from_arrow(
-            pyarrow.RecordBatchReader.from_batches(
-                pyarrow.schema({'seq': 'str', 'qual': 'str'}),
-                record_batch_iter
-            )
-        )
-        if row_limit is not None:
-            rel = rel.limit(row_limit)
+        rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
         return rel
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,6 +46,7 @@ dev = [
     'twine~=6.1.0',
     'packaging~=25.0',
     'pandas-stubs~=2.1.0',
+    'pyarrow-stubs~=20.0.0',
     'pytest~=7.2',
     'pytest-socket~=0.6.0',
     'requests-mock~=1.11.0',

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@`
`31`	`31`	`info_button,`
`32`	`32`	`)`
`33`	`33`
`34`		`-preview_row_limit: Optional[int] = (psutil.virtual_memory().available/1024/1024/1024)*10000`
	`34`	`+preview_row_limit: Optional[int] = int(psutil.virtual_memory().available / 1024 / 1024 / 1024) * 10000`
`35`	`35`
`36`	`36`	`usage = """usage: countess_gui [--log LEVEL] [INIFILE]`
`37`	`37`