refactor: removal of processing pool for duckdb data contract

stevenhsd · stevenhsd · commit 9befcc0a6de4 · 2026-02-13T11:59:34.000Z
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/dve/core_engine/backends/implementations/duckdb/contract.py b/src/dve/core_engine/backends/implementations/duckdb/contract.py
@@ -3,9 +3,7 @@
 # pylint: disable=R0903
 import logging
 from collections.abc import Iterator
-from concurrent.futures import Future, ProcessPoolExecutor, as_completed
 from functools import partial
-from multiprocessing import cpu_count
 from typing import Any, Optional
 from uuid import uuid4
 
@@ -71,12 +69,10 @@ def __init__(
         connection: DuckDBPyConnection,
         logger: Optional[logging.Logger] = None,
         debug: bool = False,
-        executor: Optional[ProcessPoolExecutor] = None,
         **kwargs: Any,
     ):
         self.debug = debug
         self._connection = connection
-        self._executor = ProcessPoolExecutor(cpu_count() - 1) if not executor else executor
         """A bool indicating whether to enable debug logging."""
 
         super().__init__(logger, **kwargs)
@@ -167,11 +163,8 @@ def apply_data_contract(
 
                 batches = pq.ParquetFile(entity_locations[entity_name]).iter_batches(10000)
                 msg_count = 0
-                futures: list[Future] = [
-                    self._executor.submit(row_validator_helper, batch) for batch in batches
-                ]
-                for future in as_completed(futures):
-                    if msgs := future.result():
+                for batch in batches:
+                    if msgs := row_validator_helper(arrow_batch=batch):
                         msg_writer.write_queue.put(msgs)
                         msg_count += len(msgs)
 
diff --git a/src/dve/pipeline/duckdb_pipeline.py b/src/dve/pipeline/duckdb_pipeline.py
@@ -1,7 +1,6 @@
 """DuckDB implementation for `Pipeline` object."""
 
 import logging
-from concurrent.futures import ProcessPoolExecutor
 from typing import Optional
 
 from duckdb import DuckDBPyConnection, DuckDBPyRelation
@@ -34,13 +33,12 @@ def __init__(
         reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
         job_run_id: Optional[int] = None,
         logger: Optional[logging.Logger] = None,
-        executor: Optional[ProcessPoolExecutor] = None,
     ):
         self._connection = connection
         super().__init__(
             processed_files_path,
             audit_tables,
-            DuckDBDataContract(connection=self._connection, executor=executor),
+            DuckDBDataContract(connection=self._connection),
             DuckDBStepImplementations.register_udfs(connection=self._connection),
             rules_path,
             submitted_files_path,
diff --git a/tests/features/environment.py b/tests/features/environment.py
@@ -1,5 +1,3 @@
-from concurrent.futures import ProcessPoolExecutor
-from multiprocessing import cpu_count
 import shutil
 import tempfile
 from pathlib import Path
@@ -29,7 +27,6 @@ def before_all(context: Context):
         temp_dir = Path(context.dbfs_root.__enter__())
         dbfs_impl = DBFSFilesystemImplementation(temp_dir)
         add_implementation(dbfs_impl)
-    context.process_pool = ProcessPoolExecutor(cpu_count() - 1)
 
 
 def before_scenario(context: Context, scenario: Scenario):
@@ -81,4 +78,3 @@ def after_all(context: Context):
 
     context.connection.close()
     shutil.rmtree(context.ddb_db_file.parent)
-    context.process_pool.shutdown(wait=True, cancel_futures=True)
diff --git a/tests/features/steps/steps_pipeline.py b/tests/features/steps/steps_pipeline.py
@@ -6,12 +6,11 @@
 
 """
 # pylint: disable=no-name-in-module
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial, reduce
-from itertools import chain
 import operator
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 from uuid import uuid4
 from behave import given, then, when  # type: ignore
 from behave.model import Row, Table
@@ -23,19 +22,16 @@
 
 import context_tools as ctxt
 import dve.parser.file_handling.service as fh
-from dve.pipeline.utils import SubmissionStatus, load_config
 
 import polars as pl
 from pyspark.sql import SparkSession
 from dve.core_engine.backends.implementations.duckdb.auditing import DDBAuditingManager
 from dve.core_engine.backends.implementations.spark.auditing import SparkAuditingManager
-from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
 from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
 from dve.pipeline.duckdb_pipeline import DDBDVEPipeline
 from dve.pipeline.spark_pipeline import SparkDVEPipeline
 
 from utilities import (
-    ERROR_DF_FIELDS,
     load_errors_from_service,
     get_test_file_path,
     SERVICE_TO_STORAGE_PATH_MAPPING,
@@ -74,8 +70,7 @@ def setup_duckdb_pipeline(
     connection: duckdb.DuckDBPyConnection,
     dataset_id: str,
     processing_path: Path,
-    schema_file_name: Optional[str] = None,
-    executor: Optional[ProcessPoolExecutor] = None
+    schema_file_name: Optional[str] = None
 ):
 
     schema_file_name = f"{dataset_id}.dischema.json" if not schema_file_name else schema_file_name
@@ -97,8 +92,7 @@ def setup_duckdb_pipeline(
         connection=connection,
         rules_path=rules_path,
         submitted_files_path=processing_path.as_posix(),
-        reference_data_loader=DuckDBRefDataLoader,
-        executor=executor
+        reference_data_loader=DuckDBRefDataLoader
     )
 
 
@@ -206,7 +200,7 @@ def add_pipeline_to_ctx(
     context: Context, implementation: str, schema_file_name: Optional[str] = None
 ):
     pipeline_map: Dict[str, Callable] = {
-        "duckdb": partial(setup_duckdb_pipeline, connection=context.connection, executor=context.process_pool),
+        "duckdb": partial(setup_duckdb_pipeline, connection=context.connection),
         "spark": partial(setup_spark_pipeline, spark=context.spark_session),
     }
     if not implementation in pipeline_map:
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py
@@ -1,6 +1,4 @@
-from concurrent.futures import ProcessPoolExecutor
 import json
-from multiprocessing import cpu_count
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
@@ -32,13 +30,8 @@
     temp_xml_file,
 )
 
-@pytest.fixture(scope="module")
-def temp_process_pool_executor():
-    with ProcessPoolExecutor(cpu_count() - 1) as pool:
-        yield pool
 
-
-def test_duckdb_data_contract_csv(temp_csv_file, temp_process_pool_executor):
+def test_duckdb_data_contract_csv(temp_csv_file):
     uri, _, _, mdl = temp_csv_file
     connection = default_connection
 
@@ -97,7 +90,7 @@ def test_duckdb_data_contract_csv(temp_csv_file, temp_process_pool_executor):
     }
     entity_locations: Dict[str, URI] = {"test_ds": str(uri)}
 
-    data_contract: DuckDBDataContract = DuckDBDataContract(connection, executor=temp_process_pool_executor)
+    data_contract: DuckDBDataContract = DuckDBDataContract(connection)
     entities, feedback_errors_uri, stage_successful = data_contract.apply_data_contract(get_parent(uri.as_posix()), entities, entity_locations, dc_meta)
     rel: DuckDBPyRelation = entities.get("test_ds")
     assert dict(zip(rel.columns, rel.dtypes)) == {
@@ -108,7 +101,7 @@ def test_duckdb_data_contract_csv(temp_csv_file, temp_process_pool_executor):
     assert stage_successful
 
 
-def test_duckdb_data_contract_xml(temp_xml_file, temp_process_pool_executor):
+def test_duckdb_data_contract_xml(temp_xml_file):
     uri, header_model, header_data, class_model, class_data = temp_xml_file
     connection = default_connection
     contract_meta = json.dumps(
@@ -195,7 +188,7 @@ def test_duckdb_data_contract_xml(temp_xml_file, temp_process_pool_executor):
         reporting_fields={"test_header": ["school"], "test_class_info": ["year"]},
     )
 
-    data_contract: DuckDBDataContract = DuckDBDataContract(connection, executor=temp_process_pool_executor)
+    data_contract: DuckDBDataContract = DuckDBDataContract(connection)
     entities, feedback_errors_uri, stage_successful = data_contract.apply_data_contract(get_parent(uri.as_posix()), entities, entity_locations, dc_meta)
     header_rel: DuckDBPyRelation = entities.get("test_header")
     header_expected_schema: Dict[str, DuckDBPyType] = {
@@ -335,11 +328,10 @@ def test_ddb_data_contract_read_nested_parquet(nested_all_string_parquet):
     }
 
 def test_duckdb_data_contract_custom_error_details(nested_all_string_parquet_w_errors,
-                                                  nested_parquet_custom_dc_err_details,
-                                                  temp_process_pool_executor):
+                                                  nested_parquet_custom_dc_err_details):
     parquet_uri, contract_meta, _ = nested_all_string_parquet_w_errors
     connection = default_connection
-    data_contract = DuckDBDataContract(connection, executor=temp_process_pool_executor)
+    data_contract = DuckDBDataContract(connection)
 
     entity = data_contract.read_parquet(path=parquet_uri)
     assert entity.count("*").fetchone()[0] == 2