diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f78abe2 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +.git +.github +.ipython +.jupyter +.pytest_cache +.venv +.vscode +CODE_OF_CONDUCT.md +CONTRIBUTING.md +Dockerfile +LICENSE diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ab4dfd3..1f63478 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -103,7 +103,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies - run: uv sync --dev --group local + run: uv sync --dev - name: Run local tests shell: bash diff --git a/Dockerfile b/Dockerfile index c25a823..2087969 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,16 @@ +# Dockerfile for running dlt pipelines + +# Dockerfile is based heavily on the example uv dockerfile: +# https://github.com/astral-sh/uv-docker-example + # Use a Python image with uv pre-installed -FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim +FROM ghcr.io/astral-sh/uv:python3.13-trixie-slim -# Install the project into `/app` -WORKDIR /app +# Set environment variable to noninteractive to prevent prompts during apt operations +ENV DEBIAN_FRONTEND=noninteractive + +# add tini +RUN apt-get update -y && apt-get install -y --no-install-recommends tini git # Enable bytecode compilation ENV UV_COMPILE_BYTECODE=1 @@ -16,6 +24,9 @@ ENV UV_NO_DEV=1 # Ensure installed tools can be executed out of the box ENV UV_TOOL_BIN_DIR=/usr/local/bin +# Install the project into `/app` +WORKDIR /app + # Install the project's dependencies using the lockfile and settings RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.lock,target=uv.lock \ @@ -35,10 +46,7 @@ ENV PATH="/app/.venv/bin:$PATH" RUN groupadd --system --gid 999 nonroot \ && useradd --system --gid 999 --uid 999 --create-home nonroot +COPY --chmod=+x ./scripts/entrypoint.sh /app/ # Use the non-root user to run our application USER nonroot - -# Reset the entrypoint, don't invoke `uv` -ENTRYPOINT [] - -# CMD ["uv", "run", "python", "--version"] +ENTRYPOINT ["./entrypoint.sh"] diff --git a/pyproject.toml b/pyproject.toml index d9dd88d..e88041a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,13 +9,11 @@ authors = [ ] dependencies = [ - "biopython>=1.86", "click>=8.3.1", "dlt[deltalake,filesystem,parquet]>=1.21.0", "lxml>=6.0.2", "pydantic>=2.12.5", "pydantic-settings>=2.12.0", - "ruff>=0.14.14", ] [project.scripts] @@ -25,35 +23,40 @@ uniref_pipeline = "cdm_data_loader_utils.pipelines.uniref_pipeline:cli" [dependency-groups] dev = [ + "berdl-notebook-utils>=0.0.1", + "biopython>=1.86", "pytest>=9.0.2", "pytest-asyncio>=1.3.0", "pytest-cov>=7.0.0", "pytest-env>=1.2.0", + "ruff>=0.14.14", ] experimental = [ "mutmut>=3.4.0", ] -local = [ - "berdl-notebook-utils>=0.0.1", -] -minio = [ - "boto3[crt]>=1.42.0", - "tqdm>=4.67.3", -] models = [ "genson>=1.3.0", "json2python-models>=0.3.1", ] -pipeline = [] xml = [ "xmlschema>=4.3.1", "xsdata[cli,lxml]>=26.1", ] +[project.optional-dependencies] +# for minio interactions -- see utils/minio.py for more details +minio = [ + "boto3[crt]>=1.42.0", + "tqdm>=4.67.3", +] + +biopython = [ + "biopython>=1.86", +] + [tool.ruff] line-length = 120 target-version = "py313" - # Exclude a variety of commonly ignored directories. exclude = [ "__pypackages__", @@ -127,7 +130,7 @@ select = [ "SLOT", # flake8-slots "SIM", # flake8-simplify "TID", # flake8-tidy-imports - "TCH", # flake8-type-checking + "TC", # flake8-type-checking "INT", # flake8-gettext "ARG", # flake8-unused-arguments "PTH", # flake8-use-pathlib @@ -149,7 +152,6 @@ select = [ # Allow autofix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] -unfixable = [] ignore = [ # D200: unnecessary-multiline-docstring diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh new file mode 100755 index 0000000..c38a68f --- /dev/null +++ b/scripts/entrypoint.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Ensure at least one argument is provided +if [ "$#" -eq 0 ]; then + echo "Usage: $0 {uniref|uniprot} [args...]" + exit 1 +fi + +cmd="$1" +shift + +case "$cmd" in + uniref) + # Run the uniref pipeline with any additional arguments via tini + exec /usr/bin/tini -- uv run uniref_pipeline "$@" + ;; + uniprot) + # Run the uniprot pipeline with any additional arguments via tini + exec /usr/bin/tini -- uv run uniprot_pipeline "$@" + ;; + *) + echo "Error: unknown command '$cmd'; valid commands are 'uniref' or 'uniprot'." >&2 + exit 1 + ;; +esac diff --git a/uv.lock b/uv.lock index c9fd9dd..b3483a6 100644 --- a/uv.lock +++ b/uv.lock @@ -469,32 +469,35 @@ name = "cdm-data-loader-utils" version = "0.1.0" source = { editable = "." } dependencies = [ - { name = "biopython" }, { name = "click" }, { name = "dlt", extra = ["deltalake", "filesystem", "parquet"] }, { name = "lxml" }, { name = "pydantic" }, { name = "pydantic-settings" }, - { name = "ruff" }, +] + +[package.optional-dependencies] +biopython = [ + { name = "biopython" }, +] +minio = [ + { name = "boto3", extra = ["crt"] }, + { name = "tqdm" }, ] [package.dev-dependencies] dev = [ + { name = "berdl-notebook-utils" }, + { name = "biopython" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, { name = "pytest-env" }, + { name = "ruff" }, ] experimental = [ { name = "mutmut" }, ] -local = [ - { name = "berdl-notebook-utils" }, -] -minio = [ - { name = "boto3", extra = ["crt"] }, - { name = "tqdm" }, -] models = [ { name = "genson" }, { name = "json2python-models" }, @@ -506,33 +509,32 @@ xml = [ [package.metadata] requires-dist = [ - { name = "biopython", specifier = ">=1.86" }, + { name = "biopython", marker = "extra == 'biopython'", specifier = ">=1.86" }, + { name = "boto3", extras = ["crt"], marker = "extra == 'minio'", specifier = ">=1.42.0" }, { name = "click", specifier = ">=8.3.1" }, { name = "dlt", extras = ["deltalake", "filesystem", "parquet"], specifier = ">=1.21.0" }, { name = "lxml", specifier = ">=6.0.2" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "pydantic-settings", specifier = ">=2.12.0" }, - { name = "ruff", specifier = ">=0.14.14" }, + { name = "tqdm", marker = "extra == 'minio'", specifier = ">=4.67.3" }, ] +provides-extras = ["minio", "biopython"] [package.metadata.requires-dev] dev = [ + { name = "berdl-notebook-utils", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" }, + { name = "biopython", specifier = ">=1.86" }, { name = "pytest", specifier = ">=9.0.2" }, { name = "pytest-asyncio", specifier = ">=1.3.0" }, { name = "pytest-cov", specifier = ">=7.0.0" }, { name = "pytest-env", specifier = ">=1.2.0" }, + { name = "ruff", specifier = ">=0.14.14" }, ] experimental = [{ name = "mutmut", specifier = ">=3.4.0" }] -local = [{ name = "berdl-notebook-utils", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" }] -minio = [ - { name = "boto3", extras = ["crt"], specifier = ">=1.42.0" }, - { name = "tqdm", specifier = ">=4.67.3" }, -] models = [ { name = "genson", specifier = ">=1.3.0" }, { name = "json2python-models", specifier = ">=0.3.1" }, ] -pipeline = [] xml = [ { name = "xmlschema", specifier = ">=4.3.1" }, { name = "xsdata", extras = ["cli", "lxml"], specifier = ">=26.1" },