diff --git a/docs/code.md b/docs/code.md index 6debce6..2e465ec 100644 --- a/docs/code.md +++ b/docs/code.md @@ -8,6 +8,7 @@ container in order to run: * --image "Docker image" * --input "path to file in cloud storage" * --input-recursive "path to directory in cloud storage" +* --boot-disk-image "Custom boot disk image to use" The following explains each option and how to choose which to use. @@ -231,3 +232,12 @@ chmod u+x ${SCRIPT_DIR}/*.sh ${SCRIPT_DIR}/script1.sh ${SCRIPT_DIR}/script2.sh ``` + +## --boot-disk-image "Custom boot disk image to use" + +For most uses, the underlying virtual machine image is transparent to your tasks; the provider default is sufficient. However, there are cases where you may want to specify a custom boot disk image. + +When using GPU accelerators, an image with GPU drivers is needed. The `google-batch` provider will automatically select the `batch-debian` image when `--accelerator-type` and `--accelerator-count` are specified. + +If your `dsub` task does not have internet access, it may fail as this image will attempt to update the GPU drivers. You may instead pre-build a custom image with the drivers installed, specify it with the `--boot-disk-image`, and set '--install-gpu-drivers' to `false` +Specifically for the `google-batch` provider, information about available images can be found [here](https://cloud.google.com/batch/docs/view-os-images) \ No newline at end of file diff --git a/dsub/_dsub_version.py b/dsub/_dsub_version.py index 89d7911..050f045 100644 --- a/dsub/_dsub_version.py +++ b/dsub/_dsub_version.py @@ -26,4 +26,6 @@ 0.1.3.dev0 -> 0.1.3 -> 0.1.4.dev0 -> ... """ -DSUB_VERSION = '0.5.1' + +DSUB_VERSION = '0.5.2' + diff --git a/dsub/commands/dsub.py b/dsub/commands/dsub.py index 48a1ced..69bfa3c 100644 --- a/dsub/commands/dsub.py +++ b/dsub/commands/dsub.py @@ -501,6 +501,20 @@ def _parse_arguments(prog, argv): following third-party software onto your job's Compute Engine instances: NVIDIA(R) Tesla(R) drivers and NVIDIA(R) CUDA toolkit. (default: 0)""") + google_common.add_argument( + '--boot-disk-image', + help="""Custom boot disk image to use (e.g., a deeplearning image with + GPU drivers pre-installed). If not specified and an accelerator is + present, the `google-batch` provider defaults to 'batch-debian'. + (default: None)""") + google_common.add_argument( + '--install-gpu-drivers', + type=lambda x: {'true': True, 'false': False}[x.lower()], + default=None, + help="""Whether to install GPU drivers. Defaults to true when an + accelerator is present. Set to false when + using images with pre-installed drivers. Valid values: true, false. + (default: auto-detect)""") google_common.add_argument( '--credentials-file', type=str, @@ -645,7 +659,9 @@ def _get_job_resources(args): enable_stackdriver_monitoring=args.enable_stackdriver_monitoring, max_retries=args.retries, max_preemptible_attempts=args.preemptible, - block_external_network=args.block_external_network) + block_external_network=args.block_external_network, + boot_disk_image=args.boot_disk_image, + install_gpu_drivers=args.install_gpu_drivers) def _get_job_metadata(provider, user_id, job_name, script, task_ids, diff --git a/dsub/lib/job_model.py b/dsub/lib/job_model.py index a098d47..7b39fde 100644 --- a/dsub/lib/job_model.py +++ b/dsub/lib/job_model.py @@ -445,6 +445,8 @@ class Resources( 'max_retries', 'max_preemptible_attempts', 'block_external_network', + 'boot_disk_image', + 'install_gpu_drivers', ])): """Job resource parameters related to CPUs, memory, and disk. @@ -484,6 +486,8 @@ class Resources( representing always preemtible. block_external_network (bool): Prevents the containers from accessing the external network. + boot_disk_image (str): Custom boot disk image to use + install_gpu_drivers (bool): Whether to install GPU drivers. """ __slots__ = () @@ -515,7 +519,9 @@ def __new__(cls, enable_stackdriver_monitoring=None, max_retries=None, max_preemptible_attempts=None, - block_external_network=None): + block_external_network=None, + boot_disk_image=None, + install_gpu_drivers=None): return super(Resources, cls).__new__(cls, min_cores, min_ram, machine_type, disk_size, disk_type, boot_disk_size, preemptible, image, @@ -525,7 +531,8 @@ def __new__(cls, accelerator_count, nvidia_driver_version, timeout, log_interval, ssh, enable_stackdriver_monitoring, max_retries, max_preemptible_attempts, - block_external_network) + block_external_network, boot_disk_image, + install_gpu_drivers) def ensure_job_params_are_complete(job_params): diff --git a/dsub/providers/google_batch.py b/dsub/providers/google_batch.py index 4fd6c19..73a35d4 100644 --- a/dsub/providers/google_batch.py +++ b/dsub/providers/google_batch.py @@ -698,6 +698,7 @@ def _create_batch_request( entrypoint='/bin/bash', volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'], commands=['-c', continuous_logging_cmd], + options=None ) ) @@ -711,6 +712,7 @@ def _create_batch_request( entrypoint='/bin/bash', volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'], commands=['-c', prepare_command], + options=None ) ) @@ -732,12 +734,15 @@ def _create_batch_request( cp_loop=google_utils.LOCALIZATION_LOOP, ), ], + options=None ) ) user_command_volumes = [f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'] for gcs_volume in self._get_gcs_volumes_for_user_command(mounts): user_command_volumes.append(gcs_volume) + # Add --gpus all option for GPU-enabled containers + container_options = '--gpus all' if job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia') else None runnables.append( # user-command google_batch_operations.build_runnable( @@ -756,6 +761,7 @@ def _create_batch_request( user_script=script_path, ), ], + options=container_options, ) ) @@ -777,6 +783,7 @@ def _create_batch_request( cp_loop=google_utils.DELOCALIZATION_LOOP, ), ], + options=None ) ) @@ -790,6 +797,7 @@ def _create_batch_request( entrypoint='/bin/bash', volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'], commands=['-c', logging_cmd], + options=None ), ) @@ -800,13 +808,23 @@ def _create_batch_request( boot_disk_size = ( job_resources.boot_disk_size if job_resources.boot_disk_size else 0 ) + # Determine boot disk image: use user-specified value, or default to batch-debian for GPU jobs + if job_resources.boot_disk_image: + boot_disk_image = job_resources.boot_disk_image + elif job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia'): + boot_disk_image = 'batch-debian' + else: + boot_disk_image = None + boot_disk = google_batch_operations.build_persistent_disk( size_gb=max(boot_disk_size, job_model.LARGE_BOOT_DISK_SIZE), disk_type=job_model.DEFAULT_DISK_TYPE, + image=boot_disk_image, ) disk = google_batch_operations.build_persistent_disk( size_gb=job_resources.disk_size, disk_type=job_resources.disk_type or job_model.DEFAULT_DISK_TYPE, + image=None ) attached_disk = google_batch_operations.build_attached_disk( disk=disk, device_name=google_utils.DATA_DISK_NAME @@ -834,11 +852,15 @@ def _create_batch_request( provisioning_model=self._get_provisioning_model(task_resources), ) + # Determine whether to install GPU drivers: use user-specified value, or default to True for GPU jobs + if job_resources.install_gpu_drivers is not None: + install_gpu_drivers = job_resources.install_gpu_drivers + else: + install_gpu_drivers = job_resources.accelerator_type is not None + ipt = google_batch_operations.build_instance_policy_or_template( instance_policy=instance_policy, - install_gpu_drivers=True - if job_resources.accelerator_type is not None - else False, + install_gpu_drivers=install_gpu_drivers, ) if job_resources.service_account: diff --git a/dsub/providers/google_batch_operations.py b/dsub/providers/google_batch_operations.py index 301da36..2536e1d 100644 --- a/dsub/providers/google_batch_operations.py +++ b/dsub/providers/google_batch_operations.py @@ -211,13 +211,14 @@ def build_task_group( def build_container( - image_uri: str, entrypoint: str, volumes: List[str], commands: List[str] + image_uri: str, entrypoint: str, volumes: List[str], commands: List[str], options: Optional[str] ) -> batch_v1.types.task.Runnable.Container: container = batch_v1.types.task.Runnable.Container() container.image_uri = image_uri container.entrypoint = entrypoint container.commands = commands container.volumes = volumes + container.options = options return container @@ -229,6 +230,7 @@ def build_runnable( entrypoint: str, volumes: List[str], commands: List[str], + options: Optional[str], ) -> batch_v1.types.task.Runnable: """Build a Runnable object for a Batch request. @@ -241,11 +243,12 @@ def build_runnable( entrypoint (str): Docker image entrypoint path volumes (List[str]): List of volume mounts (host_path:container_path) commands (List[str]): Command arguments to pass to the entrypoint + options (str): Container options such as "--gpus all" Returns: An object representing a Runnable """ - container = build_container(image_uri, entrypoint, volumes, commands) + container = build_container(image_uri, entrypoint, volumes, commands, options) runnable = batch_v1.Runnable() runnable.container = container runnable.background = run_in_background @@ -401,11 +404,12 @@ def build_attached_disk( def build_persistent_disk( - size_gb: int, disk_type: str + size_gb: int, disk_type: str, image: str ) -> batch_v1.types.AllocationPolicy.Disk: disk = batch_v1.AllocationPolicy.Disk() disk.type = disk_type disk.size_gb = size_gb + disk.image = image return disk diff --git a/test/integration/e2e_accelerator.google-batch.sh b/test/integration/e2e_accelerator.google-batch.sh new file mode 100755 index 0000000..9a9a1c6 --- /dev/null +++ b/test/integration/e2e_accelerator.google-batch.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset + +# Test GPU support in Google Batch provider. +# Validates that NVIDIA accelerators trigger: +# 1. --gpus all container option +# 2. batch-debian boot disk image +# 3. Actual GPU access in the running container +# +# Required environment variables: +# DOCKER_IMAGE - Google Artifact Registry image with GPU support +# Example: us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest +# PET_SA_EMAIL - Service account with access to GAR image and GPU resources +# Example: my-service-account@my-project.iam.gserviceaccount.com +# +# Optional environment variables (for VPC-SC or custom networking): +# GPU_NETWORK - Network configuration +# Example: projects/my-project/global/networks/my-network +# GPU_SUBNETWORK - Subnetwork configuration +# Example: projects/my-project/regions/us-central1/subnetworks/my-subnet +# GPU_USE_PRIVATE_ADDRESS - Set to any value to use private address + +readonly SCRIPT_DIR="$(dirname "${0}")" + +# Do standard test setup +source "${SCRIPT_DIR}/test_setup_e2e.sh" + +# Check GPU-specific prerequisites +if [[ -z "${DOCKER_IMAGE:-}" ]]; then + 1>&2 echo "ERROR: DOCKER_IMAGE environment variable is not set." + 1>&2 echo "This test requires a Google Artifact Registry image with GPU support." + 1>&2 echo "Set it with: export DOCKER_IMAGE='REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG'" + 1>&2 echo "Example: export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest'" + exit 1 +fi + +if [[ -z "${PET_SA_EMAIL:-}" ]]; then + 1>&2 echo "ERROR: PET_SA_EMAIL environment variable is not set." + 1>&2 echo "This test requires a service account with access to the GAR image and GPU resources." + 1>&2 echo "Set it with: export PET_SA_EMAIL='my-service-account@my-project.iam.gserviceaccount.com'" + exit 1 +fi + +echo "Launching GPU pipeline with Google Batch provider..." +echo " Using image: ${DOCKER_IMAGE}" +echo " Using service account: ${PET_SA_EMAIL}" + +# Test nvidia accelerator enables GPU features +# Uses DOCKER_IMAGE and PET_SA_EMAIL environment variables (required) +# Optionally uses GPU_NETWORK, GPU_SUBNETWORK, and GPU_USE_PRIVATE_ADDRESS if set +run_dsub \ + --provider 'google-batch' \ + --image "${DOCKER_IMAGE}" \ + --service-account "${PET_SA_EMAIL}" \ + ${GPU_NETWORK:+--network "${GPU_NETWORK}"} \ + ${GPU_SUBNETWORK:+--subnetwork "${GPU_SUBNETWORK}"} \ + ${GPU_USE_PRIVATE_ADDRESS:+--use-private-address} \ + --accelerator-type 'nvidia-tesla-t4' \ + --accelerator-count 1 \ + --env NVIDIA_VISIBLE_DEVICES=all \ + --command '\ + echo "=== GPU Detection Test ===" && \ + nvidia-smi && \ + echo "=== Boot Image Test ===" && \ + cat /etc/os-release | grep "ID=" && \ + echo "=== Container GPU Access Test ===" && \ + nvidia-smi -L' \ + --wait + +echo +echo "Checking GPU detection output..." + +# Check that GPU was detected and accessible +RESULT="$(gsutil cat "${STDOUT_LOG}")" + +# Validate GPU hardware was detected +if ! echo "${RESULT}" | grep -qi "Tesla T4"; then + 1>&2 echo "ERROR: Tesla T4 GPU not detected in nvidia-smi output!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate GPU memory info is present +if ! echo "${RESULT}" | grep -qi "GPU.*Memory"; then + 1>&2 echo "ERROR: GPU Memory information not found!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate container has GPU access (nvidia-smi -L should list GPUs) +if ! echo "${RESULT}" | grep -qi "GPU 0:"; then + 1>&2 echo "ERROR: Container does not have GPU access (nvidia-smi -L failed)!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +echo +echo "GPU test output (showing GPU was accessible):" +echo "*****************************" +echo "${RESULT}" +echo "*****************************" +echo "SUCCESS: GPU accelerator test passed!" +echo "- GPU hardware detected" +echo "- Container has GPU access" +echo "- batch-debian image used (implied by successful GPU access)" \ No newline at end of file diff --git a/test/integration/e2e_accelerator_vpc_sc.google-batch.sh b/test/integration/e2e_accelerator_vpc_sc.google-batch.sh new file mode 100755 index 0000000..b798992 --- /dev/null +++ b/test/integration/e2e_accelerator_vpc_sc.google-batch.sh @@ -0,0 +1,193 @@ +#!/bin/bash + +# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test GPU support in VPC-SC environments with Google Batch provider. +# Validates that custom boot disk images and driver installation flags work correctly: +# 1. Custom boot disk image with pre-installed drivers +# 2. --install-gpu-drivers false to skip driver downloads +# 3. VPC network configuration for VPC-SC perimeters +# 4. Private IP only mode for VPC-SC compliance +# 5. Actual GPU access in the running container with pre-installed drivers +# +# REQUIRED ENVIRONMENT VARIABLES: +# DOCKER_IMAGE - GPU-enabled container image from Google Artifact Registry +# Example: us-central1-docker.pkg.dev/my-project/my-repo/image:tag +# PET_SA_EMAIL - Service account email with access to GAR image and GPU resources +# Example: my-sa@my-project.iam.gserviceaccount.com +# YOUR_BUCKET - GCS bucket name for test outputs (do NOT include gs:// prefix) +# Example: my-test-bucket +# GPU_NETWORK - VPC network for VPC-SC perimeters +# Example: projects/my-project/global/networks/network +# GPU_SUBNETWORK - VPC subnetwork for VPC-SC perimeters (must match REGIONS) +# Example: projects/my-project/regions/us-west2/subnetworks/subnetwork +# REGIONS - GCP region where the job will run (must match subnetwork region) +# Example: us-west2 +# +# OPTIONAL ENVIRONMENT VARIABLES: +# BOOT_DISK_IMAGE - Custom boot disk image with pre-installed GPU drivers +# Default: projects/${PROJECT_ID}/global/images/deeplearning-driver +# Alternative: projects/deeplearning-platform-release/global/images/family/common-cu121-debian-11-py310 +# +# USAGE: +# export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/image:tag' +# export PET_SA_EMAIL='my-sa@my-project.iam.gserviceaccount.com' +# export YOUR_BUCKET='my-test-bucket' # Do NOT include gs:// +# export GPU_NETWORK='projects/my-project/global/networks/network' +# export GPU_SUBNETWORK='projects/my-project/regions/us-west2/subnetworks/subnetwork' +# export REGIONS='us-west2' # Must match subnetwork region +# ./test/integration/e2e_accelerator_vpc_sc.google-batch.sh + +set -o errexit +set -o nounset + +readonly SCRIPT_DIR="$(dirname "${0}")" + +# Do standard test setup +source "${SCRIPT_DIR}/test_setup_e2e.sh" + +# Check GPU-specific prerequisites +if [[ -z "${DOCKER_IMAGE:-}" ]]; then + 1>&2 echo "ERROR: DOCKER_IMAGE environment variable is not set." + 1>&2 echo "This test requires a Google Artifact Registry image with GPU support." + 1>&2 echo "Set it with: export DOCKER_IMAGE='REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG'" + 1>&2 echo "Example: export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest'" + exit 1 +fi + +if [[ -z "${PET_SA_EMAIL:-}" ]]; then + 1>&2 echo "ERROR: PET_SA_EMAIL environment variable is not set." + 1>&2 echo "This test requires a service account with access to the GAR image and GPU resources." + 1>&2 echo "Set it with: export PET_SA_EMAIL='my-service-account@my-project.iam.gserviceaccount.com'" + exit 1 +fi + +if [[ -z "${YOUR_BUCKET:-}" ]]; then + 1>&2 echo "ERROR: YOUR_BUCKET environment variable is not set." + 1>&2 echo "This test requires a GCS bucket for test outputs." + 1>&2 echo "Set it with: export YOUR_BUCKET='my-test-bucket' (do NOT include gs:// prefix)" + exit 1 +fi + +if [[ -z "${GPU_NETWORK:-}" ]]; then + 1>&2 echo "ERROR: GPU_NETWORK environment variable is not set." + 1>&2 echo "This VPC-SC test requires a VPC network configuration." + 1>&2 echo "Set it with: export GPU_NETWORK='projects/\${GOOGLE_CLOUD_PROJECT}/global/networks/network'" + exit 1 +fi + +if [[ -z "${GPU_SUBNETWORK:-}" ]]; then + 1>&2 echo "ERROR: GPU_SUBNETWORK environment variable is not set." + 1>&2 echo "This VPC-SC test requires a VPC subnetwork configuration." + 1>&2 echo "Set it with: export GPU_SUBNETWORK='projects/\${GOOGLE_CLOUD_PROJECT}/regions/us-west2/subnetworks/subnetwork'" + exit 1 +fi + +if [[ -z "${REGIONS:-}" ]]; then + 1>&2 echo "ERROR: REGIONS environment variable is not set." + 1>&2 echo "This VPC-SC test requires specifying the region (must match subnetwork region)." + 1>&2 echo "Set it with: export REGIONS='us-west2'" + exit 1 +fi + +# Optional: Custom boot disk image (defaults to project-specific deeplearning-driver image) +# For VPC-SC environments, this should be an image with pre-installed GPU drivers +if [[ -z "${BOOT_DISK_IMAGE:-}" ]]; then + # Default to custom Deep Learning image in the project + # This assumes you have created a custom image with GPU drivers pre-installed + BOOT_DISK_IMAGE="projects/${PROJECT_ID}/global/images/deeplearning-driver" + echo "Using default boot disk image: ${BOOT_DISK_IMAGE}" +else + echo "Using custom boot disk image: ${BOOT_DISK_IMAGE}" +fi + +echo "Launching GPU pipeline in VPC-SC mode with Google Batch provider..." +echo " Using GAR image: ${DOCKER_IMAGE}" +echo " Using service account: ${PET_SA_EMAIL}" +echo " Using boot disk image: ${BOOT_DISK_IMAGE}" +echo " Using VPC network: ${GPU_NETWORK}" +echo " Using VPC subnetwork: ${GPU_SUBNETWORK}" +echo " Region: ${REGIONS}" +echo " Install GPU drivers: false" +echo " Private IP only: true" + +# Test VPC-SC scenario with custom boot image and no driver installation +# Uses required VPC-SC parameters: GPU_NETWORK, GPU_SUBNETWORK, REGIONS +# Note: Calls dsub directly (not run_dsub) to avoid hardcoded network defaults in test_setup.sh +dsub \ + --provider 'google-batch' \ + --project "${PROJECT_ID}" \ + --regions "${REGIONS}" \ + --logging "${LOGGING}" \ + --image "${DOCKER_IMAGE}" \ + --service-account "${PET_SA_EMAIL}" \ + --network "${GPU_NETWORK}" \ + --subnetwork "${GPU_SUBNETWORK}" \ + --use-private-address \ + --boot-disk-image "${BOOT_DISK_IMAGE}" \ + --boot-disk-size 200 \ + --install-gpu-drivers false \ + --accelerator-type 'nvidia-tesla-t4' \ + --accelerator-count 1 \ + --env NVIDIA_VISIBLE_DEVICES=all \ + --command '\ + echo "=== GPU Detection Test ===" && \ + nvidia-smi && \ + echo "=== Boot Image Test ===" && \ + cat /etc/os-release | grep "ID=" && \ + echo "=== Container GPU Access Test ===" && \ + nvidia-smi -L' \ + --wait + +echo +echo "Checking GPU detection output..." + +# Check that GPU was detected and accessible +RESULT="$(gsutil cat "${STDOUT_LOG}")" + +# Validate GPU hardware was detected +if ! echo "${RESULT}" | grep -qi "Tesla T4"; then + 1>&2 echo "ERROR: Tesla T4 GPU not detected in nvidia-smi output!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate GPU memory info is present +if ! echo "${RESULT}" | grep -qi "GPU.*Memory"; then + 1>&2 echo "ERROR: GPU Memory information not found!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate container has GPU access (nvidia-smi -L should list GPUs) +if ! echo "${RESULT}" | grep -qi "GPU 0:"; then + 1>&2 echo "ERROR: Container does not have GPU access (nvidia-smi -L failed)!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +echo +echo "VPC-SC GPU test output (showing GPU was accessible with pre-installed drivers):" +echo "*****************************" +echo "${RESULT}" +echo "*****************************" +echo "SUCCESS: VPC-SC GPU accelerator test passed!" +echo "- GPU hardware detected" +echo "- Container has GPU access" +echo "- Custom boot disk image used: ${BOOT_DISK_IMAGE}" +echo "- GPU drivers pre-installed (driver installation was disabled)" diff --git a/test/integration/e2e_logging_paths.sh b/test/integration/e2e_logging_paths.sh index 7cac80c..314cfd5 100755 --- a/test/integration/e2e_logging_paths.sh +++ b/test/integration/e2e_logging_paths.sh @@ -27,7 +27,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME="log-test" -readonly JOB_USER="${USER}" +readonly JOB_USER="${USER:-$(whoami)}" # Test a basic job with base logging path echo "Subtest #1: Basic logging path" diff --git a/test/integration/e2e_logging_paths_pattern_tasks.sh b/test/integration/e2e_logging_paths_pattern_tasks.sh index 0150fc0..5524c01 100755 --- a/test/integration/e2e_logging_paths_pattern_tasks.sh +++ b/test/integration/e2e_logging_paths_pattern_tasks.sh @@ -31,7 +31,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME=$(logging_paths_tasks_setup::get_job_name) -readonly JOB_USER="${USER}" +readonly JOB_USER="${USER:-$(whoami)}" # Set up the tasks file logging_paths_tasks_setup::write_tasks_file diff --git a/test/integration/io_setup.sh b/test/integration/io_setup.sh index 79d2a24..6cf7caf 100644 --- a/test/integration/io_setup.sh +++ b/test/integration/io_setup.sh @@ -26,6 +26,8 @@ readonly INPUT_BAM_MD5="4afb9b8908959dbd4e2d5c54bf254c93" readonly REQUESTER_PAYS_INPUT_BAM_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${INPUT_BAM_FILE}" readonly REQUESTER_PAYS_POPULATION_FILE_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${POPULATION_FILE}" +# Set user variable like in other tests +readonly JOB_USER="${USER:-$(whoami)}" # This is the image we use to test the PD mount feature. # Inject the TEST_TOKEN into the name so that multiple tests can run # concurrently. Since the image test can be run multiple times for one @@ -230,7 +232,7 @@ function io_setup::check_dstat() { local dstat_output=$(run_dstat --status '*' --jobs "${job_id}" --full) echo " Checking user-id" - util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER}" + util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${JOB_USER}" echo " Checking logging" util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}" diff --git a/test/integration/io_tasks_setup.sh b/test/integration/io_tasks_setup.sh index f44f867..9b93627 100644 --- a/test/integration/io_tasks_setup.sh +++ b/test/integration/io_tasks_setup.sh @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +readonly JOB_USER="${USER:-$(whoami)}" readonly POPULATION_FILE="gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/20131219.superpopulations.tsv" readonly POPULATION_MD5="68a73f849b82071afe11888bac1aa8a7" @@ -119,7 +120,7 @@ function io_tasks_setup::check_dstat() { echo " Check task ${task_id}" echo " Checking user-id" - util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER}" + util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${JOB_USER}" echo " Checking logging" util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}/${job_id}.${task_id}.log" diff --git a/test/integration/test_setup.sh b/test/integration/test_setup.sh index 9984f40..769e0f6 100644 --- a/test/integration/test_setup.sh +++ b/test/integration/test_setup.sh @@ -30,6 +30,9 @@ # * Provide functions run_dsub, run_dstat, run_ddel which will call a function # with DSUB_PROVIDER-specific default parameters set. +# Set default USER if not already set (needed for Jupyterlab/Docker environments) +export USER="${USER:-$(whoami)}" + # If the DSUB_PROVIDER is not set, figure it out from the name of the script. # If the script name is ..sh, pull out the provider. # If the script name is .sh, use "local". @@ -89,14 +92,23 @@ function run_dsub() { } function dsub_google-batch() { + # Use REGIONS env var if set, otherwise fall back to LOCATION + local location="${LOCATION:-${REGIONS:-}}" + + # Use environment variables for VPC-SC configuration if set + local network="${GPU_NETWORK:-global/networks/default}" + local subnetwork="${GPU_SUBNETWORK:-regions/us-central1/subnetworks/default}" + local service_account="${PET_SA_EMAIL:-}" + dsub \ --provider google-batch \ --project "${PROJECT_ID}" \ ${location:+--location "${location}"} \ --logging "${LOGGING_OVERRIDE:-${LOGGING}}" \ - --network "global/networks/default" \ - --subnetwork "regions/us-central1/subnetworks/default" \ + --network "${network}" \ + --subnetwork "${subnetwork}" \ --use-private-address \ + ${service_account:+--service-account "${service_account}"} \ "${@}" } diff --git a/test/integration/test_setup_e2e.py b/test/integration/test_setup_e2e.py index 8f01a5f..898d878 100644 --- a/test/integration/test_setup_e2e.py +++ b/test/integration/test_setup_e2e.py @@ -171,16 +171,27 @@ def dsub_google_batch(dsub_args): if val: opt_args.append(var[1], val) - # pyformat: disable - return dsub_command.call([ + # Use environment variables for VPC-SC configuration if set + network = os.environ.get("GPU_NETWORK", "global/networks/default") + subnetwork = os.environ.get("GPU_SUBNETWORK", + "regions/us-central1/subnetworks/default") + location = os.environ.get("LOCATION", os.environ.get("REGIONS", "us-central1")) + service_account = os.environ.get("PET_SA_EMAIL", "") + + args = [ "--provider", "google-batch", "--project", PROJECT_ID, "--logging", LOGGING, - "--regions", "us-central1", - "--network", "global/networks/default", - "--subnetwork", "regions/us-central1/subnetworks/default", + "--regions", location, + "--network", network, + "--subnetwork", subnetwork, "--use-private-address" - ] + opt_args + dsub_args) + ] + if service_account: + args.extend(["--service-account", service_account]) + + # pyformat: disable + return dsub_command.call(args + opt_args + dsub_args) # pyformat: enable diff --git a/test/integration/test_setup_e2e.sh b/test/integration/test_setup_e2e.sh old mode 100644 new mode 100755 index 060f2a1..c5e7cc1 --- a/test/integration/test_setup_e2e.sh +++ b/test/integration/test_setup_e2e.sh @@ -81,6 +81,46 @@ else DSUB_BUCKET_REQUESTER_PAYS="dsub-test-requester-pays-public" fi +# GPU-specific prerequisites (optional, only needed for GPU tests) +if [[ -n "${DOCKER_IMAGE:-}" ]]; then + echo " GAR image for GPU tests: ${DOCKER_IMAGE}" + + # Check if PET_SA_EMAIL is also set + if [[ -z "${PET_SA_EMAIL:-}" ]]; then + 1>&2 echo "WARNING: DOCKER_IMAGE is set but PET_SA_EMAIL is not." + 1>&2 echo "GPU tests require both DOCKER_IMAGE and PET_SA_EMAIL to be set." + else + echo " Service account for GPU tests: ${PET_SA_EMAIL}" + + # Validate that the service account can access the GAR image + # echo " Validating service account access to GAR image..." + + # # Extract the repository from the image path + # # Format: REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG + # GAR_REPO=$(echo "${DOCKER_IMAGE}" | sed -E 's|^([^/]+/[^/]+/[^/]+)/.*|\1|') + + # # Check if the service account has permission to pull from this repository + # # We'll use gcloud artifacts docker images describe with impersonation + # if ! gcloud artifacts docker images describe "${DOCKER_IMAGE}" \ + # --impersonate-service-account="${PET_SA_EMAIL}" \ + # --quiet 2>/dev/null; then + # 1>&2 echo "WARNING: Service account ${PET_SA_EMAIL} may not have access to ${DOCKER_IMAGE}" + # 1>&2 echo "Please ensure the service account has 'Artifact Registry Reader' role on the repository." + # 1>&2 echo "You can grant access with:" + # 1>&2 echo " gcloud artifacts repositories add-iam-policy-binding REPO_NAME \\" + # 1>&2 echo " --location=LOCATION \\" + # 1>&2 echo " --member=serviceAccount:${PET_SA_EMAIL} \\" + # 1>&2 echo " --role=roles/artifactregistry.reader" + # else + # echo " ✓ Service account has access to GAR image" + # fi + fi +elif [[ -n "${PET_SA_EMAIL:-}" ]]; then + echo " Service account for GPU tests: ${PET_SA_EMAIL}" + 1>&2 echo "WARNING: PET_SA_EMAIL is set but DOCKER_IMAGE is not." + 1>&2 echo "GPU tests require both DOCKER_IMAGE and PET_SA_EMAIL to be set." +fi + # Set standard LOGGING, INPUTS, and OUTPUTS values readonly TEST_GCS_ROOT="gs://${DSUB_BUCKET}/dsub/sh/${DSUB_PROVIDER}/${TEST_NAME}" readonly TEST_GCS_DOCKER_ROOT="gs/${DSUB_BUCKET}/dsub/sh/${DSUB_PROVIDER}/${TEST_NAME}" diff --git a/test/unit/gpu_test.py b/test/unit/gpu_test.py new file mode 100644 index 0000000..0961524 --- /dev/null +++ b/test/unit/gpu_test.py @@ -0,0 +1,243 @@ +# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for GPU support in the Google Batch provider.""" + +import unittest +from dsub.providers import google_batch +from dsub.lib import job_model + + +class TestGPUSupport(unittest.TestCase): + """Test GPU-specific configurations in Google Batch provider.""" + + def _create_test_job_descriptor(self, accelerator_type=None, boot_disk_image=None, install_gpu_drivers=None): + """Create a minimal JobDescriptor for testing. + + Args: + accelerator_type: The accelerator type to use, or None for no accelerator. + boot_disk_image: Custom boot disk image, or None for default. + install_gpu_drivers: Whether to install GPU drivers, or None for default. + + Returns: + A JobDescriptor configured for testing. + """ + job_metadata = { + 'script': job_model.Script('test.sh', 'echo hello'), + 'job-id': 'test-job-id', + 'job-name': 'test-job-name', + 'user-id': 'test-user', + 'user-project': 'test-project', + 'dsub-version': '1-0-0' + } + + job_params = {} + job_model.ensure_job_params_are_complete(job_params) + + task_metadata = {} + task_params = { + 'labels': set(), + 'envs': set(), + 'inputs': set(), + 'outputs': set(), + 'input-recursives': set(), + 'output-recursives': set() + } + + task_resources = job_model.Resources( + logging_path=job_model.LoggingParam( + 'gs://test-bucket/logs.log', 'google-cloud-storage' + ) + ) + task_descriptor = job_model.TaskDescriptor( + task_metadata, task_params, task_resources + ) + + job_resources = job_model.Resources( + accelerator_type=accelerator_type, + image='gcr.io/test/image:latest', + boot_disk_image=boot_disk_image, + install_gpu_drivers=install_gpu_drivers + ) + + return job_model.JobDescriptor( + job_metadata, job_params, job_resources, [task_descriptor] + ) + + def _create_batch_request(self, job_descriptor): + """Create a batch request using the Google Batch provider. + + Args: + job_descriptor: The JobDescriptor to create a request for. + + Returns: + The CreateJobRequest object from the provider. + """ + provider = google_batch.GoogleBatchJobProvider( + dry_run=True, + project='test-project', + location='us-central1' + ) + return provider._create_batch_request(job_descriptor) + + def test_nvidia_accelerator_enables_gpu_options(self): + """Test that nvidia accelerators enable GPU-specific configurations.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100' + ) + request = self._create_batch_request(job_descriptor) + + # Extract the user command runnable (index 3 in the runnables list) + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + + # Verify GPU container options are set + self.assertEqual(user_runnable.container.options, "--gpus all") + + # Verify boot disk uses GPU-compatible image + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, "batch-debian") + + def test_non_nvidia_accelerator_uses_default_options(self): + """Test that non-nvidia accelerators use default configurations.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='tpu-v3' + ) + request = self._create_batch_request(job_descriptor) + + # Extract the user command runnable + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + + # Verify no GPU options are set + self.assertIn(user_runnable.container.options, [None, ""]) + + # Verify default boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, "") + + def test_no_accelerator_uses_default_options(self): + """Test that jobs without accelerators use default configurations.""" + job_descriptor = self._create_test_job_descriptor(accelerator_type=None) + request = self._create_batch_request(job_descriptor) + + # Extract the user command runnable + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + + # Verify no GPU options are set + self.assertIn(user_runnable.container.options, [None, ""]) + + # Verify default boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, "") + + def test_custom_boot_disk_image_overrides_default(self): + """Test that custom boot_disk_image overrides the default.""" + custom_image = "projects/deeplearning-platform-release/global/images/family/common-gpu" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + boot_disk_image=custom_image + ) + request = self._create_batch_request(job_descriptor) + + # Verify custom boot disk image is used instead of batch-debian + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, custom_image) + + # Verify GPU container options are still set + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertEqual(user_runnable.container.options, "--gpus all") + + def test_install_gpu_drivers_false_disables_driver_installation(self): + """Test that install_gpu_drivers=False disables driver installation.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + install_gpu_drivers=False + ) + request = self._create_batch_request(job_descriptor) + + # Verify GPU drivers are not installed + ipt = request.job.allocation_policy.instances[0] + self.assertFalse(ipt.install_gpu_drivers) + + # Verify GPU container options are still set + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertEqual(user_runnable.container.options, "--gpus all") + + def test_install_gpu_drivers_true_enables_driver_installation(self): + """Test that install_gpu_drivers=True enables driver installation.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + install_gpu_drivers=True + ) + request = self._create_batch_request(job_descriptor) + + # Verify GPU drivers are installed + ipt = request.job.allocation_policy.instances[0] + self.assertTrue(ipt.install_gpu_drivers) + + def test_vpc_sc_scenario_custom_image_no_drivers(self): + """Test VPC-SC scenario with custom image and no driver installation.""" + custom_image = "projects/deeplearning-platform-release/global/images/family/common-gpu" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + boot_disk_image=custom_image, + install_gpu_drivers=False + ) + request = self._create_batch_request(job_descriptor) + + # Verify custom boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, custom_image) + + # Verify GPU drivers are not installed + ipt = request.job.allocation_policy.instances[0] + self.assertFalse(ipt.install_gpu_drivers) + + # Verify GPU container options are still set (containers need GPU access) + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertEqual(user_runnable.container.options, "--gpus all") + + def test_default_install_gpu_drivers_true_for_nvidia(self): + """Test that install_gpu_drivers defaults to True for NVIDIA accelerators.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-t4' + ) + request = self._create_batch_request(job_descriptor) + + # Verify GPU drivers are installed by default + ipt = request.job.allocation_policy.instances[0] + self.assertTrue(ipt.install_gpu_drivers) + + def test_custom_boot_disk_image_without_accelerator(self): + """Test that custom boot_disk_image can be used without accelerators.""" + custom_image = "projects/my-project/global/images/my-custom-image" + job_descriptor = self._create_test_job_descriptor( + boot_disk_image=custom_image + ) + request = self._create_batch_request(job_descriptor) + + # Verify custom boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, custom_image) + + # Verify no GPU options are set + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertIn(user_runnable.container.options, [None, ""]) + + # Verify GPU drivers are not installed + ipt = request.job.allocation_policy.instances[0] + self.assertFalse(ipt.install_gpu_drivers) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file