From 4bfd7f73817d4d04c71d9f173241ecb7db44c366 Mon Sep 17 00:00:00 2001 From: Peter Su Date: Thu, 20 Nov 2025 16:20:21 +0000 Subject: [PATCH 01/11] porting over changes from verily-src/dsub 0.5.2 to allow dsub to use modern NVIDIA GPUs with a special case for using boot disk images that already have the GPU drivers installed --- README.md | 54 ++-- docs/code.md | 10 + docs/compute_resources.md | 5 +- docs/job_control.md | 12 +- docs/providers/README.md | 10 +- docs/troubleshooting.md | 18 +- dsub/_dsub_version.py | 2 +- dsub/commands/dsub.py | 39 ++- dsub/lib/job_model.py | 11 +- dsub/lib/param_util.py | 3 +- dsub/providers/google_base.py | 5 +- dsub/providers/google_batch.py | 154 +++++++++-- dsub/providers/google_batch_operations.py | 57 +++- dsub/providers/google_utils.py | 10 +- dsub/providers/provider_base.py | 7 +- setup.py | 14 +- .../e2e_accelerator.google-batch.sh | 124 +++++++++ .../e2e_accelerator_vpc_sc.google-batch.sh | 193 ++++++++++++++ test/integration/e2e_command_flag.sh | 1 - test/integration/e2e_ddel.sh | 2 + test/integration/e2e_dstat.sh | 49 ++-- test/integration/e2e_env_tasks.sh | 1 - test/integration/e2e_error.sh | 3 +- test/integration/e2e_io_recursive.sh | 2 +- test/integration/e2e_python.sh | 2 +- test/integration/e2e_python_api.py | 80 ++++-- test/integration/e2e_runtime.sh | 1 - test/integration/script_python.py | 2 +- test/integration/test_setup.sh | 5 + test/integration/test_setup_e2e.py | 5 +- test/integration/test_setup_e2e.sh | 40 +++ test/integration/unit_flags.google-batch.sh | 24 +- test/run_tests.sh | 11 +- test/unit/gpu_test.py | 243 ++++++++++++++++++ 34 files changed, 1004 insertions(+), 195 deletions(-) create mode 100755 test/integration/e2e_accelerator.google-batch.sh create mode 100755 test/integration/e2e_accelerator_vpc_sc.google-batch.sh mode change 100644 => 100755 test/integration/test_setup_e2e.sh create mode 100644 test/unit/gpu_test.py diff --git a/README.md b/README.md index a0f9c2e..4df7b46 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ your shell. #### Install the Google Cloud SDK -While not used directly by `dsub` for the `google-batch` or `google-cls-v2` providers, you are likely to want to install the command line tools found in the [Google +While not used directly by `dsub` for the `google-batch` provider, you are likely to want to install the command line tools found in the [Google Cloud SDK](https://cloud.google.com/sdk/). If you will be using the `local` provider for faster job development, @@ -156,13 +156,10 @@ You'll get quicker turnaround times and won't incur cloud charges using it. ### Getting started on Google Cloud -`dsub` currently supports the [Cloud Life Sciences v2beta](https://cloud.google.com/life-sciences/docs/reference/rest) -API from Google Cloud and is is developing support for the [Batch](https://cloud.google.com/batch/docs/reference/rest) +`dsub` currently supports the [Batch](https://cloud.google.com/batch/docs/reference/rest) API from Google Cloud. -`dsub` supports the v2beta API with the `google-cls-v2` provider. -`google-cls-v2` is the current default provider. `dsub` will be transitioning to -make `google-batch` the default in coming releases. +`google-batch` is the current default provider. The steps for getting started differ slightly as indicated in the steps below: @@ -171,10 +168,6 @@ The steps for getting started differ slightly as indicated in the steps below: 1. Enable the APIs: - - For the `v2beta` API (provider: `google-cls-v2`): - - [Enable the Cloud Life Sciences, Storage, and Compute APIs](https://console.cloud.google.com/flows/enableapi?apiid=lifesciences.googleapis.com,storage.googleapis.com,compute.googleapis.com&redirect=https://console.cloud.google.com) - - For the `batch` API (provider: `google-batch`): [Enable the Batch, Storage, and Compute APIs](https://console.cloud.google.com/flows/enableapi?apiid=batch.googleapis.com,storage.googleapis.com,compute.googleapis.com&redirect=https://console.cloud.google.com). @@ -203,20 +196,6 @@ The steps for getting started differ slightly as indicated in the steps below: 1. Run a very simple "Hello World" `dsub` job and wait for completion. - - For the `v2beta` API (provider: `google-cls-v2`): - - dsub \ - --provider google-cls-v2 \ - --project my-cloud-project \ - --regions us-central1 \ - --logging gs://my-bucket/logging/ \ - --output OUT=gs://my-bucket/output/out.txt \ - --command 'echo "Hello World" > "${OUT}"' \ - --wait - - Change `my-cloud-project` to your Google Cloud project, and `my-bucket` to - the bucket you created above. - - For the `batch` API (provider: `google-batch`): dsub \ @@ -247,8 +226,7 @@ To this end, `dsub` provides multiple "backend providers", each of which implements a consistent runtime environment. The current providers are: - local -- google-cls-v2(the default) -- google-batch (*new*) +- google-batch (the default) More details on the runtime environment implemented by the backend providers can be found in [dsub backend providers](https://github.com/DataBiosphere/dsub/blob/main/docs/providers/README.md). @@ -449,7 +427,7 @@ mounting read-only: [Compute Engine Image](https://cloud.google.com/compute/docs/images) that you pre-create. -The `google-cls-v2` and `google-batch` provider support these methods of +The `google-batch` provider supports these methods of providing access to resource data. The `local` provider supports mounting a @@ -457,7 +435,7 @@ local directory in a similar fashion to support your local development. ##### Mounting a Google Cloud Storage bucket -To have the `google-cls-v2` or `google-batch` provider mount a +To have the `google-batch` provider mount a Cloud Storage bucket using [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse), use the `--mount` command line flag: @@ -474,7 +452,7 @@ before using Cloud Storage FUSE. ##### Mounting an existing peristent disk -To have the `google-cls-v2` or `google-batch` provider mount a persistent disk that +To have the `google-batch` provider mount a persistent disk that you have pre-created and populated, use the `--mount` command line flag and the url of the source disk: @@ -482,7 +460,7 @@ url of the source disk: ##### Mounting a persistent disk, created from an image -To have the `google-cls-v2` or `google-batch` provider mount a persistent disk created from an image, +To have the `google-batch` provider mount a persistent disk created from an image, use the `--mount` command line flag and the url of the source image and the size (in GB) of the disk: @@ -513,7 +491,7 @@ path using the environment variable. `dsub` tasks run using the `local` provider will use the resources available on your local machine. -`dsub` tasks run using the `google-cls-v2` or `google-batch` providers can take advantage +`dsub` tasks run using the `google-batch` provider can take advantage of a wide range of CPU, RAM, disk, and hardware accelerator (eg. GPU) options. See the [Compute Resources](https://github.com/DataBiosphere/dsub/blob/main/docs/compute_resources.md) @@ -620,14 +598,14 @@ For more details, see [Checking Status and Troubleshooting Jobs](https://github. The `dstat` command displays the status of jobs: - dstat --provider google-cls-v2 --project my-cloud-project + dstat --provider google-batch --project my-cloud-project With no additional arguments, dstat will display a list of *running* jobs for the current `USER`. To display the status of a specific job, use the `--jobs` flag: - dstat --provider google-cls-v2 --project my-cloud-project --jobs job-id + dstat --provider google-batch --project my-cloud-project --jobs job-id For a batch job, the output will list all *running* tasks. @@ -659,7 +637,7 @@ By default, dstat outputs one line per task. If you're using a batch job with many tasks then you may benefit from `--summary`. ``` -$ dstat --provider google-cls-v2 --project my-project --status '*' --summary +$ dstat --provider google-batch --project my-project --status '*' --summary Job Name Status Task Count ------------- ------------- ------------- @@ -680,25 +658,25 @@ Use the `--users` flag to specify other users, or `'*'` for all users. To delete a running job: - ddel --provider google-cls-v2 --project my-cloud-project --jobs job-id + ddel --provider google-batch --project my-cloud-project --jobs job-id If the job is a batch job, all running tasks will be deleted. To delete specific tasks: ddel \ - --provider google-cls-v2 \ + --provider google-batch \ --project my-cloud-project \ --jobs job-id \ --tasks task-id1 task-id2 To delete all running jobs for the current user: - ddel --provider google-cls-v2 --project my-cloud-project --jobs '*' + ddel --provider google-batch --project my-cloud-project --jobs '*' ## Service Accounts and Scope (Google providers only) -When you run the `dsub` command with the `google-cls-v2` or `google-batch` +When you run the `dsub` command with the `google-batch` provider, there are two different sets of credentials to consider: - Account submitting the `pipelines.run()` request to run your command/script on a VM diff --git a/docs/code.md b/docs/code.md index 6debce6..2e465ec 100644 --- a/docs/code.md +++ b/docs/code.md @@ -8,6 +8,7 @@ container in order to run: * --image "Docker image" * --input "path to file in cloud storage" * --input-recursive "path to directory in cloud storage" +* --boot-disk-image "Custom boot disk image to use" The following explains each option and how to choose which to use. @@ -231,3 +232,12 @@ chmod u+x ${SCRIPT_DIR}/*.sh ${SCRIPT_DIR}/script1.sh ${SCRIPT_DIR}/script2.sh ``` + +## --boot-disk-image "Custom boot disk image to use" + +For most uses, the underlying virtual machine image is transparent to your tasks; the provider default is sufficient. However, there are cases where you may want to specify a custom boot disk image. + +When using GPU accelerators, an image with GPU drivers is needed. The `google-batch` provider will automatically select the `batch-debian` image when `--accelerator-type` and `--accelerator-count` are specified. + +If your `dsub` task does not have internet access, it may fail as this image will attempt to update the GPU drivers. You may instead pre-build a custom image with the drivers installed, specify it with the `--boot-disk-image`, and set '--install-gpu-drivers' to `false` +Specifically for the `google-batch` provider, information about available images can be found [here](https://cloud.google.com/batch/docs/view-os-images) \ No newline at end of file diff --git a/docs/compute_resources.md b/docs/compute_resources.md index 3598745..6f5bc8d 100644 --- a/docs/compute_resources.md +++ b/docs/compute_resources.md @@ -129,10 +129,9 @@ was assigned.** ### Ensure your job accesses only Google Services -The default `--image` used for `dsub` tasks is `ubuntu:14.04` which is pulled -from Dockerhub. For VMs that do not have a public IP address, set the `--image` +The default `--image` used for `dsub` tasks is `marketplace.gcr.io/google/ubuntu2204` which is pulled +from Artifact Registry. For VMs that do not have a public IP address, set the `--image` flag to a Docker image hosted by -[Google Container Registry](https://cloud.google.com/container-registry/docs) or [Artifact Registry](https://cloud.google.com/artifact-registry/docs). Google provides a set of diff --git a/docs/job_control.md b/docs/job_control.md index 229cfea..d811576 100644 --- a/docs/job_control.md +++ b/docs/job_control.md @@ -61,22 +61,22 @@ dsub ... --after "${JOB_A}" "${JOB_B}" Here is the output of a sample run: ``` -$ JOBID_A=$(dsub --provider google-cls-v2 --project "${MYPROJECT}" --regions us-central1 \ +$ JOBID_A=$(dsub --provider google-batch --project "${MYPROJECT}" --regions us-central1 \ --logging "gs://${MYBUCKET}/logging/" \ --command 'echo "hello from job A"') Job: echo----180924-112256-64 Launched job-id: echo----180924-112256-64 To check the status, run: - dstat --provider google-cls-v2 --project ${MYPROJECT} --jobs 'echo----180924-112256-64' --status '*' + dstat --provider google-batch --project ${MYPROJECT} --jobs 'echo----180924-112256-64' --status '*' To cancel the job, run: - ddel --provider google-cls-v2 --project ${MYPROJECT} --jobs 'echo----180924-112256-64' + ddel --provider google-batch --project ${MYPROJECT} --jobs 'echo----180924-112256-64' $ echo "${JOBID_A}" echo----180924-112256-64 $ JOBID_B=... (similar) -$ JOBID_C=$(dsub --provider google-cls-v2 --project "${MYPROJECT}" --regions us-central1 \ +$ JOBID_C=$(dsub --provider google-batch --project "${MYPROJECT}" --regions us-central1 \ --logging "gs://${MYBUCKET}/logging/" \ --command 'echo "job C"' --after "${JOBID_A}" "${JOBID_B}") Waiting for predecessor jobs to complete... @@ -86,9 +86,9 @@ Waiting for: echo----180924-112259-48. echo----180924-112259-48: SUCCESS Launched job-id: echo----180924-112302-87 To check the status, run: - dstat --provider google-cls-v2 --project ${MYPROJECT} --jobs 'echo----180924-112302-87' --status '*' + dstat --provider google-batch --project ${MYPROJECT} --jobs 'echo----180924-112302-87' --status '*' To cancel the job, run: - ddel --provider google-cls-v2 --project ${MYPROJECT} --jobs 'echo----180924-112302-87' + ddel --provider google-batch --project ${MYPROJECT} --jobs 'echo----180924-112302-87' echo----180924-112302-87 ``` diff --git a/docs/providers/README.md b/docs/providers/README.md index 0d55b5d..c482dc8 100644 --- a/docs/providers/README.md +++ b/docs/providers/README.md @@ -10,8 +10,7 @@ implements a consistent runtime environment. The current supported providers are: - local -- google-cls-v2 (the default) -- google-batch (*new*) +- google-batch (the default) ## Runtime environment @@ -282,7 +281,7 @@ its status is `RUNNING`. #### Logging -The `google-cls-v2` and `google-batch` provider saves 3 log files to Cloud Storage, every 5 minutes +The `google-batch` provider saves 3 log files to Cloud Storage, every 5 minutes to the `--logging` location specified to `dsub`: - `[prefix].log`: log generated by all containers running on the VM @@ -293,7 +292,7 @@ Logging paths and the `[prefix]` are discussed further in [Logging](../logging.m #### Resource requirements -The `google-cls-v2` and `google-batch` providers support many resource-related +The `google-batch` providers support many resource-related flags to configure the Compute Engine VMs that tasks run on, such as `--machine-type` or `--min-cores` and `--min-ram`, as well as `--boot-disk-size` and `--disk-size`. Additional provider-specific parameters are available @@ -311,8 +310,7 @@ large Docker images are used, as such images need to be pulled to the boot disk. #### Provider specific parameters -The following `dsub` parameters are specific to the `google-cls-v2` and -`google-batch` providers: +The following `dsub` parameters are specific to the `google-batch` providers: * [Location resources](https://cloud.google.com/about/locations) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index cf59c20..2851ca4 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -20,7 +20,7 @@ When submitted with no filter arguments, `dstat` shows information for all tasks in the `RUNNING` state belonging to the current user: ``` -$ dstat --provider google-cls-v2 --project my-project +$ dstat --provider google-batch --project my-project Job Name Task Status Last Update -------------- ------ ---------------- ------------------- my-job-name task-3 localizing-files 2017-04-06 16:03:34 @@ -44,7 +44,7 @@ them separately. To check on a specific job, pass the `--jobs` (or `-j`) flag. For example: ``` -$ dstat --provider google-cls-v2 --project my-project --jobs my-job-id +$ dstat --provider google-batch --project my-project --jobs my-job-id Job Name Status Last Update -------------- -------- ------------------- my-job-name Pending 2017-04-11 16:05:35 @@ -59,7 +59,7 @@ value `*` to `dstat`: ``` $ dstat \ - --provider google-cls-v2 \ + --provider google-batch \ --project my-project \ --jobs my-job-id \ --status '*' @@ -75,7 +75,7 @@ Be sure to quote the `*` to prevent shell expansion. To view results for all jobs associated with your user id: ``` -dstat --provider google-cls-v2 --project my-project --status '*' +dstat --provider google-batch --project my-project --status '*' ``` ### Check jobs with my own labels @@ -176,7 +176,7 @@ use the `--age` flag. For example, the following command will return all jobs started in the last day: ``` -./dstat --provider google-cls-v2 --project my-project --status '*' --age 1d +./dstat --provider google-batch --project my-project --status '*' --age 1d ``` The `--age` flags supports the following types of values: @@ -218,7 +218,7 @@ The following examples shows minute-by-minute progression of 3 tasks ``` $ dstat \ - --provider google-cls-v2 \ + --provider google-batch \ --project my-project \ --jobs my-job-id \ --wait --poll-interval 60 @@ -281,7 +281,7 @@ maintain consistency between dsub versions. ``` $ dstat \ - --provider google-cls-v2 \ + --provider google-batch \ --project my-project \ --jobs my-job-id \ --full @@ -306,7 +306,7 @@ Note the `Internal ID` in this example provides the ``` $ dstat \ - --provider google-cls-v2 \ + --provider google-batch \ --project my-project \ --jobs my-job-id \ --format text \ @@ -320,7 +320,7 @@ my-job-id my-job-name Success 2017-04-11 16: ``` $ dstat \ - --provider google-v2 \ + --provider google-batch \ --project my-project \ --jobs my-job-id \ --format json \ diff --git a/dsub/_dsub_version.py b/dsub/_dsub_version.py index 67f5361..8a1b9cf 100644 --- a/dsub/_dsub_version.py +++ b/dsub/_dsub_version.py @@ -26,4 +26,4 @@ 0.1.3.dev0 -> 0.1.3 -> 0.1.4.dev0 -> ... """ -DSUB_VERSION = '0.5.0' +DSUB_VERSION = '0.5.1.dev0' diff --git a/dsub/commands/dsub.py b/dsub/commands/dsub.py index 73d1973..69bfa3c 100644 --- a/dsub/commands/dsub.py +++ b/dsub/commands/dsub.py @@ -113,7 +113,7 @@ DEFAULT_OUTPUT_LOCAL_PATH = 'output' DEFAULT_MOUNT_LOCAL_PATH = 'mount' -DEFAULT_IMAGE = 'ubuntu:14.04' +DEFAULT_IMAGE = 'marketplace.gcr.io/google/ubuntu2204' class TaskParamAction(argparse.Action): @@ -202,11 +202,12 @@ def _check_nvidia_driver_version(args): ) -def _google_cls_v2_parse_arguments(args): - """Validated google-cls-v2 arguments.""" +def _google_parse_arguments(args): + """Validated google-cls-v2 and google-batch arguments.""" - # For the google-cls-v2 provider, the addition of the "--location" parameter, - # along with a default (us-central1), we can just default everything. + # For the google-cls-v2 and google-batch providers, with the addition of the + # "--location" parameter, along with a default (us-central1), we can just + # default everything. # So we only need to validate that there is not both a region and zone. if (args.zones and args.regions): @@ -216,6 +217,12 @@ def _google_cls_v2_parse_arguments(args): raise ValueError( '--machine-type not supported together with --min-cores or --min-ram.') + if args.provider == 'google-batch': + if args.zones and any('*' in zone for zone in args.zones): + raise ValueError('Wildcard zones not supported for google-batch.') + if args.regions and any('*' in region for region in args.regions): + raise ValueError('Wildcard regions not supported for google-batch.') + _check_private_address(args) _check_nvidia_driver_version(args) @@ -494,6 +501,20 @@ def _parse_arguments(prog, argv): following third-party software onto your job's Compute Engine instances: NVIDIA(R) Tesla(R) drivers and NVIDIA(R) CUDA toolkit. (default: 0)""") + google_common.add_argument( + '--boot-disk-image', + help="""Custom boot disk image to use (e.g., a deeplearning image with + GPU drivers pre-installed). If not specified and an accelerator is + present, the `google-batch` provider defaults to 'batch-debian'. + (default: None)""") + google_common.add_argument( + '--install-gpu-drivers', + type=lambda x: {'true': True, 'false': False}[x.lower()], + default=None, + help="""Whether to install GPU drivers. Defaults to true when an + accelerator is present. Set to false when + using images with pre-installed drivers. Valid values: true, false. + (default: auto-detect)""") google_common.add_argument( '--credentials-file', type=str, @@ -591,8 +612,8 @@ def _parse_arguments(prog, argv): 'local': ['logging'], }, argv) - if args.provider == 'google-cls-v2': - _google_cls_v2_parse_arguments(args) + if args.provider == 'google-cls-v2' or args.provider == 'google-batch': + _google_parse_arguments(args) return args @@ -638,7 +659,9 @@ def _get_job_resources(args): enable_stackdriver_monitoring=args.enable_stackdriver_monitoring, max_retries=args.retries, max_preemptible_attempts=args.preemptible, - block_external_network=args.block_external_network) + block_external_network=args.block_external_network, + boot_disk_image=args.boot_disk_image, + install_gpu_drivers=args.install_gpu_drivers) def _get_job_metadata(provider, user_id, job_name, script, task_ids, diff --git a/dsub/lib/job_model.py b/dsub/lib/job_model.py index a098d47..7b39fde 100644 --- a/dsub/lib/job_model.py +++ b/dsub/lib/job_model.py @@ -445,6 +445,8 @@ class Resources( 'max_retries', 'max_preemptible_attempts', 'block_external_network', + 'boot_disk_image', + 'install_gpu_drivers', ])): """Job resource parameters related to CPUs, memory, and disk. @@ -484,6 +486,8 @@ class Resources( representing always preemtible. block_external_network (bool): Prevents the containers from accessing the external network. + boot_disk_image (str): Custom boot disk image to use + install_gpu_drivers (bool): Whether to install GPU drivers. """ __slots__ = () @@ -515,7 +519,9 @@ def __new__(cls, enable_stackdriver_monitoring=None, max_retries=None, max_preemptible_attempts=None, - block_external_network=None): + block_external_network=None, + boot_disk_image=None, + install_gpu_drivers=None): return super(Resources, cls).__new__(cls, min_cores, min_ram, machine_type, disk_size, disk_type, boot_disk_size, preemptible, image, @@ -525,7 +531,8 @@ def __new__(cls, accelerator_count, nvidia_driver_version, timeout, log_interval, ssh, enable_stackdriver_monitoring, max_retries, max_preemptible_attempts, - block_external_network) + block_external_network, boot_disk_image, + install_gpu_drivers) def ensure_job_params_are_complete(job_params): diff --git a/dsub/lib/param_util.py b/dsub/lib/param_util.py index c5b9dba..064c594 100644 --- a/dsub/lib/param_util.py +++ b/dsub/lib/param_util.py @@ -910,8 +910,7 @@ def age_to_create_time(age, from_time=None): else: # If no unit is given treat the age as seconds from epoch, otherwise apply # the correct time unit. - return dsub_util.replace_timezone( - datetime.datetime.utcfromtimestamp(int(age)), pytz.utc) + return datetime.datetime.fromtimestamp(int(age), tz=pytz.utc) except (ValueError, OverflowError) as e: raise ValueError('Unable to parse age string %s: %s' % (age, e)) diff --git a/dsub/providers/google_base.py b/dsub/providers/google_base.py index 28019c9..6e3426a 100644 --- a/dsub/providers/google_base.py +++ b/dsub/providers/google_base.py @@ -340,7 +340,10 @@ def handle_cancel_response(request_id, response, exception): """Callback for the cancel response.""" del response # unused - if exception: + if exception and isinstance(exception, TypeError): + # TODO: Remove once Batch Python Client is updated. + canceled.append({'name': request_id}) + elif exception: # We don't generally expect any failures here, except possibly trying # to cancel an operation that is already canceled or finished. # diff --git a/dsub/providers/google_batch.py b/dsub/providers/google_batch.py index 654539b..172e216 100644 --- a/dsub/providers/google_batch.py +++ b/dsub/providers/google_batch.py @@ -18,7 +18,9 @@ """ import ast +import operator import os +import re import sys import textwrap from typing import Dict, List, Set @@ -92,7 +94,7 @@ # [task_id:task/,runnable_index:] # pylint: disable=anomalous-backslash-in-string -_LOG_FILTER_PYTHON = textwrap.dedent(""" +_LOG_FILTER_PYTHON = textwrap.dedent(r""" import fileinput import glob import re @@ -199,6 +201,52 @@ def copy_log_to_staging(glob_str: str, staging_path: str, filter_str: str = None """) +_EVENT_REGEX_MAP = { + 'scheduled': re.compile('^Job state is set from QUEUED to SCHEDULED'), + 'start': re.compile('^Job state is set from SCHEDULED to RUNNING'), + 'ok': re.compile('^Job state is set from RUNNING to SUCCEEDED'), + 'fail': re.compile('^Job state is set from .+? to FAILED'), + 'cancellation-in-progress': re.compile( + '^Job state is set from .+? to CANCELLATION_IN_PROGRESS' + ), + 'canceled': re.compile('^Job state is set from .+? to CANCELLED'), +} + + +class GoogleBatchEventMap(object): + """Helper for extracing a set of normalized, filtered operation events.""" + + def __init__(self, op: batch_v1.types.Job): + self._op = op + + def get_filtered_normalized_events(self): + """Map and filter the batch API events down to events of interest. + + Returns: + A list of maps containing the normalized, filtered events. + """ + events = {} + for event in google_batch_operations.get_status_events(self._op): + mapped, _ = self._map(event) + name = mapped['name'] + + events[name] = mapped + + return sorted(list(events.values()), key=operator.itemgetter('event-time')) + + def _map(self, event): + """Extract elements from a Batch status event and map to a named event.""" + description = event.description + event_time = event.event_time.rfc3339() + + for name, regex in _EVENT_REGEX_MAP.items(): + match = regex.match(description) + if match: + return {'name': name, 'event-time': event_time}, match + + return {'name': description, 'event-time': event_time}, None + + class GoogleBatchOperation(base.Task): """Task wrapper around a Batch API Job object.""" @@ -296,12 +344,19 @@ def get_field(self, field: str, default: str = None): elif field == 'provider': return _PROVIDER_NAME elif field == 'provider-attributes': - # TODO: This needs to return instance (VM) metadata - value = {} - value['preemptible'] = google_batch_operations.get_preemptible(self._op) + value = { + 'boot-disk-size': google_batch_operations.get_boot_disk_size( + self._op + ), + 'disk-size': google_batch_operations.get_disk_size(self._op), + 'disk-type': google_batch_operations.get_disk_type(self._op), + 'machine-type': google_batch_operations.get_machine_type(self._op), + 'regions': google_batch_operations.get_regions(self._op), + 'zones': google_batch_operations.get_zones(self._op), + 'preemptible': google_batch_operations.get_preemptible(self._op), + } elif field == 'events': - # TODO: This needs to return a list of events - value = [] + value = GoogleBatchEventMap(self._op).get_filtered_normalized_events() elif field == 'script-name': if self._job_descriptor: value = self._job_descriptor.job_metadata.get(field) @@ -321,9 +376,14 @@ def get_field(self, field: str, default: str = None): elif field == 'status': value = self._operation_status() elif field == 'status-message': - value = self._operation_status_message() + msg, _, _ = self._operation_status_message() + value = msg elif field == 'status-detail': - value = self._operation_status_message() + # As much detail as we can reasonably get from the operation + msg, _, detail = self._operation_status_message() + if detail: + msg = detail + value = msg else: raise ValueError(f'Unsupported field: "{field}"') @@ -348,7 +408,7 @@ def _operation_status(self): return 'RUNNING' if google_batch_operations.is_success(self._op): return 'SUCCESS' - if google_batch_operations.is_canceled(): + if google_batch_operations.is_canceled(self._op): return 'CANCELED' if google_batch_operations.is_failed(self._op): return 'FAILURE' @@ -360,11 +420,32 @@ def _operation_status(self): ) def _operation_status_message(self): - # TODO: This is intended to grab as much detail as possible - # Currently, just grabbing the description field from the last status_event + """Returns the most relevant status string and failed action. + + This string is meant for display only. + + Returns: + A triple of: + - printable status message + - the action that failed (if any) + - a detail message (if available) + """ + msg = '' + action = None + detail = None status_events = google_batch_operations.get_status_events(self._op) + if not google_batch_operations.is_done(self._op): + msg = 'RUNNING' + elif google_batch_operations.is_success(self._op): + msg = 'SUCCESS' + elif google_batch_operations.is_canceled(self._op): + msg = 'CANCELED' + elif google_batch_operations.is_failed(self._op): + msg = 'FAILURE' + if status_events: - return status_events[-1].description + detail = status_events[-1].description + return msg, action, detail class GoogleBatchBatchHandler(object): @@ -406,7 +487,7 @@ def _batch_handler_def(self): return GoogleBatchBatchHandler def _operations_cancel_api_def(self): - return batch_v1.BatchServiceClient().delete_job + return batch_v1.BatchServiceClient().cancel_job def _get_provisioning_model(self, task_resources): if task_resources.preemptible: @@ -436,10 +517,6 @@ def _get_batch_job_regions(self, regions, zones) -> List[str]: return [f'regions/{self._location}'] return (regions or []) + (zones or []) - def _get_create_time_filters(self, create_time_min, create_time_max): - # TODO: Currently, Batch API does not support filtering by create t. - return [] - def _get_logging_env(self, logging_uri, user_project, include_filter_script): """Returns the environment for actions that copy logging files.""" if not logging_uri.endswith('.log'): @@ -621,6 +698,7 @@ def _create_batch_request( entrypoint='/bin/bash', volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'], commands=['-c', continuous_logging_cmd], + options=None ) ) @@ -634,6 +712,7 @@ def _create_batch_request( entrypoint='/bin/bash', volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'], commands=['-c', prepare_command], + options=None ) ) @@ -655,12 +734,15 @@ def _create_batch_request( cp_loop=google_utils.LOCALIZATION_LOOP, ), ], + options=None ) ) user_command_volumes = [f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'] for gcs_volume in self._get_gcs_volumes_for_user_command(mounts): user_command_volumes.append(gcs_volume) + # Add --gpus all option for GPU-enabled containers + container_options = "--gpus all" if job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia') else None runnables.append( # user-command google_batch_operations.build_runnable( @@ -679,6 +761,7 @@ def _create_batch_request( user_script=script_path, ), ], + options=container_options, ) ) @@ -700,6 +783,7 @@ def _create_batch_request( cp_loop=google_utils.DELOCALIZATION_LOOP, ), ], + options=None ) ) @@ -713,6 +797,7 @@ def _create_batch_request( entrypoint='/bin/bash', volumes=[f'{_VOLUME_MOUNT_POINT}:{_DATA_MOUNT_POINT}'], commands=['-c', logging_cmd], + options=None ), ) @@ -720,15 +805,25 @@ def _create_batch_request( # instance type and resources attached to each VM. The AllocationPolicy # describes when, where, and how compute resources should be allocated # for the Job. + boot_disk_size = ( + job_resources.boot_disk_size if job_resources.boot_disk_size else 0 + ) + # Determine boot disk image: use user-specified value, or default to batch-debian for GPU jobs + if job_resources.boot_disk_image: + boot_disk_image = job_resources.boot_disk_image + elif job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia'): + boot_disk_image = "batch-debian" + else: + boot_disk_image = None boot_disk = google_batch_operations.build_persistent_disk( - size_gb=max( - job_resources.boot_disk_size, job_model.LARGE_BOOT_DISK_SIZE - ), + size_gb=max(boot_disk_size, job_model.LARGE_BOOT_DISK_SIZE), disk_type=job_model.DEFAULT_DISK_TYPE, + image=boot_disk_image, ) disk = google_batch_operations.build_persistent_disk( size_gb=job_resources.disk_size, disk_type=job_resources.disk_type or job_model.DEFAULT_DISK_TYPE, + image=None ) attached_disk = google_batch_operations.build_attached_disk( disk=disk, device_name=google_utils.DATA_DISK_NAME @@ -756,11 +851,17 @@ def _create_batch_request( provisioning_model=self._get_provisioning_model(task_resources), ) + # Determine whether to install GPU drivers: use user-specified value, or default to True for GPU jobs + if job_resources.install_gpu_drivers is not None: + install_gpu_drivers = job_resources.install_gpu_drivers + elif job_resources.accelerator_type is not None: + install_gpu_drivers = True + else: + install_gpu_drivers = False + ipt = google_batch_operations.build_instance_policy_or_template( instance_policy=instance_policy, - install_gpu_drivers=True - if job_resources.accelerator_type is not None - else False, + install_gpu_drivers=install_gpu_drivers, ) if job_resources.service_account: @@ -800,7 +901,7 @@ def _create_batch_request( # Bring together the task definition(s) and build the Job request. task_spec = google_batch_operations.build_task_spec( - runnables=runnables, volumes=([datadisk_volume] + gcs_volumes) + runnables=runnables, volumes=([datadisk_volume] + gcs_volumes), max_run_duration=job_resources.timeout ) task_group = google_batch_operations.build_task_group( task_spec, task_count=1, task_count_per_node=1 @@ -938,11 +1039,8 @@ def lookup_job_tasks( page_size=0, ): client = batch_v1.BatchServiceClient() - # TODO: Batch API has no 'done' filter like lifesciences API. - # Need to figure out how to filter for jobs that are completed. - empty_statuses = set() ops_filter = self._build_query_filter( - empty_statuses, + statuses, user_ids, job_ids, job_names, diff --git a/dsub/providers/google_batch_operations.py b/dsub/providers/google_batch_operations.py index 5bba01d..2d66b7d 100644 --- a/dsub/providers/google_batch_operations.py +++ b/dsub/providers/google_batch_operations.py @@ -48,6 +48,7 @@ def is_done(op: batch_v1.types.Job) -> bool: return op.status.state in [ batch_v1.types.job.JobStatus.State.SUCCEEDED, batch_v1.types.job.JobStatus.State.FAILED, + batch_v1.types.job.JobStatus.State.CANCELLED, ] @@ -56,10 +57,9 @@ def is_success(op: batch_v1.types.Job) -> bool: return op.status.state == batch_v1.types.job.JobStatus.State.SUCCEEDED -def is_canceled() -> bool: +def is_canceled(op: batch_v1.types.Job) -> bool: """Return whether the operation was canceled by the user.""" - # TODO: Verify if the batch job has a canceled enum - return False + return op.status.state == batch_v1.types.job.JobStatus.State.CANCELLED def is_failed(op: batch_v1.types.Job) -> bool: @@ -109,6 +109,44 @@ def get_preemptible(op: batch_v1.types.Job) -> bool: raise ValueError(f'Invalid provisioning_model value: {pm}') +def get_boot_disk_size(op: batch_v1.types.Job) -> int: + return op.allocation_policy.instances[0].policy.boot_disk.size_gb + + +def get_disk_size(op: batch_v1.types.Job) -> int: + return op.allocation_policy.instances[0].policy.disks[0].new_disk.size_gb + + +def get_disk_type(op: batch_v1.types.Job) -> str: + return op.allocation_policy.instances[0].policy.disks[0].new_disk.type + + +def get_machine_type(op: batch_v1.types.Job) -> str: + return op.allocation_policy.instances[0].policy.machine_type + + +def get_zones(op: batch_v1.types.Job) -> List[str]: + list_of_locations = list(op.allocation_policy.location.allowed_locations) + # Filter to get only zones and remove the prefix + zones = [ + location.replace('zones/', '') + for location in list_of_locations + if location.startswith('zones/') + ] + return zones + + +def get_regions(op: batch_v1.types.Job) -> List[str]: + list_of_locations = list(op.allocation_policy.location.allowed_locations) + # Filter to get only regions and remove the prefix + regions = [ + location.replace('regions/', '') + for location in list_of_locations + if location.startswith('regions/') + ] + return regions + + def build_job( task_groups: List[batch_v1.types.TaskGroup], allocation_policy: batch_v1.types.AllocationPolicy, @@ -126,6 +164,7 @@ def build_job( def build_task_spec( runnables: List[batch_v1.types.task.Runnable], volumes: List[batch_v1.types.Volume], + max_run_duration: str, ) -> batch_v1.types.TaskSpec: """Build a TaskSpec object for a Batch request. @@ -139,6 +178,7 @@ def build_task_spec( task_spec = batch_v1.TaskSpec() task_spec.runnables = runnables task_spec.volumes = volumes + task_spec.max_run_duration = max_run_duration return task_spec @@ -171,13 +211,14 @@ def build_task_group( def build_container( - image_uri: str, entrypoint: str, volumes: List[str], commands: List[str] + image_uri: str, entrypoint: str, volumes: List[str], commands: List[str], options: Optional[str] ) -> batch_v1.types.task.Runnable.Container: container = batch_v1.types.task.Runnable.Container() container.image_uri = image_uri container.entrypoint = entrypoint container.commands = commands container.volumes = volumes + container.options = options return container @@ -189,6 +230,7 @@ def build_runnable( entrypoint: str, volumes: List[str], commands: List[str], + options: Optional[str], ) -> batch_v1.types.task.Runnable: """Build a Runnable object for a Batch request. @@ -201,11 +243,12 @@ def build_runnable( entrypoint (str): Docker image entrypoint path volumes (List[str]): List of volume mounts (host_path:container_path) commands (List[str]): Command arguments to pass to the entrypoint + options (str): Container options such as "--gpus all" Returns: An object representing a Runnable """ - container = build_container(image_uri, entrypoint, volumes, commands) + container = build_container(image_uri, entrypoint, volumes, commands, options) runnable = batch_v1.Runnable() runnable.container = container runnable.background = run_in_background @@ -361,11 +404,13 @@ def build_attached_disk( def build_persistent_disk( - size_gb: int, disk_type: str + size_gb: int, disk_type: str, image: Optional[str] ) -> batch_v1.types.AllocationPolicy.Disk: disk = batch_v1.AllocationPolicy.Disk() disk.type = disk_type disk.size_gb = size_gb + if image: + disk.image = image return disk diff --git a/dsub/providers/google_utils.py b/dsub/providers/google_utils.py index 5b4f64d..cea1196 100644 --- a/dsub/providers/google_utils.py +++ b/dsub/providers/google_utils.py @@ -26,10 +26,10 @@ from ..lib import providers_util STATUS_FILTER_MAP = { - 'RUNNING': 'done = false', - 'CANCELED': 'error = 1', - 'FAILURE': 'error > 1', - 'SUCCESS': 'error = 0', + 'RUNNING': 'status.state="RUNNING" OR status.state="QUEUED" OR status.state="SCHEDULED"', + 'CANCELED': 'status.state="CANCELLED"', + 'FAILURE': 'status.state="FAILED"', + 'SUCCESS': 'status.state="SUCCEEDED"', } @@ -54,7 +54,7 @@ def label_filter(label_key, label_value): def create_time_filter(create_time, comparator): """Return a valid createTime filter for operations.list().""" - return 'createTime {} "{}"'.format(comparator, create_time) + return 'createTime {} "{}"'.format(comparator, create_time.isoformat()) # Generate command to create the directories for the dsub user environment diff --git a/dsub/providers/provider_base.py b/dsub/providers/provider_base.py index 814b2e4..8c15dca 100644 --- a/dsub/providers/provider_base.py +++ b/dsub/providers/provider_base.py @@ -81,14 +81,13 @@ def create_parser(prog): parser.add_argument( '--provider', - default='google-cls-v2', - choices=['local', 'google-cls-v2', 'google-batch', 'test-fails'], + default='google-batch', + choices=['local', 'google-batch', 'test-fails'], help="""Job service provider. Valid values are - "google-cls-v2" (Google's Pipelines API v2beta), "google-batch" (Google's Batch API v1alpha1), and "local" (local Docker execution). "test-*" providers are for testing purposes only. - (default: google-cls-v2)""", + (default: google-batch)""", metavar='PROVIDER', ) diff --git a/setup.py b/setup.py index 1e06af3..b7fcfba 100644 --- a/setup.py +++ b/setup.py @@ -14,22 +14,22 @@ # dependencies for dsub, ddel, dstat # Pin to known working versions to prevent episodic breakage from library # version mismatches. - # This version list generated: 05/03/2024 + # This version list generated: 09/03/2025 # direct dependencies - 'google-api-python-client>=2.47.0,<=2.131.0', - 'google-auth>=2.6.6,<=2.29.0', - 'google-cloud-batch<=0.17.20', + 'google-api-python-client>=2.47.0,<=2.181.0', + 'google-auth>=2.6.6,<=2.40.3', + 'google-cloud-batch>=0.17.36', 'python-dateutil<=2.9.0', - 'pytz<=2024.1', + 'pytz<=2025.2', 'pyyaml<=6.0.1', 'tenacity<=8.2.3', 'tabulate<=0.9.0', # downstream dependencies 'funcsigs==1.0.2', - 'google-api-core>=2.7.3,<=2.19.0', + 'google-api-core>=2.7.3,<=2.25.1', 'google-auth-httplib2<=0.2.0', 'httplib2<=0.22.0', - 'protobuf>=3.19.0,<=5.26.0', + 'protobuf>=3.19.0,<=6.32.0', 'pyasn1<=0.6.0', 'pyasn1-modules<=0.4.0', 'rsa<=4.9', diff --git a/test/integration/e2e_accelerator.google-batch.sh b/test/integration/e2e_accelerator.google-batch.sh new file mode 100755 index 0000000..9a9a1c6 --- /dev/null +++ b/test/integration/e2e_accelerator.google-batch.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset + +# Test GPU support in Google Batch provider. +# Validates that NVIDIA accelerators trigger: +# 1. --gpus all container option +# 2. batch-debian boot disk image +# 3. Actual GPU access in the running container +# +# Required environment variables: +# DOCKER_IMAGE - Google Artifact Registry image with GPU support +# Example: us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest +# PET_SA_EMAIL - Service account with access to GAR image and GPU resources +# Example: my-service-account@my-project.iam.gserviceaccount.com +# +# Optional environment variables (for VPC-SC or custom networking): +# GPU_NETWORK - Network configuration +# Example: projects/my-project/global/networks/my-network +# GPU_SUBNETWORK - Subnetwork configuration +# Example: projects/my-project/regions/us-central1/subnetworks/my-subnet +# GPU_USE_PRIVATE_ADDRESS - Set to any value to use private address + +readonly SCRIPT_DIR="$(dirname "${0}")" + +# Do standard test setup +source "${SCRIPT_DIR}/test_setup_e2e.sh" + +# Check GPU-specific prerequisites +if [[ -z "${DOCKER_IMAGE:-}" ]]; then + 1>&2 echo "ERROR: DOCKER_IMAGE environment variable is not set." + 1>&2 echo "This test requires a Google Artifact Registry image with GPU support." + 1>&2 echo "Set it with: export DOCKER_IMAGE='REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG'" + 1>&2 echo "Example: export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest'" + exit 1 +fi + +if [[ -z "${PET_SA_EMAIL:-}" ]]; then + 1>&2 echo "ERROR: PET_SA_EMAIL environment variable is not set." + 1>&2 echo "This test requires a service account with access to the GAR image and GPU resources." + 1>&2 echo "Set it with: export PET_SA_EMAIL='my-service-account@my-project.iam.gserviceaccount.com'" + exit 1 +fi + +echo "Launching GPU pipeline with Google Batch provider..." +echo " Using image: ${DOCKER_IMAGE}" +echo " Using service account: ${PET_SA_EMAIL}" + +# Test nvidia accelerator enables GPU features +# Uses DOCKER_IMAGE and PET_SA_EMAIL environment variables (required) +# Optionally uses GPU_NETWORK, GPU_SUBNETWORK, and GPU_USE_PRIVATE_ADDRESS if set +run_dsub \ + --provider 'google-batch' \ + --image "${DOCKER_IMAGE}" \ + --service-account "${PET_SA_EMAIL}" \ + ${GPU_NETWORK:+--network "${GPU_NETWORK}"} \ + ${GPU_SUBNETWORK:+--subnetwork "${GPU_SUBNETWORK}"} \ + ${GPU_USE_PRIVATE_ADDRESS:+--use-private-address} \ + --accelerator-type 'nvidia-tesla-t4' \ + --accelerator-count 1 \ + --env NVIDIA_VISIBLE_DEVICES=all \ + --command '\ + echo "=== GPU Detection Test ===" && \ + nvidia-smi && \ + echo "=== Boot Image Test ===" && \ + cat /etc/os-release | grep "ID=" && \ + echo "=== Container GPU Access Test ===" && \ + nvidia-smi -L' \ + --wait + +echo +echo "Checking GPU detection output..." + +# Check that GPU was detected and accessible +RESULT="$(gsutil cat "${STDOUT_LOG}")" + +# Validate GPU hardware was detected +if ! echo "${RESULT}" | grep -qi "Tesla T4"; then + 1>&2 echo "ERROR: Tesla T4 GPU not detected in nvidia-smi output!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate GPU memory info is present +if ! echo "${RESULT}" | grep -qi "GPU.*Memory"; then + 1>&2 echo "ERROR: GPU Memory information not found!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate container has GPU access (nvidia-smi -L should list GPUs) +if ! echo "${RESULT}" | grep -qi "GPU 0:"; then + 1>&2 echo "ERROR: Container does not have GPU access (nvidia-smi -L failed)!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +echo +echo "GPU test output (showing GPU was accessible):" +echo "*****************************" +echo "${RESULT}" +echo "*****************************" +echo "SUCCESS: GPU accelerator test passed!" +echo "- GPU hardware detected" +echo "- Container has GPU access" +echo "- batch-debian image used (implied by successful GPU access)" \ No newline at end of file diff --git a/test/integration/e2e_accelerator_vpc_sc.google-batch.sh b/test/integration/e2e_accelerator_vpc_sc.google-batch.sh new file mode 100755 index 0000000..b798992 --- /dev/null +++ b/test/integration/e2e_accelerator_vpc_sc.google-batch.sh @@ -0,0 +1,193 @@ +#!/bin/bash + +# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test GPU support in VPC-SC environments with Google Batch provider. +# Validates that custom boot disk images and driver installation flags work correctly: +# 1. Custom boot disk image with pre-installed drivers +# 2. --install-gpu-drivers false to skip driver downloads +# 3. VPC network configuration for VPC-SC perimeters +# 4. Private IP only mode for VPC-SC compliance +# 5. Actual GPU access in the running container with pre-installed drivers +# +# REQUIRED ENVIRONMENT VARIABLES: +# DOCKER_IMAGE - GPU-enabled container image from Google Artifact Registry +# Example: us-central1-docker.pkg.dev/my-project/my-repo/image:tag +# PET_SA_EMAIL - Service account email with access to GAR image and GPU resources +# Example: my-sa@my-project.iam.gserviceaccount.com +# YOUR_BUCKET - GCS bucket name for test outputs (do NOT include gs:// prefix) +# Example: my-test-bucket +# GPU_NETWORK - VPC network for VPC-SC perimeters +# Example: projects/my-project/global/networks/network +# GPU_SUBNETWORK - VPC subnetwork for VPC-SC perimeters (must match REGIONS) +# Example: projects/my-project/regions/us-west2/subnetworks/subnetwork +# REGIONS - GCP region where the job will run (must match subnetwork region) +# Example: us-west2 +# +# OPTIONAL ENVIRONMENT VARIABLES: +# BOOT_DISK_IMAGE - Custom boot disk image with pre-installed GPU drivers +# Default: projects/${PROJECT_ID}/global/images/deeplearning-driver +# Alternative: projects/deeplearning-platform-release/global/images/family/common-cu121-debian-11-py310 +# +# USAGE: +# export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/image:tag' +# export PET_SA_EMAIL='my-sa@my-project.iam.gserviceaccount.com' +# export YOUR_BUCKET='my-test-bucket' # Do NOT include gs:// +# export GPU_NETWORK='projects/my-project/global/networks/network' +# export GPU_SUBNETWORK='projects/my-project/regions/us-west2/subnetworks/subnetwork' +# export REGIONS='us-west2' # Must match subnetwork region +# ./test/integration/e2e_accelerator_vpc_sc.google-batch.sh + +set -o errexit +set -o nounset + +readonly SCRIPT_DIR="$(dirname "${0}")" + +# Do standard test setup +source "${SCRIPT_DIR}/test_setup_e2e.sh" + +# Check GPU-specific prerequisites +if [[ -z "${DOCKER_IMAGE:-}" ]]; then + 1>&2 echo "ERROR: DOCKER_IMAGE environment variable is not set." + 1>&2 echo "This test requires a Google Artifact Registry image with GPU support." + 1>&2 echo "Set it with: export DOCKER_IMAGE='REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG'" + 1>&2 echo "Example: export DOCKER_IMAGE='us-central1-docker.pkg.dev/my-project/my-repo/parabricks:latest'" + exit 1 +fi + +if [[ -z "${PET_SA_EMAIL:-}" ]]; then + 1>&2 echo "ERROR: PET_SA_EMAIL environment variable is not set." + 1>&2 echo "This test requires a service account with access to the GAR image and GPU resources." + 1>&2 echo "Set it with: export PET_SA_EMAIL='my-service-account@my-project.iam.gserviceaccount.com'" + exit 1 +fi + +if [[ -z "${YOUR_BUCKET:-}" ]]; then + 1>&2 echo "ERROR: YOUR_BUCKET environment variable is not set." + 1>&2 echo "This test requires a GCS bucket for test outputs." + 1>&2 echo "Set it with: export YOUR_BUCKET='my-test-bucket' (do NOT include gs:// prefix)" + exit 1 +fi + +if [[ -z "${GPU_NETWORK:-}" ]]; then + 1>&2 echo "ERROR: GPU_NETWORK environment variable is not set." + 1>&2 echo "This VPC-SC test requires a VPC network configuration." + 1>&2 echo "Set it with: export GPU_NETWORK='projects/\${GOOGLE_CLOUD_PROJECT}/global/networks/network'" + exit 1 +fi + +if [[ -z "${GPU_SUBNETWORK:-}" ]]; then + 1>&2 echo "ERROR: GPU_SUBNETWORK environment variable is not set." + 1>&2 echo "This VPC-SC test requires a VPC subnetwork configuration." + 1>&2 echo "Set it with: export GPU_SUBNETWORK='projects/\${GOOGLE_CLOUD_PROJECT}/regions/us-west2/subnetworks/subnetwork'" + exit 1 +fi + +if [[ -z "${REGIONS:-}" ]]; then + 1>&2 echo "ERROR: REGIONS environment variable is not set." + 1>&2 echo "This VPC-SC test requires specifying the region (must match subnetwork region)." + 1>&2 echo "Set it with: export REGIONS='us-west2'" + exit 1 +fi + +# Optional: Custom boot disk image (defaults to project-specific deeplearning-driver image) +# For VPC-SC environments, this should be an image with pre-installed GPU drivers +if [[ -z "${BOOT_DISK_IMAGE:-}" ]]; then + # Default to custom Deep Learning image in the project + # This assumes you have created a custom image with GPU drivers pre-installed + BOOT_DISK_IMAGE="projects/${PROJECT_ID}/global/images/deeplearning-driver" + echo "Using default boot disk image: ${BOOT_DISK_IMAGE}" +else + echo "Using custom boot disk image: ${BOOT_DISK_IMAGE}" +fi + +echo "Launching GPU pipeline in VPC-SC mode with Google Batch provider..." +echo " Using GAR image: ${DOCKER_IMAGE}" +echo " Using service account: ${PET_SA_EMAIL}" +echo " Using boot disk image: ${BOOT_DISK_IMAGE}" +echo " Using VPC network: ${GPU_NETWORK}" +echo " Using VPC subnetwork: ${GPU_SUBNETWORK}" +echo " Region: ${REGIONS}" +echo " Install GPU drivers: false" +echo " Private IP only: true" + +# Test VPC-SC scenario with custom boot image and no driver installation +# Uses required VPC-SC parameters: GPU_NETWORK, GPU_SUBNETWORK, REGIONS +# Note: Calls dsub directly (not run_dsub) to avoid hardcoded network defaults in test_setup.sh +dsub \ + --provider 'google-batch' \ + --project "${PROJECT_ID}" \ + --regions "${REGIONS}" \ + --logging "${LOGGING}" \ + --image "${DOCKER_IMAGE}" \ + --service-account "${PET_SA_EMAIL}" \ + --network "${GPU_NETWORK}" \ + --subnetwork "${GPU_SUBNETWORK}" \ + --use-private-address \ + --boot-disk-image "${BOOT_DISK_IMAGE}" \ + --boot-disk-size 200 \ + --install-gpu-drivers false \ + --accelerator-type 'nvidia-tesla-t4' \ + --accelerator-count 1 \ + --env NVIDIA_VISIBLE_DEVICES=all \ + --command '\ + echo "=== GPU Detection Test ===" && \ + nvidia-smi && \ + echo "=== Boot Image Test ===" && \ + cat /etc/os-release | grep "ID=" && \ + echo "=== Container GPU Access Test ===" && \ + nvidia-smi -L' \ + --wait + +echo +echo "Checking GPU detection output..." + +# Check that GPU was detected and accessible +RESULT="$(gsutil cat "${STDOUT_LOG}")" + +# Validate GPU hardware was detected +if ! echo "${RESULT}" | grep -qi "Tesla T4"; then + 1>&2 echo "ERROR: Tesla T4 GPU not detected in nvidia-smi output!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate GPU memory info is present +if ! echo "${RESULT}" | grep -qi "GPU.*Memory"; then + 1>&2 echo "ERROR: GPU Memory information not found!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +# Validate container has GPU access (nvidia-smi -L should list GPUs) +if ! echo "${RESULT}" | grep -qi "GPU 0:"; then + 1>&2 echo "ERROR: Container does not have GPU access (nvidia-smi -L failed)!" + 1>&2 echo "stdout content:" + 1>&2 echo "${RESULT}" + exit 1 +fi + +echo +echo "VPC-SC GPU test output (showing GPU was accessible with pre-installed drivers):" +echo "*****************************" +echo "${RESULT}" +echo "*****************************" +echo "SUCCESS: VPC-SC GPU accelerator test passed!" +echo "- GPU hardware detected" +echo "- Container has GPU access" +echo "- Custom boot disk image used: ${BOOT_DISK_IMAGE}" +echo "- GPU drivers pre-installed (driver installation was disabled)" diff --git a/test/integration/e2e_command_flag.sh b/test/integration/e2e_command_flag.sh index 77ef45a..da66abc 100755 --- a/test/integration/e2e_command_flag.sh +++ b/test/integration/e2e_command_flag.sh @@ -32,7 +32,6 @@ echo "Launching pipeline..." BOOT_DISK_SIZE=20 \ run_dsub \ - --image "debian:stable-slim" \ --min-cores "1" \ --min-ram "3.75" \ --env VAR1="VAL1" VAR2="VAL2" VAR3="VAL3" \ diff --git a/test/integration/e2e_ddel.sh b/test/integration/e2e_ddel.sh index 40f1b87..1e24b86 100755 --- a/test/integration/e2e_ddel.sh +++ b/test/integration/e2e_ddel.sh @@ -113,6 +113,8 @@ if ! util::dstat_yaml_job_has_valid_end_time "${DSTAT_OUTPUT}"; then exit 1 fi +echo "dstat output for ${JOB_ID} includes a valid end time." + # Verify that there is a canceled event util::dstat_yaml_assert_field_equal "${DSTAT_OUTPUT}" "[0].events.[-1].name" "canceled" diff --git a/test/integration/e2e_dstat.sh b/test/integration/e2e_dstat.sh index e00344d..7068d38 100755 --- a/test/integration/e2e_dstat.sh +++ b/test/integration/e2e_dstat.sh @@ -65,7 +65,12 @@ function verify_dstat_output() { check_completed_tasks="0 1 2" fi - local expected_events=(start pulling-image localizing-files running-docker delocalizing-files ok) + local expected_events + if [[ "${DSUB_PROVIDER}" == "google-batch" ]]; then + expected_events=(scheduled start ok) + else + expected_events=(start pulling-image localizing-files running-docker delocalizing-files ok) + fi for task in ${check_completed_tasks}; do util::dstat_out_assert_equal_events "${dstat_out}" "[${task}].events" "${expected_events[@]}" done @@ -116,8 +121,12 @@ function verify_dstat_google_provider_fields() { # For simplicity, let's just check when the tasks are complete. - # Check boot disk: expect default of 10 - util::dstat_yaml_assert_field_equal "${dstat_out}" "[${task}].provider-attributes.boot-disk-size" 10 + # Check boot disk: expect default of 10 (30 for google-batch) + local expected_boot_disk_size=10 + if [[ "${DSUB_PROVIDER}" == "google-batch" ]]; then + expected_boot_disk_size=30 + fi + util::dstat_yaml_assert_field_equal "${dstat_out}" "[${task}].provider-attributes.boot-disk-size" "${expected_boot_disk_size}" # Check data disk: expect default of 200, pd-standard util::dstat_yaml_assert_field_equal "${dstat_out}" "[${task}].provider-attributes.disk-size" 200 @@ -130,19 +139,23 @@ function verify_dstat_google_provider_fields() { util::dstat_yaml_assert_boolean_field_equal "${dstat_out}" "[${task}].provider-attributes.preemptible" "false" # Check that instance name is not empty - local instance_name=$(python3 "${SCRIPT_DIR}"/get_data_value.py "yaml" "${dstat_out}" "[${task}].provider-attributes.instance-name") - if [[ -z "${instance_name}" ]]; then - 1>&2 echo " - FAILURE: Instance ${instance_name} for job ${job_name}, task $((task+1)) is empty." - 1>&2 echo "${dstat_out}" - exit 1 - fi + # Instance name is not available through google-batch provider + if [[ "${DSUB_PROVIDER}" != "google-batch" ]]; then + local instance_name=$(python3 "${SCRIPT_DIR}"/get_data_value.py "yaml" "${dstat_out}" "[${task}].provider-attributes.instance-name") + if [[ -z "${instance_name}" ]]; then + 1>&2 echo " - FAILURE: Instance ${instance_name} for job ${job_name}, task $((task+1)) is empty." + 1>&2 echo "${dstat_out}" + exit 1 + fi - # Check zone exists and is expected format - local job_zone=$(python3 "${SCRIPT_DIR}"/get_data_value.py "yaml" "${dstat_out}" "[${task}].provider-attributes.zone") - if ! [[ "${job_zone}" =~ ^[a-z]{1,4}-[a-z]{2,15}[0-9]-[a-z]$ ]]; then - 1>&2 echo " - FAILURE: Zone ${job_zone} for job ${job_name}, task $((task+1)) not valid." - 1>&2 echo "${dstat_out}" - exit 1 + # Check zone exists and is expected format + # Zone is not available through google-batch provider + local job_zone=$(python3 "${SCRIPT_DIR}"/get_data_value.py "yaml" "${dstat_out}" "[${task}].provider-attributes.zone") + if ! [[ "${job_zone}" =~ ^[a-z]{1,4}-[a-z]{2,15}[0-9]-[a-z]$ ]]; then + 1>&2 echo " - FAILURE: Zone ${job_zone} for job ${job_name}, task $((task+1)) not valid." + 1>&2 echo "${dstat_out}" + exit 1 + fi fi done @@ -198,12 +211,6 @@ verify_dstat_output "${DSTAT_OUTPUT}" echo "Checking dstat (by job-name)..." -# For the google provider, sleep briefly to allow the Pipelines v1 -# to set the compute properties, which occurs shortly after pipeline submit. -if [[ "${DSUB_PROVIDER}" == "google" ]]; then - sleep 2 -fi - if ! DSTAT_OUTPUT="$(run_dstat --status 'RUNNING' 'SUCCESS' --full --names "${RUNNING_JOB_NAME_2}" "${RUNNING_JOB_NAME}" "${COMPLETED_JOB_NAME}" --label "test-token=${TEST_TOKEN}")"; then 1>&2 echo "dstat exited with a non-zero exit code!" 1>&2 echo "Output:" diff --git a/test/integration/e2e_env_tasks.sh b/test/integration/e2e_env_tasks.sh index af7a3ac..512a287 100755 --- a/test/integration/e2e_env_tasks.sh +++ b/test/integration/e2e_env_tasks.sh @@ -50,7 +50,6 @@ echo "Launching pipeline..." # VAL4 tests spaces in variable values # VAR6 tests empty variable value run_dsub \ - --image "ubuntu" \ --script "${SCRIPT_DIR}/script_env_test.sh" \ --env VAR1="VAL1" VAR2="VAL2" VAR3="VAL3" \ --env VAR4="VAL4 (four)" \ diff --git a/test/integration/e2e_error.sh b/test/integration/e2e_error.sh index 0126afc..2c11367 100755 --- a/test/integration/e2e_error.sh +++ b/test/integration/e2e_error.sh @@ -28,7 +28,6 @@ echo "Launching pipeline..." set +o errexit JOB_ID="$(run_dsub \ - --image 'debian:stable-slim' \ --name 'e2e-error' \ --command 'idontknowhowtounix' \ --wait)" @@ -44,6 +43,8 @@ declare -a EXPECTED_EVENTS if [[ "${DSUB_PROVIDER}" == "local" ]]; then # The local provider has slightly different events in this error case EXPECTED_EVENTS=(start pulling-image localizing-files running-docker delocalizing-files fail) +elif [[ "${DSUB_PROVIDER}" == "google-batch" ]]; then + EXPECTED_EVENTS=(scheduled start fail) else EXPECTED_EVENTS=(start pulling-image localizing-files running-docker fail) fi diff --git a/test/integration/e2e_io_recursive.sh b/test/integration/e2e_io_recursive.sh index b168db3..3bca94c 100755 --- a/test/integration/e2e_io_recursive.sh +++ b/test/integration/e2e_io_recursive.sh @@ -69,7 +69,7 @@ gsutil -m rsync -r "${LOCAL_INPUTS}" "${INPUTS}/" echo "Launching pipeline..." JOB_ID="$(run_dsub \ - --image "google/cloud-sdk:latest" \ + --image "gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" \ --script "${SCRIPT_DIR}/script_io_recursive.sh" \ --env FILE_CONTENTS="${FILE_CONTENTS}" \ --input INPUT_PATH_SHALLOW="${INPUTS}/shallow/*" \ diff --git a/test/integration/e2e_python.sh b/test/integration/e2e_python.sh index 8fc232d..2082eaa 100755 --- a/test/integration/e2e_python.sh +++ b/test/integration/e2e_python.sh @@ -27,7 +27,7 @@ source "${SCRIPT_DIR}/test_setup_e2e.sh" echo "Launching pipeline..." run_dsub \ - --image python \ + --image gcr.io/google-appengine/python \ --script "${SCRIPT_DIR}/script_python.py" \ --wait diff --git a/test/integration/e2e_python_api.py b/test/integration/e2e_python_api.py index 63841ea..7c50a9c 100644 --- a/test/integration/e2e_python_api.py +++ b/test/integration/e2e_python_api.py @@ -15,6 +15,7 @@ from __future__ import print_function +import datetime import sys import time @@ -76,8 +77,12 @@ def dsub_start_job(command, labels['test-name'] = test_setup.TEST_NAME logging = param_util.build_logging_param(test.LOGGING) + zones = ['us-central1-*'] + if test.DSUB_PROVIDER == 'google-batch': + zones = ['us-central1-a'] job_resources = job_model.Resources( - image='ubuntu', logging=logging, zones=['us-central1-*']) + image='ubuntu', logging=logging, zones=zones + ) env_data = {job_model.EnvParam(k, v) for (k, v) in envs.items()} label_data = {job_model.LabelParam(k, v) for (k, v) in labels.items()} @@ -203,6 +208,10 @@ def dstat_check_job_names(err_message, third_job = dstat_get_jobs(job_ids={job3['job-id']})[0] print('Checking jobs') +# Add a small epsilon to the create_time_max to account for floating point +# precision errors. +epsilon = datetime.timedelta(seconds=0.1) + first_ct = first_job['create-time'] second_ct = second_job['create-time'] third_ct = third_job['create-time'] @@ -210,63 +219,88 @@ def dstat_check_job_names(err_message, dstat_check_job_names( 'Get jobs by create_time_min = {}.'.format(first_ct), ['job3', 'job2', 'job1'], - create_time_min=first_ct) + create_time_min=first_ct, +) dstat_check_job_names( - 'Get jobs by create_time_min = {}.'.format(second_ct), ['job3', 'job2'], - create_time_min=second_ct) + 'Get jobs by create_time_min = {}.'.format(second_ct), + ['job3', 'job2'], + create_time_min=second_ct, +) dstat_check_job_names( - 'Get jobs by create_time_min = {}.'.format(third_ct), ['job3'], - create_time_min=third_ct) + 'Get jobs by create_time_min = {}.'.format(third_ct), + ['job3'], + create_time_min=third_ct, +) dstat_check_job_names( - 'Get jobs by create_time_max = {}.'.format(first_ct), ['job1'], - create_time_max=first_ct) + 'Get jobs by create_time_max = {}.'.format(first_ct + epsilon), + ['job1'], + create_time_max=first_ct + epsilon, +) dstat_check_job_names( - 'Get jobs by create_time_max = {}.'.format(second_ct), ['job2', 'job1'], - create_time_max=second_ct) + 'Get jobs by create_time_max = {}.'.format(second_ct + epsilon), + ['job2', 'job1'], + create_time_max=second_ct + epsilon, +) dstat_check_job_names( 'Get jobs by create_time_max = {}.'.format(third_ct), ['job3', 'job2', 'job1'], - create_time_max=third_ct) + create_time_max=third_ct + epsilon, +) dstat_check_job_names( - f'Get jobs by range: create_time_min = {first_ct}, create_time_max = {first_ct}.', + f'Get jobs by range: create_time_min = {first_ct}, create_time_max =' + f' {first_ct + epsilon}.', ['job1'], create_time_min=first_ct, - create_time_max=first_ct) + create_time_max=first_ct + epsilon, +) dstat_check_job_names( - f'Get jobs by range: create_time_min = {second_ct}, create_time_max = {second_ct}.', + f'Get jobs by range: create_time_min = {second_ct}, create_time_max =' + f' {second_ct + epsilon}.', ['job2'], create_time_min=second_ct, - create_time_max=second_ct) + create_time_max=second_ct + epsilon, +) dstat_check_job_names( - f'Get jobs by range: create_time_min = {third_ct}, create_time_max = {third_ct}.', + f'Get jobs by range: create_time_min = {third_ct}, create_time_max =' + f' {third_ct + epsilon}.', ['job3'], create_time_min=third_ct, - create_time_max=third_ct) + create_time_max=third_ct + epsilon, +) dstat_check_job_names( 'Get jobs by range: create_time_min = {}, create_time_max = {}.'.format( - first_ct, second_ct), ['job2', 'job1'], + first_ct, second_ct + epsilon + ), + ['job2', 'job1'], create_time_min=first_ct, - create_time_max=second_ct) + create_time_max=second_ct + epsilon, +) dstat_check_job_names( 'Get jobs by range: create_time_min = {}, create_time_max = {}.'.format( - second_ct, third_ct), ['job3', 'job2'], + second_ct, third_ct + epsilon + ), + ['job3', 'job2'], create_time_min=second_ct, - create_time_max=third_ct) + create_time_max=third_ct + epsilon, +) dstat_check_job_names( 'Get jobs by range: create_time_min = {}, create_time_max = {}.'.format( - first_ct, third_ct), ['job3', 'job2', 'job1'], + first_ct, third_ct + epsilon + ), + ['job3', 'job2', 'job1'], create_time_min=first_ct, - create_time_max=third_ct) + create_time_max=third_ct + epsilon, +) print('SUCCESS') diff --git a/test/integration/e2e_runtime.sh b/test/integration/e2e_runtime.sh index 4fb4e5a..fb276b9 100755 --- a/test/integration/e2e_runtime.sh +++ b/test/integration/e2e_runtime.sh @@ -42,7 +42,6 @@ echo "Launching pipeline..." run_dsub \ --project "${PROJECT_ID}" \ --logging "${LOGGING}" \ - --image "debian:stable-slim" \ --name "google_env.sh" \ --command "${COMMAND}" \ --wait diff --git a/test/integration/script_python.py b/test/integration/script_python.py index fff5042..164f3bc 100755 --- a/test/integration/script_python.py +++ b/test/integration/script_python.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python """Minimal Python script.""" from __future__ import print_function diff --git a/test/integration/test_setup.sh b/test/integration/test_setup.sh index 0e7fe14..e60f78a 100644 --- a/test/integration/test_setup.sh +++ b/test/integration/test_setup.sh @@ -89,11 +89,16 @@ function run_dsub() { } function dsub_google-batch() { + local location="${LOCATION:-}" + dsub \ --provider google-batch \ --project "${PROJECT_ID}" \ ${location:+--location "${location}"} \ --logging "${LOGGING_OVERRIDE:-${LOGGING}}" \ + --network "global/networks/default" \ + --subnetwork "regions/us-central1/subnetworks/default" \ + --use-private-address \ "${@}" } diff --git a/test/integration/test_setup_e2e.py b/test/integration/test_setup_e2e.py index d2534a5..8f01a5f 100644 --- a/test/integration/test_setup_e2e.py +++ b/test/integration/test_setup_e2e.py @@ -176,7 +176,10 @@ def dsub_google_batch(dsub_args): "--provider", "google-batch", "--project", PROJECT_ID, "--logging", LOGGING, - "--regions", "us-central1" + "--regions", "us-central1", + "--network", "global/networks/default", + "--subnetwork", "regions/us-central1/subnetworks/default", + "--use-private-address" ] + opt_args + dsub_args) # pyformat: enable diff --git a/test/integration/test_setup_e2e.sh b/test/integration/test_setup_e2e.sh old mode 100644 new mode 100755 index 060f2a1..c5e7cc1 --- a/test/integration/test_setup_e2e.sh +++ b/test/integration/test_setup_e2e.sh @@ -81,6 +81,46 @@ else DSUB_BUCKET_REQUESTER_PAYS="dsub-test-requester-pays-public" fi +# GPU-specific prerequisites (optional, only needed for GPU tests) +if [[ -n "${DOCKER_IMAGE:-}" ]]; then + echo " GAR image for GPU tests: ${DOCKER_IMAGE}" + + # Check if PET_SA_EMAIL is also set + if [[ -z "${PET_SA_EMAIL:-}" ]]; then + 1>&2 echo "WARNING: DOCKER_IMAGE is set but PET_SA_EMAIL is not." + 1>&2 echo "GPU tests require both DOCKER_IMAGE and PET_SA_EMAIL to be set." + else + echo " Service account for GPU tests: ${PET_SA_EMAIL}" + + # Validate that the service account can access the GAR image + # echo " Validating service account access to GAR image..." + + # # Extract the repository from the image path + # # Format: REGION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG + # GAR_REPO=$(echo "${DOCKER_IMAGE}" | sed -E 's|^([^/]+/[^/]+/[^/]+)/.*|\1|') + + # # Check if the service account has permission to pull from this repository + # # We'll use gcloud artifacts docker images describe with impersonation + # if ! gcloud artifacts docker images describe "${DOCKER_IMAGE}" \ + # --impersonate-service-account="${PET_SA_EMAIL}" \ + # --quiet 2>/dev/null; then + # 1>&2 echo "WARNING: Service account ${PET_SA_EMAIL} may not have access to ${DOCKER_IMAGE}" + # 1>&2 echo "Please ensure the service account has 'Artifact Registry Reader' role on the repository." + # 1>&2 echo "You can grant access with:" + # 1>&2 echo " gcloud artifacts repositories add-iam-policy-binding REPO_NAME \\" + # 1>&2 echo " --location=LOCATION \\" + # 1>&2 echo " --member=serviceAccount:${PET_SA_EMAIL} \\" + # 1>&2 echo " --role=roles/artifactregistry.reader" + # else + # echo " ✓ Service account has access to GAR image" + # fi + fi +elif [[ -n "${PET_SA_EMAIL:-}" ]]; then + echo " Service account for GPU tests: ${PET_SA_EMAIL}" + 1>&2 echo "WARNING: PET_SA_EMAIL is set but DOCKER_IMAGE is not." + 1>&2 echo "GPU tests require both DOCKER_IMAGE and PET_SA_EMAIL to be set." +fi + # Set standard LOGGING, INPUTS, and OUTPUTS values readonly TEST_GCS_ROOT="gs://${DSUB_BUCKET}/dsub/sh/${DSUB_PROVIDER}/${TEST_NAME}" readonly TEST_GCS_DOCKER_ROOT="gs/${DSUB_BUCKET}/dsub/sh/${DSUB_PROVIDER}/${TEST_NAME}" diff --git a/test/integration/unit_flags.google-batch.sh b/test/integration/unit_flags.google-batch.sh index b5b58c7..bec418f 100755 --- a/test/integration/unit_flags.google-batch.sh +++ b/test/integration/unit_flags.google-batch.sh @@ -228,7 +228,7 @@ readonly -f test_no_service_account function test_network() { local subtest="${FUNCNAME[0]}" - if call_dsub \ + if DOCKER_IMAGE_OVERRIDE="marketplace.gcr.io/google/debian9" call_dsub \ --command 'echo "${TEST_NAME}"' \ --network 'network-name-foo' \ --subnetwork 'subnetwork-name-foo' \ @@ -311,24 +311,16 @@ function test_region_and_zone() { --zones us-central1-f \ --regions us-central1; then - # Check that the output contains expected values - regions_result=$(grep " allowed_locations: \"regions/" "${TEST_STDERR}" | awk -F\" '{print $2}') - if [[ "${regions_result}" != "regions/us-central1" ]]; then - 1>&2 echo "location was actually ${regions_result}, expected regions/us-central1" - exit 1 - fi - - zones_result=$(grep " allowed_locations: \"zones/" "${TEST_STDERR}" | awk -F\" '{print $2}') - if [[ "${zones_result}" != "zones/us-central1-f" ]]; then - 1>&2 echo "location was actually ${zones_result}, expected zones/us-central1-f" - exit 1 - fi + 1>&2 echo "Both regions and zones specified - not detected" - test_passed "${subtest}" + test_failed "${subtest}" else - 1>&2 echo "Location not used as default region" + assert_output_empty - test_failed "${subtest}" + assert_err_contains \ + "ValueError: At most one of --regions and --zones may be specified" + + test_passed "${subtest}" fi } readonly -f test_neither_region_nor_zone diff --git a/test/run_tests.sh b/test/run_tests.sh index bba73a8..bc355ea 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -225,9 +225,14 @@ function get_test_providers() { e2e_after_fail.py | \ e2e_after_fail.sh | \ e2e_command_flag.sh | \ + e2e_ddel.sh | \ + e2e_ddel_tasks.sh | \ + e2e_dstat.sh | \ + e2e_dstat_tasks.sh | \ e2e_dsub_summary.sh | \ e2e_env_list.py | \ e2e_env_tasks.sh | \ + e2e_error.sh | \ e2e_image.sh | \ e2e_input_wildcards.sh | \ e2e_io.sh | \ @@ -236,7 +241,9 @@ function get_test_providers() { e2e_io_mount_bucket.google-cls-v2.sh | \ e2e_io_mount_bucket_requester_pays.google-cls-v2.sh | \ e2e_io_recursive.sh | \ + e2e_io_tasks.py | \ e2e_io_tasks.sh | \ + e2e_labels.sh | \ e2e_logging_content.sh | \ e2e_logging_fail.sh | \ e2e_logging_paths.sh | \ @@ -248,13 +255,15 @@ function get_test_providers() { e2e_non_root.sh | \ e2e_preemptible_retries_fail.google-cls-v2.sh | \ e2e_python.sh | \ + e2e_python_api.py | \ e2e_requester_pays_buckets.sh | \ e2e_retries_success.sh | \ e2e_retries_fail_1.sh | \ e2e_retries_fail_2.sh | \ e2e_runtime.sh | \ e2e_skip.sh | \ - e2e_skip_tasks.sh) + e2e_skip_tasks.sh | \ + e2e_user.google-cls-v2.sh) local all_provider_list="${DSUB_PROVIDER:-local google-cls-v2 google-batch}" ;; *) diff --git a/test/unit/gpu_test.py b/test/unit/gpu_test.py new file mode 100644 index 0000000..0961524 --- /dev/null +++ b/test/unit/gpu_test.py @@ -0,0 +1,243 @@ +# Copyright 2025 Verily Life Sciences Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for GPU support in the Google Batch provider.""" + +import unittest +from dsub.providers import google_batch +from dsub.lib import job_model + + +class TestGPUSupport(unittest.TestCase): + """Test GPU-specific configurations in Google Batch provider.""" + + def _create_test_job_descriptor(self, accelerator_type=None, boot_disk_image=None, install_gpu_drivers=None): + """Create a minimal JobDescriptor for testing. + + Args: + accelerator_type: The accelerator type to use, or None for no accelerator. + boot_disk_image: Custom boot disk image, or None for default. + install_gpu_drivers: Whether to install GPU drivers, or None for default. + + Returns: + A JobDescriptor configured for testing. + """ + job_metadata = { + 'script': job_model.Script('test.sh', 'echo hello'), + 'job-id': 'test-job-id', + 'job-name': 'test-job-name', + 'user-id': 'test-user', + 'user-project': 'test-project', + 'dsub-version': '1-0-0' + } + + job_params = {} + job_model.ensure_job_params_are_complete(job_params) + + task_metadata = {} + task_params = { + 'labels': set(), + 'envs': set(), + 'inputs': set(), + 'outputs': set(), + 'input-recursives': set(), + 'output-recursives': set() + } + + task_resources = job_model.Resources( + logging_path=job_model.LoggingParam( + 'gs://test-bucket/logs.log', 'google-cloud-storage' + ) + ) + task_descriptor = job_model.TaskDescriptor( + task_metadata, task_params, task_resources + ) + + job_resources = job_model.Resources( + accelerator_type=accelerator_type, + image='gcr.io/test/image:latest', + boot_disk_image=boot_disk_image, + install_gpu_drivers=install_gpu_drivers + ) + + return job_model.JobDescriptor( + job_metadata, job_params, job_resources, [task_descriptor] + ) + + def _create_batch_request(self, job_descriptor): + """Create a batch request using the Google Batch provider. + + Args: + job_descriptor: The JobDescriptor to create a request for. + + Returns: + The CreateJobRequest object from the provider. + """ + provider = google_batch.GoogleBatchJobProvider( + dry_run=True, + project='test-project', + location='us-central1' + ) + return provider._create_batch_request(job_descriptor) + + def test_nvidia_accelerator_enables_gpu_options(self): + """Test that nvidia accelerators enable GPU-specific configurations.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100' + ) + request = self._create_batch_request(job_descriptor) + + # Extract the user command runnable (index 3 in the runnables list) + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + + # Verify GPU container options are set + self.assertEqual(user_runnable.container.options, "--gpus all") + + # Verify boot disk uses GPU-compatible image + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, "batch-debian") + + def test_non_nvidia_accelerator_uses_default_options(self): + """Test that non-nvidia accelerators use default configurations.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='tpu-v3' + ) + request = self._create_batch_request(job_descriptor) + + # Extract the user command runnable + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + + # Verify no GPU options are set + self.assertIn(user_runnable.container.options, [None, ""]) + + # Verify default boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, "") + + def test_no_accelerator_uses_default_options(self): + """Test that jobs without accelerators use default configurations.""" + job_descriptor = self._create_test_job_descriptor(accelerator_type=None) + request = self._create_batch_request(job_descriptor) + + # Extract the user command runnable + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + + # Verify no GPU options are set + self.assertIn(user_runnable.container.options, [None, ""]) + + # Verify default boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, "") + + def test_custom_boot_disk_image_overrides_default(self): + """Test that custom boot_disk_image overrides the default.""" + custom_image = "projects/deeplearning-platform-release/global/images/family/common-gpu" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + boot_disk_image=custom_image + ) + request = self._create_batch_request(job_descriptor) + + # Verify custom boot disk image is used instead of batch-debian + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, custom_image) + + # Verify GPU container options are still set + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertEqual(user_runnable.container.options, "--gpus all") + + def test_install_gpu_drivers_false_disables_driver_installation(self): + """Test that install_gpu_drivers=False disables driver installation.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + install_gpu_drivers=False + ) + request = self._create_batch_request(job_descriptor) + + # Verify GPU drivers are not installed + ipt = request.job.allocation_policy.instances[0] + self.assertFalse(ipt.install_gpu_drivers) + + # Verify GPU container options are still set + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertEqual(user_runnable.container.options, "--gpus all") + + def test_install_gpu_drivers_true_enables_driver_installation(self): + """Test that install_gpu_drivers=True enables driver installation.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + install_gpu_drivers=True + ) + request = self._create_batch_request(job_descriptor) + + # Verify GPU drivers are installed + ipt = request.job.allocation_policy.instances[0] + self.assertTrue(ipt.install_gpu_drivers) + + def test_vpc_sc_scenario_custom_image_no_drivers(self): + """Test VPC-SC scenario with custom image and no driver installation.""" + custom_image = "projects/deeplearning-platform-release/global/images/family/common-gpu" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-a100', + boot_disk_image=custom_image, + install_gpu_drivers=False + ) + request = self._create_batch_request(job_descriptor) + + # Verify custom boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, custom_image) + + # Verify GPU drivers are not installed + ipt = request.job.allocation_policy.instances[0] + self.assertFalse(ipt.install_gpu_drivers) + + # Verify GPU container options are still set (containers need GPU access) + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertEqual(user_runnable.container.options, "--gpus all") + + def test_default_install_gpu_drivers_true_for_nvidia(self): + """Test that install_gpu_drivers defaults to True for NVIDIA accelerators.""" + job_descriptor = self._create_test_job_descriptor( + accelerator_type='nvidia-tesla-t4' + ) + request = self._create_batch_request(job_descriptor) + + # Verify GPU drivers are installed by default + ipt = request.job.allocation_policy.instances[0] + self.assertTrue(ipt.install_gpu_drivers) + + def test_custom_boot_disk_image_without_accelerator(self): + """Test that custom boot_disk_image can be used without accelerators.""" + custom_image = "projects/my-project/global/images/my-custom-image" + job_descriptor = self._create_test_job_descriptor( + boot_disk_image=custom_image + ) + request = self._create_batch_request(job_descriptor) + + # Verify custom boot disk image is used + instance_policy = request.job.allocation_policy.instances[0].policy + self.assertEqual(instance_policy.boot_disk.image, custom_image) + + # Verify no GPU options are set + user_runnable = request.job.task_groups[0].task_spec.runnables[3] + self.assertIn(user_runnable.container.options, [None, ""]) + + # Verify GPU drivers are not installed + ipt = request.job.allocation_policy.instances[0] + self.assertFalse(ipt.install_gpu_drivers) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 58d9330adb9ff544e62838a94fa1b3554179d87b Mon Sep 17 00:00:00 2001 From: Peter Su Date: Wed, 26 Nov 2025 18:42:53 +0000 Subject: [PATCH 02/11] reverting unnecessary change in a datetime format in dsub/lib/param_util.py --- dsub/lib/param_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dsub/lib/param_util.py b/dsub/lib/param_util.py index 064c594..c5b9dba 100644 --- a/dsub/lib/param_util.py +++ b/dsub/lib/param_util.py @@ -910,7 +910,8 @@ def age_to_create_time(age, from_time=None): else: # If no unit is given treat the age as seconds from epoch, otherwise apply # the correct time unit. - return datetime.datetime.fromtimestamp(int(age), tz=pytz.utc) + return dsub_util.replace_timezone( + datetime.datetime.utcfromtimestamp(int(age)), pytz.utc) except (ValueError, OverflowError) as e: raise ValueError('Unable to parse age string %s: %s' % (age, e)) From a7ca7ee3d2b5a38209c91b30799c9d407cf8514f Mon Sep 17 00:00:00 2001 From: Peter Su Date: Wed, 26 Nov 2025 18:44:13 +0000 Subject: [PATCH 03/11] made image required for google_batch_operations.build_persistent_disk, removed if conditional --- dsub/providers/google_batch_operations.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dsub/providers/google_batch_operations.py b/dsub/providers/google_batch_operations.py index 7c0ec35..2536e1d 100644 --- a/dsub/providers/google_batch_operations.py +++ b/dsub/providers/google_batch_operations.py @@ -404,13 +404,12 @@ def build_attached_disk( def build_persistent_disk( - size_gb: int, disk_type: str, image: Optional[str] + size_gb: int, disk_type: str, image: str ) -> batch_v1.types.AllocationPolicy.Disk: disk = batch_v1.AllocationPolicy.Disk() disk.type = disk_type disk.size_gb = size_gb - if image: - disk.image = image + disk.image = image return disk From 571ed98f20bddc7d1bbce208b21eca44ce172161 Mon Sep 17 00:00:00 2001 From: Peter Su Date: Tue, 2 Dec 2025 23:38:08 +0000 Subject: [PATCH 04/11] updated test_setup.sh to respect network, subnetwork, service_account, and location environment variables --- test/integration/test_setup.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/test/integration/test_setup.sh b/test/integration/test_setup.sh index e60f78a..2ff4c4c 100644 --- a/test/integration/test_setup.sh +++ b/test/integration/test_setup.sh @@ -89,16 +89,23 @@ function run_dsub() { } function dsub_google-batch() { - local location="${LOCATION:-}" + # Use REGIONS env var if set, otherwise fall back to LOCATION + local location="${LOCATION:-${REGIONS:-}}" + + # Use environment variables for VPC-SC configuration if set + local network="${GPU_NETWORK:-global/networks/default}" + local subnetwork="${GPU_SUBNETWORK:-regions/us-central1/subnetworks/default}" + local service_account="${PET_SA_EMAIL:-}" dsub \ --provider google-batch \ --project "${PROJECT_ID}" \ ${location:+--location "${location}"} \ --logging "${LOGGING_OVERRIDE:-${LOGGING}}" \ - --network "global/networks/default" \ - --subnetwork "regions/us-central1/subnetworks/default" \ + --network "${network}" \ + --subnetwork "${subnetwork}" \ --use-private-address \ + ${service_account:+--service-account "${service_account}"} \ "${@}" } From 4a5a8f64ea133bc868b3e324a7bdb743ce4add5a Mon Sep 17 00:00:00 2001 From: Peter Su Date: Tue, 16 Dec 2025 16:57:07 +0000 Subject: [PATCH 05/11] updating a few integration tests to explicitly set the USER variable to jupyter if unset. test_setup.sh updated to use env variables rather than hardcoded network settings --- test/integration/e2e_logging_paths.sh | 2 +- test/integration/e2e_logging_paths_pattern_tasks.sh | 2 +- test/integration/io_setup.sh | 2 +- test/integration/io_tasks_setup.sh | 2 +- test/integration/test_setup.sh | 3 +++ 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/integration/e2e_logging_paths.sh b/test/integration/e2e_logging_paths.sh index 7cac80c..36be744 100755 --- a/test/integration/e2e_logging_paths.sh +++ b/test/integration/e2e_logging_paths.sh @@ -27,7 +27,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME="log-test" -readonly JOB_USER="${USER}" +readonly JOB_USER="${USER:-jupyter}" # Test a basic job with base logging path echo "Subtest #1: Basic logging path" diff --git a/test/integration/e2e_logging_paths_pattern_tasks.sh b/test/integration/e2e_logging_paths_pattern_tasks.sh index 0150fc0..8d41c70 100755 --- a/test/integration/e2e_logging_paths_pattern_tasks.sh +++ b/test/integration/e2e_logging_paths_pattern_tasks.sh @@ -31,7 +31,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME=$(logging_paths_tasks_setup::get_job_name) -readonly JOB_USER="${USER}" +readonly JOB_USER="${USER:-jupyter}" # Set up the tasks file logging_paths_tasks_setup::write_tasks_file diff --git a/test/integration/io_setup.sh b/test/integration/io_setup.sh index 79d2a24..191a948 100644 --- a/test/integration/io_setup.sh +++ b/test/integration/io_setup.sh @@ -230,7 +230,7 @@ function io_setup::check_dstat() { local dstat_output=$(run_dstat --status '*' --jobs "${job_id}" --full) echo " Checking user-id" - util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER}" + util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER:-jupyter}" echo " Checking logging" util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}" diff --git a/test/integration/io_tasks_setup.sh b/test/integration/io_tasks_setup.sh index f44f867..36336be 100644 --- a/test/integration/io_tasks_setup.sh +++ b/test/integration/io_tasks_setup.sh @@ -119,7 +119,7 @@ function io_tasks_setup::check_dstat() { echo " Check task ${task_id}" echo " Checking user-id" - util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER}" + util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER:-jupyter}" echo " Checking logging" util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}/${job_id}.${task_id}.log" diff --git a/test/integration/test_setup.sh b/test/integration/test_setup.sh index 2ff4c4c..6f1e302 100644 --- a/test/integration/test_setup.sh +++ b/test/integration/test_setup.sh @@ -30,6 +30,9 @@ # * Provide functions run_dsub, run_dstat, run_ddel which will call a function # with DSUB_PROVIDER-specific default parameters set. +# Set default USER if not already set (needed for Jupyterlab/Docker environments) +export USER="${USER:-jupyter}" + # If the DSUB_PROVIDER is not set, figure it out from the name of the script. # If the script name is ..sh, pull out the provider. # If the script name is .sh, use "local". From df7ad924473415b2a5d28aa9d2a12407c57e61d5 Mon Sep 17 00:00:00 2001 From: Peter Su Date: Tue, 16 Dec 2025 22:51:40 +0000 Subject: [PATCH 06/11] updating test_setup_e2e.py to use env variables instead of hardcoded network and SA values --- test/integration/test_setup_e2e.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/test/integration/test_setup_e2e.py b/test/integration/test_setup_e2e.py index 8f01a5f..898d878 100644 --- a/test/integration/test_setup_e2e.py +++ b/test/integration/test_setup_e2e.py @@ -171,16 +171,27 @@ def dsub_google_batch(dsub_args): if val: opt_args.append(var[1], val) - # pyformat: disable - return dsub_command.call([ + # Use environment variables for VPC-SC configuration if set + network = os.environ.get("GPU_NETWORK", "global/networks/default") + subnetwork = os.environ.get("GPU_SUBNETWORK", + "regions/us-central1/subnetworks/default") + location = os.environ.get("LOCATION", os.environ.get("REGIONS", "us-central1")) + service_account = os.environ.get("PET_SA_EMAIL", "") + + args = [ "--provider", "google-batch", "--project", PROJECT_ID, "--logging", LOGGING, - "--regions", "us-central1", - "--network", "global/networks/default", - "--subnetwork", "regions/us-central1/subnetworks/default", + "--regions", location, + "--network", network, + "--subnetwork", subnetwork, "--use-private-address" - ] + opt_args + dsub_args) + ] + if service_account: + args.extend(["--service-account", service_account]) + + # pyformat: disable + return dsub_command.call(args + opt_args + dsub_args) # pyformat: enable From 086f68d5bdc995439c0cc767f4ea5b23e792009f Mon Sep 17 00:00:00 2001 From: Peter Su Date: Fri, 19 Dec 2025 17:02:26 +0000 Subject: [PATCH 07/11] replaced default USER of jupyter with whoami, define JOB_USER in a few integration test files and use that directly --- test/integration/e2e_logging_paths.sh | 2 +- test/integration/e2e_logging_paths_pattern_tasks.sh | 2 +- test/integration/io_setup.sh | 4 +++- test/integration/io_tasks_setup.sh | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/integration/e2e_logging_paths.sh b/test/integration/e2e_logging_paths.sh index 36be744..e6e8213 100755 --- a/test/integration/e2e_logging_paths.sh +++ b/test/integration/e2e_logging_paths.sh @@ -27,7 +27,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME="log-test" -readonly JOB_USER="${USER:-jupyter}" +readonly JOB_USER="${USER:-whoami}" # Test a basic job with base logging path echo "Subtest #1: Basic logging path" diff --git a/test/integration/e2e_logging_paths_pattern_tasks.sh b/test/integration/e2e_logging_paths_pattern_tasks.sh index 8d41c70..421f7c0 100755 --- a/test/integration/e2e_logging_paths_pattern_tasks.sh +++ b/test/integration/e2e_logging_paths_pattern_tasks.sh @@ -31,7 +31,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME=$(logging_paths_tasks_setup::get_job_name) -readonly JOB_USER="${USER:-jupyter}" +readonly JOB_USER="${USER:-whoami}" # Set up the tasks file logging_paths_tasks_setup::write_tasks_file diff --git a/test/integration/io_setup.sh b/test/integration/io_setup.sh index 191a948..82fae4f 100644 --- a/test/integration/io_setup.sh +++ b/test/integration/io_setup.sh @@ -26,6 +26,8 @@ readonly INPUT_BAM_MD5="4afb9b8908959dbd4e2d5c54bf254c93" readonly REQUESTER_PAYS_INPUT_BAM_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${INPUT_BAM_FILE}" readonly REQUESTER_PAYS_POPULATION_FILE_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${POPULATION_FILE}" +# Set user variable like in other tests +readonly JOB_USER="${USER:-whoami}" # This is the image we use to test the PD mount feature. # Inject the TEST_TOKEN into the name so that multiple tests can run # concurrently. Since the image test can be run multiple times for one @@ -230,7 +232,7 @@ function io_setup::check_dstat() { local dstat_output=$(run_dstat --status '*' --jobs "${job_id}" --full) echo " Checking user-id" - util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER:-jupyter}" + util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${JOB_USER}" echo " Checking logging" util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}" diff --git a/test/integration/io_tasks_setup.sh b/test/integration/io_tasks_setup.sh index 36336be..1a82399 100644 --- a/test/integration/io_tasks_setup.sh +++ b/test/integration/io_tasks_setup.sh @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +readonly JOB_USER="${USER:-whoami}" readonly POPULATION_FILE="gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/20131219.superpopulations.tsv" readonly POPULATION_MD5="68a73f849b82071afe11888bac1aa8a7" @@ -119,7 +120,7 @@ function io_tasks_setup::check_dstat() { echo " Check task ${task_id}" echo " Checking user-id" - util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER:-jupyter}" + util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${JOB_USER}" echo " Checking logging" util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}/${job_id}.${task_id}.log" From 34651dd3b27fabf185ad6cc5199e22c40e13004f Mon Sep 17 00:00:00 2001 From: Peter Su Date: Mon, 22 Dec 2025 22:04:36 +0000 Subject: [PATCH 08/11] upping version to 0.5.2 to prepare for merge to main --- dsub/_dsub_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsub/_dsub_version.py b/dsub/_dsub_version.py index 71bbde4..050f045 100644 --- a/dsub/_dsub_version.py +++ b/dsub/_dsub_version.py @@ -27,5 +27,5 @@ """ -DSUB_VERSION = '0.5.2.dev0' +DSUB_VERSION = '0.5.2' From 60c98bf00f30c7d15719c7b2f75220b42a58af23 Mon Sep 17 00:00:00 2001 From: Peter Su Date: Fri, 2 Jan 2026 20:08:43 +0000 Subject: [PATCH 09/11] replaced whoami string with user to get the actual logged in user --- test/integration/e2e_logging_paths.sh | 2 +- test/integration/e2e_logging_paths_pattern_tasks.sh | 2 +- test/integration/io_setup.sh | 2 +- test/integration/io_tasks_setup.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/integration/e2e_logging_paths.sh b/test/integration/e2e_logging_paths.sh index e6e8213..314cfd5 100755 --- a/test/integration/e2e_logging_paths.sh +++ b/test/integration/e2e_logging_paths.sh @@ -27,7 +27,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME="log-test" -readonly JOB_USER="${USER:-whoami}" +readonly JOB_USER="${USER:-$(whoami)}" # Test a basic job with base logging path echo "Subtest #1: Basic logging path" diff --git a/test/integration/e2e_logging_paths_pattern_tasks.sh b/test/integration/e2e_logging_paths_pattern_tasks.sh index 421f7c0..5524c01 100755 --- a/test/integration/e2e_logging_paths_pattern_tasks.sh +++ b/test/integration/e2e_logging_paths_pattern_tasks.sh @@ -31,7 +31,7 @@ readonly LOGGING_BASE="$(dirname "${LOGGING}")" declare LOGGING_OVERRIDE readonly JOB_NAME=$(logging_paths_tasks_setup::get_job_name) -readonly JOB_USER="${USER:-whoami}" +readonly JOB_USER="${USER:-$(whoami)}" # Set up the tasks file logging_paths_tasks_setup::write_tasks_file diff --git a/test/integration/io_setup.sh b/test/integration/io_setup.sh index 82fae4f..6cf7caf 100644 --- a/test/integration/io_setup.sh +++ b/test/integration/io_setup.sh @@ -27,7 +27,7 @@ readonly REQUESTER_PAYS_INPUT_BAM_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/ readonly REQUESTER_PAYS_POPULATION_FILE_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${POPULATION_FILE}" # Set user variable like in other tests -readonly JOB_USER="${USER:-whoami}" +readonly JOB_USER="${USER:-$(whoami)}" # This is the image we use to test the PD mount feature. # Inject the TEST_TOKEN into the name so that multiple tests can run # concurrently. Since the image test can be run multiple times for one diff --git a/test/integration/io_tasks_setup.sh b/test/integration/io_tasks_setup.sh index 1a82399..9b93627 100644 --- a/test/integration/io_tasks_setup.sh +++ b/test/integration/io_tasks_setup.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -readonly JOB_USER="${USER:-whoami}" +readonly JOB_USER="${USER:-$(whoami)}" readonly POPULATION_FILE="gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/20131219.superpopulations.tsv" readonly POPULATION_MD5="68a73f849b82071afe11888bac1aa8a7" From b101d8a04fe188b91937691a9fbbb12381254b89 Mon Sep 17 00:00:00 2001 From: Peter Su Date: Tue, 6 Jan 2026 17:56:05 +0000 Subject: [PATCH 10/11] updating user from jupyter to user in test/integration/test_setup.sh --- test/integration/test_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/test_setup.sh b/test/integration/test_setup.sh index 6f1e302..769e0f6 100644 --- a/test/integration/test_setup.sh +++ b/test/integration/test_setup.sh @@ -31,7 +31,7 @@ # with DSUB_PROVIDER-specific default parameters set. # Set default USER if not already set (needed for Jupyterlab/Docker environments) -export USER="${USER:-jupyter}" +export USER="${USER:-$(whoami)}" # If the DSUB_PROVIDER is not set, figure it out from the name of the script. # If the script name is ..sh, pull out the provider. From 0dc6d63d57150d26f541462cfabfe3933184f269 Mon Sep 17 00:00:00 2001 From: Peter Su Date: Thu, 29 Jan 2026 04:19:44 +0000 Subject: [PATCH 11/11] addressing PR comments in google_batch.py around simpler elif/else logic and double quotes --- dsub/providers/google_batch.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dsub/providers/google_batch.py b/dsub/providers/google_batch.py index 067abd3..73a35d4 100644 --- a/dsub/providers/google_batch.py +++ b/dsub/providers/google_batch.py @@ -742,7 +742,7 @@ def _create_batch_request( for gcs_volume in self._get_gcs_volumes_for_user_command(mounts): user_command_volumes.append(gcs_volume) # Add --gpus all option for GPU-enabled containers - container_options = "--gpus all" if job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia') else None + container_options = '--gpus all' if job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia') else None runnables.append( # user-command google_batch_operations.build_runnable( @@ -812,7 +812,7 @@ def _create_batch_request( if job_resources.boot_disk_image: boot_disk_image = job_resources.boot_disk_image elif job_resources.accelerator_type and job_resources.accelerator_type.startswith('nvidia'): - boot_disk_image = "batch-debian" + boot_disk_image = 'batch-debian' else: boot_disk_image = None @@ -855,10 +855,8 @@ def _create_batch_request( # Determine whether to install GPU drivers: use user-specified value, or default to True for GPU jobs if job_resources.install_gpu_drivers is not None: install_gpu_drivers = job_resources.install_gpu_drivers - elif job_resources.accelerator_type is not None: - install_gpu_drivers = True else: - install_gpu_drivers = False + install_gpu_drivers = job_resources.accelerator_type is not None ipt = google_batch_operations.build_instance_policy_or_template( instance_policy=instance_policy,