From 98ef4e0a1afa3bef7dcfdc0d123973c912512fdc Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 4 Jun 2026 21:14:38 +0000 Subject: [PATCH 1/4] feat(http-proxy): add HTTP proxy configuration initialization action and integration tests Introduces a new initialization action `http-proxy/http-proxy.sh` to configure global HTTP/HTTPS proxy settings on Dataproc cluster nodes. Features: - Configures global proxy environment variables (`http_proxy`, `https_proxy`, `no_proxy`) in `/etc/environment` and `/etc/profile.d/proxy.sh`. - Bypasses proxying for Google Cloud APIs and internal GCP domains (e.g., `metadata.google.internal`, `.googleapis.com`, local cluster hostnames) using a robust default `no_proxy` list. - Automatically appends custom bypass hosts from the `no-proxy` metadata. - Configures `gcloud` CLI proxy settings to align with the environment. - Installs the proxy's PEM CA certificate (if provided via `http-proxy-pem-uri`) to the OS, Java, and Conda trust stores. - Configures system package managers (`apt`/`dnf`) and `dirmngr` to fetch packages through the proxy. - Configures `boto.cfg` (used by `gsutil`) to use the proxy. - Ensures idempotency by skipping configuration if already present. Documentation: - Adds `http-proxy/README.md` detailing parameters, usage, and the critical compatibility requirement (`dataproc:dataproc.master.custom.init.actions.mode=RUN_BEFORE_SERVICES`). - Updates root `README.md` to link to the new action. Testing: - Adds integration test `http-proxy/test_http_proxy.py` to verify the action exits cleanly when no proxy metadata is provided. - Registers the test target in the root `BUILD` file. Build Infrastructure: - Adds `MODULE.bazel` with `rules_python` dependency to support Bazel Bzlmod builds. - Updates root `BUILD` file to load python rules explicitly. TAG=agy CONV=b274b565-1bd6-43f1-b4db-31f3d89d087b --- BUILD | 15 ++ MODULE.bazel | 1 + README.md | 1 + http-proxy/README.md | 57 ++++++ http-proxy/http-proxy.sh | 365 ++++++++++++++++++++++++++++++++++ http-proxy/test_http_proxy.py | 24 +++ 6 files changed, 463 insertions(+) create mode 100644 MODULE.bazel create mode 100644 http-proxy/README.md create mode 100755 http-proxy/http-proxy.sh create mode 100644 http-proxy/test_http_proxy.py diff --git a/BUILD b/BUILD index 3dd5093ab..e89581955 100644 --- a/BUILD +++ b/BUILD @@ -1,3 +1,5 @@ +load("@rules_python//python:defs.bzl", "py_test", "py_library") + package(default_visibility = ["//visibility:public"]) test_suite( @@ -6,6 +8,7 @@ test_suite( ":test_cloud_sql_proxy", ":test_dr_elephant", ":test_hive_hcatalog", + ":test_http_proxy", ":test_spark_rapids", ":test_starburst_presto", "//alluxio:test_alluxio", @@ -151,3 +154,15 @@ py_test( "@io_abseil_py//absl/testing:parameterized", ], ) + +py_test( + name = "test_http_proxy", + size = "enormous", + srcs = ["http-proxy/test_http_proxy.py"], + data = ["http-proxy/http-proxy.sh"], + local = True, + deps = [ + "//integration_tests:dataproc_test_case", + "@io_abseil_py//absl/testing:parameterized", + ], +) diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 000000000..fc6ec32a1 --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1 @@ +bazel_dep(name = "rules_python", version = "1.7.0") diff --git a/README.md b/README.md index 9e143d414..5154cb003 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ This repository currently offers the following actions for use with Dataproc clu * Configure the environment * Configure a *nice* shell environment * To switch to Python 3, use the conda initialization action + * [HTTP Proxy](http-proxy/README.md) * Connect to Google Cloud Platform services * Install alternate versions of the [Cloud Storage and BigQuery connectors](https://github.com/GoogleCloudPlatform/bigdata-interop/releases). [Specific versions](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions) of these connectors come pre-installed on Cloud Dataproc clusters. * Share a [Cloud SQL](https://cloud.google.com/sql/) Hive Metastore, or simply read/write data from Cloud SQL. diff --git a/http-proxy/README.md b/http-proxy/README.md new file mode 100644 index 000000000..59d336315 --- /dev/null +++ b/http-proxy/README.md @@ -0,0 +1,57 @@ +# HTTP Proxy Configuration + +This initialization action configures global HTTP and HTTPS proxy settings on every node in a [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster. + +It is designed to set up proxy environments for clusters running in private networks that must egress through a secure web proxy or gateway. + +## Features + +- Configures global proxy environment variables (`http_proxy`, `https_proxy`, `no_proxy` and their uppercase variants) in `/etc/environment`. +- Persists proxy settings for all shell sessions via `/etc/profile.d/proxy.sh`. +- Bypasses proxying for Google Cloud APIs and internal GCP domains (e.g. `metadata.google.internal`, `169.254.169.254`, `.googleapis.com`, local cluster hostnames) using a robust default `no_proxy` list. +- Automatically appends custom bypass hosts from the `no-proxy` metadata. +- Configures the `gcloud` CLI proxy settings to align with the environment. +- Installs the proxy's PEM CA certificate (if provided) to the OS, Java, and Conda trust stores. +- Configures system package managers (`apt`/`dnf`) and `dirmngr` to fetch packages through the proxy. +- Configures `boto.cfg` (used by `gsutil`) to use the proxy. + +## Parameters + +You configure the proxy settings using VM metadata: + +| Metadata Key | Description | +|---|---| +| `http-proxy` | (Optional) The HTTP proxy host and port (e.g. `10.0.0.1:8080` or `vzproxy.verizon.com:9290`). | +| `https-proxy` | (Optional) The HTTPS proxy host and port. | +| `proxy-uri` | (Optional) A unified proxy host and port if HTTP and HTTPS proxies are the same. Used as fallback if `http-proxy` or `https-proxy` are not set. | +| `no-proxy` | (Optional) A comma-separated list of additional hosts/domains that should bypass the proxy. | +| `http-proxy-pem-uri` | (Optional) A Cloud Storage URI (e.g. `gs://my-bucket/proxy_ca.crt`) containing the PEM-encoded CA certificate for the proxy. Required if the proxy inspects SSL traffic. | + +## Usage + +### ⚠️ CRITICAL COMPATIBILITY REQUIREMENT ⚠️ + +For Dataproc internal components (like HDFS NameNode) to successfully initialize and access the Google Cloud metadata server during boot, **this initialization action must run before system services start.** + +You **must** set the following cluster property: +`dataproc:dataproc.master.custom.init.actions.mode=RUN_BEFORE_SERVICES` + +### Example + +Use the `gcloud` command to create a new cluster with this initialization action: + +```bash +PROJECT_ID="my-project-id" +REGION="us-east4" +CLUSTER_NAME="my-proxy-cluster" +PROXY_HOST_PORT="vzproxy.verizon.com:9290" +CA_CERT_URI="gs://my-secure-bucket/proxy_ca.crt" + +gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --initialization-actions gs://dataproc-initialization-actions-${REGION}/http-proxy/http-proxy.sh \ + --properties "dataproc:dataproc.master.custom.init.actions.mode=RUN_BEFORE_SERVICES" \ + --metadata "proxy-uri=${PROXY_HOST_PORT}" \ + --metadata "http-proxy-pem-uri=${CA_CERT_URI}" \ + --metadata "no-proxy=my-onprem-service.corp.internal" +``` diff --git a/http-proxy/http-proxy.sh b/http-proxy/http-proxy.sh new file mode 100755 index 000000000..2c725567e --- /dev/null +++ b/http-proxy/http-proxy.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +set -euo pipefail + +# --- Metadata Helpers --- +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + # Capture stdout and http_code separately + http_code=$(curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${1}" -H "Metadata-Flavor: Google" -w '%{http_code}' \ + -s -o ${tmpfile} --connect-timeout 5 --max-time 10) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && "${http_code}" == "200" ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} +} + +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value "${url}" + return_code=$? + return ${return_code} +} + +# replicates /usr/share/google/get_metadata_value +function get_metadata_value() { + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + return ${return_code} +} + +function get_metadata_attribute() { + local -r attribute_name="$1" + local -r default_value="${2:-}" + set +e + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + set -e +} +# --- End Metadata Helpers --- + +# --- OS Detection Helpers --- +function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } +function is_debuntu() { [[ "$(os_id)" == "debian" || "$(os_id)" == "ubuntu" ]] ; } +function is_rocky() { [[ "$(os_id)" == "rocky" ]] ; } +# --- End OS Detection Helpers --- + +# --- Version Comparison Helpers --- +function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } +function version_lt(){ [[ "$1" = "$2" ]] && return 1 || version_le "$1" "$2"; } +# --- End Version Comparison Helpers --- + +function execute_with_retries() { + local -r cmd="$*" + local retries=3 + local delay=5 + for ((i = 0; i < retries; i++)); do + eval "${cmd}" && return 0 + echo "Command failed. Retrying in ${delay} seconds..." >&2 + sleep "${delay}" + done + echo "Command failed after ${retries} retries: ${cmd}" >&2 + return 1 +} + +function set_proxy(){ + # Idempotency Check for Proxy + if grep -q "http_proxy=" /etc/environment && [[ -n "${http_proxy:-}" ]]; then + echo "INFO: Proxy already configured in /etc/environment. Skipping proxy setup portion." + return 0 + fi + + local meta_http_proxy meta_https_proxy meta_proxy_uri + meta_http_proxy=$(get_metadata_attribute 'http-proxy' '') + meta_https_proxy=$(get_metadata_attribute 'https-proxy' '') + meta_proxy_uri=$(get_metadata_attribute 'proxy-uri' '') + + echo "DEBUG: set_proxy: meta_http_proxy='${meta_http_proxy}'" + echo "DEBUG: set_proxy: meta_https_proxy='${meta_https_proxy}'" + echo "DEBUG: set_proxy: meta_proxy_uri='${meta_proxy_uri}'" + + local http_proxy_val="" + local https_proxy_val="" + + # Determine HTTP_PROXY value + if [[ -n "${meta_http_proxy}" ]] && [[ "${meta_http_proxy}" != ":" ]]; then + http_proxy_val="${meta_http_proxy}" + elif [[ -n "${meta_proxy_uri}" ]] && [[ "${meta_proxy_uri}" != ":" ]]; then + http_proxy_val="${meta_proxy_uri}" + fi + + # Determine HTTPS_PROXY value + if [[ -n "${meta_https_proxy}" ]] && [[ "${meta_https_proxy}" != ":" ]]; then + https_proxy_val="${meta_https_proxy}" + elif [[ -n "${meta_proxy_uri}" ]] && [[ "${meta_proxy_uri}" != ":" ]]; then + https_proxy_val="${meta_proxy_uri}" + fi + + if [[ -z "${http_proxy_val}" && -z "${https_proxy_val}" ]]; then + echo "DEBUG: set_proxy: No valid proxy metadata found. Skipping proxy config." + return 0 + fi + + local default_no_proxy_list=( + "localhost" + "127.0.0.1" + "::1" + "metadata.google.internal" + "169.254.169.254" + ".google.com" + ".googleapis.com" + ".internal" + ) + + # Add project-specific internal domain + local project_id + project_id=$(get_metadata_attribute 'project-id' "${PROJECT_ID:-}") + if [[ -n "${project_id}" ]]; then + default_no_proxy_list+=( ".c.${project_id}.internal" ) + fi + + # Add cluster-specific hostnames + local cluster_name + cluster_name=$(get_metadata_attribute 'dataproc-cluster-name' '') + if [[ -n "${cluster_name}" ]]; then + # Add wildcard patterns (supported by some tools like Go/Java) + default_no_proxy_list+=( "${cluster_name}-m" "${cluster_name}-m-*" "${cluster_name}-w-*" "${cluster_name}-sw-*" ) + # Add FQDN suffixes to ensure bypass for tools like curl/wget + default_no_proxy_list+=( "${cluster_name}-m.c.${project_id}.internal" ) + default_no_proxy_list+=( ".c.${project_id}.internal" ) + fi + + local user_no_proxy + user_no_proxy=$(get_metadata_attribute 'no-proxy' '') + local user_no_proxy_list=() + if [[ -n "${user_no_proxy}" ]]; then + IFS=',' read -r -a user_no_proxy_list <<< "${user_no_proxy// /,}" + fi + + local combined_no_proxy_list=( "${default_no_proxy_list[@]}" "${user_no_proxy_list[@]}" ) + local no_proxy + no_proxy=$( IFS=',' ; echo "${combined_no_proxy_list[*]}" ) + export NO_PROXY="${no_proxy}" + export no_proxy="${no_proxy}" + + # Export environment variables + if [[ -n "${http_proxy_val}" ]]; then + export HTTP_PROXY="http://${http_proxy_val}" + export http_proxy="http://${http_proxy_val}" + fi + if [[ -n "${https_proxy_val}" ]]; then + export HTTPS_PROXY="http://${https_proxy_val}" + export https_proxy="http://${https_proxy_val}" + fi + + # Clear existing proxy settings in /etc/environment + sed -i -e '/^http_proxy=/d' -e '/^https_proxy=/d' -e '/^no_proxy=/d' \ + -e '/^HTTP_PROXY=/d' -e '/^HTTPS_PROXY=/d' -e '/^NO_PROXY=/d' /etc/environment + + # Add current proxy environment variables to /etc/environment + if [[ -n "${HTTP_PROXY:-}" ]]; then echo "HTTP_PROXY=${HTTP_PROXY}" >> /etc/environment; fi + if [[ -n "${http_proxy:-}" ]]; then echo "http_proxy=${http_proxy}" >> /etc/environment; fi + if [[ -n "${HTTPS_PROXY:-}" ]]; then echo "HTTPS_PROXY=${HTTPS_PROXY}" >> /etc/environment; fi + if [[ -n "${https_proxy:-}" ]]; then echo "https_proxy=${https_proxy}" >> /etc/environment; fi + if [[ -n "${NO_PROXY:-}" ]]; then echo "NO_PROXY=${NO_PROXY}" >> /etc/environment; fi + if [[ -n "${NO_PROXY:-}" ]]; then echo "no_proxy=${no_proxy}" >> /etc/environment; fi + + # Persist for all shell sessions + local profile_script="/etc/profile.d/proxy.sh" + echo "# Proxy settings from Dataproc init action" > "${profile_script}" + if [[ -n "${HTTP_PROXY:-}" ]]; then echo "export HTTP_PROXY='${HTTP_PROXY}'" >> "${profile_script}"; fi + if [[ -n "${http_proxy:-}" ]]; then echo "export http_proxy='${http_proxy}'" >> "${profile_script}"; fi + if [[ -n "${HTTPS_PROXY:-}" ]]; then echo "export HTTPS_PROXY='${HTTPS_PROXY}'" >> "${profile_script}"; fi + if [[ -n "${https_proxy:-}" ]]; then echo "export https_proxy='${https_proxy}'" >> "${profile_script}"; fi + if [[ -n "${NO_PROXY:-}" ]]; then echo "export NO_PROXY='${NO_PROXY}'" >> "${profile_script}"; fi + if [[ -n "${no_proxy:-}" ]]; then echo "export no_proxy='${no_proxy}'" >> "${profile_script}"; fi + + # Source the script to apply settings to the current shell + source "${profile_script}" + + # Configure gcloud proxy + local gcloud_version + local -r min_gcloud_proxy_ver="547.0.0" + gcloud_version=$(gcloud version --format="value(google_cloud_sdk)" 2>/dev/null || echo "0.0.0") + if version_ge "${gcloud_version}" "${min_gcloud_proxy_ver}"; then + if [[ -n "${http_proxy_val}" ]]; then + local proxy_host=$(echo "${http_proxy_val}" | cut -d: -f1) + local proxy_port=$(echo "${http_proxy_val}" | cut -d: -f2) + gcloud config set proxy/type http + gcloud config set proxy/address "${proxy_host}" + gcloud config set proxy/port "${proxy_port}" + else + gcloud config unset proxy/type + gcloud config unset proxy/address + gcloud config unset proxy/port + fi + fi + + # Install the HTTPS proxy's certificate + local proxy_ca_pem="" + local trusted_pem_path="" + METADATA_HTTP_PROXY_PEM_URI="$(get_metadata_attribute http-proxy-pem-uri '')" + if [[ -n "${METADATA_HTTP_PROXY_PEM_URI}" ]] ; then + if [[ ! "${METADATA_HTTP_PROXY_PEM_URI}" =~ ^gs:// ]] ; then echo "ERROR: http-proxy-pem-uri value must start with gs://" ; exit 1 ; fi + echo "DEBUG: set_proxy: Processing http-proxy-pem-uri='${METADATA_HTTP_PROXY_PEM_URI}'" + local trusted_pem_dir + if is_debuntu ; then + trusted_pem_dir="/usr/local/share/ca-certificates" + proxy_ca_pem="${trusted_pem_dir}/proxy_ca.crt" + mkdir -p "${trusted_pem_dir}" + gsutil cp "${METADATA_HTTP_PROXY_PEM_URI}" "${proxy_ca_pem}" || { echo "ERROR: Failed to download proxy CA cert from GCS." ; exit 1 ; } + update-ca-certificates + trusted_pem_path="/etc/ssl/certs/ca-certificates.crt" + elif is_rocky ; then + trusted_pem_dir="/etc/pki/ca-trust/source/anchors" + proxy_ca_pem="${trusted_pem_dir}/proxy_ca.crt" + mkdir -p "${trusted_pem_dir}" + gsutil cp "${METADATA_HTTP_PROXY_PEM_URI}" "${proxy_ca_pem}" || { echo "ERROR: Failed to download proxy CA cert from GCS." ; exit 1 ; } + update-ca-trust + trusted_pem_path="/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem" + fi + export REQUESTS_CA_BUNDLE="${trusted_pem_path}" + echo "DEBUG: set_proxy: trusted_pem_path set to '${trusted_pem_path}'" + + # Add to Java/Conda trust stores + if [[ -f "/etc/environment" ]]; then + JAVA_HOME="$(awk -F= '/^JAVA_HOME=/ {print $2}' /etc/environment)" + if [[ -n "${JAVA_HOME:-}" && -f "${JAVA_HOME}/bin/keytool" ]]; then + "${JAVA_HOME}/bin/keytool" -import -cacerts -storepass changeit -noprompt -alias swp_ca -file "${proxy_ca_pem}" + fi + fi + if command -v conda &> /dev/null ; then + local conda_cert_file="/opt/conda/default/ssl/cacert.pem" + if [[ -f "${conda_cert_file}" ]]; then + local ca_subject=$(openssl crl2pkcs7 -nocrl -certfile "${proxy_ca_pem}" | openssl pkcs7 -print_certs -noout | grep ^subject) + openssl crl2pkcs7 -nocrl -certfile "${conda_cert_file}" | openssl pkcs7 -print_certs -noout | grep -Fxq "${ca_subject}" || { + cat "${proxy_ca_pem}" >> "${conda_cert_file}" + } + fi + fi + fi + + if [[ -n "${http_proxy_val}" ]]; then + + local proxy_host=$(echo "${http_proxy_val}" | cut -d: -f1) + local proxy_port=$(echo "${http_proxy_val}" | cut -d: -f2) + + echo "DEBUG: set_proxy: Testing TCP connection to proxy ${proxy_host}:${proxy_port}..." + if ! nc -zv -w 5 "${proxy_host}" "${proxy_port}"; then + echo "ERROR: Failed to establish TCP connection to proxy ${proxy_host}:${proxy_port}." + exit 1 + fi + + echo "DEBUG: set_proxy: Testing external site access via proxy..." + local test_url="https://www.google.com" + local curl_test_args=() + if [[ -n "${trusted_pem_path}" ]]; then + curl_test_args+=(--cacert "${trusted_pem_path}") + fi + if curl "${curl_test_args[@]}" -vL --retry 3 --retry-delay 5 -o /dev/null "${test_url}"; then + echo "DEBUG: set_proxy: Successfully fetched ${test_url} via proxy." + else + echo "ERROR: Failed to fetch ${test_url} via proxy ${HTTP_PROXY}." + exit 1 + fi + fi + + # Configure package managers + local pkg_proxy_conf_file + local effective_proxy="${http_proxy_val:-${https_proxy_val}}" + if [[ -z "${effective_proxy}" ]]; then + echo "DEBUG: set_proxy: No HTTP or HTTPS proxy set for package managers." + elif is_debuntu ; then + pkg_proxy_conf_file="/etc/apt/apt.conf.d/99proxy" + echo "Acquire::http::Proxy \"http://${effective_proxy}\";" > "${pkg_proxy_conf_file}" + echo "Acquire::https::Proxy \"http://${effective_proxy}\";" >> "${pkg_proxy_conf_file}" + elif is_rocky ; then + pkg_proxy_conf_file="/etc/dnf/dnf.conf" + touch "${pkg_proxy_conf_file}" + sed -i.bak '/^proxy=/d' "${pkg_proxy_conf_file}" + if grep -q "^\[main\]" "${pkg_proxy_conf_file}"; then + sed -i.bak "/^\\\[main\\\\]/a proxy=http://${effective_proxy}" "${pkg_proxy_conf_file}" + else + echo -e "[main]\nproxy=http://${effective_proxy}" >> "${pkg_proxy_conf_file}" + fi + fi + + # Configure dirmngr + if is_debuntu ; then + if ! dpkg -l | grep -q dirmngr; then + execute_with_retries apt-get install -y -qq dirmngr + fi + elif is_rocky ; then + if ! rpm -q gnupg2-smime; then + execute_with_retries dnf install -y -q gnupg2-smime + fi + fi + mkdir -p /etc/gnupg + local dirmngr_conf="/etc/gnupg/dirmngr.conf" + touch "${dirmngr_conf}" + sed -i.bak '/^http-proxy/d' "${dirmngr_conf}" + if [[ -n "${HTTP_PROXY:-}" ]]; then + echo "http-proxy ${HTTP_PROXY}" >> "${dirmngr_conf}" + fi +} + +function repair_boto() { + local boto_file="/etc/boto.cfg" + if [[ -f "${boto_file}" ]]; then + echo "DEBUG: repair_boto: Repairing and deduplicating ${boto_file}" >&2 + + # 1. Deduplicate sections (fix for DuplicateSectionError) + # Use a more robust perl one-liner that also handles the content within duplicate sections + # by only keeping the first occurrence of each section and its variables. + # Comments and blank lines are always printed to preserve structure. + perl -i -ne ' + print and next if /^\s*(?:[#;]|$)/; + if (/^\[([^\]]+)\]/) { + $skip = $seen{$1}++; + } + print unless $skip; + ' "${boto_file}" + + # 2. Fix universe_domain if it is still a variable + local universe_domain + universe_domain=$(get_metadata_attribute 'universe-domain' 'googleapis.com') + # Use a more robust replacement that handles potential escaping issues + UNIVERSE_DOMAIN="${universe_domain}" perl -i -pe 's/\$\{universe_domain\}/$ENV{UNIVERSE_DOMAIN}/g' "${boto_file}" + # Also fix cases where it might have been partially expanded to storage.$ + UNIVERSE_DOMAIN="${universe_domain}" perl -i -pe 's/storage\.\$/storage.$ENV{UNIVERSE_DOMAIN}/g' "${boto_file}" + + # 3. Apply proxy if set + local meta_http_proxy=$(get_metadata_attribute 'http-proxy' '') + local meta_proxy_uri=$(get_metadata_attribute 'proxy-uri' '') + local effective_proxy="${meta_http_proxy:-${meta_proxy_uri}}" + + if [[ -n "${effective_proxy}" ]]; then + local proxy_host="${effective_proxy%:*}" + local proxy_port="${effective_proxy##*:}" + + sed -i -e '/^proxy =/d' -e '/^proxy_port =/d' "${boto_file}" + if grep -q "^\[Boto\]" "${boto_file}"; then + sed -i "/^\[Boto\]/a proxy = ${proxy_host}\nproxy_port = ${proxy_port}" "${boto_file}" + else + echo -e "\n[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}" >> "${boto_file}" + fi + fi + echo "DEBUG: repair_boto: Updated ${boto_file}" >&2 + fi +} + +# --- Execution --- +set_proxy +repair_boto +echo "DEBUG: gce-proxy-setup.sh complete." >&2 diff --git a/http-proxy/test_http_proxy.py b/http-proxy/test_http_proxy.py new file mode 100644 index 000000000..ba2e78442 --- /dev/null +++ b/http-proxy/test_http_proxy.py @@ -0,0 +1,24 @@ +import pkg_resources +from absl.testing import absltest +from absl.testing import parameterized + +from integration_tests.dataproc_test_case import DataprocTestCase + + +class HttpProxyTestCase(DataprocTestCase): + COMPONENT = 'http-proxy' + INIT_ACTIONS = ['http-proxy/http-proxy.sh'] + + @parameterized.parameters( + ("SINGLE",), + ) + def test_http_proxy_skip(self, configuration): + # Test that it exits cleanly when no proxy metadata is provided + self.createCluster( + configuration, + self.INIT_ACTIONS, + timeout_in_minutes=10) + + +if __name__ == '__main__': + absltest.main() From 6a6a5e9cc96aaf71c89ed890fb13e0839e97f48c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 5 Jun 2026 19:07:56 +0000 Subject: [PATCH 2/4] feat(http-proxy): add HTTP proxy configuration initialization action and integration tests Introduces a new initialization action to configure global HTTP/HTTPS proxy settings on Dataproc cluster nodes, along with comprehensive integration tests and build infrastructure updates. Features: - Configures global proxy environment variables (http_proxy, https_proxy, no_proxy) in /etc/environment and /etc/profile.d/proxy.sh - Bypasses proxying for Google Cloud APIs and internal GCP domains (e.g., metadata.google.internal, .googleapis.com, local cluster hostnames) by default - Automatically appends custom bypass hosts from the no-proxy metadata - Configures gcloud CLI proxy settings to align with the environment - Installs the proxy's PEM CA certificate (if provided via http-proxy-pem-uri) to the OS, Java, and Conda trust stores - Configures system package managers (apt/dnf) and dirmngr to fetch packages through the proxy - Configures boto.cfg (used by gsutil) to use the proxy, with robust section deduplication and universe_domain variable resolution - Validates proxy egress connectivity at boot via nc and curl Documentation: - Adds http-proxy/README.md which details parameters, usage, and the critical compatibility requirement (dataproc:dataproc.master.custom.init.actions.mode=RUN_BEFORE_SERVICES) - Updates the root README.md file to link to the new action Testing and Build Infrastructure: - Adds integration test http-proxy/test_http_proxy.py that verifies the clean exit path (no proxy) and the proxy-enabled path - The proxy-enabled test path automatically detects provisioned Secure Web Proxy (SWP) resources in the project, harvests the CA certificate from Private CA, stages it to GCS, and provisions the cluster in a custom VPC network segment - Registers the new test target :test_http_proxy in the root BUILD file - Fixes python rules loading in integration_tests/BUILD and the root BUILD file - Adds MODULE.bazel with rules_python and abseil-py declarations to support modern Bazel Bzlmod builds - Modifies integration_tests/dataproc_test_case.py to perform the following: * Fix a class instantiation bug in setUpClass * Add network and subnet arguments to createCluster * Clean PYTHONPATH and PYTHONSAFEPATH in subprocess environments, preventing Bazel's sandboxed python from breaking external CLI tools (gcloud, gsutil) TAG=agy CONV=b274b565-1bd6-43f1-b4db-31f3d89d087b --- MODULE.bazel | 1 + http-proxy/test_http_proxy.py | 124 ++++++++++++++++++++++++ integration_tests/BUILD | 2 + integration_tests/dataproc_test_case.py | 16 ++- 4 files changed, 141 insertions(+), 2 deletions(-) diff --git a/MODULE.bazel b/MODULE.bazel index fc6ec32a1..a5667bf74 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -1 +1,2 @@ bazel_dep(name = "rules_python", version = "1.7.0") +bazel_dep(name = "abseil-py", version = "2.1.0", repo_name = "io_abseil_py") diff --git a/http-proxy/test_http_proxy.py b/http-proxy/test_http_proxy.py index ba2e78442..e4b9e6f5d 100644 --- a/http-proxy/test_http_proxy.py +++ b/http-proxy/test_http_proxy.py @@ -1,3 +1,6 @@ +import logging +import os +import json import pkg_resources from absl.testing import absltest from absl.testing import parameterized @@ -8,6 +11,87 @@ class HttpProxyTestCase(DataprocTestCase): COMPONENT = 'http-proxy' INIT_ACTIONS = ['http-proxy/http-proxy.sh'] + SWP_AVAILABLE = False + CA_CERT_URI = None + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.SWP_AVAILABLE = False + try: + # Check SWP gateway + ret, stdout, stderr = cls.run_command( + f"gcloud network-services gateways list --location={cls.REGION} --project={cls.PROJECT} --format='value(name)'" + ) + if ret == 0 and any('swp' in gw for gw in stdout.strip().split('\n') if gw): + # Check CA Pool + ret, stdout, stderr = cls.run_command( + f"gcloud privateca pools list --location={cls.REGION} --project={cls.PROJECT} --format='value(name)'" + ) + if ret == 0 and any('swp' in pool for pool in stdout.strip().split('\n') if pool): + # Check Cert + ret, stdout, stderr = cls.run_command( + f"gcloud certificate-manager certificates list --location={cls.REGION} --project={cls.PROJECT} --format='value(name)'" + ) + if ret == 0 and any('swp' in cert for cert in stdout.strip().split('\n') if cert): + cls.SWP_AVAILABLE = True + except Exception as e: + logging.warning(f"Error checking SWP availability: {e}") + + if cls.SWP_AVAILABLE: + try: + cls.CA_CERT_URI = cls.setup_ca_cert() + except Exception as e: + logging.warning(f"Failed to setup CA cert: {e}") + cls.SWP_AVAILABLE = False + + @classmethod + def setup_ca_cert(cls): + # 1. Find CA pool starting with swp-ca-pool- + ret, stdout, stderr = cls.run_command( + f"gcloud privateca pools list --location={cls.REGION} --project={cls.PROJECT} --format='value(name)'" + ) + if ret != 0 or not stdout.strip(): + raise Exception(f"Failed to list CA pools: {stderr}") + pools = stdout.strip().split('\n') + swp_pools = [p for p in pools if 'swp' in p] + if not swp_pools: + raise Exception("No SWP CA pool found") + pool_full_name = swp_pools[0] + pool_name = pool_full_name.split('/')[-1] + + # 2. Find Root CA in this pool + ret, stdout, stderr = cls.run_command( + f"gcloud privateca roots list --pool={pool_name} --location={cls.REGION} --project={cls.PROJECT} --format='value(name)'" + ) + if ret != 0 or not stdout.strip(): + raise Exception(f"Failed to list roots in pool {pool_name}: {stderr}") + roots = stdout.strip().split('\n') + swp_roots = [r for r in roots if 'swp' in r] + if not swp_roots: + raise Exception(f"No SWP root CA found in pool {pool_name}") + root_full_name = swp_roots[0] + root_name = root_full_name.split('/')[-1] + + # 3. Describe Root CA to get PEM + ret, stdout, stderr = cls.run_command( + f"gcloud privateca roots describe {root_name} --pool={pool_name} --location={cls.REGION} --project={cls.PROJECT} --format='value(pemCaCertificates)'" + ) + if ret != 0 or not stdout.strip(): + raise Exception(f"Failed to get PEM for CA {root_name}: {stderr}") + pem_cert = stdout.strip() + + # 4. Write to temp file and upload to GCS + local_pem_path = "/tmp/swp-root-ca.pem" + with open(local_pem_path, "w") as f: + f.write(pem_cert) + + gcs_pem_uri = f"{cls.INIT_ACTIONS_REPO}/swp-root-ca.pem" + ret, stdout, stderr = cls.run_command(f"gsutil cp {local_pem_path} {gcs_pem_uri}") + if ret != 0: + raise Exception(f"Failed to upload CA cert to GCS: {stderr}") + + return gcs_pem_uri @parameterized.parameters( ("SINGLE",), @@ -19,6 +103,46 @@ def test_http_proxy_skip(self, configuration): self.INIT_ACTIONS, timeout_in_minutes=10) + @parameterized.parameters( + ("SINGLE",), + ) + def test_http_proxy_enabled(self, configuration): + if not self.SWP_AVAILABLE: + self.skipTest("SWP is not available/provisioned in this project") + + real_test_file = os.path.realpath(__file__) + real_test_dir = os.path.dirname(real_test_file) + env_json_path = os.path.abspath(os.path.join(real_test_dir, "../env.json")) + + with open(env_json_path, "r") as f: + env_data = json.load(f) + + cluster_name = env_data["CLUSTER_NAME"] + swp_ip = env_data["SWP_IP"] + swp_port = env_data["SWP_PORT"] + network = f"net-{cluster_name}" + subnet = f"subnet-{cluster_name}" + + metadata = ( + f"http-proxy=http://{swp_ip}:{swp_port}," + f"https-proxy=http://{swp_ip}:{swp_port}," + f"http-proxy-pem-uri={self.CA_CERT_URI}," + "no-proxy=metadata.google.internal,.googleapis.com" + ) + + self.createCluster( + configuration, + self.INIT_ACTIONS, + metadata=metadata, + network=network, + subnet=subnet, + timeout_in_minutes=15) + + # Verify internet connectivity through proxy + self.assert_instance_command( + f"{self.getClusterName()}-m", + "curl -I -s --connect-timeout 10 https://www.google.com") + if __name__ == '__main__': absltest.main() diff --git a/integration_tests/BUILD b/integration_tests/BUILD index 8c3991ce5..f85118bcb 100644 --- a/integration_tests/BUILD +++ b/integration_tests/BUILD @@ -1,3 +1,5 @@ +load("@rules_python//python:defs.bzl", "py_library") + package(default_visibility = ["//visibility:public"]) py_library( diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 69cdc0dc6..683d2f6af 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -67,7 +67,7 @@ def setUpClass(cls): assert cls.PROJECT assert cls.REGION - cls.INIT_ACTIONS_REPO = DataprocTestCase().stage_init_actions( + cls.INIT_ACTIONS_REPO = cls().stage_init_actions( cls.PROJECT) assert cls.COMPONENT @@ -103,7 +103,9 @@ def createCluster(self, master_machine_type=None, worker_machine_type=None, boot_disk_size="50GB", - startup_script=None): + startup_script=None, + network=None, + subnet=None): self.initClusterName(configuration) self.cluster_version = None self.cluster_zone = zone @@ -146,6 +148,10 @@ def createCluster(self, args.append("--properties={}".format(properties)) if metadata: args.append("--metadata={}".format(metadata)) + if network: + args.append("--network={}".format(network)) + if subnet: + args.append("--subnet={}".format(subnet)) if scopes: args.append("--scopes={}".format(scopes)) @@ -367,12 +373,18 @@ def run_command(cmd, timeout_in_minutes=DEFAULT_TIMEOUT): "gcloud beta compute scp --internal-ip ") if ( INTERNAL_IP_SSH and "gcloud compute scp " in cmd) else cmd + env = os.environ.copy() + if "PYTHONPATH" in env: + del env["PYTHONPATH"] + if "PYTHONSAFEPATH" in env: + del env["PYTHONSAFEPATH"] p = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + env=env, ) timeout = timeout_in_minutes * 60 my_timer = Timer(timeout, lambda process: process.kill(), [p]) From 81c97b7aa8b13a517d25a2bdfd560b65b1191270 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 5 Jun 2026 19:15:40 +0000 Subject: [PATCH 3/4] better escaping --- http-proxy/http-proxy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http-proxy/http-proxy.sh b/http-proxy/http-proxy.sh index 2c725567e..7d652ed35 100755 --- a/http-proxy/http-proxy.sh +++ b/http-proxy/http-proxy.sh @@ -144,7 +144,7 @@ function set_proxy(){ user_no_proxy=$(get_metadata_attribute 'no-proxy' '') local user_no_proxy_list=() if [[ -n "${user_no_proxy}" ]]; then - IFS=',' read -r -a user_no_proxy_list <<< "${user_no_proxy// /,}" + IFS=',' read -r -a user_no_proxy_list <<< "${user_no_proxy//[[:space:]]/}" fi local combined_no_proxy_list=( "${default_no_proxy_list[@]}" "${user_no_proxy_list[@]}" ) From 5ad78ee18478b9031b26a3eeb36ec8892775dcf9 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 5 Jun 2026 19:18:41 +0000 Subject: [PATCH 4/4] remove ".." ; skip proxy test without env.json --- http-proxy/test_http_proxy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/http-proxy/test_http_proxy.py b/http-proxy/test_http_proxy.py index e4b9e6f5d..b648507dc 100644 --- a/http-proxy/test_http_proxy.py +++ b/http-proxy/test_http_proxy.py @@ -112,7 +112,11 @@ def test_http_proxy_enabled(self, configuration): real_test_file = os.path.realpath(__file__) real_test_dir = os.path.dirname(real_test_file) - env_json_path = os.path.abspath(os.path.join(real_test_dir, "../env.json")) + repo_root_dir = os.path.dirname(real_test_dir) + env_json_path = os.path.abspath(os.path.join(repo_root_dir, "env.json")) + + if not os.path.exists(env_json_path): + self.skipTest(f"env.json not found at {env_json_path}. Skipping enabled proxy test.") with open(env_json_path, "r") as f: env_data = json.load(f)