Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 66 additions & 37 deletions .github/workflows/pypi_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,59 @@ jobs:
run: echo "Release approved, proceeding to build, test and publish MaxText package."

build_and_test_maxtext_package:
name: Build and Test MaxText Package
name: ${{ matrix.image_name }}
needs: [release_approval]
uses: ./.github/workflows/build_and_test_maxtext.yml
secrets: inherit
strategy:
fail-fast: false
matrix:
include:
- device: tpu
build_mode: stable
image_name: maxtext_jax_stable
workflow: pre-training
dockerfile: maxtext_tpu_dependencies.Dockerfile
- device: gpu
build_mode: stable
image_name: maxtext_gpu_jax_stable
workflow: pre-training
dockerfile: maxtext_gpu_dependencies.Dockerfile
- device: tpu
build_mode: stable
image_name: maxtext_post_training_stable
workflow: post-training
dockerfile: maxtext_tpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
workflow: ${{ matrix.workflow }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ github.sha }}
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

run_e2e_tests:
name: ${{ matrix.image_name }}
needs: [build_release_candidate_images]
strategy:
fail-fast: false
matrix:
include:
- dag_id: maxtext_e2e_tpu_pre_training
candidate_image: "gcr.io/${{ vars.PROJECT_NAME }}/maxtext_jax_stable:${{ github.run_id }}"
- dag_id: maxtext_e2e_tpu_post_training
candidate_image: "gcr.io/${{ vars.PROJECT_NAME }}/maxtext_post_training_stable:${{ github.run_id }}"
uses: ./.github/workflows/run_e2e_tests.yml
with:
dag_id: ${{ matrix.dag_id }}
candidate_image: ${{ matrix.candidate_image }}
uses: ./.github/workflows/run_e2e_tests.yml

publish_maxtext_package_to_pypi:
name: Publish MaxText package to PyPI
needs: [build_and_test_maxtext_package]
needs: [run_e2e_tests]
runs-on: ubuntu-latest
environment: release
steps:
Expand Down Expand Up @@ -89,39 +134,23 @@ jobs:
# Set the output variable for other jobs to consume
echo "version=$latest_version" >> "$GITHUB_OUTPUT"

# This job builds and pushes MaxText stable Docker images for both TPU and GPU devices.
# It runs only after a new release is published to PyPI.
# Creates docker image for MaxText commit corresponding to the release.
upload_maxtext_docker_images:
name: ${{ matrix.image_name }}
needs: [get_latest_maxtext_pypi_version]
promote_release_images:
name: Promote Release Images - ${{ matrix.image_name }}
needs: [publish_maxtext_package_to_pypi, get_latest_maxtext_pypi_version]
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
strategy:
fail-fast: false
matrix:
include:
- device: tpu
build_mode: stable
image_name: maxtext_jax_stable
workflow: pre-training
dockerfile: maxtext_tpu_dependencies.Dockerfile
- device: gpu
build_mode: stable
image_name: maxtext_gpu_jax_stable
workflow: pre-training
dockerfile: maxtext_gpu_dependencies.Dockerfile
- device: tpu
build_mode: stable
image_name: maxtext_post_training_stable
workflow: post-training
dockerfile: maxtext_tpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
workflow: ${{ matrix.workflow }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ github.sha }}
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
image_name:
- maxtext_jax_stable
- maxtext_post_training_stable
steps:
- name: Configure Docker
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q

- name: Add tags to Docker image
shell: bash
run: |
SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${{ matrix.image_name }}"
gcloud container images add-tag "${SOURCE_IMAGE}:${{ github.run_id }}" "${SOURCE_IMAGE}:${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}" --quiet
110 changes: 110 additions & 0 deletions .github/workflows/run_e2e_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: MaxText E2E Airflow Tests

on:
workflow_call:
inputs:
dag_id:
description: 'The ID of the Airflow DAG to trigger for E2E tests'
required: true
type: string
candidate_image:
description: 'The Docker image URI of the release candidate to test'
required: true
type: string

permissions:
contents: read

jobs:
e2e_airflow_tests:
name: E2E Airflow Tests - ${{ inputs.dag_id }}
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
steps:
- name: Get Airflow URI
id: info
run: |
AIRFLOW_URI=$(gcloud composer environments describe ml-automation-solutions \
--location us-central1 \
--project cloud-ml-auto-solutions \
--format "value(config.airflowUri)")
echo "airflow_uri=${AIRFLOW_URI}" >> "$GITHUB_OUTPUT"

- name: Trigger DAG
id: trigger
run: |
IAP_TOKEN=$(gcloud auth print-access-token)

CANDIDATE_IMAGE="${{ inputs.candidate_image }}"
RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \
"${{ steps.info.outputs.airflow_uri }}/api/v1/dags/${{ inputs.dag_id }}/dagRuns" \
-H "Authorization: Bearer ${IAP_TOKEN}" \
-H "Content-Type: application/json" \
-d "{
\"conf\": {
\"release_candidate_image\": \"${CANDIDATE_IMAGE}\",
\"maxtext_sha\": \"${{ github.sha }}\",
\"github_run_id\": \"${{ github.run_id }}\"
}
}")

HTTP_STATUS=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')
echo "HTTP status: ${HTTP_STATUS}"
echo "Response body: ${BODY}"

if [ "${HTTP_STATUS}" -lt 200 ] || [ "${HTTP_STATUS}" -ge 300 ]; then
echo "Error: Airflow API returned HTTP ${HTTP_STATUS}"
exit 1
fi

DAG_RUN_ID=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('dag_run_id',''))")
if [ -z "${DAG_RUN_ID}" ] || [ "${DAG_RUN_ID}" = "null" ]; then
echo "Error: could not parse dag_run_id from response"
exit 1
fi
echo "dag_run_id=${DAG_RUN_ID}" >> "$GITHUB_OUTPUT"

- name: Poll DAG run status
run: |
AIRFLOW_URI="${{ steps.info.outputs.airflow_uri }}"
DAG_RUN_ID="${{ steps.trigger.outputs.dag_run_id }}"
MAX_WAIT=14400
POLL_INTERVAL=300
ELAPSED=0

while [ $ELAPSED -lt $MAX_WAIT ]; do
IAP_TOKEN=$(gcloud auth print-access-token)

RESPONSE=$(curl -s -w "\n%{http_code}" \
"${AIRFLOW_URI}/api/v1/dags/${{ inputs.dag_id }}/dagRuns/${DAG_RUN_ID}" \
-H "Authorization: Bearer ${IAP_TOKEN}")
HTTP_STATUS=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')

if [ "${HTTP_STATUS}" -lt 200 ] || [ "${HTTP_STATUS}" -ge 300 ]; then
echo "Warning: Airflow API returned HTTP ${HTTP_STATUS}, will retry. Body: ${BODY}"
sleep $POLL_INTERVAL
ELAPSED=$((ELAPSED + POLL_INTERVAL))
continue
fi

STATE=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('state',''))" 2>/dev/null || echo "unknown")

if [ "${STATE}" = "unknown" ]; then
echo "Warning: could not parse state from response body: ${BODY}"
fi

echo "DAG run state: ${STATE} (elapsed: ${ELAPSED}s)"

case "$STATE" in
success) echo "E2E tests passed."; exit 0 ;;
failed|upstream_failed) echo "E2E tests FAILED."; exit 1 ;;
esac

sleep $POLL_INTERVAL
ELAPSED=$((ELAPSED + POLL_INTERVAL))
done

echo "Timeout: E2E tests did not complete within 4 hours."
exit 1
Loading