diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml index 0eba10c310..89fe8e3219 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/pypi_release.yml @@ -38,14 +38,59 @@ jobs: run: echo "Release approved, proceeding to build, test and publish MaxText package." build_and_test_maxtext_package: - name: Build and Test MaxText Package + name: ${{ matrix.image_name }} needs: [release_approval] - uses: ./.github/workflows/build_and_test_maxtext.yml - secrets: inherit + strategy: + fail-fast: false + matrix: + include: + - device: tpu + build_mode: stable + image_name: maxtext_jax_stable + workflow: pre-training + dockerfile: maxtext_tpu_dependencies.Dockerfile + - device: gpu + build_mode: stable + image_name: maxtext_gpu_jax_stable + workflow: pre-training + dockerfile: maxtext_gpu_dependencies.Dockerfile + - device: tpu + build_mode: stable + image_name: maxtext_post_training_stable + workflow: post-training + dockerfile: maxtext_tpu_dependencies.Dockerfile + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + image_name: ${{ matrix.image_name }} + device: ${{ matrix.device }} + build_mode: ${{ matrix.build_mode }} + workflow: ${{ matrix.workflow }} + dockerfile: ${{ matrix.dockerfile }} + maxtext_sha: ${{ github.sha }} + version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }} + secrets: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + + run_e2e_tests: + name: ${{ matrix.image_name }} + needs: [build_release_candidate_images] + strategy: + fail-fast: false + matrix: + include: + - dag_id: maxtext_e2e_tpu_pre_training + candidate_image: "gcr.io/${{ vars.PROJECT_NAME }}/maxtext_jax_stable:${{ github.run_id }}" + - dag_id: maxtext_e2e_tpu_post_training + candidate_image: "gcr.io/${{ vars.PROJECT_NAME }}/maxtext_post_training_stable:${{ github.run_id }}" + uses: ./.github/workflows/run_e2e_tests.yml + with: + dag_id: ${{ matrix.dag_id }} + candidate_image: ${{ matrix.candidate_image }} + uses: ./.github/workflows/run_e2e_tests.yml publish_maxtext_package_to_pypi: name: Publish MaxText package to PyPI - needs: [build_and_test_maxtext_package] + needs: [run_e2e_tests] runs-on: ubuntu-latest environment: release steps: @@ -89,39 +134,23 @@ jobs: # Set the output variable for other jobs to consume echo "version=$latest_version" >> "$GITHUB_OUTPUT" - # This job builds and pushes MaxText stable Docker images for both TPU and GPU devices. - # It runs only after a new release is published to PyPI. - # Creates docker image for MaxText commit corresponding to the release. - upload_maxtext_docker_images: - name: ${{ matrix.image_name }} - needs: [get_latest_maxtext_pypi_version] + promote_release_images: + name: Promote Release Images - ${{ matrix.image_name }} + needs: [publish_maxtext_package_to_pypi, get_latest_maxtext_pypi_version] + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 strategy: fail-fast: false matrix: - include: - - device: tpu - build_mode: stable - image_name: maxtext_jax_stable - workflow: pre-training - dockerfile: maxtext_tpu_dependencies.Dockerfile - - device: gpu - build_mode: stable - image_name: maxtext_gpu_jax_stable - workflow: pre-training - dockerfile: maxtext_gpu_dependencies.Dockerfile - - device: tpu - build_mode: stable - image_name: maxtext_post_training_stable - workflow: post-training - dockerfile: maxtext_tpu_dependencies.Dockerfile - uses: ./.github/workflows/build_and_push_docker_image.yml - with: - image_name: ${{ matrix.image_name }} - device: ${{ matrix.device }} - build_mode: ${{ matrix.build_mode }} - workflow: ${{ matrix.workflow }} - dockerfile: ${{ matrix.dockerfile }} - maxtext_sha: ${{ github.sha }} - version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }} - secrets: - HF_TOKEN: ${{ secrets.HF_TOKEN }} + image_name: + - maxtext_jax_stable + - maxtext_post_training_stable + steps: + - name: Configure Docker + run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q + + - name: Add tags to Docker image + shell: bash + run: | + SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${{ matrix.image_name }}" + gcloud container images add-tag "${SOURCE_IMAGE}:${{ github.run_id }}" "${SOURCE_IMAGE}:${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}" --quiet \ No newline at end of file diff --git a/.github/workflows/run_e2e_tests.yml b/.github/workflows/run_e2e_tests.yml new file mode 100644 index 0000000000..f27b917971 --- /dev/null +++ b/.github/workflows/run_e2e_tests.yml @@ -0,0 +1,110 @@ +name: MaxText E2E Airflow Tests + +on: + workflow_call: + inputs: + dag_id: + description: 'The ID of the Airflow DAG to trigger for E2E tests' + required: true + type: string + candidate_image: + description: 'The Docker image URI of the release candidate to test' + required: true + type: string + +permissions: + contents: read + +jobs: + e2e_airflow_tests: + name: E2E Airflow Tests - ${{ inputs.dag_id }} + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 + steps: + - name: Get Airflow URI + id: info + run: | + AIRFLOW_URI=$(gcloud composer environments describe ml-automation-solutions \ + --location us-central1 \ + --project cloud-ml-auto-solutions \ + --format "value(config.airflowUri)") + echo "airflow_uri=${AIRFLOW_URI}" >> "$GITHUB_OUTPUT" + + - name: Trigger DAG + id: trigger + run: | + IAP_TOKEN=$(gcloud auth print-access-token) + + CANDIDATE_IMAGE="${{ inputs.candidate_image }}" + RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \ + "${{ steps.info.outputs.airflow_uri }}/api/v1/dags/${{ inputs.dag_id }}/dagRuns" \ + -H "Authorization: Bearer ${IAP_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{ + \"conf\": { + \"release_candidate_image\": \"${CANDIDATE_IMAGE}\", + \"maxtext_sha\": \"${{ github.sha }}\", + \"github_run_id\": \"${{ github.run_id }}\" + } + }") + + HTTP_STATUS=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | sed '$d') + echo "HTTP status: ${HTTP_STATUS}" + echo "Response body: ${BODY}" + + if [ "${HTTP_STATUS}" -lt 200 ] || [ "${HTTP_STATUS}" -ge 300 ]; then + echo "Error: Airflow API returned HTTP ${HTTP_STATUS}" + exit 1 + fi + + DAG_RUN_ID=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('dag_run_id',''))") + if [ -z "${DAG_RUN_ID}" ] || [ "${DAG_RUN_ID}" = "null" ]; then + echo "Error: could not parse dag_run_id from response" + exit 1 + fi + echo "dag_run_id=${DAG_RUN_ID}" >> "$GITHUB_OUTPUT" + + - name: Poll DAG run status + run: | + AIRFLOW_URI="${{ steps.info.outputs.airflow_uri }}" + DAG_RUN_ID="${{ steps.trigger.outputs.dag_run_id }}" + MAX_WAIT=14400 + POLL_INTERVAL=300 + ELAPSED=0 + + while [ $ELAPSED -lt $MAX_WAIT ]; do + IAP_TOKEN=$(gcloud auth print-access-token) + + RESPONSE=$(curl -s -w "\n%{http_code}" \ + "${AIRFLOW_URI}/api/v1/dags/${{ inputs.dag_id }}/dagRuns/${DAG_RUN_ID}" \ + -H "Authorization: Bearer ${IAP_TOKEN}") + HTTP_STATUS=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | sed '$d') + + if [ "${HTTP_STATUS}" -lt 200 ] || [ "${HTTP_STATUS}" -ge 300 ]; then + echo "Warning: Airflow API returned HTTP ${HTTP_STATUS}, will retry. Body: ${BODY}" + sleep $POLL_INTERVAL + ELAPSED=$((ELAPSED + POLL_INTERVAL)) + continue + fi + + STATE=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('state',''))" 2>/dev/null || echo "unknown") + + if [ "${STATE}" = "unknown" ]; then + echo "Warning: could not parse state from response body: ${BODY}" + fi + + echo "DAG run state: ${STATE} (elapsed: ${ELAPSED}s)" + + case "$STATE" in + success) echo "E2E tests passed."; exit 0 ;; + failed|upstream_failed) echo "E2E tests FAILED."; exit 1 ;; + esac + + sleep $POLL_INTERVAL + ELAPSED=$((ELAPSED + POLL_INTERVAL)) + done + + echo "Timeout: E2E tests did not complete within 4 hours." + exit 1