Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 57 additions & 9 deletions .github/workflows/build_and_test_maxtext.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ jobs:
device_name: X64
cloud_runner: linux-x86-n2-16
image_type: ${{ matrix.image_type }}
pytest_marker: 'cpu_only'
pytest_marker: 'cpu_only and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
Expand All @@ -155,7 +155,7 @@ jobs:
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
Expand All @@ -175,7 +175,7 @@ jobs:
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
Expand All @@ -195,7 +195,7 @@ jobs:
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
Expand All @@ -215,7 +215,7 @@ jobs:
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
Expand All @@ -236,13 +236,57 @@ jobs:
device_name: a100-40gb-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_post_training_cpu_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: cpu
device_name: X64
cloud_runner: linux-x86-n2-16
image_type: ${{ matrix.image_type }}
pytest_marker: 'cpu_only'
pytest_addopts: 'tests/post_training/unit'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_post_training_tpu_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'tpu_only'
pytest_addopts: 'tests/post_training/unit'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_gpu_integration_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
Expand All @@ -257,7 +301,7 @@ jobs:
device_name: a100-40gb-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
Expand All @@ -266,7 +310,7 @@ jobs:

all_tests_passed:
name: All Required Tests Passed
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
if: always()
runs-on: ubuntu-latest
steps:
Expand All @@ -287,6 +331,8 @@ jobs:
echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"

# Fail only if any job failed or was cancelled (skipped is OK)
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
Expand All @@ -305,6 +351,8 @@ jobs:
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }}

all_notebooks_passed:
name: All Notebooks Passed
Expand Down Expand Up @@ -337,7 +385,7 @@ jobs:

notify_failure:
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
if: ${{ always() }}
runs-on: ubuntu-latest
permissions:
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/run_tests_against_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ on:
maxtext_sha:
required: true
type: string
extra_pip_deps_file:
required: false
type: string
default: ''

permissions:
contents: read
Expand Down Expand Up @@ -96,6 +100,12 @@ jobs:
python3 --version
python3 -m pip freeze
uv pip install pytest-cov
- name: Install extra pip deps
if: inputs.extra_pip_deps_file != ''
shell: bash
run: |
source .venv/bin/activate
uv pip install -r ${{ inputs.extra_pip_deps_file }}
- name: Copy test assets files
run : gcloud storage cp gs://maxtext-test-assets/* tests/assets
- name: Run Tests
Expand Down
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ testpaths =
python_files = *_test.py *_tests.py
addopts =
-rf --import-mode=importlib --strict-markers
--ignore=tests/integration/grpo_trainer_correctness_test.py
--ignore=tests/post_training/integration/grpo_trainer_correctness_test.py
--ignore=tests/integration/smoke/train_gpu_smoke_test.py
--ignore=tests/integration/smoke/train_int8_smoke_test.py
--ignore=tests/integration/smoke/train_smoke_test.py
Expand Down Expand Up @@ -36,4 +36,5 @@ markers =
e.g., end_to_end tests
external_serving: JetStream / serving / decode server components
external_training: goodput integrations
post_training: marks tests for post-training code paths.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
google-tunix @ https://github.com/google/tunix/archive/336d102fe32ca0edbe42a8f66ff0fd533cebdf52.zip
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-r extra_post_train_base_deps_from_github.txt
google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
google-tunix @ https://github.com/google/tunix/archive/336d102fe32ca0edbe42a8f66ff0fd533cebdf52.zip
mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip
tpu-inference @ https://github.com/vllm-project/tpu-inference/archive/0cae84fc9a883ba1bde02d4f07930e6af9e92958.zip
vllm @ git+https://github.com/vllm-project/vllm@ee8a29511fc69e3f0f6291fa6ff1cf6e47f7750d
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

from trl import GRPOConfig, GRPOTrainer

pytestmark = [pytest.mark.external_training] # uses pre-generated checkpoint
pytestmark = [pytest.mark.external_training, pytest.mark.post_training] # uses pre-generated checkpoint


class GRPOTest(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from maxtext/tests/assets/logits_generation/generate_grpo_golden_logits.py

Usage:
pytest tests/integration/grpo_trainer_correctness_test.py
pytest tests/post_training/integration/grpo_trainer_correctness_test.py
"""

import os
Expand Down Expand Up @@ -52,7 +52,7 @@
import transformers

# This test is for serving pathways via offline_engine and maxengine.
pytestmark = [pytest.mark.external_training]
pytestmark = [pytest.mark.external_training, pytest.mark.post_training]


def get_golden_data(config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

Usage:

pytest tests/integration/sft_trainer_correctness_test.py
pytest tests/post_training/integration/sft_trainer_correctness_test.py
"""

import os.path
Expand All @@ -46,6 +46,8 @@
import pytest
from transformers import AutoTokenizer

pytestmark = [pytest.mark.post_training]


def get_golden_data(model_name):
"""Get the golden data for sft_trainer from maxtext/tests/assets/logits_generation/generate_sft_golden_data.py."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest

pytest.importorskip("tunix")
pytestmark = [pytest.mark.tpu_only]
pytestmark = [pytest.mark.tpu_only, pytest.mark.post_training]

import json
import os
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@

"""Data processing tests for distillation."""

import pytest

pytestmark = [pytest.mark.post_training, pytest.mark.cpu_only]

import argparse
import os
import subprocess
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import pytest
from types import SimpleNamespace

pytestmark = [pytest.mark.post_training]

evaluate_rl = pytest.importorskip(
"maxtext.trainers.post_train.rl.evaluate_rl",
reason="tunix (required by evaluate_rl) is not installed GPU",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
# limitations under the License.

"""Data processing tests for SFT."""
import pytest

pytestmark = [pytest.mark.post_training, pytest.mark.cpu_only]

import subprocess
import unittest
import os.path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pytest

pytest.importorskip("tunix")
pytestmark = [pytest.mark.tpu_only, pytest.mark.external_training]
pytestmark = [pytest.mark.tpu_only, pytest.mark.external_training, pytest.mark.post_training]

import jax

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pytest

pytest.importorskip("tunix")
pytestmark = [pytest.mark.tpu_only]
pytestmark = [pytest.mark.tpu_only, pytest.mark.post_training]

import shutil
import tempfile
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import jax


pytestmark = [pytest.mark.post_training]

# Same as in rl_utils_test.py.
train_rl = pytest.importorskip(
"maxtext.trainers.post_train.rl.train_rl",
Expand Down
Loading