diff --git a/.github/workflows/build_and_test_maxtext.yml b/.github/workflows/build_and_test_maxtext.yml index f3c6c9d0fa..f758f428d5 100644 --- a/.github/workflows/build_and_test_maxtext.yml +++ b/.github/workflows/build_and_test_maxtext.yml @@ -133,7 +133,7 @@ jobs: device_name: X64 cloud_runner: linux-x86-n2-16 image_type: ${{ matrix.image_type }} - pytest_marker: 'cpu_only' + pytest_marker: 'cpu_only and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" @@ -155,7 +155,7 @@ jobs: device_name: v6e-4 image_type: ${{ matrix.image_type }} cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test' + pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" @@ -175,7 +175,7 @@ jobs: device_name: v6e-4 image_type: ${{ matrix.image_type }} cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and integration_test' + pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" @@ -195,7 +195,7 @@ jobs: device_name: v6e-4 image_type: ${{ matrix.image_type }} cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test' + pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" @@ -215,7 +215,7 @@ jobs: device_name: v6e-4 image_type: ${{ matrix.image_type }} cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and integration_test' + pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" @@ -236,13 +236,57 @@ jobs: device_name: a100-40gb-4 image_type: ${{ matrix.image_type }} cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and not integration_test' + pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training' xla_python_client_mem_fraction: 0.65 tf_force_gpu_allow_growth: true container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} + maxtext_post_training_cpu_unit_tests: + needs: build_and_upload_maxtext_package + if: needs.doc_only_check.outputs.run_tests == 'true' + uses: ./.github/workflows/run_tests_against_package.yml + strategy: + fail-fast: false + matrix: + image_type: ["py312"] + with: + device_type: cpu + device_name: X64 + cloud_runner: linux-x86-n2-16 + image_type: ${{ matrix.image_type }} + pytest_marker: 'cpu_only' + pytest_addopts: 'tests/post_training/unit' + xla_python_client_mem_fraction: 0.75 + tf_force_gpu_allow_growth: false + container_resource_option: "--privileged" + is_scheduled_run: ${{ github.event_name == 'schedule' }} + extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt' + maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} + + maxtext_post_training_tpu_unit_tests: + needs: build_and_upload_maxtext_package + if: needs.doc_only_check.outputs.run_tests == 'true' + uses: ./.github/workflows/run_tests_against_package.yml + strategy: + fail-fast: false + matrix: + image_type: ["py312"] + with: + device_type: tpu + device_name: v6e-4 + image_type: ${{ matrix.image_type }} + cloud_runner: linux-x86-ct6e-180-4tpu + pytest_marker: 'tpu_only' + pytest_addopts: 'tests/post_training/unit' + xla_python_client_mem_fraction: 0.75 + tf_force_gpu_allow_growth: false + container_resource_option: "--privileged" + is_scheduled_run: ${{ github.event_name == 'schedule' }} + extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt' + maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} + maxtext_gpu_integration_tests: needs: build_and_upload_maxtext_package if: needs.doc_only_check.outputs.run_tests == 'true' @@ -257,7 +301,7 @@ jobs: device_name: a100-40gb-4 image_type: ${{ matrix.image_type }} cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and integration_test' + pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training' xla_python_client_mem_fraction: 0.65 tf_force_gpu_allow_growth: true container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" @@ -266,7 +310,7 @@ jobs: all_tests_passed: name: All Required Tests Passed - needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests] + needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests] if: always() runs-on: ubuntu-latest steps: @@ -287,6 +331,8 @@ jobs: echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}" echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}" echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}" + echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}" + echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}" # Fail only if any job failed or was cancelled (skipped is OK) if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then @@ -305,6 +351,8 @@ jobs: NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }} NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }} NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }} + NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }} + NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }} all_notebooks_passed: name: All Notebooks Passed @@ -337,7 +385,7 @@ jobs: notify_failure: name: Notify failed build # creates an issue or modifies last open existing issue for failed build - needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests] + needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests] if: ${{ always() }} runs-on: ubuntu-latest permissions: diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml index 7211202821..aba1000056 100644 --- a/.github/workflows/run_tests_against_package.yml +++ b/.github/workflows/run_tests_against_package.yml @@ -61,6 +61,10 @@ on: maxtext_sha: required: true type: string + extra_pip_deps_file: + required: false + type: string + default: '' permissions: contents: read @@ -96,6 +100,12 @@ jobs: python3 --version python3 -m pip freeze uv pip install pytest-cov + - name: Install extra pip deps + if: inputs.extra_pip_deps_file != '' + shell: bash + run: | + source .venv/bin/activate + uv pip install -r ${{ inputs.extra_pip_deps_file }} - name: Copy test assets files run : gcloud storage cp gs://maxtext-test-assets/* tests/assets - name: Run Tests diff --git a/pytest.ini b/pytest.ini index a2789e273b..185a0157c9 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,7 +5,7 @@ testpaths = python_files = *_test.py *_tests.py addopts = -rf --import-mode=importlib --strict-markers - --ignore=tests/integration/grpo_trainer_correctness_test.py + --ignore=tests/post_training/integration/grpo_trainer_correctness_test.py --ignore=tests/integration/smoke/train_gpu_smoke_test.py --ignore=tests/integration/smoke/train_int8_smoke_test.py --ignore=tests/integration/smoke/train_smoke_test.py @@ -36,4 +36,5 @@ markers = e.g., end_to_end tests external_serving: JetStream / serving / decode server components external_training: goodput integrations + post_training: marks tests for post-training code paths. diff --git a/src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt b/src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt new file mode 100644 index 0000000000..5b4b9ce474 --- /dev/null +++ b/src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt @@ -0,0 +1 @@ +google-tunix @ https://github.com/google/tunix/archive/336d102fe32ca0edbe42a8f66ff0fd533cebdf52.zip diff --git a/src/install_maxtext_extra_deps/extra_post_train_deps_from_github.txt b/src/install_maxtext_extra_deps/extra_post_train_deps_from_github.txt index 820ec1235c..b990ed71d7 100644 --- a/src/install_maxtext_extra_deps/extra_post_train_deps_from_github.txt +++ b/src/install_maxtext_extra_deps/extra_post_train_deps_from_github.txt @@ -1,5 +1,5 @@ +-r extra_post_train_base_deps_from_github.txt google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip -google-tunix @ https://github.com/google/tunix/archive/336d102fe32ca0edbe42a8f66ff0fd533cebdf52.zip mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip tpu-inference @ https://github.com/vllm-project/tpu-inference/archive/0cae84fc9a883ba1bde02d4f07930e6af9e92958.zip vllm @ git+https://github.com/vllm-project/vllm@ee8a29511fc69e3f0f6291fa6ff1cf6e47f7750d diff --git a/tests/integration/grpo_correctness.py b/tests/post_training/integration/grpo_correctness.py similarity index 99% rename from tests/integration/grpo_correctness.py rename to tests/post_training/integration/grpo_correctness.py index bd97b2f319..44a3e28df7 100644 --- a/tests/integration/grpo_correctness.py +++ b/tests/post_training/integration/grpo_correctness.py @@ -35,7 +35,7 @@ from trl import GRPOConfig, GRPOTrainer -pytestmark = [pytest.mark.external_training] # uses pre-generated checkpoint +pytestmark = [pytest.mark.external_training, pytest.mark.post_training] # uses pre-generated checkpoint class GRPOTest(unittest.TestCase): diff --git a/tests/integration/grpo_trainer_correctness_test.py b/tests/post_training/integration/grpo_trainer_correctness_test.py similarity index 98% rename from tests/integration/grpo_trainer_correctness_test.py rename to tests/post_training/integration/grpo_trainer_correctness_test.py index d73c510d2f..5994ab4b31 100644 --- a/tests/integration/grpo_trainer_correctness_test.py +++ b/tests/post_training/integration/grpo_trainer_correctness_test.py @@ -22,7 +22,7 @@ from maxtext/tests/assets/logits_generation/generate_grpo_golden_logits.py Usage: - pytest tests/integration/grpo_trainer_correctness_test.py + pytest tests/post_training/integration/grpo_trainer_correctness_test.py """ import os @@ -52,7 +52,7 @@ import transformers # This test is for serving pathways via offline_engine and maxengine. -pytestmark = [pytest.mark.external_training] +pytestmark = [pytest.mark.external_training, pytest.mark.post_training] def get_golden_data(config): diff --git a/tests/integration/sft_trainer_correctness_test.py b/tests/post_training/integration/sft_trainer_correctness_test.py similarity index 98% rename from tests/integration/sft_trainer_correctness_test.py rename to tests/post_training/integration/sft_trainer_correctness_test.py index 789cc0207d..9ed48a0492 100644 --- a/tests/integration/sft_trainer_correctness_test.py +++ b/tests/post_training/integration/sft_trainer_correctness_test.py @@ -21,7 +21,7 @@ Usage: - pytest tests/integration/sft_trainer_correctness_test.py + pytest tests/post_training/integration/sft_trainer_correctness_test.py """ import os.path @@ -46,6 +46,8 @@ import pytest from transformers import AutoTokenizer +pytestmark = [pytest.mark.post_training] + def get_golden_data(model_name): """Get the golden data for sft_trainer from maxtext/tests/assets/logits_generation/generate_sft_golden_data.py.""" diff --git a/tests/unit/distillation_checkpointing_test.py b/tests/post_training/unit/distillation_checkpointing_test.py similarity index 98% rename from tests/unit/distillation_checkpointing_test.py rename to tests/post_training/unit/distillation_checkpointing_test.py index 6511e777cd..8d4c8e8ce0 100644 --- a/tests/unit/distillation_checkpointing_test.py +++ b/tests/post_training/unit/distillation_checkpointing_test.py @@ -17,7 +17,7 @@ import pytest pytest.importorskip("tunix") -pytestmark = [pytest.mark.tpu_only] +pytestmark = [pytest.mark.tpu_only, pytest.mark.post_training] import json import os diff --git a/tests/unit/distillation_data_processing_test.py b/tests/post_training/unit/distillation_data_processing_test.py similarity index 98% rename from tests/unit/distillation_data_processing_test.py rename to tests/post_training/unit/distillation_data_processing_test.py index eb8fc96414..837f4eb259 100644 --- a/tests/unit/distillation_data_processing_test.py +++ b/tests/post_training/unit/distillation_data_processing_test.py @@ -14,6 +14,10 @@ """Data processing tests for distillation.""" +import pytest + +pytestmark = [pytest.mark.post_training, pytest.mark.cpu_only] + import argparse import os import subprocess diff --git a/tests/unit/rl_utils_test.py b/tests/post_training/unit/rl_utils_test.py similarity index 99% rename from tests/unit/rl_utils_test.py rename to tests/post_training/unit/rl_utils_test.py index 2fd04f93f6..c8aaa7dd83 100644 --- a/tests/unit/rl_utils_test.py +++ b/tests/post_training/unit/rl_utils_test.py @@ -18,6 +18,8 @@ import pytest from types import SimpleNamespace +pytestmark = [pytest.mark.post_training] + evaluate_rl = pytest.importorskip( "maxtext.trainers.post_train.rl.evaluate_rl", reason="tunix (required by evaluate_rl) is not installed GPU", diff --git a/tests/unit/sft_data_processing_test.py b/tests/post_training/unit/sft_data_processing_test.py similarity index 99% rename from tests/unit/sft_data_processing_test.py rename to tests/post_training/unit/sft_data_processing_test.py index bd8092c12f..1882399b5f 100644 --- a/tests/unit/sft_data_processing_test.py +++ b/tests/post_training/unit/sft_data_processing_test.py @@ -13,6 +13,10 @@ # limitations under the License. """Data processing tests for SFT.""" +import pytest + +pytestmark = [pytest.mark.post_training, pytest.mark.cpu_only] + import subprocess import unittest import os.path diff --git a/tests/unit/sft_hooks_test.py b/tests/post_training/unit/sft_hooks_test.py similarity index 99% rename from tests/unit/sft_hooks_test.py rename to tests/post_training/unit/sft_hooks_test.py index 91a498627a..cd83435577 100644 --- a/tests/unit/sft_hooks_test.py +++ b/tests/post_training/unit/sft_hooks_test.py @@ -16,7 +16,7 @@ import pytest pytest.importorskip("tunix") -pytestmark = [pytest.mark.tpu_only, pytest.mark.external_training] +pytestmark = [pytest.mark.tpu_only, pytest.mark.external_training, pytest.mark.post_training] import jax diff --git a/tests/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py similarity index 99% rename from tests/unit/train_distill_test.py rename to tests/post_training/unit/train_distill_test.py index 6e84a914af..e059986162 100644 --- a/tests/unit/train_distill_test.py +++ b/tests/post_training/unit/train_distill_test.py @@ -18,7 +18,7 @@ import pytest pytest.importorskip("tunix") -pytestmark = [pytest.mark.tpu_only] +pytestmark = [pytest.mark.tpu_only, pytest.mark.post_training] import shutil import tempfile diff --git a/tests/unit/train_rl_test.py b/tests/post_training/unit/train_rl_test.py similarity index 99% rename from tests/unit/train_rl_test.py rename to tests/post_training/unit/train_rl_test.py index a6d730a9a2..af66d52a98 100644 --- a/tests/unit/train_rl_test.py +++ b/tests/post_training/unit/train_rl_test.py @@ -21,6 +21,8 @@ import jax +pytestmark = [pytest.mark.post_training] + # Same as in rl_utils_test.py. train_rl = pytest.importorskip( "maxtext.trainers.post_train.rl.train_rl",