diff --git a/.github/workflows/_gpu_4cards_case_test.yml b/.github/workflows/_gpu_4cards_case_test.yml index 9b1455e8c06..e0e8b709623 100644 --- a/.github/workflows/_gpu_4cards_case_test.yml +++ b/.github/workflows/_gpu_4cards_case_test.yml @@ -198,6 +198,28 @@ jobs: python -m pip install ${fd_wheel_url} --no-deps --target=/workspace/FastDeploy export PYTHONPATH=/workspace/FastDeploy/ + mkdir -p /workspace/FastDeploy/run_4_cards_tests_error_logs + + chmod -R a+rX run_4_cards_tests_error_logs/ 2>/dev/null || true + export CUDA_VISIBLE_DEVICES=0,1,2,3 - bash scripts/run_gpu_4cards.sh + TEST_EXIT_CODE=0 + bash scripts/run_gpu_4cards.sh || TEST_EXIT_CODE=8 + echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env ' + if [ -f FastDeploy/exit_code.env ]; then + cat FastDeploy/exit_code.env >> $GITHUB_ENV + fi + + if [ "${TEST_EXIT_CODE}" -eq 8 ]; then + exit 1 + fi + + - name: Upload run_4_cards_tests_error_logs + if: ${{ always() && env.TEST_EXIT_CODE == 8 }} + uses: actions/upload-artifact@v4 + with: + name: run_4_cards_tests_error_logs + path: FastDeploy/run_4_cards_tests_error_logs + retention-days: 7 + if-no-files-found: ignore diff --git a/scripts/run_gpu_4cards.sh b/scripts/run_gpu_4cards.sh index 719ec19255c..22dfa52f715 100644 --- a/scripts/run_gpu_4cards.sh +++ b/scripts/run_gpu_4cards.sh @@ -39,6 +39,30 @@ for test_file in "${test_files[@]}"; do echo "${test_file}" >> "${FAILED_CASE_FILE}" FAILED_COUNT=$((FAILED_COUNT + 1)) + # Save logs for failed test case + error_base_dir="${REPO_ROOT}/run_4_cards_tests_error_logs" + test_folder_name=$(echo "$test_file" | tr '/' '_' | sed 's/\.py$//') + error_log_dir="${error_base_dir}/${test_folder_name}" + mkdir -p "${error_log_dir}" + + echo "Saving log* to ${error_log_dir}..." + + # Copy all log* directories + for log_dir in "${REPO_ROOT}"/log*; do + if [ -d "${log_dir}" ]; then + cp -r "${log_dir}" "${error_log_dir}/" || true + fi + done + + # Copy all *.log files + for log_file in "${REPO_ROOT}"/*.log; do + if [ -f "${log_file}" ]; then + cp "${log_file}" "${error_log_dir}/" || true + fi + done + + echo "*.log saved to ${error_log_dir}" + echo "" echo "==================== Dumping Logs ====================" diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/__test_GLM_45_AIR_mtp_tp4.py similarity index 100% rename from tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py rename to tests/e2e/4cards_cases/__test_GLM_45_AIR_mtp_tp4.py diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/__test_GLM_45_AIR_tp4.py similarity index 100% rename from tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py rename to tests/e2e/4cards_cases/__test_GLM_45_AIR_tp4.py diff --git a/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py b/tests/e2e/4cards_cases/__test_Qwen3_30b_tp4.py similarity index 100% rename from tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py rename to tests/e2e/4cards_cases/__test_Qwen3_30b_tp4.py diff --git a/tests/e2e/4cards_cases/test_determinism_long.py b/tests/e2e/4cards_cases/__test_determinism_long.py similarity index 100% rename from tests/e2e/4cards_cases/test_determinism_long.py rename to tests/e2e/4cards_cases/__test_determinism_long.py diff --git a/tests/e2e/4cards_cases/test_ernie_21b_tp1_dp4_mtp.py b/tests/e2e/4cards_cases/__test_ernie_21b_tp1_dp4_mtp.py similarity index 100% rename from tests/e2e/4cards_cases/test_ernie_21b_tp1_dp4_mtp.py rename to tests/e2e/4cards_cases/__test_ernie_21b_tp1_dp4_mtp.py diff --git a/tests/e2e/4cards_cases/test_vocab_parallel_embedding_deterministic_launch.py b/tests/e2e/4cards_cases/__test_vocab_parallel_embedding_deterministic_launch.py similarity index 100% rename from tests/e2e/4cards_cases/test_vocab_parallel_embedding_deterministic_launch.py rename to tests/e2e/4cards_cases/__test_vocab_parallel_embedding_deterministic_launch.py diff --git a/tests/e2e/4cards_cases/test_ernie_21b_tp1_dp4.py b/tests/e2e/4cards_cases/test_ernie_21b_tp1_dp4.py index 4fb178d4582..4fc2794f544 100644 --- a/tests/e2e/4cards_cases/test_ernie_21b_tp1_dp4.py +++ b/tests/e2e/4cards_cases/test_ernie_21b_tp1_dp4.py @@ -336,9 +336,9 @@ def test_text_diff(api_url): base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "21b_tp1_dp4_text_baseline.txt") + base_file = os.path.join(base_path, "21b_tp1_dp4_text_baseline_debug.txt") else: - base_file = "21b_tp1_dp4_text_baseline.txt" + base_file = "21b_tp1_dp4_text_baseline_debug.txt" with open(base_file, "r", encoding="utf-8") as f: baseline = f.read()