diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index e58cff478..4ca9be53c 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -312,8 +312,15 @@ jobs:
           if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Verify eval scores
-        if: ${{ (success() || failure()) && inputs.eval-only }}
-        run: python3 utils/evals/validate_scores.py
+        if: ${{ (success() || failure()) && (inputs.run-eval || inputs.eval-only) }}
+        run: |
+          expected_concs="${EVAL_CONC}"
+          if [[ -z "${expected_concs}" ]]; then
+            expected_concs="$(
+              printf '%s\n' "${CONC_LIST}" | tr ' ' '\n' | sort -n | tail -1
+            )"
+          fi
+          python3 utils/evals/validate_scores.py --expected-concs "${expected_concs}"
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ always() && (inputs.run-eval || inputs.eval-only) }}
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index a57e89725..d6d982bd5 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -302,8 +302,8 @@ jobs:
           if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Verify eval scores
-        if: ${{ (success() || failure()) && inputs.eval-only }}
-        run: python3 utils/evals/validate_scores.py
+        if: ${{ (success() || failure()) && (inputs.run-eval || inputs.eval-only) }}
+        run: python3 utils/evals/validate_scores.py --expected-concs "${CONC}"
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index 2aa917622..65be1552a 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -438,7 +438,8 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-            run-eval: ${{ matrix.config.run-eval }}
+            # Accuracy runs have their own eval-only matrices below.
+            run-eval: false
 
     sweep-single-node-8k1k:
         needs: [setup, canary-select, canary-sweep]
diff --git a/.github/workflows/test-changelog-gate.yml b/.github/workflows/test-changelog-gate.yml
index d46a6aaac..950053326 100644
--- a/.github/workflows/test-changelog-gate.yml
+++ b/.github/workflows/test-changelog-gate.yml
@@ -8,19 +8,22 @@ on:
     paths:
       - ".claude/commands/recover-failed-ingest.md"
       - ".github/workflows/benchmark-multinode-tmpl.yml"
+      - ".github/workflows/benchmark-tmpl.yml"
+      - ".github/workflows/collect-evals.yml"
       - ".github/workflows/e2e-tests.yml"
       - ".github/workflows/run-sweep.yml"
       - ".github/workflows/test-changelog-gate.yml"
       - "benchmarks/benchmark_lib.sh"
-      - "benchmarks/multi_node/amd_utils/job.slurm"
+      - "benchmarks/multi_node/amd_utils/**"
+      - "benchmarks/single_node/**"
+      - "runners/launch_*.sh"
       - "utils/find_reusable_sweep_run.py"
       - "utils/test_find_reusable_sweep_run.py"
       - "utils/process_changelog.py"
       - "utils/test_process_changelog.py"
       - "utils/collect_eval_results.py"
       - "utils/test_collect_eval_results.py"
-      - "utils/evals/validate_scores.py"
-      - "utils/evals/test_batched_eval.py"
+      - "utils/evals/**"
       - "utils/prepare_perf_changelog_merge.py"
       - "utils/recover_failed_ingest.py"
       - "utils/changelog_gate_tests/test_prepare_perf_changelog_merge.py"
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95e063a3d..aed65d8b8 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -752,6 +752,7 @@ _stage_lm_eval_artifacts() {
     local results_dir="$1"
     local eval_conc="$2"
     local moved=0
+    local result_count=0
     local failed=0
     local jf base stem extension target suffix
 
@@ -788,6 +789,9 @@ _stage_lm_eval_artifacts() {
 
         if mv -f "$jf" "$target"; then
             moved=1
+            if [[ "$base" == results*.json ]]; then
+                result_count=$((result_count + 1))
+            fi
         else
             echo "WARN: failed to stage eval artifact ${jf}" >&2
             failed=1
@@ -805,6 +809,10 @@ _stage_lm_eval_artifacts() {
         echo "WARN: no eval artifacts were produced for concurrency ${eval_conc}" >&2
         return 1
     fi
+    if [ "$result_count" -eq 0 ]; then
+        echo "WARN: no results*.json artifact was produced for concurrency ${eval_conc}" >&2
+        return 1
+    fi
     return "$failed"
 }
 
@@ -834,6 +842,12 @@ append_lm_eval_summary() {
     local meta_json
     local metadata_conc="${CONC:-1}"
     local batch_metadata=""
+    local eval_exit_code="${EVAL_RUN_EXIT_CODE:-0}"
+
+    if [[ ! "$eval_exit_code" =~ ^[0-9]+$ ]]; then
+        echo "ERROR: invalid eval exit code '${eval_exit_code}'" >&2
+        return 1
+    fi
 
     if [ -n "$batch_concs" ]; then
         meta_json="./meta_env.json"
@@ -864,8 +878,13 @@ append_lm_eval_summary() {
         meta_json="${out_dir}/meta_env.json"
     fi
 
+    if [[ ! "$metadata_conc" =~ ^[1-9][0-9]*$ ]]; then
+        echo "ERROR: invalid eval metadata concurrency '${metadata_conc}'" >&2
+        return 1
+    fi
+
     # Write minimal meta for collectors that expect it
-    local model_name="${MODEL_NAME:-$MODEL}"
+    local model_name="${MODEL_NAME:-${MODEL:-}}"
     local is_multinode_json="false"
     if [ "${IS_MULTINODE:-false}" = "true" ]; then
         is_multinode_json="true"
@@ -899,7 +918,7 @@ append_lm_eval_summary() {
     local fw="${FRAMEWORK:-}"
     local prec="${PRECISION:-}"
     if [[ -z "$fw" || -z "$prec" ]]; then
-        if [[ -n "${RESULT_FILENAME}" ]]; then
+        if [[ -n "${RESULT_FILENAME:-}" ]]; then
             # Extract the two fields immediately before "_tp"
             # Handles arbitrary underscores in exp_name by matching from the end
             local parsed
@@ -914,12 +933,13 @@ append_lm_eval_summary() {
             fi
         fi
     fi
-    cat > "${meta_json}" <<META
+    if ! cat > "${meta_json}" <<META
 {
   "is_multinode": ${is_multinode_json},
   "framework": "${fw:-unknown}",
   "precision": "${prec:-unknown}",
-  "spec_decoding": "${SPEC_DECODING}",
+  "spec_decoding": "${SPEC_DECODING:-none}",
+  "eval_exit_code": ${eval_exit_code},
   "tp": ${TP:-1},
   "conc": ${metadata_conc},
 ${batch_metadata}  "ep": ${EP_SIZE:-1},
@@ -939,6 +959,16 @@ ${batch_metadata}  "ep": ${EP_SIZE:-1},
   "osl": "${OSL:-0}"
 }
 META
+    then
+        echo "ERROR: failed to write eval metadata to ${meta_json}" >&2
+        return 1
+    fi
+
+    if ! python3 -m json.tool "${meta_json}" >/dev/null 2>&1; then
+        echo "ERROR: generated invalid eval metadata at ${meta_json}" >&2
+        rm -f "${meta_json}" || true
+        return 1
+    fi
 
     if [ -n "$batch_concs" ]; then
         echo "Prepared batched eval artifacts in: $(pwd)"
@@ -946,26 +976,94 @@ META
     fi
 
     # Move eval artifacts into PWD (no new directories in workspace)
+    local artifact_rc=0
+    local result_count=0
+    local jf base
     if [ -f "${meta_json}" ]; then
-        mv -f "${meta_json}" ./ || echo "WARN: failed to move ${meta_json}" >&2
+        if ! mv -f "${meta_json}" ./; then
+            echo "ERROR: failed to move ${meta_json}" >&2
+            artifact_rc=1
+        fi
     fi
     if [ -d "${out_dir}" ]; then
         while IFS= read -r -d '' jf; do
             base=$(basename "$jf")
             if [ "$base" != "meta_env.json" ]; then
-                mv -f "$jf" ./ || echo "WARN: failed to move ${jf}" >&2
+                if mv -f "$jf" ./; then
+                    if [[ "$base" == results*.json ]]; then
+                        result_count=$((result_count + 1))
+                    fi
+                else
+                    echo "ERROR: failed to move ${jf}" >&2
+                    artifact_rc=1
+                fi
             fi
-        done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null)
+        done < <(
+            find "${out_dir}" -type f \
+                \( -name "*.json" -o -name "*.jsonl" \) -print0 2>/dev/null
+        )
+    fi
+    if [ "$result_count" -eq 0 ]; then
+        echo "ERROR: eval produced no results*.json artifact" >&2
+        artifact_rc=1
     fi
 
-    # Best-effort cleanup of the temp directory
-    if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then
+    # Preserve the temp directory when staging fails so the caller can inspect it.
+    if [ "$artifact_rc" -eq 0 ] && [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then
         rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true
     fi
 
+    if [ "$artifact_rc" -ne 0 ]; then
+        echo "ERROR: eval artifact staging was incomplete" >&2
+        return "$artifact_rc"
+    fi
+
     echo "Moved eval artifacts to: $(pwd)"
 }
 
+_copy_lm_eval_artifacts() {
+    local source_dir="$1"
+    local destination_dir="$2"
+    local artifact_path base
+    local result_count=0
+    local copy_rc=0
+
+    if ! mkdir -p "$destination_dir"; then
+        echo "ERROR: failed to create eval artifact directory ${destination_dir}" >&2
+        return 1
+    fi
+
+    if [ ! -f "${source_dir}/meta_env.json" ]; then
+        echo "ERROR: missing ${source_dir}/meta_env.json" >&2
+        copy_rc=1
+    elif ! cp -f "${source_dir}/meta_env.json" "$destination_dir/"; then
+        echo "ERROR: failed to copy ${source_dir}/meta_env.json" >&2
+        copy_rc=1
+    fi
+
+    while IFS= read -r -d '' artifact_path; do
+        base=$(basename "$artifact_path")
+        if cp -f "$artifact_path" "$destination_dir/"; then
+            if [[ "$base" == results*.json ]]; then
+                result_count=$((result_count + 1))
+            fi
+        else
+            echo "ERROR: failed to copy eval artifact ${artifact_path}" >&2
+            copy_rc=1
+        fi
+    done < <(
+        find "$source_dir" -maxdepth 1 -type f \
+            \( -name "results*.json" -o -name "sample*.jsonl" \) \
+            -print0 2>/dev/null
+    )
+
+    if [ "$result_count" -eq 0 ]; then
+        echo "ERROR: no results*.json artifacts found in ${source_dir}" >&2
+        copy_rc=1
+    fi
+    return "$copy_rc"
+}
+
 # ------------------------------
 # Unified eval entrypoint
 # ------------------------------
@@ -989,29 +1087,46 @@ run_eval() {
     unset EVAL_BATCHED_CONCS
     unset EVAL_BATCHED_COMPLETED_CONCS
     unset EVAL_BATCHED_FAILED_CONCS
+    export EVAL_RUN_EXIT_CODE=0
 
     local requested_concs="${EVAL_CONCURRENT_REQUESTS:-}"
+    if [ -z "$requested_concs" ] && [ -n "${CONC:-}" ]; then
+        requested_concs="$CONC"
+    fi
+    requested_concs="${requested_concs//$'\n'/ }"
+
     local eval_concs=()
     if [ -n "$requested_concs" ]; then
         read -r -a eval_concs <<< "$requested_concs"
     fi
 
+    local eval_conc seen_concs=" "
+    for eval_conc in "${eval_concs[@]}"; do
+        if [[ ! "$eval_conc" =~ ^[1-9][0-9]*$ ]]; then
+            echo "ERROR: invalid eval concurrency '${eval_conc}'" >&2
+            export EVAL_RUN_EXIT_CODE=2
+            return 2
+        fi
+        if [[ "$seen_concs" == *" ${eval_conc} "* ]]; then
+            echo "ERROR: duplicate eval concurrency '${eval_conc}'" >&2
+            export EVAL_RUN_EXIT_CODE=2
+            return 2
+        fi
+        seen_concs="${seen_concs}${eval_conc} "
+    done
+
     if [ "${#eval_concs[@]}" -gt 1 ]; then
         if [[ "$framework" != "lm-eval" && "$framework" != "lm_eval" ]]; then
             echo "ERROR: batched eval concurrency is only supported for lm-eval" >&2
+            export EVAL_RUN_EXIT_CODE=2
             return 1
         fi
 
-        local eval_conc results_dir eval_rc stage_rc
+        local results_dir eval_rc stage_rc
         local completed_concs=()
         local failed_concs=()
 
         for eval_conc in "${eval_concs[@]}"; do
-            if [[ ! "$eval_conc" =~ ^[1-9][0-9]*$ ]]; then
-                echo "ERROR: invalid eval concurrency '${eval_conc}'" >&2
-                return 1
-            fi
-
             if ! results_dir=$(mktemp -d /tmp/eval_out-conc"${eval_conc}"-XXXXXX); then
                 echo "ERROR: failed to create eval output directory for concurrency ${eval_conc}" >&2
                 failed_concs+=("$eval_conc")
@@ -1043,8 +1158,13 @@ run_eval() {
         export EVAL_BATCHED_FAILED_CONCS="${failed_concs[*]}"
 
         if [ "${#failed_concs[@]}" -gt 0 ]; then
+            export EVAL_RUN_EXIT_CODE=1
             echo "ERROR: batched eval failed for concurrency: ${failed_concs[*]}" >&2
-            echo "Deferring failure until post-upload score validation preserves all artifacts" >&2
+            if [ "${EVAL_ONLY:-false}" = "true" ]; then
+                echo "Deferring failure until post-upload score validation preserves all artifacts" >&2
+                return 0
+            fi
+            return 1
         fi
         return 0
     fi
@@ -1055,11 +1175,12 @@ run_eval() {
         *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
 
+    export EVAL_RUN_EXIT_CODE="$eval_rc"
     if [ "$eval_rc" -ne 0 ]; then
         echo "ERROR: run_eval failed with exit code $eval_rc" >&2
-        if [ "${EVAL_ONLY}" = "true" ]; then
-            echo "Eval-only mode: failing after artifact collection" >&2
-            return "$eval_rc"
+        if [ "${EVAL_ONLY:-false}" = "true" ]; then
+            echo "Deferring failure until post-upload score validation preserves artifacts" >&2
+            return 0
         fi
     fi
     return $eval_rc
diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh
index 52f81b7d6..3beef4111 100644
--- a/benchmarks/multi_node/amd_utils/env_atom.sh
+++ b/benchmarks/multi_node/amd_utils/env_atom.sh
@@ -34,7 +34,10 @@ export IBDEVICES
 
 export SAFETENSORS_FAST_GPU=1
 export VLLM_LOG_LEVEL=WARNING
-export ATOM_LOG_LEVEL=WARNING
+export ATOM_LOG_LEVEL="${ATOM_LOG_LEVEL:-WARNING}"
+export ATOM_UVICORN_LOG_LEVEL="${ATOM_UVICORN_LOG_LEVEL:-warning}"
+export ATOM_UVICORN_ACCESS_LOG="${ATOM_UVICORN_ACCESS_LOG:-0}"
+export ATOMESH_LOG_LEVEL="${ATOMESH_LOG_LEVEL:-warn}"
 export AITER_LOG_LEVEL=WARNING
 export LOG_LEVEL=WARNING
 export LOGLEVEL=WARNING
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 17f5b4f54..bb55eb847 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -304,6 +304,11 @@ export ENGINE=$ENGINE
 export RUN_EVAL="${RUN_EVAL:-false}"
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 export EVAL_CONC="${EVAL_CONC:-}"
+export EVAL_FRAMEWORK="${EVAL_FRAMEWORK:-}"
+export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-}"
+export EVAL_MAX_MODEL_LEN="${EVAL_MAX_MODEL_LEN:-}"
+export MODEL="${MODEL:-}"
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
 export FRAMEWORK="${FRAMEWORK:-}"
 export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
@@ -354,6 +359,9 @@ DOCKER_ENV_COMMON=(
     -e NODE0_ADDR=\$NODE0_ADDR
     -e MODEL_DIR=/models
     -e MODEL_NAME=\$MODEL_NAME
+    -e MODEL=\$MODEL
+    -e MODEL_PATH=$DOCKER_MODEL_PATH
+    -e MAX_MODEL_LEN=\$MAX_MODEL_LEN
     -e GPUS_PER_NODE=\$GPUS_PER_NODE
     -e xP=\$xP
     -e yD=\$yD
@@ -370,7 +378,9 @@ DOCKER_ENV_COMMON=(
     -e WS_PATH=${WS_PATH}
     -e RUN_EVAL=\$RUN_EVAL
     -e EVAL_ONLY=\$EVAL_ONLY
-    -e EVAL_CONC
+    -e EVAL_FRAMEWORK=\$EVAL_FRAMEWORK
+    -e EVAL_TASKS_DIR=\$EVAL_TASKS_DIR
+    -e EVAL_MAX_MODEL_LEN=\$EVAL_MAX_MODEL_LEN
     -e FRAMEWORK=\$FRAMEWORK
     -e PRECISION=\$PRECISION
     -e MODEL_PREFIX=\$MODEL_PREFIX
@@ -417,6 +427,10 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then
         -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
         -e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-}
         -e IBDEVICES=${IBDEVICES:-}
+        -e ATOM_LOG_LEVEL=\${ATOM_LOG_LEVEL:-WARNING}
+        -e ATOM_UVICORN_LOG_LEVEL=\${ATOM_UVICORN_LOG_LEVEL:-warning}
+        -e ATOM_UVICORN_ACCESS_LOG=\${ATOM_UVICORN_ACCESS_LOG:-0}
+        -e ATOMESH_LOG_LEVEL=\${ATOMESH_LOG_LEVEL:-warn}
     )
 else
     DOCKER_ENV_ENGINE=(
@@ -585,6 +599,7 @@ fi
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
     ${EXTRA_DOCKER_MOUNTS:-} \
     \${RDMA_MOUNTS[@]+"\${RDMA_MOUNTS[@]}"} \
+    -e \"EVAL_CONC=\$EVAL_CONC\" \
     ${DOCKER_ENV_COMMON[*]} \
     ${DOCKER_ENV_ENGINE[*]} \
     --name \"$DOCKER_CONT_NAME\" \
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 957c84d60..601c6de78 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -29,11 +29,11 @@ IPADDRS="${IPADDRS:-localhost}"
 
 # Parallelism
 PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}"
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}"
 DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}"
 
 # ATOM server ports (different from SGLang which uses 8000 for all)
 PREFILL_PORT="${PREFILL_PORT:-8010}"
@@ -63,8 +63,14 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
 # Dependencies and Environment Setup
 # =============================================================================
 
-source $ATOM_WS_PATH/setup_deps.sh
-source $ATOM_WS_PATH/env_atom.sh
+if ! source "$ATOM_WS_PATH/setup_deps.sh"; then
+    echo "ERROR: failed to initialize ATOM dependencies" >&2
+    exit 1
+fi
+if ! source "$ATOM_WS_PATH/env_atom.sh"; then
+    echo "ERROR: failed to initialize ATOM environment" >&2
+    exit 1
+fi
 
 host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
 if [[ -z "$host_ip" ]]; then
@@ -103,28 +109,21 @@ done
 echo "Prefill IPs : ${PREFILL_IPS[*]}"
 echo "Decode  IPs : ${DECODE_IPS[*]}"
 
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
-
 PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
-if [ "$PREFILL_ENABLE_DP" = "true" ]; then
-    if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #DPA+TP
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
-    fi
-fi 
-
-DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
-if [ "$DECODE_ENABLE_DP" = "true" ]; then
-    if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #DPA+TP
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
-    fi
-fi 
+if [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
+    PREFILL_PARALLEL_ARGS+=(--enable-expert-parallel)
+fi
+if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
+    PREFILL_PARALLEL_ARGS+=(--enable-dp-attention)
+fi
+
+DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP
+if [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    DECODE_PARALLEL_ARGS+=(--enable-expert-parallel)
+fi
+if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
+    DECODE_PARALLEL_ARGS+=(--enable-dp-attention)
+fi
 
 echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}"
 echo "Decode  Parallel args : ${DECODE_PARALLEL_ARGS[*]}"
@@ -213,7 +212,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         ${DECODE_ARGS} \
         --policy random \
         --backend atom \
-        --log-level info \
+        --log-level ${ATOMESH_LOG_LEVEL} \
         --disable-health-check \
         --disable-circuit-breaker \
         --prometheus-port 29100"
@@ -303,26 +302,33 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 else
                     export TP="${PREFILL_TP_SIZE}"
                     export CONC="${EVAL_CONCURRENT_REQUESTS}"
+                    export EP_SIZE=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
                     export PREFILL_TP="${PREFILL_TP_SIZE}"
                     export PREFILL_EP=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
                     export PREFILL_NUM_WORKERS="${xP}"
                     export DECODE_TP="${DECODE_TP_SIZE}"
                     export DECODE_EP=1
+                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
                     export DECODE_NUM_WORKERS="${yD}"
+                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
                     export ISL="${BENCH_INPUT_LEN}"
                     export OSL="${BENCH_OUTPUT_LEN}"
 
-                    MODEL_NAME="${MODEL_DIR}/${MODEL_NAME}" append_lm_eval_summary
-
                     EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-                    mkdir -p "$EVAL_COPY_DIR"
-                    for f in meta_env.json; do
-                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-                    done
-                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                    if ! MODEL_NAME="${MODEL_DIR}/${MODEL_NAME}" append_lm_eval_summary; then
+                        echo "ERROR: failed to finalize eval artifacts" >&2
+                        EVAL_FAILED=1
+                    fi
+                    if ! _copy_lm_eval_artifacts /workspace "$EVAL_COPY_DIR"; then
+                        echo "ERROR: failed to stage eval artifacts in $EVAL_COPY_DIR" >&2
+                        EVAL_FAILED=1
+                    else
+                        echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                    fi
                 fi
             fi
 
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index c28ccab41..0f7fd2fc3 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -597,20 +597,17 @@ if [ "$NODE_RANK" -eq 0 ]; then
                     # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE,
                     # RESULT_FILENAME are already set via Docker -e flags from job.slurm
 
-                    append_lm_eval_summary
-                    # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
-
-                    # Copy eval artifacts to run_logs for NFS extraction by runner
                     EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-                    mkdir -p "$EVAL_COPY_DIR"
-                    for f in meta_env.json; do
-                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-                    done
-                    # Use find for glob patterns to avoid "no match" errors
-                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                    if ! append_lm_eval_summary; then
+                        echo "ERROR: failed to finalize eval artifacts" >&2
+                        EVAL_FAILED=1
+                    fi
+                    if ! _copy_lm_eval_artifacts /workspace "$EVAL_COPY_DIR"; then
+                        echo "ERROR: failed to stage eval artifacts in $EVAL_COPY_DIR" >&2
+                        EVAL_FAILED=1
+                    else
+                        echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                    fi
                 fi
             fi
 
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index d61fe0359..d8871a17e 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -372,17 +372,17 @@ if [ "$NODE_RANK" -eq 0 ]; then
                     export ISL="${BENCH_INPUT_LEN}"
                     export OSL="${BENCH_OUTPUT_LEN}"
 
-                    append_lm_eval_summary
-
                     EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-                    mkdir -p "$EVAL_COPY_DIR"
-                    for f in meta_env.json; do
-                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-                    done
-                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                    if ! append_lm_eval_summary; then
+                        echo "ERROR: failed to finalize eval artifacts" >&2
+                        EVAL_FAILED=1
+                    fi
+                    if ! _copy_lm_eval_artifacts /workspace "$EVAL_COPY_DIR"; then
+                        echo "ERROR: failed to stage eval artifacts in $EVAL_COPY_DIR" >&2
+                        EVAL_FAILED=1
+                    else
+                        echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                    fi
                 fi
             fi
 
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
index add2e3fa5..b094954f8 100644
--- a/benchmarks/multi_node/amd_utils/setup_deps.sh
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -7,9 +7,10 @@
 #                    (base image: vllm/vllm-openai-rocm:v0.18.0)
 #   sglang-disagg -> SGLang aiter gluon patch + per-model installs
 #                    (base image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-*)
+#   atom-disagg   -> ATOM logging controls + shared aiter/model installs
 #
-# Sourced by server_vllm.sh and server_sglang.sh so PATH / LD_LIBRARY_PATH
-# exports persist. Each patch is idempotent: skipped if already applied.
+# Sourced by the engine server launchers so PATH / LD_LIBRARY_PATH exports
+# persist. Each patch is idempotent: skipped if already applied.
 #
 # Build steps run in subshells to avoid CWD pollution between installers.
 # =============================================================================
@@ -79,6 +80,284 @@ install_amd_quark() {
     _SETUP_INSTALLED+=("amd-quark")
 }
 
+# ---------------------------------------------------------------------------
+# 7. Make the pinned ATOM image honor InferenceX logging environment variables.
+#
+# ATOM's Python logger hardcodes its console handler to INFO, and its
+# programmatic uvicorn.run() call enables INFO/access logs without CLI flags.
+# Engine workers use multiprocessing "spawn", so changing only the parent
+# process logger does not affect them. Upstream ATOM does not currently expose
+# these controls. Patch the installed package once so all processes read the
+# inherited environment:
+#   ATOM_LOG_LEVEL              (default WARNING)
+#   ATOM_UVICORN_LOG_LEVEL      (default warning)
+#   ATOM_UVICORN_ACCESS_LOG     (default 0)
+# atomesh has a native --log-level flag and does not need patching.
+#
+# This patch fails closed: after editing, it executes the patched getLogger()
+# function with a root INFO handler installed and verifies that ATOM INFO is
+# suppressed, WARNING is emitted exactly once, and propagation is disabled. It
+# also executes the patched uvicorn.run() expression with a fake server and
+# verifies the effective level/access-log arguments.
+# ---------------------------------------------------------------------------
+patch_atom_logging_controls() {
+    if ! python3 - <<'PY'
+import ast
+import contextlib
+import importlib.util
+import io
+import logging
+import os
+import sys
+import types
+from pathlib import Path
+
+spec = importlib.util.find_spec("atom")
+if spec is None or not spec.submodule_search_locations:
+    raise RuntimeError("could not locate the installed ATOM package")
+
+atom_dir = Path(next(iter(spec.submodule_search_locations)))
+updates: dict[Path, str] = {}
+
+logger_path = atom_dir / "utils" / "__init__.py"
+logger_src = logger_path.read_text()
+logger_marker = "# InferenceX compatibility: make logging environment-driven."
+if logger_marker not in logger_src:
+    old_logger_level = "        logger.setLevel(logging.DEBUG)"
+    old_handler_level = "        console_handler.setLevel(logging.INFO)"
+    if logger_src.count(old_logger_level) != 1:
+        raise RuntimeError(
+            f"{logger_path}: expected one hardcoded logger DEBUG level"
+        )
+    if logger_src.count(old_handler_level) != 1:
+        raise RuntimeError(
+            f"{logger_path}: expected one hardcoded handler INFO level"
+        )
+    new_logger_level = """        # InferenceX compatibility: make logging environment-driven.
+        _atom_log_level = getattr(
+            logging,
+            os.getenv("ATOM_LOG_LEVEL", "WARNING").upper(),
+            logging.WARNING,
+        )
+        logger.setLevel(_atom_log_level)
+        # Prevent another framework's root handler from re-emitting ATOM INFO.
+        logger.propagate = False"""
+    logger_src = logger_src.replace(
+        old_logger_level,
+        new_logger_level,
+        1,
+    ).replace(
+        old_handler_level,
+        "        console_handler.setLevel(_atom_log_level)",
+        1,
+    )
+    updates[logger_path] = logger_src
+
+api_path = atom_dir / "entrypoints" / "openai" / "api_server.py"
+api_src = api_path.read_text()
+if "ATOM_UVICORN_ACCESS_LOG" not in api_src:
+    old = "    uvicorn.run(app, host=args.host, port=args.server_port)"
+    if api_src.count(old) != 1:
+        raise RuntimeError(f"{api_path}: expected one uvicorn.run() call")
+    new = """    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.server_port,
+        log_level=__import__("os").getenv(
+            "ATOM_UVICORN_LOG_LEVEL", "warning"
+        ).lower(),
+        access_log=__import__("os").getenv(
+            "ATOM_UVICORN_ACCESS_LOG", "0"
+        ).strip().lower() in {"1", "true", "yes", "on"},
+    )"""
+    api_src = api_src.replace(old, new, 1)
+    updates[api_path] = api_src
+
+for path, contents in updates.items():
+    compile(contents, str(path), "exec")
+    path.write_text(contents)
+
+logger_src = logger_path.read_text()
+api_src = api_path.read_text()
+
+for required in (
+    logger_marker,
+    'os.getenv("ATOM_LOG_LEVEL", "WARNING")',
+    "logger.setLevel(_atom_log_level)",
+    "console_handler.setLevel(_atom_log_level)",
+    "logger.propagate = False",
+):
+    if required not in logger_src:
+        raise RuntimeError(f"{logger_path}: missing logging control: {required}")
+
+for required in (
+    '"ATOM_UVICORN_LOG_LEVEL", "warning"',
+    '"ATOM_UVICORN_ACCESS_LOG", "0"',
+    "access_log=",
+):
+    if required not in api_src:
+        raise RuntimeError(f"{api_path}: missing logging control: {required}")
+
+
+def verify_logger_behavior() -> None:
+    tree = ast.parse(logger_src, filename=str(logger_path))
+    functions = [
+        node
+        for node in tree.body
+        if isinstance(node, ast.FunctionDef) and node.name == "getLogger"
+    ]
+    if len(functions) != 1:
+        raise RuntimeError(f"{logger_path}: expected one getLogger() function")
+
+    envs_module = types.ModuleType("atom.utils.envs")
+    envs_module.ATOM_LOG_MORE = False
+    atom_module = types.ModuleType("atom")
+    atom_module.__path__ = []
+    utils_module = types.ModuleType("atom.utils")
+    utils_module.__path__ = []
+    utils_module.envs = envs_module
+    module_backups = {
+        name: sys.modules.get(name)
+        for name in ("atom", "atom.utils", "atom.utils.envs")
+    }
+    sys.modules["atom"] = atom_module
+    sys.modules["atom.utils"] = utils_module
+    sys.modules["atom.utils.envs"] = envs_module
+
+    atom_logger = logging.getLogger("atom")
+    old_handlers = atom_logger.handlers[:]
+    old_level = atom_logger.level
+    old_propagate = atom_logger.propagate
+    root_logger = logging.getLogger()
+    root_stream = io.StringIO()
+    root_handler = logging.StreamHandler(root_stream)
+    old_root_level = root_logger.level
+    root_logger.addHandler(root_handler)
+    root_logger.setLevel(logging.DEBUG)
+
+    old_atom_level = os.environ.get("ATOM_LOG_LEVEL")
+    os.environ["ATOM_LOG_LEVEL"] = "WARNING"
+    atom_logger.handlers.clear()
+    try:
+        namespace = {
+            "logger": atom_logger,
+            "logging": logging,
+            "os": os,
+            "torch": types.SimpleNamespace(
+                _dynamo=types.SimpleNamespace(config=types.SimpleNamespace())
+            ),
+        }
+        function_module = ast.Module(body=functions, type_ignores=[])
+        ast.fix_missing_locations(function_module)
+        exec(compile(function_module, str(logger_path), "exec"), namespace)
+
+        atom_stream = io.StringIO()
+        with contextlib.redirect_stderr(atom_stream):
+            configured_logger = namespace["getLogger"]()
+            for handler in configured_logger.handlers:
+                if isinstance(handler, logging.StreamHandler):
+                    handler.setStream(atom_stream)
+            configured_logger.info("INFERENCEX_HIDDEN_ATOM_INFO")
+            configured_logger.warning("INFERENCEX_VISIBLE_ATOM_WARNING")
+
+        output = atom_stream.getvalue()
+        if "INFERENCEX_HIDDEN_ATOM_INFO" in output:
+            raise RuntimeError("ATOM INFO logging is still enabled")
+        if output.count("INFERENCEX_VISIBLE_ATOM_WARNING") != 1:
+            raise RuntimeError("ATOM WARNING was not emitted exactly once")
+        if root_stream.getvalue():
+            raise RuntimeError("ATOM logs still propagate to the root logger")
+        if configured_logger.getEffectiveLevel() != logging.WARNING:
+            raise RuntimeError("ATOM logger effective level is not WARNING")
+        if configured_logger.propagate:
+            raise RuntimeError("ATOM logger propagation is still enabled")
+    finally:
+        atom_logger.handlers[:] = old_handlers
+        atom_logger.setLevel(old_level)
+        atom_logger.propagate = old_propagate
+        root_logger.removeHandler(root_handler)
+        root_logger.setLevel(old_root_level)
+        if old_atom_level is None:
+            os.environ.pop("ATOM_LOG_LEVEL", None)
+        else:
+            os.environ["ATOM_LOG_LEVEL"] = old_atom_level
+        for name, module in module_backups.items():
+            if module is None:
+                sys.modules.pop(name, None)
+            else:
+                sys.modules[name] = module
+
+
+def verify_uvicorn_behavior() -> None:
+    tree = ast.parse(api_src, filename=str(api_path))
+    calls = [
+        node
+        for node in ast.walk(tree)
+        if isinstance(node, ast.Call)
+        and isinstance(node.func, ast.Attribute)
+        and isinstance(node.func.value, ast.Name)
+        and node.func.value.id == "uvicorn"
+        and node.func.attr == "run"
+    ]
+    if len(calls) != 1:
+        raise RuntimeError(f"{api_path}: expected one uvicorn.run() call")
+
+    captured: dict[str, object] = {}
+
+    class FakeUvicorn:
+        @staticmethod
+        def run(*args, **kwargs):
+            captured["args"] = args
+            captured["kwargs"] = kwargs
+
+    old_level = os.environ.get("ATOM_UVICORN_LOG_LEVEL")
+    old_access = os.environ.get("ATOM_UVICORN_ACCESS_LOG")
+    os.environ["ATOM_UVICORN_LOG_LEVEL"] = "warning"
+    os.environ["ATOM_UVICORN_ACCESS_LOG"] = "0"
+    try:
+        expression = ast.Expression(body=calls[0])
+        ast.fix_missing_locations(expression)
+        eval(
+            compile(expression, str(api_path), "eval"),
+            {
+                "app": object(),
+                "args": types.SimpleNamespace(host="0.0.0.0", server_port=8000),
+                "uvicorn": FakeUvicorn,
+            },
+        )
+        kwargs = captured.get("kwargs")
+        if not isinstance(kwargs, dict):
+            raise RuntimeError("patched uvicorn.run() was not executed")
+        if kwargs.get("log_level") != "warning":
+            raise RuntimeError("Uvicorn log level is not warning")
+        if kwargs.get("access_log") is not False:
+            raise RuntimeError("Uvicorn access logging is still enabled")
+    finally:
+        if old_level is None:
+            os.environ.pop("ATOM_UVICORN_LOG_LEVEL", None)
+        else:
+            os.environ["ATOM_UVICORN_LOG_LEVEL"] = old_level
+        if old_access is None:
+            os.environ.pop("ATOM_UVICORN_ACCESS_LOG", None)
+        else:
+            os.environ["ATOM_UVICORN_ACCESS_LOG"] = old_access
+
+
+verify_logger_behavior()
+verify_uvicorn_behavior()
+
+action = "Patched" if updates else "Verified"
+print(
+    f"[SETUP] {action} ATOM logging controls: "
+    "engine=WARNING propagation=off uvicorn=warning access_log=off"
+)
+PY
+    then
+        return 1
+    fi
+    _SETUP_INSTALLED+=("ATOM-logging-controls")
+}
+
 # ---------------------------------------------------------------------------
 # 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock)
 #    In WRITE mode, save_kv_layer spins forever waiting for the handshake
@@ -739,6 +1018,16 @@ install_transformers_glm5() {
 # Run installers (engine-gated)
 # =============================================================================
 
+if [[ "$ENGINE" == "atom-disagg" ]]; then
+    if ! patch_atom_logging_controls; then
+        echo "[SETUP] ERROR: failed to enable ATOM logging controls" >&2
+        if [[ "${BASH_SOURCE[0]}" != "$0" ]]; then
+            return 1
+        fi
+        exit 1
+    fi
+fi
+
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     install_recipe_deps
     install_amd_quark
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index fa3d65418..dc30cd4c3 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -129,6 +129,11 @@ export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8}
 export RUN_EVAL="${RUN_EVAL:-false}"
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 export EVAL_CONC="${EVAL_CONC:-}"
+export EVAL_FRAMEWORK="${EVAL_FRAMEWORK:-}"
+export EVAL_TASKS_DIR="${EVAL_TASKS_DIR:-}"
+export EVAL_MAX_MODEL_LEN="${EVAL_MAX_MODEL_LEN:-}"
+export MODEL="${MODEL:-}"
+export MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
 export FRAMEWORK="${FRAMEWORK:-}"
 export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 3bd64eb9a..a6bac4d00 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -375,8 +375,13 @@ EOF
             shopt -s nullglob
             for eval_file in "$EVAL_DIR"/*; do
                 [ -f "$eval_file" ] || continue
-                cp "$eval_file" "$GITHUB_WORKSPACE/"
-                echo "Copied eval artifact: $(basename "$eval_file")"
+                eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"
+                rm -f "$eval_dest"
+                if cp "$eval_file" "$eval_dest"; then
+                    echo "Copied eval artifact: $(basename "$eval_file")"
+                else
+                    echo "WARNING: Failed to copy eval artifact: $(basename "$eval_file")" >&2
+                fi
             done
             shopt -u nullglob
         else
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index a941860c0..d49cc6ad7 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -277,8 +277,13 @@ if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
         shopt -s nullglob
         for eval_file in "$EVAL_DIR"/*; do
             [ -f "$eval_file" ] || continue
-            cp "$eval_file" "$GITHUB_WORKSPACE/"
-            echo "Copied eval artifact: $(basename "$eval_file")"
+            eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"
+            rm -f "$eval_dest"
+            if cp "$eval_file" "$eval_dest"; then
+                echo "Copied eval artifact: $(basename "$eval_file")"
+            else
+                echo "WARNING: Failed to copy eval artifact: $(basename "$eval_file")" >&2
+            fi
         done
         shopt -u nullglob
     else
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 36c8af203..95fea2dd1 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -500,8 +500,13 @@ if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
         shopt -s nullglob
         for eval_file in "$EVAL_DIR"/*; do
             [ -f "$eval_file" ] || continue
-            cp "$eval_file" "$GITHUB_WORKSPACE/"
-            echo "Copied eval artifact: $(basename "$eval_file")"
+            eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"
+            rm -f "$eval_dest"
+            if cp "$eval_file" "$eval_dest"; then
+                echo "Copied eval artifact: $(basename "$eval_file")"
+            else
+                echo "WARNING: Failed to copy eval artifact: $(basename "$eval_file")" >&2
+            fi
         done
         shopt -u nullglob
     else
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 08a5a967a..d4f97e85c 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -9,14 +9,63 @@ server_name="bmk-server"
 # the h200 launchers, which have carried SPEC_SUFFIX since #392).
 SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
+DOCKER_ENV_VARS=(
+    HF_TOKEN
+    HF_HUB_CACHE
+    EXP_NAME
+    IMAGE
+    MODEL
+    MODEL_PREFIX
+    TP
+    EP_SIZE
+    DP_ATTENTION
+    CONC
+    MAX_MODEL_LEN
+    ISL
+    OSL
+    FRAMEWORK
+    PRECISION
+    SPEC_DECODING
+    DISAGG
+    RUN_EVAL
+    EVAL_ONLY
+    EVAL_FRAMEWORK
+    EVAL_TASKS_DIR
+    EVAL_MAX_MODEL_LEN
+    OPENAI_API_KEY
+    RUNNER_TYPE
+    RESULT_FILENAME
+    RANDOM_RANGE_RATIO
+    SCENARIO_TYPE
+    SCENARIO_SUBDIR
+    IS_AGENTIC
+    OFFLOADING
+    TOTAL_CPU_DRAM_GB
+    DURATION
+    RESULT_DIR
+    PYTHONDONTWRITEBYTECODE
+    PYTHONPYCACHEPREFIX
+    PROFILE
+    SGLANG_TORCH_PROFILER_DIR
+    VLLM_TORCH_PROFILER_DIR
+    VLLM_RPC_TIMEOUT
+)
+DOCKER_ENV_ARGS=()
+export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/pycache/}"
+for env_name in "${DOCKER_ENV_VARS[@]}"; do
+    DOCKER_ENV_ARGS+=(--env "$env_name")
+done
+
 set -x
-docker run --rm --network=host --name=$server_name \
+docker run --rm --network=host --name="$server_name" \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
--v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
--e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+-v "$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
+-v "$GITHUB_WORKSPACE:/workspace/" -w /workspace/ \
+"${DOCKER_ENV_ARGS[@]}" \
+--env "PORT=$PORT" \
+--env TORCH_CUDA_ARCH_LIST="9.0" \
+--env CUDA_DEVICE_ORDER=PCI_BUS_ID \
+--env CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
-$IMAGE \
-benchmarks/single_node/${SCENARIO_SUBDIR}"${EXP_NAME%%_*}_${PRECISION}_h100${SPEC_SUFFIX}.sh"
+"$IMAGE" \
+"benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h100${SPEC_SUFFIX}.sh"
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index d1bbcef58..2fad08c5c 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -262,8 +262,13 @@ EOF
             shopt -s nullglob
             for eval_file in "$EVAL_DIR"/*; do
                 [ -f "$eval_file" ] || continue
-                cp "$eval_file" "$GITHUB_WORKSPACE/"
-                echo "Copied eval artifact: $(basename "$eval_file")"
+                eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"
+                rm -f "$eval_dest"
+                if cp "$eval_file" "$eval_dest"; then
+                    echo "Copied eval artifact: $(basename "$eval_file")"
+                else
+                    echo "WARNING: Failed to copy eval artifact: $(basename "$eval_file")" >&2
+                fi
             done
             shopt -u nullglob
         else
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 572056956..f51c1908f 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -250,8 +250,13 @@ EOF
             shopt -s nullglob
             for eval_file in "$EVAL_DIR"/*; do
                 [ -f "$eval_file" ] || continue
-                cp "$eval_file" "$GITHUB_WORKSPACE/"
-                echo "Copied eval artifact: $(basename "$eval_file")"
+                eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"
+                rm -f "$eval_dest"
+                if cp "$eval_file" "$eval_dest"; then
+                    echo "Copied eval artifact: $(basename "$eval_file")"
+                else
+                    echo "WARNING: Failed to copy eval artifact: $(basename "$eval_file")" >&2
+                fi
             done
             shopt -u nullglob
         else
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index acfd4912a..d7200992b 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -164,7 +164,7 @@ PY
     fi
 
     # Extract eval results if eval was requested
-    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
         # Find eval_results in the slurm job logs directory
         EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1)
         if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then
@@ -172,8 +172,13 @@ PY
             shopt -s nullglob
             for eval_file in "$EVAL_DIR"/*; do
                 [ -f "$eval_file" ] || continue
-                cp "$eval_file" "$GITHUB_WORKSPACE/"
-                echo "Copied eval artifact: $(basename "$eval_file")"
+                eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"
+                rm -f "$eval_dest"
+                if cp "$eval_file" "$eval_dest"; then
+                    echo "Copied eval artifact: $(basename "$eval_file")"
+                else
+                    echo "WARNING: Failed to copy eval artifact: $(basename "$eval_file")" >&2
+                fi
             done
             shopt -u nullglob
         else
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 194fa4acb..522d5c08e 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -46,6 +46,23 @@ def result_concurrency(path: Path) -> Optional[int]:
     return int(match.group(1)) if match else None
 
 
+def valid_concurrency_list(
+    value: object,
+    *,
+    allow_empty: bool = True,
+) -> bool:
+    """Return whether metadata contains unique positive integer concurrencies."""
+    return (
+        isinstance(value, list)
+        and (allow_empty or bool(value))
+        and all(
+            isinstance(conc, int) and not isinstance(conc, bool) and conc > 0
+            for conc in value
+        )
+        and len(set(value)) == len(value)
+    )
+
+
 def detect_lm_eval_jsons(d: Path, batched: bool = False) -> List[Path]:
     """Return lm-eval result JSONs from one artifact directory.
 
@@ -99,60 +116,104 @@ def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]:
     - Values from results[task][metric,filter]
     """
     data = load_json(json_path) or {}
+    if not isinstance(data, dict):
+        return []
     results = data.get('results', {})
     configs = data.get('configs', {})
 
-    if not results:
+    if not isinstance(results, dict) or not results:
         return []
+    if not isinstance(configs, dict):
+        configs = {}
 
     extracted = []
 
-    for task in results.keys():
-        task_results = results[task]
+    for task, task_results in results.items():
+        if not isinstance(task_results, dict):
+            continue
         task_config = configs.get(task, {})
+        if not isinstance(task_config, dict):
+            task_config = {}
 
         # Base metric: from config's metric_list
         metric_list = task_config.get('metric_list', [])
-        base_metric = metric_list[0]['metric'] if metric_list else 'exact_match'
+        if (
+            isinstance(metric_list, list)
+            and metric_list
+            and isinstance(metric_list[0], dict)
+            and isinstance(metric_list[0].get('metric'), str)
+        ):
+            base_metric = metric_list[0]['metric']
+        else:
+            base_metric = 'exact_match'
 
         # Filters: from config's filter_list
         filter_list = task_config.get('filter_list', [])
+        if not isinstance(filter_list, list):
+            filter_list = []
 
         strict_val, strict_se = None, None
         flex_val, flex_se = None, None
         accuracy_val, accuracy_se = None, None
 
         # Helper to get value/stderr pair for filtered metrics
-        def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
-            val_key = f"{base_metric},{filter_name}"
-            se_key = f"{base_metric}_stderr,{filter_name}"
+        def get_val_se(
+            filter_name: Optional[str],
+        ) -> Tuple[Optional[float], Optional[float]]:
+            suffix = f",{filter_name}" if filter_name else ""
+            val_key = f"{base_metric}{suffix}"
+            se_key = f"{base_metric}_stderr{suffix}"
             return task_results.get(val_key), task_results.get(se_key)
 
         # Extract metrics based on filter_list
         if not filter_list:
-            # No filters - check for accuracy or use base metric
-            if 'acc' in task_results:
-                accuracy_val = task_results.get('acc')
-                accuracy_se = task_results.get('acc_stderr')
+            value, stderr = get_val_se('none')
+            if value is None:
+                value, stderr = get_val_se(None)
+            if base_metric in {'acc', 'accuracy'}:
+                accuracy_val, accuracy_se = value, stderr
             else:
-                strict_val = task_results.get(base_metric)
-                strict_se = task_results.get(f"{base_metric}_stderr")
+                strict_val, strict_se = value, stderr
         else:
             # Extract metrics for each filter
-            for f in filter_list:
-                fname = f['name']
-                if 'strict' in fname:
-                    strict_val, strict_se = get_val_se(fname)
-                elif 'flex' in fname or 'extract' in fname:
-                    flex_val, flex_se = get_val_se(fname)
+            for filter_config in filter_list:
+                if not isinstance(filter_config, dict):
+                    continue
+                filter_name = filter_config.get('name')
+                if not isinstance(filter_name, str):
+                    continue
+                value, stderr = get_val_se(filter_name)
+                normalized_name = filter_name.lower()
+                if 'strict' in normalized_name:
+                    strict_val, strict_se = value, stderr
+                elif (
+                    'flex' in normalized_name
+                    or 'extract' in normalized_name
+                ):
+                    flex_val, flex_se = value, stderr
+                elif base_metric in {'acc', 'accuracy'}:
+                    accuracy_val, accuracy_se = value, stderr
+                elif strict_val is None:
+                    strict_val, strict_se = value, stderr
 
         # N-samples (effective count)
-        n_eff = data.get('n-samples', {}).get(task, {}).get('effective')
+        sample_counts = data.get('n-samples', {})
+        if not isinstance(sample_counts, dict):
+            sample_counts = {}
+        task_counts = sample_counts.get(task, {})
+        n_eff = (
+            task_counts.get('effective')
+            if isinstance(task_counts, dict)
+            else None
+        )
 
         # Model name
+        metadata = task_config.get('metadata', {})
+        if not isinstance(metadata, dict):
+            metadata = {}
         model = (
             data.get('model_name')
-            or task_config.get('metadata', {}).get('model')
+            or metadata.get('model')
         )
 
         extracted.append({
@@ -264,6 +325,10 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
         row['score'] = m.get('strict')
         row['score_name'] = 'em_strict'
         row['score_se'] = m.get('strict_se')
+    elif m.get('flex') is not None:
+        row['score'] = m.get('flex')
+        row['score_name'] = 'em_flexible'
+        row['score_se'] = m.get('flex_se')
     elif m.get('accuracy') is not None:
         row['score'] = m.get('accuracy')
         row['score_name'] = 'accuracy'
@@ -281,15 +346,48 @@ def collect_eval_rows(root: Path) -> List[Dict[str, Any]]:
     rows: List[Dict[str, Any]] = []
     for d in find_eval_sets(root):
         meta = load_json(d / 'meta_env.json') or {}
+        eval_exit_code = meta.get('eval_exit_code')
+        if (
+            not isinstance(eval_exit_code, int)
+            or isinstance(eval_exit_code, bool)
+            or eval_exit_code != 0
+        ):
+            continue
+
         batch_concs = meta.get('eval_concs')
-        batched = isinstance(batch_concs, list)
+        if 'eval_concs' in meta and not valid_concurrency_list(
+            batch_concs,
+            allow_empty=False,
+        ):
+            continue
+        batched = valid_concurrency_list(batch_concs, allow_empty=False)
         allowed_concs: Optional[set[int]] = None
         if batched:
-            completed_concs = meta.get('completed_eval_concs', batch_concs)
-            if isinstance(completed_concs, list):
-                allowed_concs = {as_int(conc, -1) for conc in completed_concs}
+            completed_concs = meta.get('completed_eval_concs')
+            failed_concs = meta.get('failed_eval_concs')
+            if not (
+                valid_concurrency_list(completed_concs)
+                and valid_concurrency_list(failed_concs)
+            ):
+                continue
+            expected_set = set(batch_concs)
+            completed_set = set(completed_concs)
+            failed_set = set(failed_concs)
+            if (
+                completed_set != expected_set
+                or failed_set
+                or completed_set & failed_set
+            ):
+                continue
+            allowed_concs = completed_set
+
+        lm_paths = detect_lm_eval_jsons(d, batched=batched)
+        if batched and {
+            result_concurrency(path) for path in lm_paths
+        } != allowed_concs:
+            continue
 
-        for lm_path in detect_lm_eval_jsons(d, batched=batched):
+        for lm_path in lm_paths:
             row_meta = meta
             if batched:
                 conc = result_concurrency(lm_path)
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 41247cf07..e18dc6bcc 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -90,9 +90,11 @@ Multi-node evals support two hardware paths:
 - `lm-eval` runner (`benchmarks/lm_eval.py`) is invoked by `do_sweep.py` as a post/eval-only step and sources InferenceX's `benchmark_lib.sh` from the mounted workspace (`/infmax-workspace`)
 - Eval artifacts written to `/logs/eval_results/` inside the container, collected by launch scripts
 - NVIDIA Slurm launch scripts always collect server logs for debugging but skip benchmark result collection when `EVAL_ONLY=true`
-- Env vars threaded: `RUN_EVAL`, `EVAL_ONLY`, `IS_MULTINODE`, `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP/EP/NUM_WORKERS/DP_ATTN`, `DECODE_TP/EP/NUM_WORKERS/DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`
+- Env vars threaded: `RUN_EVAL`, `EVAL_ONLY`, `IS_MULTINODE`, `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP/EP/NUM_WORKERS/DP_ATTN`, `DECODE_TP/EP/NUM_WORKERS/DP_ATTN`, `MODEL_NAME`, `MODEL_PATH`, `MAX_MODEL_LEN`, `EVAL_CONC`, `EVAL_FRAMEWORK`, `EVAL_TASKS_DIR`, `EVAL_MAX_MODEL_LEN`
 
-For multi-node `all-evals`, `EVAL_CONC` is a space-separated list. When it contains multiple values, `run_eval` runs those concurrency points sequentially against the same live engine, stages each result with a `_concN` filename suffix, and records expected/completed/failed points in `meta_env.json`.
+For multi-node `all-evals`, `EVAL_CONC` is a space-separated list. When it contains multiple values, `run_eval` runs those concurrency points sequentially against the same live engine, stages each result with a `_concN` filename suffix, and records expected/completed/failed points in `meta_env.json`. The workflow passes its requested list independently to score validation, so missing metadata, missing concurrency results, result files without a checked score, and scores below threshold all fail the `Verify eval scores` step.
+
+AMD ATOM jobs default engine, Uvicorn, and atomesh output to warning level through `ATOM_LOG_LEVEL`, `ATOM_UVICORN_LOG_LEVEL`, and `ATOMESH_LOG_LEVEL`; Uvicorn access logs default off through `ATOM_UVICORN_ACCESS_LOG=0`. Only atomesh's `--log-level` is currently native upstream. The pinned ATOM image hardcodes its Python logger and `uvicorn.run()` defaults, so `setup_deps.sh` adds the other InferenceX compatibility controls to the installed package and behavior-tests them before serving. Startup fails if ATOM INFO can still reach either its handler or the root logger, or if Uvicorn access logging remains enabled. Set the levels to `INFO`/`info` and `ATOM_UVICORN_ACCESS_LOG=1` for verbose troubleshooting.
 
 ### Workflow structure
 - `e2e-tests.yml`: `test-sweep-evals` (single-node) and `test-sweep-multi-node-evals` (multi-node)
@@ -129,7 +131,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 
 | Field | Description |
 |-------|-------------|
-| `score` | Primary metric (exact match for GSM8K) |
+| `score` | Primary task metric (`em_strict`, then `em_flexible`, then accuracy) |
 | `em_strict` | Strict exact match (requires `####` format) |
 | `em_flexible` | Flexible extraction (looser number matching) |
 | `n_eff` | Number of samples evaluated |
@@ -148,7 +150,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval; a space-separated list enables sequential batched evals against one live engine |
 
 ### Score validation
-`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
+`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. The workflow supplies `--expected-concs`, and validation requires matching metadata plus at least one checked score in every result file. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
 
 ### Adding a new eval task
 
diff --git a/utils/evals/test_batched_eval.py b/utils/evals/test_batched_eval.py
index f1ebb6b64..b5db22907 100644
--- a/utils/evals/test_batched_eval.py
+++ b/utils/evals/test_batched_eval.py
@@ -2,6 +2,7 @@
 
 import json
 import os
+import re
 import subprocess
 import sys
 from pathlib import Path
@@ -93,6 +94,7 @@ def test_batched_eval_runs_every_concurrency_and_stages_results(
     assert meta["eval_concs"] == [1, 4, 8]
     assert meta["completed_eval_concs"] == [1, 4, 8]
     assert meta["failed_eval_concs"] == []
+    assert meta["eval_exit_code"] == 0
     assert sorted(path.name for path in tmp_path.glob("results*.json")) == [
         "results_test_conc1.json",
         "results_test_conc4.json",
@@ -101,6 +103,7 @@ def test_batched_eval_runs_every_concurrency_and_stages_results(
     assert validate_batch_manifest(
         str(tmp_path / "meta_env.json"),
         [str(path) for path in tmp_path.glob("results*.json")],
+        expected_concs=[1, 4, 8],
     ) == []
 
 
@@ -111,9 +114,11 @@ def test_batched_eval_preserves_partial_results_and_records_failure(
 
     assert meta["completed_eval_concs"] == [1, 8]
     assert meta["failed_eval_concs"] == [4]
+    assert meta["eval_exit_code"] == 1
     errors = validate_batch_manifest(
         str(tmp_path / "meta_env.json"),
         [str(path) for path in tmp_path.glob("results*.json")],
+        expected_concs=[1, 4, 8],
     )
     assert any("failed for concurrency: 4" in error for error in errors)
     assert any("missing completed concurrency: 4" in error for error in errors)
@@ -131,7 +136,34 @@ def test_batched_eval_requires_a_valid_manifest(tmp_path: Path) -> None:
     assert any("unavailable or invalid" in error for error in errors)
 
 
-def test_validate_scores_warns_when_batch_status_metadata_is_unreadable(
+def test_batched_eval_requires_a_result_for_every_workflow_concurrency(
+    tmp_path: Path,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(
+        json.dumps({
+            "eval_exit_code": 0,
+            "eval_concs": [1, 4, 8],
+            "completed_eval_concs": [1, 4, 8],
+            "failed_eval_concs": [],
+        })
+    )
+    result_files = []
+    for conc in (1, 8):
+        result_path = tmp_path / f"results_test_conc{conc}.json"
+        result_path.write_text('{"results": {}}')
+        result_files.append(str(result_path))
+
+    errors = validate_batch_manifest(
+        str(meta_path),
+        result_files,
+        expected_concs=[1, 4, 8],
+    )
+
+    assert any("missing result files for concurrency: 4" in error for error in errors)
+
+
+def test_validate_scores_fails_when_workflow_batch_metadata_is_unreadable(
     tmp_path: Path,
     monkeypatch,
     capsys,
@@ -157,26 +189,1090 @@ def test_validate_scores_warns_when_batch_status_metadata_is_unreadable(
             str(meta_path),
             "--results-glob",
             str(result_path),
+            "--expected-concs",
+            "1 4 8",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    captured = capsys.readouterr()
+    assert "meta_env.json is unavailable or invalid" in captured.err
+
+
+def test_validate_scores_rejects_single_result_for_workflow_batch(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 8,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(
+        json.dumps({
+            "results": {
+                "custom_eval": {
+                    "exact_match,strict-match": 1.0,
+                },
+            },
+        })
+    )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "1 4 8",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    captured = capsys.readouterr()
+    assert "workflow requested multiple eval concurrencies" in captured.err
+    assert "result lacks a concurrency suffix" in captured.err
+    assert "missing result files for concurrency: 1, 4, 8" in captured.err
+
+
+def test_validate_scores_fails_when_any_concurrency_is_below_threshold(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(
+        json.dumps({
+            "eval_exit_code": 0,
+            "eval_concs": [1, 4],
+            "completed_eval_concs": [1, 4],
+            "failed_eval_concs": [],
+            "infmax_model_prefix": "test",
+        })
+    )
+    for conc, score in ((1, 0.9), (4, 0.8)):
+        (tmp_path / f"results_test_conc{conc}.json").write_text(
+            json.dumps({
+                "results": {
+                    "custom_eval": {
+                        "exact_match,strict-match": score,
+                    },
+                },
+            })
+        )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(tmp_path / "results*.json"),
+            "--expected-concs",
+            "1 4",
+            "--min-score",
+            "0.85",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    captured = capsys.readouterr()
+    assert "results_test_conc4.json" in captured.err
+    assert "0.8000 (< 0.85 from min-score)" in captured.err
+
+
+def test_validate_scores_accepts_complete_batch_above_threshold(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(
+        json.dumps({
+            "eval_exit_code": 0,
+            "eval_concs": [1, 4],
+            "completed_eval_concs": [1, 4],
+            "failed_eval_concs": [],
+            "infmax_model_prefix": "test",
+        })
+    )
+    for conc, score in ((1, 0.9), (4, 0.86)):
+        (tmp_path / f"results_test_conc{conc}.json").write_text(
+            json.dumps({
+                "results": {
+                    "custom_eval": {
+                        "exact_match,strict-match": score,
+                    },
+                },
+            })
+        )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(tmp_path / "results*.json"),
+            "--expected-concs",
+            "1 4",
+            "--min-score",
+            "0.85",
+        ],
+    )
+
+    assert validate_scores_main() == 0
+
+
+def test_validate_scores_accepts_single_concurrency_above_threshold(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 4,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(
+        json.dumps({
+            "results": {
+                "custom_eval": {
+                    "exact_match,strict-match": 0.9,
+                },
+            },
+        })
+    )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+            "--min-score",
+            "0.85",
         ],
     )
 
     assert validate_scores_main() == 0
+
+
+def test_validate_scores_fails_when_a_concurrency_has_no_score_metric(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(
+        json.dumps({
+            "eval_exit_code": 0,
+            "eval_concs": [1, 4],
+            "completed_eval_concs": [1, 4],
+            "failed_eval_concs": [],
+            "infmax_model_prefix": "test",
+        })
+    )
+    (tmp_path / "results_test_conc1.json").write_text(
+        json.dumps({
+            "results": {
+                "custom_eval": {
+                    "exact_match,strict-match": 0.9,
+                },
+            },
+        })
+    )
+    (tmp_path / "results_test_conc4.json").write_text(
+        json.dumps({"results": {"custom_eval": {"alias": "custom"}}})
+    )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(tmp_path / "results*.json"),
+            "--expected-concs",
+            "1 4",
+            "--min-score",
+            "0.85",
+        ],
+    )
+
+    assert validate_scores_main() == 1
     captured = capsys.readouterr()
     assert (
-        "WARN: could not inspect eval metadata for batched concurrency status"
+        "results_test_conc4.json has no numeric metrics matching prefix"
         in captured.err
     )
 
 
+def test_validate_scores_fails_when_one_task_has_no_score_metric(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 4,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(json.dumps({
+        "results": {
+            "valid_eval": {
+                "exact_match,strict-match": 0.9,
+            },
+            "missing_metric_eval": {
+                "alias": "missing_metric_eval",
+            },
+        },
+    }))
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    assert (
+        "missing_metric_eval has no metric matching prefix"
+        in capsys.readouterr().err
+    )
+
+
+def test_validate_scores_fails_for_non_numeric_score(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 4,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(json.dumps({
+        "results": {
+            "custom_eval": {
+                "exact_match,strict-match": "0.99",
+            },
+        },
+    }))
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    assert "has non-numeric value '0.99'" in capsys.readouterr().err
+
+
+def test_validate_scores_fails_for_non_finite_score(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 4,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(
+        '{"results":{"custom_eval":{"exact_match,strict-match":NaN}}}'
+    )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    captured = capsys.readouterr()
+    assert "exact_match,strict-match is not finite" in captured.err
+
+
+def test_validate_scores_fails_for_out_of_range_score(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 4,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(json.dumps({
+        "results": {
+            "custom_eval": {
+                "exact_match,strict-match": 1.01,
+            },
+        },
+    }))
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    assert "is outside [0, 1]" in capsys.readouterr().err
+
+
+def test_validate_scores_requires_consistent_metrics_across_concurrencies(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "eval_concs": [1, 4],
+        "completed_eval_concs": [1, 4],
+        "failed_eval_concs": [],
+        "infmax_model_prefix": "test",
+    }))
+    for conc, task in ((1, "custom_eval"), (4, "different_eval")):
+        (tmp_path / f"results_test_conc{conc}.json").write_text(json.dumps({
+            "results": {
+                task: {
+                    "exact_match,strict-match": 0.9,
+                },
+            },
+        }))
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(tmp_path / "results*.json"),
+            "--expected-concs",
+            "1 4",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    captured = capsys.readouterr()
+    assert "is missing metrics present in results_test_conc1.json" in captured.err
+    assert (
+        "has unexpected metrics compared with results_test_conc1.json"
+        in captured.err
+    )
+
+
+def test_single_eval_failure_is_staged_and_fails_validation(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    benchmark_lib = (
+        Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh"
+    )
+    env = {
+        **os.environ,
+        "BENCHMARK_LIB": str(benchmark_lib),
+    }
+    script = r'''
+source "$BENCHMARK_LIB"
+
+run_lm_eval() {
+    local results_dir
+    results_dir=$(mktemp -d)
+    export EVAL_RESULT_DIR="$results_dir"
+    mkdir -p "$results_dir/nested"
+    printf '%s' \
+        '{"results":{"custom_eval":{"exact_match,strict-match":0.99}}}' \
+        > "$results_dir/nested/results_test.json"
+    return 7
+}
+
+export EVAL_CONCURRENT_REQUESTS=4
+export EVAL_MAX_MODEL_LEN=4096
+export EVAL_ONLY=true
+export MODEL=test-model
+export MODEL_NAME=test-model
+export MODEL_PREFIX=test
+export RUNNER_TYPE=h100
+export FRAMEWORK=vllm
+export PRECISION=fp8
+export SPEC_DECODING=none
+export IS_MULTINODE=false
+export ISL=8192
+export OSL=1024
+export TP=8
+export EP_SIZE=1
+export CONC=4
+
+run_eval --framework lm-eval --port 8888
+append_lm_eval_summary
+'''
+    subprocess.run(
+        ["bash", "-c", script],
+        cwd=tmp_path,
+        env=env,
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+    meta_path = tmp_path / "meta_env.json"
+    assert json.loads(meta_path.read_text())["eval_exit_code"] == 7
+    result_path = tmp_path / "results_test.json"
+    assert result_path.is_file()
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+            "--min-score",
+            "0.85",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    assert "eval command failed with exit code 7" in capsys.readouterr().err
+
+
+def test_append_lm_eval_summary_reports_artifact_move_failure(
+    tmp_path: Path,
+) -> None:
+    benchmark_lib = (
+        Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh"
+    )
+    env = {
+        **os.environ,
+        "BENCHMARK_LIB": str(benchmark_lib),
+    }
+    script = r'''
+source "$BENCHMARK_LIB"
+
+export EVAL_RESULT_DIR="$PWD/eval-output"
+mkdir -p "$EVAL_RESULT_DIR/nested"
+printf '%s' \
+    '{"results":{"custom_eval":{"exact_match,strict-match":0.99}}}' \
+    > "$EVAL_RESULT_DIR/nested/results_test.json"
+
+export EVAL_RUN_EXIT_CODE=0
+export MODEL=test-model
+export MODEL_PREFIX=test
+export RUNNER_TYPE=h100
+export FRAMEWORK=vllm
+export PRECISION=fp8
+export SPEC_DECODING=none
+export IS_MULTINODE=false
+export ISL=8192
+export OSL=1024
+export TP=8
+export EP_SIZE=1
+export CONC=4
+
+mv() {
+    local arg
+    for arg in "$@"; do
+        if [[ "$arg" == *results_test.json ]]; then
+            return 1
+        fi
+    done
+    command mv "$@"
+}
+
+if append_lm_eval_summary; then
+    exit 9
+fi
+test -f "$EVAL_RESULT_DIR/nested/results_test.json"
+test -f "$PWD/meta_env.json"
+'''
+    completed = subprocess.run(
+        ["bash", "-c", script],
+        cwd=tmp_path,
+        env=env,
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+    assert "eval artifact staging was incomplete" in completed.stderr
+
+
+def test_run_eval_rejects_duplicate_concurrency_before_execution(
+    tmp_path: Path,
+) -> None:
+    benchmark_lib = (
+        Path(__file__).resolve().parents[2] / "benchmarks" / "benchmark_lib.sh"
+    )
+    trace_path = tmp_path / "ran_eval"
+    env = {
+        **os.environ,
+        "BENCHMARK_LIB": str(benchmark_lib),
+        "TRACE_PATH": str(trace_path),
+    }
+    script = r'''
+source "$BENCHMARK_LIB"
+run_lm_eval() {
+    touch "$TRACE_PATH"
+}
+export EVAL_CONCURRENT_REQUESTS="4 4"
+export EVAL_MAX_MODEL_LEN=4096
+export MODEL=test-model
+if run_eval --framework lm-eval --port 8888; then
+    exit 9
+fi
+test "$EVAL_RUN_EXIT_CODE" -eq 2
+'''
+
+    subprocess.run(
+        ["bash", "-c", script],
+        cwd=tmp_path,
+        env=env,
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+    assert not trace_path.exists()
+
+
+def test_validate_scores_rejects_invalid_threshold_config(
+    tmp_path: Path,
+    monkeypatch,
+    capsys,
+) -> None:
+    thresholds_path = tmp_path / "thresholds.json"
+    thresholds_path.write_text(
+        json.dumps({"default": {"custom_eval": "0.9"}})
+    )
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "conc": 4,
+        "infmax_model_prefix": "test",
+    }))
+    result_path = tmp_path / "results_test.json"
+    result_path.write_text(json.dumps({
+        "results": {
+            "custom_eval": {
+                "exact_match,strict-match": 0.99,
+            },
+        },
+    }))
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "validate_scores.py",
+            "--thresholds",
+            str(thresholds_path),
+            "--meta-env",
+            str(meta_path),
+            "--results-glob",
+            str(result_path),
+            "--expected-concs",
+            "4",
+        ],
+    )
+
+    assert validate_scores_main() == 1
+    assert "must be a finite number between 0 and 1" in capsys.readouterr().err
+
+
+def test_batched_eval_rejects_duplicate_result_files(tmp_path: Path) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_exit_code": 0,
+        "eval_concs": [4],
+        "completed_eval_concs": [4],
+        "failed_eval_concs": [],
+    }))
+    result_files = []
+    for name in ("results_test_conc4.json", "results_test_conc4_2.json"):
+        result_path = tmp_path / name
+        result_path.write_text('{"results": {}}')
+        result_files.append(str(result_path))
+
+    errors = validate_batch_manifest(
+        str(meta_path),
+        result_files,
+        expected_concs=[4],
+    )
+
+    assert any(
+        "duplicate result files for concurrency: 4" in error
+        for error in errors
+    )
+
+
 def test_amd_multinode_container_inherits_eval_concurrency_list() -> None:
-    job_slurm = (
+    amd_utils = (
         Path(__file__).resolve().parents[2]
         / "benchmarks"
         / "multi_node"
         / "amd_utils"
-        / "job.slurm"
     )
+    job_slurm = amd_utils / "job.slurm"
     contents = job_slurm.read_text()
+    submit_contents = (amd_utils / "submit.sh").read_text()
+
+    assert r'-e \"EVAL_CONC=\$EVAL_CONC\"' in contents
+    assert "\n    -e EVAL_CONC\n" not in contents
+    for env_name in (
+        "RUN_EVAL",
+        "EVAL_ONLY",
+        "EVAL_FRAMEWORK",
+        "EVAL_TASKS_DIR",
+        "EVAL_MAX_MODEL_LEN",
+        "MODEL",
+        "MAX_MODEL_LEN",
+        "FRAMEWORK",
+        "PRECISION",
+        "MODEL_PREFIX",
+        "RUNNER_TYPE",
+        "RESULT_FILENAME",
+        "SPEC_DECODING",
+        "PREFILL_TP_SIZE",
+        "PREFILL_ENABLE_EP",
+        "PREFILL_ENABLE_DP",
+        "DECODE_TP_SIZE",
+        "DECODE_ENABLE_EP",
+        "DECODE_ENABLE_DP",
+        "IS_MULTINODE",
+    ):
+        assert rf"-e {env_name}=\${env_name}" in contents
+    assert "-e MODEL_PATH=$DOCKER_MODEL_PATH" in contents
+    for env_name in (
+        "EVAL_CONC",
+        "EVAL_FRAMEWORK",
+        "EVAL_TASKS_DIR",
+        "EVAL_MAX_MODEL_LEN",
+        "MODEL",
+        "MAX_MODEL_LEN",
+    ):
+        export_line = f'export {env_name}="${{{env_name}:-}}"'
+        assert export_line in contents
+        assert export_line in submit_contents
+
+
+def test_direct_docker_launcher_forwards_workflow_metadata() -> None:
+    launcher = (
+        Path(__file__).resolve().parents[2] / "runners" / "launch_h100-cr.sh"
+    ).read_text()
+
+    for env_name in (
+        "IMAGE",
+        "MODEL_PREFIX",
+        "FRAMEWORK",
+        "PRECISION",
+        "EP_SIZE",
+        "DP_ATTENTION",
+        "SPEC_DECODING",
+        "DISAGG",
+        "RUN_EVAL",
+        "EVAL_ONLY",
+        "EVAL_FRAMEWORK",
+        "EVAL_TASKS_DIR",
+        "EVAL_MAX_MODEL_LEN",
+        "OPENAI_API_KEY",
+        "RUNNER_TYPE",
+        "RESULT_FILENAME",
+        "SCENARIO_TYPE",
+        "SCENARIO_SUBDIR",
+        "IS_AGENTIC",
+        "OFFLOADING",
+        "TOTAL_CPU_DRAM_GB",
+        "DURATION",
+        "RESULT_DIR",
+        "PYTHONPYCACHEPREFIX",
+    ):
+        assert f"    {env_name}\n" in launcher
+
+
+def test_multinode_launchers_replace_container_owned_eval_artifacts() -> None:
+    runners_dir = Path(__file__).resolve().parents[2] / "runners"
+    launchers = (
+        "launch_b200-dgxc.sh",
+        "launch_b300-nv.sh",
+        "launch_gb200-nv.sh",
+        "launch_gb300-nv.sh",
+        "launch_h100-dgxc-slurm.sh",
+        "launch_h200-dgxc-slurm.sh",
+        "launch_mi355x-amds.sh",
+    )
+
+    for launcher_name in launchers:
+        contents = (runners_dir / launcher_name).read_text()
+        assert 'eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")"' in contents
+        assert 'rm -f "$eval_dest"' in contents
+        assert 'if cp "$eval_file" "$eval_dest"; then' in contents
+        assert 'cp "$eval_file" "$GITHUB_WORKSPACE/"' not in contents
+
+    amd_launcher = (runners_dir / "launch_mi355x-amds.sh").read_text()
+    assert (
+        '[[ "${RUN_EVAL:-false}" == "true" || '
+        '"${EVAL_ONLY:-false}" == "true" ]]'
+        in amd_launcher
+    )
+
+
+def test_atom_launcher_uses_and_records_requested_parallelism() -> None:
+    server = (
+        Path(__file__).resolve().parents[2]
+        / "benchmarks"
+        / "multi_node"
+        / "amd_utils"
+        / "server_atom.sh"
+    ).read_text()
+
+    assert 'DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE")' in server
+    assert '[[ "$PREFILL_ENABLE_EP" == "true" ]]' in server
+    assert '[[ "$PREFILL_ENABLE_DP" == "true" ]]' in server
+    assert '[[ "$DECODE_ENABLE_EP" == "true" ]]' in server
+    assert '[[ "$DECODE_ENABLE_DP" == "true" ]]' in server
+    for metadata_assignment in (
+        'export EP_SIZE=1',
+        'export PREFILL_EP=1',
+        'export DECODE_EP=1',
+        'export DP_ATTENTION="${PREFILL_ENABLE_DP}"',
+        'export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"',
+        'export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"',
+    ):
+        assert metadata_assignment in server
+
+
+def test_eval_workflows_pass_their_requested_concurrencies_to_validation() -> None:
+    repo_root = Path(__file__).resolve().parents[2]
+    single_workflow = (
+        repo_root / ".github" / "workflows" / "benchmark-tmpl.yml"
+    ).read_text()
+    multinode_workflow = (
+        repo_root / ".github" / "workflows" / "benchmark-multinode-tmpl.yml"
+    ).read_text()
+    run_sweep = (
+        repo_root / ".github" / "workflows" / "run-sweep.yml"
+    ).read_text()
+
+    assert '--expected-concs "${CONC}"' in single_workflow
+    assert 'expected_concs="${EVAL_CONC}"' in multinode_workflow
+    assert '--expected-concs "${expected_concs}"' in multinode_workflow
+    verify_condition = (
+        "(success() || failure()) && "
+        "(inputs.run-eval || inputs.eval-only)"
+    )
+    assert verify_condition in single_workflow
+    assert verify_condition in multinode_workflow
+    single_inputs = run_sweep.split("&single-node-inputs", 1)[1].split(
+        "sweep-single-node-8k1k",
+        1,
+    )[0]
+    assert "run-eval: false" in single_inputs
+    assert "run-eval: ${{ matrix.config.run-eval }}" not in run_sweep
+
+
+def test_eval_regression_tests_run_for_every_inspected_runtime_path() -> None:
+    workflow = (
+        Path(__file__).resolve().parents[2]
+        / ".github"
+        / "workflows"
+        / "test-changelog-gate.yml"
+    ).read_text()
+
+    for watched_path in (
+        '".github/workflows/benchmark-tmpl.yml"',
+        '".github/workflows/benchmark-multinode-tmpl.yml"',
+        '"benchmarks/benchmark_lib.sh"',
+        '"benchmarks/multi_node/amd_utils/**"',
+        '"benchmarks/single_node/**"',
+        '"runners/launch_*.sh"',
+        '"utils/evals/**"',
+    ):
+        assert watched_path in workflow
+    assert "utils/evals/test_batched_eval.py" in workflow
+
+
+def test_eval_scripts_in_audited_scope_use_shared_finalization() -> None:
+    repo_root = Path(__file__).resolve().parents[2]
+    benchmark_lib = (repo_root / "benchmarks" / "benchmark_lib.sh").resolve()
+    ignored_scripts = {
+        "dsv4_fp4_mi355x_atom_mtp.sh",
+        "dsv4_fp4_mi355x_sglang_mtp.sh",
+        "dsv4_fp4_mi355x_vllm_mtp.sh",
+    }
+    source_pattern = re.compile(
+        r'source "\$\(dirname "\$0"\)/([^"]*benchmark_lib\.sh)"'
+    )
+    failures = []
+
+    for script in sorted(
+        (repo_root / "benchmarks" / "single_node" / "fixed_seq_len").rglob(
+            "*.sh"
+        )
+    ):
+        if "deprecated" in script.parts or script.name in ignored_scripts:
+            continue
+        contents = script.read_text()
+        if "run_eval --framework" not in contents:
+            continue
+        if "append_lm_eval_summary" not in contents:
+            failures.append(f"{script}: missing append_lm_eval_summary")
+        match = source_pattern.search(contents)
+        if match is None:
+            failures.append(f"{script}: missing relative benchmark_lib source")
+            continue
+        resolved = (script.parent / match.group(1)).resolve()
+        if resolved != benchmark_lib:
+            failures.append(f"{script}: resolves benchmark_lib to {resolved}")
+
+    assert not failures, "\n".join(failures)
+
+
+def test_slurm_container_launchers_export_the_workflow_environment() -> None:
+    runners_dir = Path(__file__).resolve().parents[2] / "runners"
+    checked = []
+
+    for launcher in sorted(runners_dir.glob("launch_*.sh")):
+        contents = launcher.read_text()
+        if "--container-image=" not in contents:
+            continue
+        checked.append(launcher.name)
+        assert "--export=ALL" in contents, (
+            f"{launcher.name} starts a Slurm container without exporting "
+            "the workflow environment"
+        )
+
+    assert checked
+
+
+def test_srtctl_launchers_export_the_eval_workspace() -> None:
+    runners_dir = Path(__file__).resolve().parents[2] / "runners"
+    launchers = (
+        "launch_b200-dgxc.sh",
+        "launch_b300-nv.sh",
+        "launch_gb200-nv.sh",
+        "launch_gb300-nv.sh",
+        "launch_h100-dgxc-slurm.sh",
+        "launch_h200-dgxc-slurm.sh",
+    )
+
+    for launcher_name in launchers:
+        contents = (runners_dir / launcher_name).read_text()
+        assert 'export EVAL_ONLY="${EVAL_ONLY:-false}"' in contents
+        assert 'export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"' in contents
+        assert "srtctl apply" in contents
+
+
+def test_atom_logging_uses_env_and_native_router_flag_without_stream_filter() -> None:
+    repo_root = Path(__file__).resolve().parents[2]
+    amd_utils = repo_root / "benchmarks" / "multi_node" / "amd_utils"
+    env_contents = (amd_utils / "env_atom.sh").read_text()
+    job_contents = (amd_utils / "job.slurm").read_text()
+    server_contents = (amd_utils / "server_atom.sh").read_text()
+    setup_contents = (amd_utils / "setup_deps.sh").read_text()
+
+    assert 'ATOM_LOG_LEVEL="${ATOM_LOG_LEVEL:-WARNING}"' in env_contents
+    assert (
+        'ATOM_UVICORN_LOG_LEVEL="${ATOM_UVICORN_LOG_LEVEL:-warning}"'
+        in env_contents
+    )
+    assert 'ATOM_UVICORN_ACCESS_LOG="${ATOM_UVICORN_ACCESS_LOG:-0}"' in env_contents
+    assert 'ATOMESH_LOG_LEVEL="${ATOMESH_LOG_LEVEL:-warn}"' in env_contents
+    assert r"-e ATOM_LOG_LEVEL=\${ATOM_LOG_LEVEL:-WARNING}" in job_contents
+    assert (
+        r"-e ATOM_UVICORN_LOG_LEVEL=\${ATOM_UVICORN_LOG_LEVEL:-warning}"
+        in job_contents
+    )
+    assert (
+        r"-e ATOM_UVICORN_ACCESS_LOG=\${ATOM_UVICORN_ACCESS_LOG:-0}"
+        in job_contents
+    )
+    assert r"-e ATOMESH_LOG_LEVEL=\${ATOMESH_LOG_LEVEL:-warn}" in job_contents
+    assert "--log-level ${ATOMESH_LOG_LEVEL}" in server_contents
+    assert "filter_atom_logs.sh" not in server_contents
+    assert 'os.getenv("ATOM_LOG_LEVEL", "WARNING")' in setup_contents
+    assert "logger.setLevel(_atom_log_level)" in setup_contents
+    assert "console_handler.setLevel(_atom_log_level)" in setup_contents
+    assert "logger.propagate = False" in setup_contents
+    assert "ATOM_UVICORN_ACCESS_LOG" in setup_contents
+    assert 'access_log=__import__("os").getenv(' in setup_contents
+    assert "verify_logger_behavior()" in setup_contents
+    assert "verify_uvicorn_behavior()" in setup_contents
+    assert not (amd_utils / "filter_atom_logs.sh").exists()
+
+
+def test_atom_server_aborts_when_setup_or_environment_fails(tmp_path: Path) -> None:
+    server = (
+        Path(__file__).resolve().parents[2]
+        / "benchmarks"
+        / "multi_node"
+        / "amd_utils"
+        / "server_atom.sh"
+    )
+
+    for failing_script, expected_error in (
+        ("setup_deps.sh", "failed to initialize ATOM dependencies"),
+        ("env_atom.sh", "failed to initialize ATOM environment"),
+    ):
+        atom_workspace = tmp_path / failing_script
+        atom_workspace.mkdir()
+        (atom_workspace / "setup_deps.sh").write_text("return 0\n")
+        (atom_workspace / "env_atom.sh").write_text("return 0\n")
+        (atom_workspace / failing_script).write_text("return 42\n")
+
+        result = subprocess.run(
+            ["bash", str(server)],
+            env={
+                **os.environ,
+                "ATOM_WS_PATH": str(atom_workspace),
+            },
+            text=True,
+            capture_output=True,
+        )
+
+        assert result.returncode == 1
+        assert expected_error in result.stderr
+
+
+def test_atom_logging_patch_is_idempotent(tmp_path: Path) -> None:
+    repo_root = Path(__file__).resolve().parents[2]
+    setup_deps = (
+        repo_root / "benchmarks" / "multi_node" / "amd_utils" / "setup_deps.sh"
+    )
+    atom_dir = tmp_path / "atom"
+    logger_path = atom_dir / "utils" / "__init__.py"
+    api_path = atom_dir / "entrypoints" / "openai" / "api_server.py"
+    logger_path.parent.mkdir(parents=True)
+    api_path.parent.mkdir(parents=True)
+    (atom_dir / "__init__.py").write_text("")
+    logger_path.write_text(
+        "import logging\n"
+        "import os\n\n"
+        'logger = logging.getLogger("atom")\n\n'
+        "def getLogger():\n"
+        "    global logger\n"
+        "    if not logger.handlers:\n"
+        "        logger.setLevel(logging.DEBUG)\n"
+        "        console_handler = logging.StreamHandler()\n"
+        "        console_handler.setLevel(logging.INFO)\n"
+        "        logger.addHandler(console_handler)\n"
+        "    return logger\n\n"
+        "logger = getLogger()\n"
+    )
+    api_path.write_text(
+        "def main(app, args, uvicorn):\n"
+        "    uvicorn.run(app, host=args.host, port=args.server_port)\n"
+    )
+    env = {
+        **os.environ,
+        "PYTHONPATH": str(tmp_path),
+        "SETUP_DEPS": str(setup_deps),
+    }
+    script = r'''
+_SETUP_INSTALLED=()
+eval "$(sed -n '/^patch_atom_logging_controls()/,/^}/p' "$SETUP_DEPS")"
+patch_atom_logging_controls
+patch_atom_logging_controls
+'''
+
+    completed = subprocess.run(
+        ["bash", "-c", script],
+        env=env,
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+    logger_contents = logger_path.read_text()
+    api_contents = api_path.read_text()
+    assert logger_contents.count('os.getenv("ATOM_LOG_LEVEL"') == 1
+    assert logger_contents.count("logger.setLevel(_atom_log_level)") == 1
+    assert logger_contents.count("console_handler.setLevel(_atom_log_level)") == 1
+    assert logger_contents.count("logger.propagate = False") == 1
+    assert api_contents.count("ATOM_UVICORN_LOG_LEVEL") == 1
+    assert api_contents.count("ATOM_UVICORN_ACCESS_LOG") == 1
+    assert "engine=WARNING propagation=off uvicorn=warning access_log=off" in (
+        completed.stdout
+    )
+    subprocess.run(
+        ["python3", "-m", "py_compile", str(logger_path), str(api_path)],
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+
+def test_amd_servers_fail_when_eval_artifact_finalization_fails() -> None:
+    amd_utils = (
+        Path(__file__).resolve().parents[2]
+        / "benchmarks"
+        / "multi_node"
+        / "amd_utils"
+    )
 
-    assert "-e EVAL_CONC\n" in contents
-    assert r"-e EVAL_CONC=\$EVAL_CONC" not in contents
+    for server_name in ("server_atom.sh", "server_sglang.sh", "server_vllm.sh"):
+        contents = (amd_utils / server_name).read_text()
+        assert "append_lm_eval_summary; then" in contents
+        assert (
+            'if ! _copy_lm_eval_artifacts /workspace "$EVAL_COPY_DIR"; then'
+            in contents
+        )
+        assert 'echo "ERROR: failed to finalize eval artifacts"' in contents
+        assert 'echo "ERROR: failed to stage eval artifacts' in contents
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
index f74ed267f..760f89294 100644
--- a/utils/evals/validate_scores.py
+++ b/utils/evals/validate_scores.py
@@ -28,13 +28,16 @@
 
 Usage:
     python3 utils/evals/validate_scores.py
+    python3 utils/evals/validate_scores.py --expected-concs "1 2 4 8"
     python3 utils/evals/validate_scores.py --thresholds my_thresholds.json
     python3 utils/evals/validate_scores.py --model-prefix dsv4
     python3 utils/evals/validate_scores.py --min-score 0.90  # flat fallback
 """
 import argparse
+from collections import Counter
 import glob
 import json
+import math
 import os
 import re
 import sys
@@ -43,6 +46,28 @@
 CONC_SUFFIX_RE = re.compile(r"_conc(\d+)(?:_\d+)?\.json$")
 
 
+def _validate_threshold_map(values: object, location: str) -> dict[str, float]:
+    """Validate and normalize one task-to-threshold mapping."""
+    if not isinstance(values, dict):
+        raise ValueError(f"{location} must be a JSON object")
+
+    normalized = {}
+    for task, value in values.items():
+        if not isinstance(task, str) or not task:
+            raise ValueError(f"{location} contains an invalid task name")
+        if (
+            not isinstance(value, (int, float))
+            or isinstance(value, bool)
+            or not math.isfinite(value)
+            or not 0 <= value <= 1
+        ):
+            raise ValueError(
+                f"{location}.{task} must be a finite number between 0 and 1"
+            )
+        normalized[task] = float(value)
+    return normalized
+
+
 def load_config(path: str) -> dict:
     """Load thresholds config, normalized to {"default": {...}, "models": {...}}.
 
@@ -56,8 +81,34 @@ def load_config(path: str) -> dict:
         raise ValueError("thresholds config must be a JSON object")
     if "default" not in cfg and "models" not in cfg:
         # Legacy flat format: the whole object is the per-task default.
-        return {"default": cfg, "models": {}}
-    return {"default": cfg.get("default", {}), "models": cfg.get("models", {})}
+        return {
+            "default": _validate_threshold_map(cfg, "default"),
+            "models": {},
+        }
+
+    unknown_keys = sorted(set(cfg) - {"default", "models"})
+    if unknown_keys:
+        raise ValueError(
+            "thresholds config contains unsupported keys: "
+            + ", ".join(unknown_keys)
+        )
+
+    models = cfg.get("models", {})
+    if not isinstance(models, dict):
+        raise ValueError("models must be a JSON object")
+    normalized_models = {}
+    for prefix, thresholds in models.items():
+        if not isinstance(prefix, str) or not prefix:
+            raise ValueError("models contains an invalid model prefix")
+        normalized_models[prefix] = _validate_threshold_map(
+            thresholds,
+            f"models.{prefix}",
+        )
+
+    return {
+        "default": _validate_threshold_map(cfg.get("default", {}), "default"),
+        "models": normalized_models,
+    }
 
 
 def detect_model_prefix(meta_env_path: str, override: str | None) -> str | None:
@@ -67,7 +118,7 @@ def detect_model_prefix(meta_env_path: str, override: str | None) -> str | None:
     try:
         with open(meta_env_path) as f:
             prefix = json.load(f).get("infmax_model_prefix")
-        if prefix and prefix != "unknown":
+        if isinstance(prefix, str) and prefix and prefix != "unknown":
             return prefix
     except (json.JSONDecodeError, OSError, AttributeError):
         pass
@@ -88,68 +139,197 @@ def resolve_threshold(config: dict, prefix: str | None, task: str, fallback: flo
     return fallback, "min-score"
 
 
+def parse_expected_concs(raw_value: str | None) -> list[int] | None:
+    """Parse a workflow-provided, space-separated concurrency list."""
+    if raw_value is None:
+        return None
+    if not raw_value.strip():
+        raise ValueError("expected concurrency list is empty")
+
+    values = raw_value.split()
+    if not all(re.fullmatch(r"[1-9][0-9]*", value) for value in values):
+        raise ValueError(
+            "expected concurrencies must be positive integers separated by spaces"
+        )
+
+    concs = [int(value) for value in values]
+    if len(set(concs)) != len(concs):
+        raise ValueError("expected concurrency list contains duplicates")
+    return concs
+
+
+def _is_valid_conc_list(
+    values: object,
+    *,
+    allow_empty: bool = True,
+) -> bool:
+    """Return whether a value is a list of unique positive integer concurrencies."""
+    return (
+        isinstance(values, list)
+        and (allow_empty or bool(values))
+        and all(
+            isinstance(value, int) and not isinstance(value, bool) and value > 0
+            for value in values
+        )
+        and len(set(values)) == len(values)
+    )
+
+
 def validate_batch_manifest(
     meta_env_path: str,
     result_files: list[str],
+    expected_concs: list[int] | None = None,
 ) -> list[str]:
-    """Validate that a batched eval produced every requested concurrency."""
+    """Validate that eval artifacts cover every workflow-requested concurrency."""
+    errors = []
+    if expected_concs is not None and not _is_valid_conc_list(
+        expected_concs,
+        allow_empty=False,
+    ):
+        return ["workflow supplied an invalid expected concurrency list"]
+    if not result_files:
+        errors.append("eval produced no result files")
+
     try:
         with open(meta_env_path) as f:
             meta = json.load(f)
-    except (json.JSONDecodeError, OSError) as exc:
-        if any(
+        if not isinstance(meta, dict):
+            raise ValueError("metadata root must be a JSON object")
+    except (json.JSONDecodeError, OSError, ValueError) as exc:
+        if expected_concs is not None or any(
             CONC_SUFFIX_RE.search(Path(result_file).name)
             for result_file in result_files
         ):
-            return [
-                "batched eval result files exist but "
-                f"{meta_env_path} is unavailable or invalid: {exc}"
-            ]
-        return []
-
-    if "eval_concs" not in meta:
-        return []
-
-    expected = meta.get("eval_concs")
-    completed = meta.get("completed_eval_concs")
-    failed = meta.get("failed_eval_concs")
-    if not all(isinstance(values, list) for values in (expected, completed, failed)):
-        return ["batched eval metadata must contain list-valued concurrency fields"]
-    if not all(
-        isinstance(value, int) and value > 0
-        for values in (expected, completed, failed)
-        for value in values
-    ):
-        return ["batched eval metadata contains an invalid concurrency"]
+            errors.append(
+                f"eval metadata {meta_env_path} is unavailable or invalid: {exc}"
+            )
+        return errors
 
-    errors = []
-    expected_set = set(expected)
-    completed_set = set(completed)
-    failed_set = set(failed)
-    if len(expected_set) != len(expected):
-        errors.append("batched eval metadata contains duplicate expected concurrencies")
-    if len(completed_set) != len(completed):
-        errors.append("batched eval metadata contains duplicate completed concurrencies")
-    if failed_set:
+    eval_exit_code = meta.get("eval_exit_code")
+    if expected_concs is not None and (
+        not isinstance(eval_exit_code, int)
+        or isinstance(eval_exit_code, bool)
+        or eval_exit_code < 0
+    ):
         errors.append(
-            "batched eval failed for concurrency: "
-            + ", ".join(str(value) for value in sorted(failed_set))
+            "eval metadata must contain a non-negative integer eval_exit_code"
         )
-    if completed_set != expected_set:
-        missing = sorted(expected_set - completed_set)
-        unexpected = sorted(completed_set - expected_set)
-        if missing:
+    elif "eval_exit_code" in meta and (
+        not isinstance(eval_exit_code, int)
+        or isinstance(eval_exit_code, bool)
+        or eval_exit_code < 0
+    ):
+        errors.append("eval metadata contains an invalid eval_exit_code")
+    elif isinstance(eval_exit_code, int) and eval_exit_code != 0:
+        errors.append(f"eval command failed with exit code {eval_exit_code}")
+
+    metadata_expected = meta.get("eval_concs")
+    metadata_is_batched = "eval_concs" in meta
+    if expected_concs is None and not metadata_is_batched:
+        if any(
+            CONC_SUFFIX_RE.search(Path(result_file).name)
+            for result_file in result_files
+        ):
             errors.append(
-                "batched eval is missing completed concurrency: "
-                + ", ".join(str(value) for value in missing)
+                "concurrency-suffixed eval results exist but batched metadata is missing"
             )
-        if unexpected:
+        return errors
+
+    if expected_concs is not None and len(expected_concs) == 1 and not metadata_is_batched:
+        metadata_conc = meta.get("conc")
+        if (
+            not isinstance(metadata_conc, int)
+            or isinstance(metadata_conc, bool)
+            or metadata_conc != expected_concs[0]
+        ):
+            errors.append(
+                "eval metadata concurrency "
+                f"{metadata_conc!r} does not match workflow request "
+                f"{expected_concs[0]}"
+            )
+        if len(result_files) != 1:
+            errors.append(
+                "non-batched eval must produce exactly one result file; "
+                f"found {len(result_files)}"
+            )
+        suffixed_results = [
+            result_file
+            for result_file in result_files
+            if CONC_SUFFIX_RE.search(Path(result_file).name)
+        ]
+        if suffixed_results:
+            errors.append(
+                "non-batched eval produced concurrency-suffixed result files"
+            )
+        return errors
+
+    if not metadata_is_batched:
+        errors.append(
+            "workflow requested multiple eval concurrencies but batched metadata is missing"
+        )
+        expected_set = set(expected_concs or [])
+    else:
+        completed = meta.get("completed_eval_concs")
+        failed = meta.get("failed_eval_concs")
+        if not (
+            _is_valid_conc_list(metadata_expected, allow_empty=False)
+            and _is_valid_conc_list(completed)
+            and _is_valid_conc_list(failed)
+        ):
             errors.append(
-                "batched eval completed unexpected concurrency: "
-                + ", ".join(str(value) for value in unexpected)
+                "batched eval metadata must contain unique, positive-integer "
+                "concurrency lists"
             )
+            expected_set = set(expected_concs or [])
+        else:
+            metadata_expected_set = set(metadata_expected)
+            expected_set = set(expected_concs or metadata_expected)
+            completed_set = set(completed)
+            failed_set = set(failed)
+            overlap = sorted(completed_set & failed_set)
+            if overlap:
+                errors.append(
+                    "batched eval metadata marks concurrency as both completed "
+                    "and failed: "
+                    + ", ".join(str(value) for value in overlap)
+                )
+
+            if (
+                expected_concs is not None
+                and metadata_expected_set != expected_set
+            ):
+                missing = sorted(expected_set - metadata_expected_set)
+                unexpected = sorted(metadata_expected_set - expected_set)
+                if missing:
+                    errors.append(
+                        "batched eval metadata is missing workflow concurrency: "
+                        + ", ".join(str(value) for value in missing)
+                    )
+                if unexpected:
+                    errors.append(
+                        "batched eval metadata has unexpected concurrency: "
+                        + ", ".join(str(value) for value in unexpected)
+                    )
+            if failed_set:
+                errors.append(
+                    "batched eval failed for concurrency: "
+                    + ", ".join(str(value) for value in sorted(failed_set))
+                )
+            if completed_set != expected_set:
+                missing = sorted(expected_set - completed_set)
+                unexpected = sorted(completed_set - expected_set)
+                if missing:
+                    errors.append(
+                        "batched eval is missing completed concurrency: "
+                        + ", ".join(str(value) for value in missing)
+                    )
+                if unexpected:
+                    errors.append(
+                        "batched eval completed unexpected concurrency: "
+                        + ", ".join(str(value) for value in unexpected)
+                    )
 
-    actual_concs = set()
+    actual_conc_counts = Counter()
     for result_file in result_files:
         match = CONC_SUFFIX_RE.search(Path(result_file).name)
         if match is None:
@@ -157,8 +337,18 @@ def validate_batch_manifest(
                 f"batched eval result lacks a concurrency suffix: {result_file}"
             )
             continue
-        actual_concs.add(int(match.group(1)))
+        actual_conc_counts[int(match.group(1))] += 1
 
+    duplicate_results = sorted(
+        conc for conc, count in actual_conc_counts.items() if count > 1
+    )
+    if duplicate_results:
+        errors.append(
+            "batched eval has duplicate result files for concurrency: "
+            + ", ".join(str(value) for value in duplicate_results)
+        )
+
+    actual_concs = set(actual_conc_counts)
     missing_results = sorted(expected_set - actual_concs)
     unexpected_results = sorted(actual_concs - expected_set)
     if missing_results:
@@ -200,21 +390,43 @@ def main() -> int:
         "--results-glob", default="results*.json",
         help="Glob pattern for result files (default: 'results*.json')",
     )
+    parser.add_argument(
+        "--expected-concs",
+        default=None,
+        help=(
+            "Space-separated concurrencies requested by the workflow. When set, "
+            "metadata and result coverage must match exactly."
+        ),
+    )
     args = parser.parse_args()
 
+    try:
+        expected_concs = parse_expected_concs(args.expected_concs)
+    except ValueError as exc:
+        print(f"FAIL: {exc}", file=sys.stderr)
+        return 1
+
+    if not math.isfinite(args.min_score) or not 0 <= args.min_score <= 1:
+        print(
+            "FAIL: --min-score must be a finite number between 0 and 1",
+            file=sys.stderr,
+        )
+        return 1
+    if not args.metric_prefix:
+        print("FAIL: --metric-prefix must not be empty", file=sys.stderr)
+        return 1
+
     # Load thresholds config
-    config = {"default": {}, "models": {}}
-    thresholds_path = args.thresholds
-    if thresholds_path is None:
-        default_path = Path(__file__).parent / "thresholds.json"
-        if default_path.exists():
-            thresholds_path = str(default_path)
-    if thresholds_path:
-        try:
-            config = load_config(thresholds_path)
-            print(f"Loaded thresholds from {thresholds_path}")
-        except (json.JSONDecodeError, OSError, ValueError) as e:
-            print(f"WARN: could not load thresholds from {thresholds_path}: {e}", file=sys.stderr)
+    thresholds_path = args.thresholds or str(Path(__file__).parent / "thresholds.json")
+    try:
+        config = load_config(thresholds_path)
+        print(f"Loaded thresholds from {thresholds_path}")
+    except (json.JSONDecodeError, OSError, ValueError) as e:
+        print(
+            f"FAIL: could not load thresholds from {thresholds_path}: {e}",
+            file=sys.stderr,
+        )
+        return 1
 
     # Identify the model so per-model thresholds can apply
     prefix = detect_model_prefix(args.meta_env, args.model_prefix)
@@ -228,45 +440,146 @@ def main() -> int:
     failed = False
     checked = 0
     result_files = sorted(glob.glob(args.results_glob))
+    expected_metric_set: set[tuple[str, str]] | None = None
+    expected_metric_source: str | None = None
 
-    manifest_errors = validate_batch_manifest(args.meta_env, result_files)
+    manifest_errors = validate_batch_manifest(
+        args.meta_env,
+        result_files,
+        expected_concs=expected_concs,
+    )
     for error in manifest_errors:
         print(f"FAIL: {error}", file=sys.stderr)
         failed = True
     if not manifest_errors:
-        try:
-            with open(args.meta_env) as f:
-                if "eval_concs" in json.load(f):
-                    print("PASS: batched eval produced every requested concurrency")
-        except (json.JSONDecodeError, OSError) as exc:
+        if expected_concs is not None:
             print(
-                "WARN: could not inspect eval metadata for batched concurrency "
-                f"status: {exc}",
-                file=sys.stderr,
+                "PASS: eval produced every requested concurrency: "
+                + ", ".join(str(value) for value in expected_concs)
             )
+        else:
+            try:
+                with open(args.meta_env) as f:
+                    if "eval_concs" in json.load(f):
+                        print("PASS: batched eval produced every requested concurrency")
+            except (json.JSONDecodeError, OSError) as exc:
+                print(
+                    "WARN: could not inspect eval metadata for batched concurrency "
+                    f"status: {exc}",
+                    file=sys.stderr,
+                )
 
     for f in result_files:
-        with open(f) as fh:
-            data = json.load(fh)
-        for task, metrics in data.get("results", {}).items():
+        try:
+            with open(f) as fh:
+                data = json.load(fh)
+        except (json.JSONDecodeError, OSError) as exc:
+            print(f"FAIL: could not read eval result {f}: {exc}", file=sys.stderr)
+            failed = True
+            continue
+
+        file_checked = 0
+        file_metric_set: set[tuple[str, str]] = set()
+        results = data.get("results", {}) if isinstance(data, dict) else {}
+        if not isinstance(results, dict):
+            results = {}
+        for task, metrics in results.items():
+            if not isinstance(metrics, dict):
+                print(
+                    f"FAIL: {Path(f).name}: {task} result is not a JSON object",
+                    file=sys.stderr,
+                )
+                failed = True
+                continue
             min_score, source = resolve_threshold(config, prefix, task, args.min_score)
+            task_has_metric = False
+            task_checked = 0
             for name, val in metrics.items():
                 if not name.startswith(args.metric_prefix) or "stderr" in name:
                     continue
-                if not isinstance(val, (int, float)):
+                task_has_metric = True
+                file_metric_set.add((task, name))
+                if not isinstance(val, (int, float)) or isinstance(val, bool):
+                    print(
+                        f"FAIL: {Path(f).name}: {task} {name} has non-numeric "
+                        f"value {val!r}",
+                        file=sys.stderr,
+                    )
+                    failed = True
                     continue
                 checked += 1
-                if val < min_score:
+                file_checked += 1
+                task_checked += 1
+                if not math.isfinite(val):
+                    print(
+                        f"FAIL: {Path(f).name}: {task} {name} is not finite",
+                        file=sys.stderr,
+                    )
+                    failed = True
+                elif not 0 <= val <= 1:
                     print(
-                        f"FAIL: {task} {name} = {val:.4f} (< {min_score} from {source})",
+                        f"FAIL: {Path(f).name}: {task} {name} = {val:.4f} "
+                        "is outside [0, 1]",
+                        file=sys.stderr,
+                    )
+                    failed = True
+                elif val < min_score:
+                    print(
+                        f"FAIL: {Path(f).name}: {task} {name} = {val:.4f} "
+                        f"(< {min_score} from {source})",
                         file=sys.stderr,
                     )
                     failed = True
                 else:
-                    print(f"PASS: {task} {name} = {val:.4f} (>= {min_score} from {source})")
+                    print(
+                        f"PASS: {Path(f).name}: {task} {name} = {val:.4f} "
+                        f"(>= {min_score} from {source})"
+                    )
+            if not task_has_metric:
+                print(
+                    f"FAIL: {Path(f).name}: {task} has no metric matching "
+                    f"prefix {args.metric_prefix!r}",
+                    file=sys.stderr,
+                )
+                failed = True
+            elif task_checked == 0:
+                failed = True
+        if file_checked == 0:
+            print(
+                f"FAIL: {Path(f).name} has no numeric metrics matching "
+                f"prefix {args.metric_prefix!r}",
+                file=sys.stderr,
+            )
+            failed = True
+        elif expected_metric_set is None:
+            expected_metric_set = file_metric_set
+            expected_metric_source = Path(f).name
+        elif file_metric_set != expected_metric_set:
+            missing_metrics = sorted(expected_metric_set - file_metric_set)
+            unexpected_metrics = sorted(file_metric_set - expected_metric_set)
+            if missing_metrics:
+                print(
+                    f"FAIL: {Path(f).name} is missing metrics present in "
+                    f"{expected_metric_source}: "
+                    + ", ".join(f"{task}/{metric}" for task, metric in missing_metrics),
+                    file=sys.stderr,
+                )
+            if unexpected_metrics:
+                print(
+                    f"FAIL: {Path(f).name} has unexpected metrics compared with "
+                    f"{expected_metric_source}: "
+                    + ", ".join(
+                        f"{task}/{metric}" for task, metric in unexpected_metrics
+                    ),
+                    file=sys.stderr,
+                )
+            failed = True
 
     if checked == 0:
-        print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr)
+        print(
+            "FAIL: no metrics matched prefix '{}'".format(args.metric_prefix),
+            file=sys.stderr,
+        )
 
     return 1 if (failed or checked == 0) else 0
 
diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
index 0f5bc4424..53d70fece 100644
--- a/utils/matrix_logic/test_validation.py
+++ b/utils/matrix_logic/test_validation.py
@@ -276,11 +276,16 @@ def test_valid_entry(self, valid_single_node_matrix_entry):
         assert entry.conc == 4
         assert entry.framework == "sglang"
 
-    def test_conc_as_list(self, valid_single_node_matrix_entry):
-        """Conc can be a list of integers."""
+    def test_conc_as_list_is_rejected(self, valid_single_node_matrix_entry):
+        """Single-node workflow entries require one concurrency integer."""
         valid_single_node_matrix_entry["conc"] = [4, 8, 16, 32, 64]
-        entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
-        assert entry.conc == [4, 8, 16, 32, 64]
+        with pytest.raises(Exception):
+            SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+
+    def test_eval_only_requires_run_eval(self, valid_single_node_matrix_entry):
+        valid_single_node_matrix_entry["eval-only"] = True
+        with pytest.raises(Exception):
+            SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
 
     def test_spec_decoding_values(self, valid_single_node_matrix_entry):
         """Spec decoding should accept valid literal values."""
@@ -389,12 +394,34 @@ def test_all_eval_concurrency_batch_marker(
         self,
         valid_multinode_matrix_entry,
     ):
+        valid_multinode_matrix_entry["run-eval"] = True
         valid_multinode_matrix_entry["eval-all-concs"] = True
 
         entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
 
         assert entry.eval_all_concs is True
 
+    def test_eval_conc_must_be_in_conc_list(
+        self,
+        valid_multinode_matrix_entry,
+    ):
+        valid_multinode_matrix_entry["run-eval"] = True
+        valid_multinode_matrix_entry["eval-conc"] = 64
+
+        with pytest.raises(Exception):
+            MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+
+    def test_eval_all_concs_rejects_eval_conc(
+        self,
+        valid_multinode_matrix_entry,
+    ):
+        valid_multinode_matrix_entry["run-eval"] = True
+        valid_multinode_matrix_entry["eval-all-concs"] = True
+        valid_multinode_matrix_entry["eval-conc"] = 2150
+
+        with pytest.raises(Exception):
+            MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+
     def test_conc_must_be_list(self, valid_multinode_matrix_entry):
         """Conc must be a list for multinode."""
         valid_multinode_matrix_entry["conc"] = 2150  # Single int, not list
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 2b7bf3baf..4a140d6f0 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -96,13 +96,19 @@ class SingleNodeMatrixEntry(BaseModel):
     tp: int
     ep: int
     dp_attn: bool = Field(alias=Fields.DP_ATTN.value)
-    conc: Union[int, List[int]]
+    conc: int = Field(gt=0)
     max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
     run_eval: bool = Field(alias=Fields.RUN_EVAL.value)
     eval_only: bool = Field(alias=Fields.EVAL_ONLY.value, default=False)
 
+    @model_validator(mode='after')
+    def validate_eval_fields(self):
+        if self.eval_only and not self.run_eval:
+            raise ValueError("eval-only requires run-eval=true")
+        return self
+
 
 class WorkerConfig(BaseModel):
     """Pydantic model for validating worker configuration in multinode entries."""
@@ -134,7 +140,7 @@ class MultiNodeMatrixEntry(BaseModel):
     osl: int
     prefill: WorkerConfig
     decode: WorkerConfig
-    conc: List[int]
+    conc: List[int] = Field(min_length=1)
     max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
@@ -145,6 +151,26 @@ class MultiNodeMatrixEntry(BaseModel):
         default=False, alias=Fields.EVAL_ALL_CONCS.value
     )
 
+    @model_validator(mode='after')
+    def validate_eval_fields(self):
+        if any(conc <= 0 for conc in self.conc):
+            raise ValueError("conc entries must be greater than 0")
+        if len(set(self.conc)) != len(self.conc):
+            raise ValueError("conc entries must be unique")
+        if self.eval_only and not self.run_eval:
+            raise ValueError("eval-only requires run-eval=true")
+        if self.eval_conc is not None:
+            if not self.run_eval:
+                raise ValueError("eval-conc requires run-eval=true")
+            if self.eval_conc not in self.conc:
+                raise ValueError("eval-conc must be present in conc")
+        if self.eval_all_concs:
+            if not self.run_eval:
+                raise ValueError("eval-all-concs requires run-eval=true")
+            if self.eval_conc is not None:
+                raise ValueError("eval-all-concs cannot be combined with eval-conc")
+        return self
+
 
 class SingleNodeAgenticMatrixEntry(BaseModel):
     """Pydantic model for validating single-node agentic coding matrix entries."""
@@ -273,6 +299,10 @@ def _validate_conc_fields(self):
             raise ValueError(
                 f"Input '{Fields.CONC_LIST.value}' entries must be greater than 0."
             )
+        if len(set(self.conc_list)) != len(self.conc_list):
+            raise ValueError(
+                f"Input '{Fields.CONC_LIST.value}' entries must be unique."
+            )
 
     return self
 
diff --git a/utils/test_collect_eval_results.py b/utils/test_collect_eval_results.py
index 019bbdf12..f66b795aa 100644
--- a/utils/test_collect_eval_results.py
+++ b/utils/test_collect_eval_results.py
@@ -3,7 +3,11 @@
 import json
 from pathlib import Path
 
-from collect_eval_results import build_row, collect_eval_rows
+from collect_eval_results import (
+    build_row,
+    collect_eval_rows,
+    extract_lm_metrics,
+)
 
 
 def test_build_row_preserves_sequence_lengths() -> None:
@@ -23,6 +27,52 @@ def test_build_row_preserves_sequence_lengths() -> None:
     assert row["osl"] == 1024
 
 
+def test_build_row_uses_flexible_metric_as_primary_score() -> None:
+    row = build_row(
+        {
+            "infmax_model_prefix": "test",
+            "hw": "h100",
+            "framework": "vllm",
+            "precision": "fp8",
+        },
+        {
+            "task": "gpqa_diamond_cot_n_shot",
+            "flex": 0.42,
+            "flex_se": 0.02,
+        },
+    )
+
+    assert row["score"] == 0.42
+    assert row["score_name"] == "em_flexible"
+    assert row["score_se"] == 0.02
+
+
+def test_extract_lm_metrics_supports_default_none_filter(
+    tmp_path: Path,
+) -> None:
+    result_path = tmp_path / "results_accuracy.json"
+    result_path.write_text(json.dumps({
+        "lm_eval_version": "0.4.9",
+        "results": {
+            "multiple_choice": {
+                "acc,none": 0.75,
+                "acc_stderr,none": 0.03,
+            },
+        },
+        "configs": {
+            "multiple_choice": {
+                "metric_list": [{"metric": "acc"}],
+                "filter_list": [],
+            },
+        },
+    }))
+
+    metrics = extract_lm_metrics(result_path)
+
+    assert metrics[0]["accuracy"] == 0.75
+    assert metrics[0]["accuracy_se"] == 0.03
+
+
 def _write_lm_eval_result(path: Path, score: float) -> None:
     path.write_text(json.dumps({
         "lm_eval_version": "0.4.0",
@@ -63,6 +113,7 @@ def test_collect_eval_rows_expands_batched_concurrencies(
         "decode_tp": 8,
         "decode_ep": 1,
         "decode_num_workers": 2,
+        "eval_exit_code": 0,
         "eval_concs": [4, 16],
         "completed_eval_concs": [4, 16],
         "failed_eval_concs": [],
@@ -83,13 +134,14 @@ def test_collect_eval_rows_expands_batched_concurrencies(
     assert [row["score"] for row in rows] == [0.90, 0.91]
 
 
-def test_collect_eval_rows_ignores_failed_batch_points(
+def test_collect_eval_rows_rejects_failed_batch(
     tmp_path: Path,
 ) -> None:
     artifact_dir = tmp_path / "eval_batch"
     artifact_dir.mkdir()
     (artifact_dir / "meta_env.json").write_text(json.dumps({
         "is_multinode": True,
+        "eval_exit_code": 1,
         "eval_concs": [4, 16],
         "completed_eval_concs": [4],
         "failed_eval_concs": [16],
@@ -106,4 +158,59 @@ def test_collect_eval_rows_ignores_failed_batch_points(
 
     rows = collect_eval_rows(tmp_path)
 
-    assert [row["conc"] for row in rows] == [4]
+    assert rows == []
+
+
+def test_collect_eval_rows_ignores_failed_single_eval(tmp_path: Path) -> None:
+    artifact_dir = tmp_path / "eval_failed"
+    artifact_dir.mkdir()
+    (artifact_dir / "meta_env.json").write_text(json.dumps({
+        "is_multinode": False,
+        "eval_exit_code": 7,
+        "conc": 4,
+    }))
+    _write_lm_eval_result(artifact_dir / "results_test.json", 0.99)
+
+    assert collect_eval_rows(tmp_path) == []
+
+
+def test_collect_eval_rows_rejects_inconsistent_batch_metadata(
+    tmp_path: Path,
+) -> None:
+    artifact_dir = tmp_path / "eval_batch"
+    artifact_dir.mkdir()
+    (artifact_dir / "meta_env.json").write_text(json.dumps({
+        "is_multinode": True,
+        "eval_exit_code": 0,
+        "eval_concs": [4, 16],
+        "completed_eval_concs": [4, 16],
+        "failed_eval_concs": [16],
+        "conc": 4,
+    }))
+    _write_lm_eval_result(
+        artifact_dir / "results_test_conc4.json",
+        0.90,
+    )
+
+    assert collect_eval_rows(tmp_path) == []
+
+
+def test_collect_eval_rows_rejects_incomplete_batch_results(
+    tmp_path: Path,
+) -> None:
+    artifact_dir = tmp_path / "eval_batch"
+    artifact_dir.mkdir()
+    (artifact_dir / "meta_env.json").write_text(json.dumps({
+        "is_multinode": True,
+        "eval_exit_code": 0,
+        "eval_concs": [4, 16],
+        "completed_eval_concs": [4, 16],
+        "failed_eval_concs": [],
+        "conc": 4,
+    }))
+    _write_lm_eval_result(
+        artifact_dir / "results_test_conc4.json",
+        0.90,
+    )
+
+    assert collect_eval_rows(tmp_path) == []
diff --git a/utils/test_process_agentic_result.py b/utils/test_process_agentic_result.py
index 38477b62a..e21e96d61 100644
--- a/utils/test_process_agentic_result.py
+++ b/utils/test_process_agentic_result.py
@@ -21,6 +21,8 @@
 
 import pytest
 
+from process_agentic_result import _HF_DATASET
+
 
 PROCESSOR = Path(__file__).resolve().parent / "process_agentic_result.py"
 
@@ -405,7 +407,8 @@ def test_processor_loads_traces_jsonl_for_theoretical_cache(tmp_path: Path):
     # Build a fake HF cache with traces.jsonl matching the conv_ids the
     # fixture references (trace-A, trace-B).
     hf_cache = tmp_path / "_hf"
-    snapshot = hf_cache / "datasets--semianalysisai--cc-traces-weka-042026" / "snapshots" / "abc"
+    org, name = _HF_DATASET.split("/", 1)
+    snapshot = hf_cache / f"datasets--{org}--{name}" / "snapshots" / "abc"
     snapshot.mkdir(parents=True)
     # Real corpus uses the ``out`` alias (Pydantic's external name for
     # output_length). Mix both to verify the loader accepts either.
diff --git a/utils/test_validate_reusable_sweep_artifacts.py b/utils/test_validate_reusable_sweep_artifacts.py
index 318724e4c..be10a5295 100644
--- a/utils/test_validate_reusable_sweep_artifacts.py
+++ b/utils/test_validate_reusable_sweep_artifacts.py
@@ -72,6 +72,8 @@ def single_eval_result(
         "dp_attention": False,
         "conc": conc,
         "task": "gsm8k",
+        "score": 0.95,
+        "score_name": "em_strict",
     }
 
 
@@ -83,6 +85,7 @@ def single_eval_meta(
 ) -> dict:
     row = single_eval_result(conc, runner, isl, osl)
     row["infmax_model_prefix"] = row.pop("model_prefix")
+    row["eval_exit_code"] = 0
     return row
 
 
@@ -100,6 +103,9 @@ def write_raw_eval_artifact(
     (artifact_dir / "meta_env.json").write_text(
         json.dumps(single_eval_meta(conc, logical_runner, isl, osl))
     )
+    (artifact_dir / "results_test.json").write_text(
+        json.dumps({"lm_eval_version": "0.4.0", "results": {}})
+    )
 
 
 def multinode_eval_entry(concs: list[int]) -> dict:
@@ -149,6 +155,8 @@ def multinode_eval_result(conc: int) -> dict:
         "decode_num_workers": 2,
         "conc": conc,
         "task": "gsm8k",
+        "score": 0.95,
+        "score_name": "em_strict",
     }
 
 
@@ -163,7 +171,12 @@ def write_raw_batched_eval_artifact(
     meta["eval_concs"] = concs
     meta["completed_eval_concs"] = concs
     meta["failed_eval_concs"] = []
+    meta["eval_exit_code"] = 0
     (artifact_dir / "meta_env.json").write_text(json.dumps(meta))
+    for conc in concs:
+        (artifact_dir / f"results_test_conc{conc}.json").write_text(
+            json.dumps({"lm_eval_version": "0.4.0", "results": {}})
+        )
 
 
 def single_fixed_entry(conc: int) -> dict:
@@ -439,6 +452,97 @@ def test_eval_validation_expands_one_batched_multinode_artifact(
     assert validate_eval_artifacts(tmp_path, expected) == []
 
 
+def test_eval_validation_rejects_failed_raw_eval_status(tmp_path: Path) -> None:
+    config = {"evals": [single_eval_entry(32)], "multinode_evals": []}
+    write_eval_aggregate(tmp_path, [single_eval_result(32)])
+    write_raw_eval_artifact(tmp_path, 32)
+    meta_path = next(tmp_path.glob("eval_*/meta_env.json"))
+    meta = json.loads(meta_path.read_text())
+    meta["eval_exit_code"] = 7
+    meta_path.write_text(json.dumps(meta))
+
+    errors = validate_eval_artifacts(tmp_path, expected_eval_keys(config))
+
+    assert any("eval_exit_code 7" in error for error in errors)
+
+
+def test_eval_validation_rejects_incomplete_batch_metadata(
+    tmp_path: Path,
+) -> None:
+    concs = [4, 16]
+    config = {
+        "evals": [],
+        "multinode_evals": [multinode_eval_entry(concs)],
+    }
+    write_eval_aggregate(
+        tmp_path,
+        [multinode_eval_result(conc) for conc in concs],
+    )
+    write_raw_batched_eval_artifact(tmp_path, concs)
+    meta_path = tmp_path / "eval_gptoss_8k1k_batch" / "meta_env.json"
+    meta = json.loads(meta_path.read_text())
+    meta["completed_eval_concs"] = [4]
+    meta["failed_eval_concs"] = [16]
+    meta_path.write_text(json.dumps(meta))
+
+    errors = validate_eval_artifacts(tmp_path, expected_eval_keys(config))
+
+    assert any("incomplete batched eval results" in error for error in errors)
+
+
+def test_eval_validation_rejects_missing_raw_batch_result(
+    tmp_path: Path,
+) -> None:
+    concs = [4, 16]
+    config = {
+        "evals": [],
+        "multinode_evals": [multinode_eval_entry(concs)],
+    }
+    write_eval_aggregate(
+        tmp_path,
+        [multinode_eval_result(conc) for conc in concs],
+    )
+    write_raw_batched_eval_artifact(tmp_path, concs)
+    (
+        tmp_path
+        / "eval_gptoss_8k1k_batch"
+        / "results_test_conc16.json"
+    ).unlink()
+
+    errors = validate_eval_artifacts(tmp_path, expected_eval_keys(config))
+
+    assert any(
+        "missing result files for concurrency: 16" in error
+        for error in errors
+    )
+
+
+def test_eval_validation_rejects_missing_aggregate_score(tmp_path: Path) -> None:
+    config = {"evals": [single_eval_entry(32)], "multinode_evals": []}
+    result = single_eval_result(32)
+    result["score"] = None
+    write_eval_aggregate(tmp_path, [result])
+    write_raw_eval_artifact(tmp_path, 32)
+
+    errors = validate_eval_artifacts(tmp_path, expected_eval_keys(config))
+
+    assert any("contains invalid score None" in error for error in errors)
+
+
+def test_eval_validation_rejects_below_threshold_aggregate_score(
+    tmp_path: Path,
+) -> None:
+    config = {"evals": [single_eval_entry(32)], "multinode_evals": []}
+    result = single_eval_result(32)
+    result["score"] = 0.80
+    write_eval_aggregate(tmp_path, [result])
+    write_raw_eval_artifact(tmp_path, 32)
+
+    errors = validate_eval_artifacts(tmp_path, expected_eval_keys(config))
+
+    assert any("is below 0.91 from models.gptoss" in error for error in errors)
+
+
 def test_eval_aggregate_validation_is_exact(tmp_path: Path) -> None:
     config = {
         "evals": [single_eval_entry(32)],
diff --git a/utils/validate_reusable_sweep_artifacts.py b/utils/validate_reusable_sweep_artifacts.py
index bbd64d174..ff97d923f 100644
--- a/utils/validate_reusable_sweep_artifacts.py
+++ b/utils/validate_reusable_sweep_artifacts.py
@@ -6,13 +6,23 @@
 import argparse
 import csv
 import json
+import math
 import sys
 from collections import Counter
 from pathlib import Path
 from typing import Any, Iterable
 
+from evals.validate_scores import (
+    load_config,
+    resolve_threshold,
+    validate_batch_manifest,
+)
+
 
 FIXED_SEQ_KEYS = ("1k1k", "8k1k")
+EVAL_THRESHOLD_CONFIG = load_config(
+    str(Path(__file__).resolve().parent / "evals" / "thresholds.json")
+)
 
 
 def as_bool(value: Any) -> bool:
@@ -30,6 +40,23 @@ def as_int(value: Any, default: int = 0) -> int:
         return default
 
 
+def valid_concurrency_list(
+    value: object,
+    *,
+    allow_empty: bool = True,
+) -> bool:
+    """Return whether metadata contains unique positive integer concurrencies."""
+    return (
+        isinstance(value, list)
+        and (allow_empty or bool(value))
+        and all(
+            isinstance(conc, int) and not isinstance(conc, bool) and conc > 0
+            for conc in value
+        )
+        and len(set(value)) == len(value)
+    )
+
+
 def load_json(path: Path) -> Any:
     """Load a JSON file."""
     with open(path) as handle:
@@ -512,6 +539,40 @@ def eval_key(row: dict[str, Any]) -> tuple[Any, ...]:
     )
 
 
+def validate_eval_score(path: Path, row: dict[str, Any]) -> list[str]:
+    """Validate one aggregate eval row's primary score."""
+    task = row.get("task")
+    score = row.get("score")
+    if not isinstance(task, str) or not task:
+        return [f"eval aggregate {path.name!r} contains a row without a task"]
+    if (
+        not isinstance(score, (int, float))
+        or isinstance(score, bool)
+        or not math.isfinite(score)
+        or not 0 <= score <= 1
+    ):
+        return [
+            f"eval aggregate {path.name!r} contains invalid score "
+            f"{score!r} for task {task!r}"
+        ]
+
+    prefix = row.get("model_prefix", row.get("infmax_model_prefix"))
+    if not isinstance(prefix, str) or not prefix:
+        prefix = None
+    threshold, source = resolve_threshold(
+        EVAL_THRESHOLD_CONFIG,
+        prefix,
+        task,
+        0.85,
+    )
+    if score < threshold:
+        return [
+            f"eval aggregate {path.name!r} score {score:.4f} for task "
+            f"{task!r} is below {threshold} from {source}"
+        ]
+    return []
+
+
 def raw_eval_artifact_dirs(artifacts_dir: Path) -> list[Path]:
     """Return raw eval result artifacts, excluding aggregate and debug artifacts."""
     return sorted(
@@ -552,19 +613,87 @@ def raw_eval_key_rows(
                 "meta_env.json"
             )
             continue
-        eval_concs = meta.get("completed_eval_concs")
-        if isinstance(meta.get("eval_concs"), list):
-            if not isinstance(eval_concs, list):
+
+        eval_exit_code = meta.get("eval_exit_code")
+        if (
+            not isinstance(eval_exit_code, int)
+            or isinstance(eval_exit_code, bool)
+            or eval_exit_code != 0
+        ):
+            errors.append(
+                f"raw eval artifact {artifact_dir.name!r} records failed "
+                f"eval_exit_code {eval_exit_code!r}"
+            )
+            continue
+
+        metadata_concs = meta.get("eval_concs")
+        if "eval_concs" in meta:
+            completed_concs = meta.get("completed_eval_concs")
+            failed_concs = meta.get("failed_eval_concs")
+            if not (
+                valid_concurrency_list(metadata_concs, allow_empty=False)
+                and valid_concurrency_list(completed_concs)
+                and valid_concurrency_list(failed_concs)
+            ):
                 errors.append(
                     f"raw eval artifact {artifact_dir.name!r} has invalid "
                     "batched concurrency metadata"
                 )
                 continue
+
+            metadata_set = set(metadata_concs)
+            completed_set = set(completed_concs)
+            failed_set = set(failed_concs)
+            if failed_set or completed_set != metadata_set:
+                errors.append(
+                    f"raw eval artifact {artifact_dir.name!r} has incomplete "
+                    "batched eval results"
+                )
+                continue
+            manifest_errors = validate_batch_manifest(
+                str(meta_path),
+                [
+                    str(result_path)
+                    for result_path in artifact_dir.glob("results*.json")
+                ],
+                expected_concs=metadata_concs,
+            )
+            if manifest_errors:
+                errors.extend(
+                    f"raw eval artifact {artifact_dir.name!r}: {error}"
+                    for error in manifest_errors
+                )
+                continue
             rows.extend(
                 eval_key({**meta, "conc": eval_conc})
-                for eval_conc in eval_concs
+                for eval_conc in completed_concs
             )
         else:
+            conc = meta.get("conc")
+            if (
+                not isinstance(conc, int)
+                or isinstance(conc, bool)
+                or conc <= 0
+            ):
+                errors.append(
+                    f"raw eval artifact {artifact_dir.name!r} has invalid "
+                    f"concurrency {conc!r}"
+                )
+                continue
+            manifest_errors = validate_batch_manifest(
+                str(meta_path),
+                [
+                    str(result_path)
+                    for result_path in artifact_dir.glob("results*.json")
+                ],
+                expected_concs=[conc],
+            )
+            if manifest_errors:
+                errors.extend(
+                    f"raw eval artifact {artifact_dir.name!r}: {error}"
+                    for error in manifest_errors
+                )
+                continue
             rows.append(eval_key(meta))
     return rows, errors
 
@@ -597,6 +726,9 @@ def validate_eval_artifacts(
                         for row in data
                         if isinstance(row, dict)
                     )
+                    for row in data:
+                        if isinstance(row, dict):
+                            errors.extend(validate_eval_score(path, row))
             if row_count == 0:
                 errors.append("eval_results_all contains no rows")
             errors.extend(