diff --git a/baselines/InternVLA-A1/README.md b/baselines/InternVLA-A1/README.md index b851f49..957302b 100644 --- a/baselines/InternVLA-A1/README.md +++ b/baselines/InternVLA-A1/README.md @@ -27,6 +27,13 @@ Then update the checkpoint path used by `eval_pjsim.sh` (or pass it via script a ## 3) Run evaluation +Please install genmanip-client sdk into the InternVLA-A1 environment. +``` +# install the EBench evaluation client +git clone https://github.com/InternRobotics/genmanip-client.git +cd genmanip-client && pip install -e .[full_numpy2] && cd - +``` + From this directory, run: ```bash @@ -35,7 +42,60 @@ bash eval_pjsim.sh The script will launch `inference.py` and start evaluation. +## 4) Quick start: single worker + +```bash +python inference.py \ + --ckpt_path /mnt/data/wangyukai/github/EBench/checkpoints/EBench-Generalist-InternVLA-A1 \ + --url "$BASE_URL" \ + --run_id "$RUN_ID" \ + --token "$TOKEN" \ + --worker_ids 0 +``` + +`--url` should point to the remote EBench evaluation server. If `--url` +is empty, `inference.py` falls back to `http://{host}:{port}`. + +## 5) Multi-worker / multi-host: `run_internvla_eval.sh` + +`scripts/run_internvla_eval.sh` launches one `inference.py` process per +worker id, writes each worker's stdout/stderr to a separate log file, and +forwards SIGINT/SIGTERM to all children for clean shutdown. + +```bash +CKPT_PATH=/you/checkpoints/EBench-Generalist-InternVLA-A1 \ +BASE_URL=https://your-ebench-server.example.com \ +RUN_ID=my_run_2026_04_29 \ +TOKEN=$EBENCH_TOKEN \ +WORKER_IDS=0,1,2,3 \ +GPU_IDS=0,1,2,3 \ +LOG_DIR=log_dir/internvla-a1-generalist \ +bash /mnt/data/wangyukai/github/EBench/scripts/run_internvla_eval.sh +``` + +## 6) CLI reference (`inference.py`) + +| Flag | Default | Description | +| ---- | ------- | ----------- | +| `--ckpt_path` | _req._ | Local InternVLA-A1 checkpoint directory. Must contain the policy config, weights, and `stats.json`. | +| `--stats_key` | `lift2` | Key inside `stats.json` used for state/action normalization statistics. | +| `--resize_size` | `224` | Image resize target used by `ResizeImagesWithPadFn`. | +| `--image_history_interval` | `15` | Interval used to pick the historical frame paired with the current frame. | +| `--action_mode` | `delta` | `delta`: add predicted joint deltas to the current joint state. `abs`: use predicted actions directly. | +| `--dtype` | `float32` | Policy inference dtype. Supported by the script: `float32` or `bfloat16`. | +| `--decode_image` | `False` | Pass through to `policy.predict_action_chunk(..., decode_image=...)`. Usually leave disabled for evaluation. | +| `--infer_horizon` | `50` | Number of predicted action steps kept from the model output before final truncation. | +| `--action_horizon_size` | `30` | Number of action steps deployed from each planned chunk. | +| `--worker_ids` | `0` | Comma-separated worker ids handled by this process, e.g. `0,1,2`. For multi-process launch, use one id per process. | +| `--url` | `""` | Full EBench evaluation server URL. Overrides `--host` and `--port` when set. | +| `--host` | `0.0.0.0` | Host used to build `http://{host}:{port}` when `--url` is not set. | +| `--port` | `8087` | Port used to build `http://{host}:{port}` when `--url` is not set. | +| `--run_id` | `""` | Identifier of the evaluation run; shared across all workers and hosts for the same run. | +| `--token` | `""` | Auth token issued by the EBench server. | + + ## Notes - Ensure your environment variables in `eval_pjsim.sh` are set correctly (for example `HF_HOME`). +- First run may require downloading checkpoints and tokenizer files from Hugging Face. - If you are using offline mode, keep `HF_HUB_OFFLINE=1` and `TRANSFORMERS_OFFLINE=1` only after required files are fully downloaded. diff --git a/baselines/InternVLA-A1/eval_pjsim.sh b/baselines/InternVLA-A1/eval_pjsim.sh index cb94c3a..d64b837 100644 --- a/baselines/InternVLA-A1/eval_pjsim.sh +++ b/baselines/InternVLA-A1/eval_pjsim.sh @@ -1,7 +1,7 @@ export LD_LIBRARY_PATH="/opt/libjpeg-turbo/lib64:/opt/libjpeg-turbo/lib:${LD_LIBRARY_PATH-}" -export GENMANIP_RESULT_DIR=./evaluation/genmanip -export HF_HOME=/your/hf_home -export HF_HUB_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 +export GENMANIP_RESULT_DIR=./client_results +export HF_HOME=./hf_home +export HF_HUB_OFFLINE=0 +export TRANSFORMERS_OFFLINE=0 -python inference.py --ckpt_path ./checkpoints/EBench-Generalist-InternVLA-A1 --worker_ids 0 \ No newline at end of file +python inference.py --ckpt_path your/path/to/checkpoints/EBench-Generalist-InternVLA-A1 --worker_ids 0 \ No newline at end of file diff --git a/baselines/InternVLA-A1/inference.py b/baselines/InternVLA-A1/inference.py index aedb5da..c2fc21f 100644 --- a/baselines/InternVLA-A1/inference.py +++ b/baselines/InternVLA-A1/inference.py @@ -78,6 +78,9 @@ class InferenceArgs: worker_ids: str = "0" # Comma-separated worker IDs, e.g. "0,1,2" host: str = "0.0.0.0" port: int = 8087 + url: str = "" # Optional custom URL for EvalClient connection, overrides host and port if provided. + token: str = "" + run_id: str = "" class QwenA1PolicyWrapper: @@ -124,9 +127,9 @@ def get_action(self, obs: dict) -> dict: predicted_rel_base = action[16:] target_base_abs = self.chunk_start_base + predicted_rel_base base_motion = target_base_abs - curr_base - output["action"] = joints_gripper + output["action"] = joints_gripper.tolist() output['is_rel'] = False - output["base_motion"] = base_motion + output["base_motion"] = base_motion.tolist() output['base_is_rel'] = True output["control_type"] = "joint_position" return output @@ -143,9 +146,9 @@ def get_action_chunk(self, obs: dict) -> list[dict]: joints_gripper = self._pack_action_fields(action) predicted_rel_base = action[16:] target_base_abs = curr_base + predicted_rel_base - action_step["action"] = joints_gripper + action_step["action"] = joints_gripper.tolist() action_step['is_rel'] = False - action_step["base_motion"] = target_base_abs + action_step["base_motion"] = target_base_abs.tolist() action_step['base_is_rel'] = False action_step["control_type"] = "joint_position" action_chunk.append(action_step) @@ -200,7 +203,14 @@ def _plan_action(self, obs: dict) -> None: image = sample[key].permute(0, 3, 1, 2) sample[key] = image - sample = self.input_transforms(sample) + for transform in self.input_transforms.transforms: + if isinstance(transform, Qwen3_VLProcessorTransformFn): + sample.update({ + f"{OBS_IMAGES}.image0_mask": torch.tensor([True]).cuda(), + f"{OBS_IMAGES}.image1_mask": torch.tensor([True]).cuda(), + f"{OBS_IMAGES}.image2_mask": torch.tensor([True]).cuda(), + }) + sample = transform(sample) inputs = {} for key in sample.keys(): @@ -247,9 +257,15 @@ def _plan_action(self, obs: dict) -> None: if not worker_ids: raise ValueError("`worker_ids` must contain at least one valid worker id.") - base_url = f"http://{args.host}:{args.port}" + if args.url: + base_url = args.url + else: + base_url = f"http://{args.host}:{args.port}" + client = EvalClient( - base_url, + base_url=base_url, + token=args.token, + run_id=args.run_id, worker_ids=worker_ids ) policy_list = [QwenA1PolicyWrapper(args) for _ in worker_ids] @@ -263,7 +279,20 @@ def _plan_action(self, obs: dict) -> None: worker_id: policy.get_action(obs[worker_id]) for worker_id, policy in zip(worker_ids, policy_list) } - obs, done = client.step(action) + try: + obs, done = client.step(action) + except Exception as exc: + print(f"[warn] EvalClient step failed: {exc}", flush=True) + client.close() + client = EvalClient( + base_url=base_url, + token=args.token, + run_id=args.run_id, + worker_ids=worker_ids + ) + obs = client.reset() + policy.reset() + break if done: break diff --git a/baselines/X-VLA/README.md b/baselines/X-VLA/README.md index 968a551..9a17695 100644 --- a/baselines/X-VLA/README.md +++ b/baselines/X-VLA/README.md @@ -39,7 +39,7 @@ pip install -r EBench/baselines/X-VLA/requirements.txt # install the EBench evaluation client git clone https://github.com/InternRobotics/genmanip-client.git -cd genmanip-client && pip install -e . && cd - +cd genmanip-client && pip install -e .[full_numpy1] && cd - ``` > `transformers<=4.51.3` is pinned because X-VLA's custom modeling code diff --git a/baselines/openpi/README.md b/baselines/openpi/README.md index 5be6479..32bee4e 100644 --- a/baselines/openpi/README.md +++ b/baselines/openpi/README.md @@ -6,13 +6,19 @@ This guide describes the minimal workflow for evaluating the post-trained OpenPI ### 1. Install OpenPI -Install the official OpenPI repository located at: +Install the official OpenPI repository located at: `baselines/openpi/third_party/openpi`. +Please refer to the official OpenPI README for detailed setup instructions. -```bash -baselines/openpi/third_party/openpi +i.e. ``` +cd baselines/openpi/third_party/openpi +GIT_LFS_SKIP_SMUDGE=1 uv sync +GIT_LFS_SKIP_SMUDGE=1 uv pip install -e . -Please refer to the official OpenPI README for detailed setup instructions. +# install genmanip client for ebench evaluation +cd path/to/genmanip-client +uv pip install -e .[full_numpy1] +``` ### 2. Add EBench-Specific Files @@ -25,17 +31,8 @@ baselines/openpi/scripts/ Layer these files on top of the official OpenPI codebase. This can be done by either updating `PYTHONPATH` or copying the files to the corresponding locations in the OpenPI repository. -### 3. Configure Evaluation Settings - -Before running evaluation, modify the configuration file: - -```bash -scripts/launch_pi_onlineeval.sh -``` -Please make sure that the model path, dataset path, environment settings, and output directory are correctly specified. - -### 4. Download EBench Post-Trained Models +### 3. Download EBench Post-Trained Models The post-trained OpenPI models on EBench are available at: @@ -54,4 +51,6 @@ Launch the evaluation with: ```bash bash scripts/launch_pi_onlineeval.sh -``` \ No newline at end of file +``` + +Please make sure that the model path, dataset path, environment settings, and output directory are correctly specified. \ No newline at end of file diff --git a/baselines/openpi/scripts/pi_eval_client_online.py b/baselines/openpi/scripts/pi_eval_client_online.py index 7366fe0..04f524f 100644 --- a/baselines/openpi/scripts/pi_eval_client_online.py +++ b/baselines/openpi/scripts/pi_eval_client_online.py @@ -110,13 +110,13 @@ def parse_args(): obs = eval_client.reset() eval_finished = False while not eval_finished: - - if obs[ids]["obs"]["reset"]: - pass action_chunk = pi0_client.get_action(obs) - obs, eval_finished = eval_client.step(action_chunk) + try: + obs, eval_finished = eval_client.step(action_chunk) + except Exception as e: + eval_client.close() + eval_client = EvalClient(base_url=base_url, worker_ids=worker_ids, run_id=run_id, token=token) + obs = eval_client.reset() finally: eval_client.close() - - \ No newline at end of file diff --git a/scripts/run_xvla_eval.sh b/scripts/run_xvla_eval.sh index 9de8157..a753f90 100755 --- a/scripts/run_xvla_eval.sh +++ b/scripts/run_xvla_eval.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Usage: MODEL_PATH=... BASE_URL=... RUN_ID=... TOKEN=... \ -# WORKER_IDS=0,1,2,3 ./EBench/scripts/run_xvla_eval.sh +# WORKER_IDS=0,1,2,3 GPU_IDS=0,1,2,3 ./EBench/scripts/run_xvla_eval.sh # Multi-host: share RUN_ID across hosts, give each host a disjoint WORKER_IDS slice. set -uo pipefail @@ -11,6 +11,9 @@ STEP_MODE="${STEP_MODE:-step}" mkdir -p "$LOG_DIR" echo "[run_xvla_eval] logs -> $LOG_DIR" echo "[run_xvla_eval] step_mode=$STEP_MODE" +if [[ -n "${GPU_IDS:-}" ]]; then + echo "[run_xvla_eval] gpu_ids=$GPU_IDS" +fi pids=() @@ -45,11 +48,20 @@ cleanup() { trap 'cleanup 130' INT TERM trap 'cleanup "$?"' EXIT +worker_ids=(${WORKER_IDS//,/ }) +gpu_ids=(${GPU_IDS:-}) +if [[ -n "${GPU_IDS:-}" ]]; then + gpu_ids=(${GPU_IDS//,/ }) +fi - -for wid in ${WORKER_IDS//,/ }; do +for i in "${!worker_ids[@]}"; do + wid="${worker_ids[$i]}" log="$LOG_DIR/worker_${wid}.log" - python -u "$RUN_PY" \ + env_args=() + if (( ${#gpu_ids[@]} )); then + env_args=("CUDA_VISIBLE_DEVICES=${gpu_ids[$((i % ${#gpu_ids[@]}))]}") + fi + env "${env_args[@]}" python -u "$RUN_PY" \ --model_path "$MODEL_PATH" \ --base_url "$BASE_URL" \ --run_id "$RUN_ID" \ @@ -58,7 +70,11 @@ for wid in ${WORKER_IDS//,/ }; do --step_mode "$STEP_MODE" \ > "$log" 2>&1 & pids+=($!) - echo "[run_xvla_eval] worker=$wid pid=$! log=$log" + if (( ${#gpu_ids[@]} )); then + echo "[run_xvla_eval] worker=$wid gpu=${gpu_ids[$((i % ${#gpu_ids[@]}))]} pid=$! log=$log" + else + echo "[run_xvla_eval] worker=$wid pid=$! log=$log" + fi done rc=0