From 777ac0d74f15345b59b23cbc113d9831d714a68e Mon Sep 17 00:00:00 2001 From: jzh26 <226629529+jzh26@users.noreply.github.com> Date: Thu, 11 Jun 2026 08:48:20 +0000 Subject: [PATCH] Fix conversation loading logic in UltraChat dataset Previously, only the first user prompt was extracted from each example, discarding all subsequent turns. UltraChat stores full multi-turn conversations in the "messages" field, so switching to that field preserves the complete dialogue rather than truncating to a single user message. Signed-off-by: jzh26 <226629529+jzh26@users.noreply.github.com> --- examples/dataset/make_dataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/dataset/make_dataset.py b/examples/dataset/make_dataset.py index 3cf1b98095d..dc4b9d91f95 100644 --- a/examples/dataset/make_dataset.py +++ b/examples/dataset/make_dataset.py @@ -276,14 +276,14 @@ async def _load_ultrachat_conversations( ds = ds.shuffle(seed=42) yield len(ds) for i in range(len(ds)): - prompt = ds[i]["prompt"].strip() prompt_id = ds[i]["prompt_id"].strip() - if prompt: - msgs = [{"role": "user", "content": prompt}] - if not prompt_id: - prompt_id = id_for_conversation(msgs) - prompt_id = f"ultrachat-{split_name}-{prompt_id}" - yield {"conversation_id": prompt_id, "conversations": msgs} + msgs = ds[i]["messages"] + if not msgs: + continue + if not prompt_id: + prompt_id = id_for_conversation(msgs) + prompt_id = f"ultrachat-{split_name}-{prompt_id}" + yield {"conversation_id": prompt_id, "conversations": msgs} logger.info(f"Finished loading UltraChat {split_name} conversations.")