From 2d590a7eb840f3e5edc43c85d6ae88752d9223d1 Mon Sep 17 00:00:00 2001 From: Dipannita Shaw Date: Wed, 11 Mar 2026 16:26:16 -0700 Subject: [PATCH] Update user docs to drop config file path --- PREFLIGHT.md | 8 ++++---- docs/guides/checkpointing_solutions/convert_checkpoint.md | 4 ++-- docs/run_maxtext/run_maxtext_localhost.md | 8 ++++---- docs/run_maxtext/run_maxtext_single_host_gpu.md | 2 +- docs/run_maxtext/run_maxtext_via_multihost_job.md | 2 +- docs/run_maxtext/run_maxtext_via_multihost_runner.md | 2 +- docs/run_maxtext/run_maxtext_via_pathways.md | 4 ++-- docs/run_maxtext/run_maxtext_via_xpk.md | 4 ++-- docs/tutorials/first_run.md | 8 ++++---- docs/tutorials/posttraining/full_finetuning.md | 1 - docs/tutorials/posttraining/knowledge_distillation.md | 6 +++--- docs/tutorials/posttraining/multimodal.md | 2 +- docs/tutorials/posttraining/rl.md | 4 ++-- docs/tutorials/posttraining/rl_on_multi_host.md | 4 ++-- docs/tutorials/posttraining/sft.md | 2 +- docs/tutorials/posttraining/sft_on_multi_host.md | 4 ++-- docs/tutorials/pretraining.md | 6 +++--- .../experimental/agent/ckpt_conversion_agent/README.md | 2 +- src/maxtext/inference/mlperf/README.md | 2 +- 19 files changed, 37 insertions(+), 38 deletions(-) diff --git a/PREFLIGHT.md b/PREFLIGHT.md index 0ec1875eec..495e8d87fa 100644 --- a/PREFLIGHT.md +++ b/PREFLIGHT.md @@ -7,12 +7,12 @@ Before you run ML workload on Multihost with GCE or GKE, simply apply `bash pref Here is an example for GCE: ``` -bash preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?} +bash preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?} ``` Here is an example for GKE: ``` -bash preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?} +bash preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?} ``` # Optimization 2: Numa binding (You can only apply this to v4 and v5p) @@ -22,14 +22,14 @@ For GCE, [preflight.sh](https://github.com/google/maxtext/blob/main/preflight.sh) will help you install `numactl` dependency, so you can use it directly, here is an example: ``` -bash preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?} +bash preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?} ``` For GKE, `numactl` should be built into your docker image from [maxtext_tpu_dependencies.Dockerfile](https://github.com/google/maxtext/blob/main/src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile), so you can use it directly if you built the maxtext docker image. Here is an example ``` -bash preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?} +bash preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?} ``` 1. `numactl`: This is the command-line tool used for controlling NUMA policy for processes or shared memory. It's particularly useful on multi-socket systems where memory locality can impact performance. diff --git a/docs/guides/checkpointing_solutions/convert_checkpoint.md b/docs/guides/checkpointing_solutions/convert_checkpoint.md index 6c606fb813..f8be6194e3 100644 --- a/docs/guides/checkpointing_solutions/convert_checkpoint.md +++ b/docs/guides/checkpointing_solutions/convert_checkpoint.md @@ -70,7 +70,7 @@ Finally, run below command to complete the conversion # Optional: If run out of disk space when downloading HuggingFace safetensors, # customize your "HF_HOME" to redirect the cache to a larger or mounted disk (e.g., on a TPU VM). # export HF_HOME="/dev/shm/huggingface_tmp" -python3 -m maxtext.checkpoint_conversion.to_maxtext maxtext/configs/base.yml \ +python3 -m maxtext.checkpoint_conversion.to_maxtext \ model_name=${MODEL_NAME?} \ hf_access_token=${HF_TOKEN?} \ base_output_directory=${MODEL_CHECKPOINT_DIRECTORY?} \ @@ -108,7 +108,7 @@ Use the `to_huggingface.py` script to convert a MaxText checkpoint into the Hugg The following command converts a MaxText checkpoint and saves it locally, to GCS, or uploads it directly to the Hugging Face Hub. ```bash -python3 -m maxtext.checkpoint_conversion.to_huggingface src/maxtext/configs/base.yml \ +python3 -m maxtext.checkpoint_conversion.to_huggingface \ model_name= \ load_parameters_path= \ base_output_directory= \ diff --git a/docs/run_maxtext/run_maxtext_localhost.md b/docs/run_maxtext/run_maxtext_localhost.md index 5f7b428f16..4695d5237f 100644 --- a/docs/run_maxtext/run_maxtext_localhost.md +++ b/docs/run_maxtext/run_maxtext_localhost.md @@ -58,7 +58,7 @@ bash tools/setup/setup.sh DEVICE={tpu|gpu} After the installation is complete, run a short training job using synthetic data to confirm everything is working correctly. This command trains a model for just 10 steps. Remember to replace `$YOUR_JOB_NAME` with a unique name for your run and `gs://` with the path to the GCS bucket you configured in the prerequisites. ```bash -python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ dataset_type=synthetic \ @@ -72,7 +72,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ To demonstrate model output, run the following command: ```bash -python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \ +python3 -m maxtext.inference.decode \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ per_device_batch_size=1 @@ -92,7 +92,7 @@ To use a pre-configured model for TPUs, you override the `model_name` parameter, llama3-8b (TPU) ```bash -python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ model_name=llama3-8b \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ @@ -106,7 +106,7 @@ python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \ qwen3-4b (TPU) ```bash -python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ model_name=qwen3-4b \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ diff --git a/docs/run_maxtext/run_maxtext_single_host_gpu.md b/docs/run_maxtext/run_maxtext_single_host_gpu.md index 51d0c36b1b..94204cd428 100644 --- a/docs/run_maxtext/run_maxtext_single_host_gpu.md +++ b/docs/run_maxtext/run_maxtext_single_host_gpu.md @@ -148,7 +148,7 @@ Hardware: GPU ``` ```bash -python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=gpu01 base_output_directory=/deps/output \ +python3 -m maxtext.trainers.pre_train.train run_name=gpu01 base_output_directory=/deps/output \ dataset_type=synthetic enable_checkpointing=True steps=10 attention=cudnn_flash_te scan_layers=False \ use_iota_embed=True hardware=gpu per_device_batch_size=12 ``` diff --git a/docs/run_maxtext/run_maxtext_via_multihost_job.md b/docs/run_maxtext/run_maxtext_via_multihost_job.md index a364a25365..74e8858a71 100644 --- a/docs/run_maxtext/run_maxtext_via_multihost_job.md +++ b/docs/run_maxtext/run_maxtext_via_multihost_job.md @@ -68,7 +68,7 @@ The `multihost_job.py` script: ```sh RUN_NAME=${YOUR_JOB_NAME?} # You may set this to any unique name for a fresh run. - python3 multihost_job.py --NUM_SLICES=${NODE_COUNT?} --RUN_NAME=${RUN_NAME?} --BUCKET_NAME=${BUCKET_NAME?} --CQR_EXTRA_ARGS="--reserved" --COMMAND="bash tools/setup/setup.sh && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${RUN_NAME?}" + python3 multihost_job.py --NUM_SLICES=${NODE_COUNT?} --RUN_NAME=${RUN_NAME?} --BUCKET_NAME=${BUCKET_NAME?} --CQR_EXTRA_ARGS="--reserved" --COMMAND="bash tools/setup/setup.sh && python3 -m maxtext.trainers.pre_train.train run_name=${RUN_NAME?}" ``` We tell `multihost_job` to target the `reserved` pool by by including `--reserved` as extra arguments to the CQR request, but you may instead target the `on-demand` pool by removing the `--CQR_EXTRA_ARGS` flag (on-demand is default), or the pre-emptible pool with `--CQR_EXTRA_ARGS="--best-effort"`, which may be necessary if your reservation is full. diff --git a/docs/run_maxtext/run_maxtext_via_multihost_runner.md b/docs/run_maxtext/run_maxtext_via_multihost_runner.md index 13688a8065..99a31a1f29 100644 --- a/docs/run_maxtext/run_maxtext_via_multihost_runner.md +++ b/docs/run_maxtext/run_maxtext_via_multihost_runner.md @@ -106,7 +106,7 @@ Although there are several steps below, most are for the initial setup. Once set Set config values for `base_output_directory` and `dataset_path` in `configs/base.yml` if not set already. ``` - python3 multihost_runner.py --TPU_PREFIX=${TPU_PREFIX?} --COMMAND="python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${RUN_NAME?}" + python3 multihost_runner.py --TPU_PREFIX=${TPU_PREFIX?} --COMMAND="python3 -m maxtext.trainers.pre_train.train run_name=${RUN_NAME?}" ``` If you are running the `multihost_runner.py` script from a TPUVM, you will need to set `--INTERNAL_IP=true`. diff --git a/docs/run_maxtext/run_maxtext_via_pathways.md b/docs/run_maxtext/run_maxtext_via_pathways.md index 5166d47d44..9e954e5c8e 100644 --- a/docs/run_maxtext/run_maxtext_via_pathways.md +++ b/docs/run_maxtext/run_maxtext_via_pathways.md @@ -96,7 +96,7 @@ xpk workload create-pathways \ --project=${PROJECT?} \ --zone=${ZONE?} \ --docker-image=${DOCKER_IMAGE?} \ - --command="python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ + --command="python3 -m maxtext.trainers.pre_train.train \ base_output_directory=gs://${BUCKET_NAME?} \ per_device_batch_size=1 \ enable_checkpointing=false \ @@ -154,7 +154,7 @@ export JAX_PLATFORMS=proxy export JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 # Run the training script -python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ base_output_directory=gs://${BUCKET_NAME?} \ per_device_batch_size=1 \ enable_checkpointing=false \ diff --git a/docs/run_maxtext/run_maxtext_via_xpk.md b/docs/run_maxtext/run_maxtext_via_xpk.md index 5493add8eb..8d142ef9dc 100644 --- a/docs/run_maxtext/run_maxtext_via_xpk.md +++ b/docs/run_maxtext/run_maxtext_via_xpk.md @@ -187,7 +187,7 @@ For instance, to run a job across **four TPU slices**, you would change `--num-s --base-docker-image maxtext_base_image\ --tpu-type v5litepod-256\ --num-slices 1\ - --command "python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${USER}-tpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100" + --command "python3 -m maxtext.trainers.pre_train.train run_name=${USER}-tpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100" ``` - **On your GPU cluster:** @@ -199,7 +199,7 @@ For instance, to run a job across **four TPU slices**, you would change `--num-s --base-docker-image maxtext_base_image\ --device-type h100-80gb-8\ --num-nodes 2\ - --command "python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${USER}-gpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100" + --command "python3 -m maxtext.trainers.pre_train.train run_name=${USER}-gpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100" ``` ______________________________________________________________________ diff --git a/docs/tutorials/first_run.md b/docs/tutorials/first_run.md index 776b408c91..ae0bae76d2 100644 --- a/docs/tutorials/first_run.md +++ b/docs/tutorials/first_run.md @@ -49,7 +49,7 @@ pre-commit install 4. After installation completes, run training on synthetic data with the following command: ```sh -python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ dataset_type=synthetic \ @@ -61,7 +61,7 @@ Optional: If you want to try training on a Hugging Face dataset, see [Data Input 5. To demonstrate model output, run the following command: ```sh -python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \ +python3 -m maxtext.inference.decode \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ per_device_batch_size=1 @@ -83,7 +83,7 @@ You can use [demo_decoding.ipynb](https://github.com/AI-Hypercomputer/maxtext/bl 2. After installation is complete, run training with the following command on synthetic data: ```sh -python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ dataset_type=synthetic \ @@ -93,7 +93,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \ 3. To demonstrate model output, run the following command: ```sh -python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \ +python3 -m maxtext.inference.decode \ run_name=${YOUR_JOB_NAME?} \ base_output_directory=gs:// \ per_device_batch_size=1 diff --git a/docs/tutorials/posttraining/full_finetuning.md b/docs/tutorials/posttraining/full_finetuning.md index 9455505402..45f6e9eb5b 100644 --- a/docs/tutorials/posttraining/full_finetuning.md +++ b/docs/tutorials/posttraining/full_finetuning.md @@ -101,7 +101,6 @@ Below is a sample training script. ```sh python3 -m maxtext.trainers.pre_train.train \ - src/maxtext/configs/base.yml \ run_name=${RUN_NAME?} \ base_output_directory=${BASE_OUTPUT_DIRECTORY?} \ load_parameters_path=${MODEL_CKPT_PATH?} \ diff --git a/docs/tutorials/posttraining/knowledge_distillation.md b/docs/tutorials/posttraining/knowledge_distillation.md index 849eb74f84..8a068c66c7 100644 --- a/docs/tutorials/posttraining/knowledge_distillation.md +++ b/docs/tutorials/posttraining/knowledge_distillation.md @@ -132,7 +132,7 @@ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu export PRE_TRAINED_MODEL_CKPT_DIRECTORY=${BASE_DIRECTORY?}/llama3.1-8b-ckpt # Convert to MaxText format -python3 -m maxtext.checkpoint_conversion.to_maxtext src/maxtext/configs/base.yml \ +python3 -m maxtext.checkpoint_conversion.to_maxtext \ model_name=llama3.1-8b \ hf_access_token=${HF_TOKEN?} \ base_output_directory=${PRE_TRAINED_MODEL_CKPT_DIRECTORY?} \ @@ -170,7 +170,7 @@ You can now fine-tune your smaller student model using supervised fine-tuning te Example command to run fine-tuning on a TPU v6e-8: ```bash -python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated src/maxtext/configs/post_train/sft.yml \ +python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated \ run_name=${RUN_NAME?} \ base_output_directory=${BASE_DIRECTORY?}/distillation/qwen3-32b-distill-llama3.1-8b \ tokenizer_path=meta-llama/Llama-3.1-8B-Instruct tokenizer_type=huggingface \ @@ -209,7 +209,7 @@ largest_dir="${sorted_dirs[-1]}" FINE_TUNED_MODEL_CKPT_PATH=${CHECKPOINTS_PATH?}/${largest_dir}/model_params # Fine-tune student model on original dataset -python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml \ +python3 -m maxtext.trainers.post_train.sft.train_sft \ run_name=${RUN_NAME?}_stage2 \ base_output_directory=${BASE_DIRECTORY?}/distillation/qwen3-32b-distill-llama3.1-8b \ tokenizer_path=meta-llama/Llama-3.1-8B-Instruct tokenizer_type=huggingface \ diff --git a/docs/tutorials/posttraining/multimodal.md b/docs/tutorials/posttraining/multimodal.md index df658b88d2..0e867f55b5 100644 --- a/docs/tutorials/posttraining/multimodal.md +++ b/docs/tutorials/posttraining/multimodal.md @@ -38,7 +38,7 @@ Then use this command to convert an unscanned checkpoint from HuggingFace to Max ```shell export HF_ACCESS_TOKEN=hf_... export MAXTEXT_CKPT_GCS_PATH=gs://... -python -m maxtext.checkpoint_conversion.to_maxtext maxtext/configs/base.yml \ +python -m maxtext.checkpoint_conversion.to_maxtext \ model_name=gemma3-4b \ hf_access_token=${HF_ACCESS_TOKEN?} \ base_output_directory=${MAXTEXT_CKPT_GCS_PATH?} \ diff --git a/docs/tutorials/posttraining/rl.md b/docs/tutorials/posttraining/rl.md index 7f2c366c26..ce95543915 100644 --- a/docs/tutorials/posttraining/rl.md +++ b/docs/tutorials/posttraining/rl.md @@ -133,7 +133,7 @@ export MAXTEXT_CKPT_PATH= # e.g., gs://my-bucke Run the following command for GRPO: ``` -python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \ +python3 -m maxtext.trainers.post_train.rl.train_rl \ model_name=${MODEL?} \ tokenizer_path=${TOKENIZER?} \ load_parameters_path=${MAXTEXT_CKPT_PATH?} \ @@ -157,7 +157,7 @@ The overview of what this run will do is as follows: Run the following command for GSPO: ``` -python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \ +python3 -m maxtext.trainers.post_train.rl.train_rl \ model_name=${MODEL?} \ tokenizer_path=${TOKENIZER?} \ load_parameters_path=${MAXTEXT_CKPT_PATH?} \ diff --git a/docs/tutorials/posttraining/rl_on_multi_host.md b/docs/tutorials/posttraining/rl_on_multi_host.md index e90dba517e..52e302e4ec 100644 --- a/docs/tutorials/posttraining/rl_on_multi_host.md +++ b/docs/tutorials/posttraining/rl_on_multi_host.md @@ -196,7 +196,7 @@ xpk workload create-pathways --workload ${WORKLOAD?} \ --tpu-type=${TPU_TYPE?} --num-slices=1 \ --project=${PROJECT_ID?} --priority=high \ --command "HF_TOKEN=${HF_TOKEN?} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \ -python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \ +python3 -m maxtext.trainers.post_train.rl.train_rl \ model_name=${MODEL?} \ tokenizer_path=${TOKENIZER?} \ load_parameters_path=${MAXTEXT_CKPT_PATH?} \ @@ -213,7 +213,7 @@ xpk workload create-pathways --workload ${WORKLOAD?} \ --tpu-type=${TPU_TYPE?} --num-slices=1 \ --project=${PROJECT_ID?} --priority=high \ --command "HF_TOKEN=${HF_TOKEN?} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \ -python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \ +python3 -m maxtext.trainers.post_train.rl.train_rl \ model_name=${MODEL?} \ tokenizer_path=${TOKENIZER?} \ load_parameters_path=${MAXTEXT_CKPT_PATH?} \ diff --git a/docs/tutorials/posttraining/sft.md b/docs/tutorials/posttraining/sft.md index cb3ff85baf..27e4efd28b 100644 --- a/docs/tutorials/posttraining/sft.md +++ b/docs/tutorials/posttraining/sft.md @@ -88,7 +88,7 @@ export PRE_TRAINED_MODEL_CKPT_PATH= # e.g., gs: Now you are ready to run SFT using the following command: ```sh -python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml \ +python3 -m maxtext.trainers.post_train.sft.train_sft \ run_name=${RUN_NAME?} \ base_output_directory=${BASE_OUTPUT_DIRECTORY?} \ model_name=${PRE_TRAINED_MODEL?} \ diff --git a/docs/tutorials/posttraining/sft_on_multi_host.md b/docs/tutorials/posttraining/sft_on_multi_host.md index 55273d5475..0884fa267c 100644 --- a/docs/tutorials/posttraining/sft_on_multi_host.md +++ b/docs/tutorials/posttraining/sft_on_multi_host.md @@ -143,7 +143,7 @@ xpk workload create \ --workload=${WORKLOAD_NAME?} \ --tpu-type=${TPU_TYPE?} \ --num-slices=${TPU_SLICE?} \ ---command "python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml run_name=${WORKLOAD_NAME?} base_output_directory=${OUTPUT_PATH?} model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} hf_access_token=${HF_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} per_device_batch_size=1 steps=${STEPS?} profiler=xplane hf_path=${DATASET_NAME?} train_split=${TRAIN_SPLIT?} train_data_columns=${TRAIN_DATA_COLUMNS?}" +--command "python3 -m maxtext.trainers.post_train.sft.train_sft run_name=${WORKLOAD_NAME?} base_output_directory=${OUTPUT_PATH?} model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} hf_access_token=${HF_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} per_device_batch_size=1 steps=${STEPS?} profiler=xplane hf_path=${DATASET_NAME?} train_split=${TRAIN_SPLIT?} train_data_columns=${TRAIN_DATA_COLUMNS?}" ``` Once the fine-tuning is completed, you can access your model checkpoints at `$OUTPUT_PATH/$WORKLOAD_NAME/checkpoints`. @@ -159,7 +159,7 @@ xpk workload create-pathways \ --workload=${WORKLOAD_NAME?} \ --tpu-type=${TPU_TYPE?} \ --num-slices=${TPU_SLICE?} \ ---command="JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE=1 python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml run_name=${WORKLOAD_NAME?} base_output_directory=${OUTPUT_PATH?} model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} hf_access_token=${HF_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} per_device_batch_size=1 steps=${STEPS?} profiler=xplane checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False enable_single_controller=True" +--command="JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE=1 python3 -m maxtext.trainers.post_train.sft.train_sft run_name=${WORKLOAD_NAME?} base_output_directory=${OUTPUT_PATH?} model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} hf_access_token=${HF_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} per_device_batch_size=1 steps=${STEPS?} profiler=xplane checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False enable_single_controller=True" ``` Once the fine-tuning is completed, you can access your model checkpoints at `$OUTPUT_PATH/$WORKLOAD_NAME/checkpoints`. diff --git a/docs/tutorials/pretraining.md b/docs/tutorials/pretraining.md index 146d465103..fb9049a5ce 100644 --- a/docs/tutorials/pretraining.md +++ b/docs/tutorials/pretraining.md @@ -35,7 +35,7 @@ We can use this **command** for pretraining: ```bash # replace base_output_directory with your bucket -python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ base_output_directory=gs://runner-maxtext-logs run_name=demo \ model_name=deepseek2-16b per_device_batch_size=1 steps=10 max_target_length=2048 enable_checkpointing=false \ dataset_type=hf hf_path=allenai/c4 hf_data_dir=en train_split=train \ @@ -102,7 +102,7 @@ This **command** shows pretraining with Grain pipeline, along with evaluation: ```bash # replace DATASET_GCS_BUCKET and base_output_directory with your buckets -python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ base_output_directory=gs://runner-maxtext-logs run_name=demo \ model_name=deepseek2-16b per_device_batch_size=1 steps=10 max_target_length=2048 enable_checkpointing=false \ dataset_type=grain grain_file_type=arrayrecord grain_train_files=/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record* grain_worker_count=2 \ @@ -139,7 +139,7 @@ This **command** shows pretraining with TFDS pipeline, along with evaluation: ```bash # replace base_output_directory and dataset_path with your buckets -python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \ +python3 -m maxtext.trainers.pre_train.train \ base_output_directory=gs://runner-maxtext-logs run_name=demo \ model_name=deepseek2-16b per_device_batch_size=1 steps=10 max_target_length=2048 enable_checkpointing=false \ dataset_type=tfds dataset_path=gs://maxtext-dataset dataset_name='c4/en:3.0.1' train_split=train \ diff --git a/src/maxtext/experimental/agent/ckpt_conversion_agent/README.md b/src/maxtext/experimental/agent/ckpt_conversion_agent/README.md index 28b3bac036..44cd63656a 100644 --- a/src/maxtext/experimental/agent/ckpt_conversion_agent/README.md +++ b/src/maxtext/experimental/agent/ckpt_conversion_agent/README.md @@ -66,7 +66,7 @@ If a ground-truth version isn't available, you'll need to debug the conversion m 3. After the conversion is done, run a decode to check the correctness of the generated code. Example command: ```bash -python3 -m maxtext.inference.decode src/maxtext/configs/base.yml model_name=gemma3-4b tokenizer_path=src/maxtext/assets/tokenizers/tokenizer.gemma3 \ +python3 -m maxtext.inference.decode model_name=gemma3-4b tokenizer_path=src/maxtext/assets/tokenizers/tokenizer.gemma3 \ load_parameters_path= per_device_batch_size=1 run_name=ht_test \ max_prefill_predict_length=8 max_target_length=16 steps=1 async_checkpointing=false scan_layers=true \ prompt='I love to' attention='dot_product' diff --git a/src/maxtext/inference/mlperf/README.md b/src/maxtext/inference/mlperf/README.md index e6fc9d1d80..0d553c852a 100644 --- a/src/maxtext/inference/mlperf/README.md +++ b/src/maxtext/inference/mlperf/README.md @@ -100,7 +100,7 @@ export SAVE_QUANT_PARAMS_PATH=gs://${USER}-bkt/quantized/llama2-70b-chat # other tokenizers under src/maxtext/assets/ directory. export TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets}}"'/tokenizer.llama2' cd maxtext && \ -python3 -m maxtext.inference.decode src/maxtext/configs/base.yml tokenizer_path=${TOKENIZER_PATH?} load_parameters_path=${LOAD_PARAMS_PATH?} max_prefill_predict_length=1024 max_target_length=2048 model_name=llama2-70b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=11 attention=dot_product quantization=int8 save_quantized_params_path=${SAVE_QUANT_PARAMS_PATH?} +python3 -m maxtext.inference.decode tokenizer_path=${TOKENIZER_PATH?} load_parameters_path=${LOAD_PARAMS_PATH?} max_prefill_predict_length=1024 max_target_length=2048 model_name=llama2-70b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=11 attention=dot_product quantization=int8 save_quantized_params_path=${SAVE_QUANT_PARAMS_PATH?} ``` Your checkpoint is generated at `$SAVE_QUANT_PARAMS_PATH`. This is used to set `load_parameters_path` param below in `MAXENGINE_ARGS` env variable.