-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphase1_tokenize_dataset.sh
More file actions
executable file
·36 lines (35 loc) · 1.33 KB
/
phase1_tokenize_dataset.sh
File metadata and controls
executable file
·36 lines (35 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
python phase1_training.py \
--tokenize_only \
--model_name "TinyLLaMA" \
--train_file training_data/phase1/train.jsonl \
--val_file training_data/phase1/val.jsonl \
--test_file training_data/phase1/test.jsonl \
--tokenized_data_path training_data/phase1 \
--output_dir ./training_output/tinyllama_phase1 \
--base_model_cache_dir ./hf_cache \
--max_seq_length 1024 \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 4 \
--learning_rate 2e-4 \
--weight_decay 0.01 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--optim "paged_adamw_8bit" \
--lora_r 16 \
--lora_alpha 32 \
--lora_dropout 0.05 \
--logging_steps 10 \
--save_steps 200 \
--eval_steps 100 \
--save_total_limit 3 \
--report_to "tensorboard" \
--seed 42 \
--load_in_4bit \
--max_train_samples 600000 \
--max_eval_samples 200000 \
--max_test_samples 200000
# --use_flash_attention_2 # Uncomment if your setup supports it AND you are NOT using --load_in_4bit (usually one or the other)
# --max_train_samples 10000 # Optional: If you want to train on a subset of your tokenized train data
# --max_eval_samples 1000 # Optional: If you want to evaluate on a subset of your tokenized val data