File tree 2 files changed +6
-6
lines changed
3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development
2 files changed +6
-6
lines changed Original file line number Diff line number Diff line change @@ -25,10 +25,9 @@ tokenizer:
25
25
26
26
# Dataset
27
27
dataset :
28
- _component_ : torchtune.datasets.wiki_text
29
- train_on_input : True
28
+ _component_ : torchtune.datasets.wikitext_dataset
30
29
seed : null
31
- shuffle : True
30
+ shuffle : False
32
31
33
32
# Model Arguments
34
33
model :
@@ -75,8 +74,8 @@ checkpointer:
75
74
resume_from_checkpoint : False
76
75
77
76
# Fine-tuning arguments
78
- batch_size : 2
79
- epochs : 3
77
+ batch_size : 1
78
+ epochs : 1
80
79
81
80
optimizer :
82
81
_component_ : torch.optim.AdamW
@@ -95,6 +94,7 @@ device: cuda
95
94
# Memory management
96
95
enable_activation_checkpointing : True
97
96
memory_efficient_fsdp_wrap : True
97
+ fsdp_cpu_offload : True
98
98
99
99
# Reduced precision
100
100
dtype : bf16
Original file line number Diff line number Diff line change @@ -77,7 +77,7 @@ declare -a TORCHRUN_ARGS=(
77
77
--rdzv_endpoint=$( hostname)
78
78
)
79
79
declare -a TRAIN_ARGS=(
80
- --config ${PWD} /tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed .yaml
80
+ --config ${PWD} /tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed .yaml
81
81
tokenizer.path=${MODEL_PATH} /${HF_MODEL} /original/tokenizer.model
82
82
checkpointer.checkpoint_dir=${MODEL_PATH} /${HF_MODEL}
83
83
checkpointer.output_dir=${MODEL_PATH} /${HF_MODEL} -tuned
You can’t perform that action at this time.
0 commit comments