-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetune_model.py
108 lines (92 loc) · 3.79 KB
/
finetune_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# finetune_model.py
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
import os
import glob
import json
# Model and data paths
model_name = "meta-llama/CodeLlama-13b-Instruct-hf"
training_data_dir = "./training_data" # Directory containing your JSON files
output_dir = "./finetuned-model"
# Find all JSON files in the directory
json_files = glob.glob(os.path.join(training_data_dir, "*.json"))
print(f"Found {len(json_files)} JSON files in {training_data_dir}")
# Combine all JSON files into a single JSONL file for training
temp_jsonl_file = os.path.join(training_data_dir, "combined_training_data.jsonl")
with open(temp_jsonl_file, "w", encoding="utf-8") as outfile:
for json_file in json_files:
try:
with open(json_file, "r", encoding="utf-8") as infile:
data = json.load(infile)
# Process the data based on your JSON structure
# Example: Convert comparison data to training format
if "source" in data and data["source"] == "AI_Comparison":
# Choose which response to use (other_ai_response is often better)
# You might want to implement logic to decide which response to use
training_example = {
"messages": [
{"role": "user", "content": data["instruction"]},
{"role": "assistant", "content": data["other_ai_response"]}
]
}
else:
# Regular training examples
training_example = {
"messages": [
{"role": "user", "content": data["instruction"]},
{"role": "assistant", "content": data["response"]}
]
}
# Write as JSONL (one JSON object per line)
outfile.write(json.dumps(training_example) + "\n")
print(f"Processed: {json_file}")
except Exception as e:
print(f"Error processing {json_file}: {e}")
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Define training arguments
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-5,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
fp16=True,
save_total_limit=2,
)
# Load the combined JSONL dataset
from datasets import load_dataset
train_dataset = load_dataset("json", data_files=temp_jsonl_file)["train"]
print(f"Dataset loaded with {len(train_dataset)} examples")
# Tokenize the dataset
def tokenize_function(examples):
# This processes the messages format from the JSONL file
prompts = []
for message_list in examples["messages"]:
# Create prompt format: "user: ... assistant: ..."
formatted_text = ""
for message in message_list:
role = message["role"]
content = message["content"]
formatted_text += f"{role}: {content}\n"
prompts.append(formatted_text)
return tokenizer(prompts, truncation=True, padding="max_length", max_length=1024)
# Apply tokenization
tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
)
# Start training
trainer.train()
# Save the final model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Optionally, clean up the temporary JSONL file
# os.remove(temp_jsonl_file)