-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlikert.py
60 lines (49 loc) · 1.5 KB
/
likert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import asyncio
from limin import ModelConfiguration
from limin_bench import (
Dataset,
LikertJudge,
generate_evaluation_run_likert,
generate_model_run,
)
dataset = Dataset(
rows=[
"What is the capital of France?",
"What is the capital of Germany?",
]
)
assistant_system_prompt = """
You are a helpful assistant.
Answer the user's questions factually correctly.
"""
judge_system_prompt = """
You are an LLM as a judge.
You will be given a conversation between a user and an assistant.
You will then judge whether the assistants answers are factually correct or not.
Return a number between 1 and 4 where:
- 1 means that answer is strongly incorrect
- 2 means that answer is incorrect
- 3 means that answer is correct
- 4 means that answer is strongly correct
"""
likert_judge = LikertJudge(
model_configuration=ModelConfiguration(model="gpt-4o"),
system_prompt=judge_system_prompt,
)
async def main():
model_run = await generate_model_run(
dataset=dataset,
system_prompt=assistant_system_prompt,
model_configuration=ModelConfiguration(model="gpt-4o"),
)
print("Full model run:")
print(model_run.model_dump_json())
evaluation_run = await generate_evaluation_run_likert(
model_run=model_run, likert_judge=likert_judge, structured=True
)
print("Full evaluation run:")
print(evaluation_run.model_dump_json())
print("Average score:")
print(evaluation_run.avg)
if __name__ == "__main__":
asyncio.run(main())