-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation.py
104 lines (87 loc) · 3.46 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import evaluate
import json
from evaluate import load
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.spice.spice import Spice
ACC=0
total_epoch=30
for iter in range(total_epoch):
nlx_result_path='results/unf_captions_full_{}.json'.format(iter)
with open(nlx_result_path, 'r') as i:
result_json = json.load(i)
with open('nle_data/VQA-X/vqaX_test.json', 'r') as j:
test_json = json.load(j)
with open('nle_data/VQA-X/cocoEval_test.json', 'r') as k:
unf_references = json.load(k)
# seperate the 'result_json' to answer and explanation
# predicted_caption and reference_caption's fomat should be match with below format
# predictions = {image_id: [predicted_caption1, predicted_caption2, ...]}
# references = {image_id: [reference_caption1, reference_caption2, ...]}
error=0
result_answer=[]
unf_predictions={}
for result in result_json:
image_ids = result["image_id"]
sentence=result['caption']
if 'because' in sentence:
words=sentence.split(' because ',1)
result_answer.append(words[0])
unf_predictions[str(image_ids)]=[words[1]]
else:
error+=1
result_answer.append('')
unf_predictions[str(image_ids)]=['']
print('number of error output in answer:',error)
# unf metric score
print('unf metric score')
for evaluator in [Cider(),Bleu(),Meteor(),Rouge()]: # Spice()
score, scores = evaluator.compute_score(unf_references,unf_predictions)
print(str(evaluator),score)
# unf bertscore
# filter
f_references,f_predictions = unf_references,unf_predictions
bertscore = load("bertscore")
i=0
score=0
precision=0
err = []
explanations = list(unf_predictions.values())
for key, value in test_json.items():
ans_dic={}
for answers in value['answers']:
if answers['answer'] in ans_dic:
ans_dic[answers['answer']]+=1
else:
ans_dic[answers['answer']]=1
# answer = max(ans_dic,key=lambda k:ans_dic[k])
if result_answer[i] in ans_dic.keys():
precision+=1
references=value['explanation']
prediction=[explanations[i][0] for k in range(len(references))]
bert_score=bertscore.compute(predictions=prediction, references=references, lang="en")
# bert_score is measure with average score
s=0
for b in bert_score['f1']:
s+=float(b)
score+=s/len(bert_score['f1'])
## perhaps if you want to measure bert_score with Max score
# score += max(bert_score['precision'])
else:
del f_references[key]
del f_predictions[key]
i+=1
total_num=len(result_json)
print('precision:',precision)
print("'f1': ",score/precision," 'precision': ",precision/total_num)
print("number of f_references: ",len(f_references))
print('filtered metric score')
for evaluator in [Cider(),Bleu(),Meteor(),Rouge()]: # Spice()
score, scores = evaluator.compute_score(f_references,f_predictions)
print(str(evaluator),score)
if ACC<precision/total_num:
ACC=precision/total_num
best_result=iter
print('best_result iteration is ',best_result)