JohnnyBoy00 commited on
Commit
b116166
1 Parent(s): 0df4b7b

Upload evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +215 -0
evaluation.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ from evaluate import load as load_metric
5
+
6
+ from sklearn.metrics import mean_squared_error
7
+ from tqdm.auto import tqdm
8
+
9
+ MAX_TARGET_LENGTH = 128
10
+
11
+ # load evaluation metrics
12
+ sacrebleu = load_metric('sacrebleu')
13
+ rouge = load_metric('rouge')
14
+ meteor = load_metric('meteor')
15
+ bertscore = load_metric('bertscore')
16
+
17
+ # use gpu if it's available
18
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
19
+
20
+ def flatten_list(l):
21
+ """
22
+ Utility function to convert a list of lists into a flattened list
23
+ Params:
24
+ l (list of lists): list to be flattened
25
+ Returns:
26
+ A flattened list with the elements of the original list
27
+ """
28
+ return [item for sublist in l for item in sublist]
29
+
30
+ def parse_float(value):
31
+ """
32
+ Utility function to parse a string into a float
33
+
34
+ Params:
35
+ value (string): value to be converted to float
36
+ Returns:
37
+ The float representation of the given string, or -1 if the string could
38
+ not be converted to a float
39
+ """
40
+ try:
41
+ float_value = float(value)
42
+ return float_value
43
+ except ValueError:
44
+ return -1
45
+
46
+ def extract_scores(predictions):
47
+ """
48
+ Utility function to extract the scores from the predictions of the model
49
+
50
+ Params:
51
+ predictions (list): complete model predictions
52
+ Returns:
53
+ scores (list): extracted scores from the model's predictions
54
+ """
55
+ scores = []
56
+ # iterate through predictions and try to extract predicted score;
57
+ # if score could not be extracted, set it to -1
58
+ for pred in predictions:
59
+ try:
60
+ score_string = pred.split('Feedback:', 1)[0].strip()
61
+ score = parse_float(score_string)
62
+ except IndexError:
63
+ try:
64
+ score_string = pred.split(' ', 1)[0].strip()
65
+ score = parse_float(score_string)
66
+ except IndexError:
67
+ score = -1
68
+ scores.append(score)
69
+
70
+ return scores
71
+
72
+ def extract_feedback(predictions):
73
+ """
74
+ Utility function to extract the feedback from the predictions of the model
75
+
76
+ Params:
77
+ predictions (list): complete model predictions
78
+ Returns:
79
+ feedback (list): extracted feedback from the model's predictions
80
+ """
81
+ feedback = []
82
+ # iterate through predictions and try to extract predicted feedback
83
+ for pred in predictions:
84
+ try:
85
+ fb = pred.split(':', 1)[1]
86
+ except IndexError:
87
+ try:
88
+ fb = pred.split(' ', 1)[1]
89
+ except IndexError:
90
+ fb = pred
91
+ feedback.append(fb.strip())
92
+
93
+ return feedback
94
+
95
+ def compute_mse(predictions, labels):
96
+ """
97
+ Utility function to compute the mean squared error of the
98
+ score predictions in relation to the golden label scores
99
+
100
+ Params:
101
+ predictions (list): model score predictions
102
+ labels (list): golden label scores
103
+ Returns:
104
+ (float, int): mse of valid samples and number of invalid samples
105
+ """
106
+ # get indexes of valid score predictions
107
+ # (i.e., where the score is greater than zero)
108
+ idx = np.where(np.array(predictions) > 0)
109
+
110
+ # get size of the golden labels list and of
111
+ # the valid predictions array
112
+ labels_size = len(labels)
113
+ valid_predictions_size = idx[0].size
114
+
115
+ # only compute mse if valid score predictions were generated,
116
+ # otherwise set mse to 1
117
+ if valid_predictions_size > 0:
118
+ # calculate mse from labels and predictions
119
+ valid_predictions = np.array(predictions)[idx]
120
+ score_labels = np.array(labels)[idx]
121
+ mse = mean_squared_error(score_labels, valid_predictions)
122
+
123
+ # cap mse at 1
124
+ if mse > 1:
125
+ return 1, labels_size - valid_predictions_size
126
+
127
+ # return computed mse and number of invalid samples
128
+ return mse, labels_size - valid_predictions_size
129
+ else:
130
+ return 1, labels_size - valid_predictions_size
131
+
132
+ def compute_metrics(predictions, labels):
133
+ """
134
+ Compute evaluation metrics from the predictions of the model
135
+
136
+ Params:
137
+ predictions (list): complete model predictions
138
+ labels (list): golden labels (previously tokenized)
139
+ Returns:
140
+ results (dict): dictionary with the computed evaluation metrics
141
+ """
142
+ # extract feedback and labels from the model's predictions
143
+ predicted_feedback = extract_feedback(predictions)
144
+ predicted_scores = extract_scores(predictions)
145
+
146
+ # extract feedback and labels from the golden labels
147
+ reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
148
+ reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels]
149
+
150
+ # compute HF metrics
151
+ sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
152
+ rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
153
+ meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
154
+ bert_score = bertscore.compute(
155
+ predictions=predicted_feedback,
156
+ references=reference_feedback,
157
+ lang='de',
158
+ model_type='bert-base-multilingual-cased',
159
+ rescale_with_baseline=True)
160
+
161
+ # compute mse of score predictions
162
+ mse, _ = compute_mse(predicted_scores, reference_scores)
163
+
164
+ results = {
165
+ 'sacrebleu': sacrebleu_score,
166
+ 'rouge': rouge_score,
167
+ 'meteor': meteor_score,
168
+ 'bert_score': np.array(bert_score['f1']).mean().item(),
169
+ 'mse': mse
170
+ }
171
+
172
+ return results
173
+
174
+ def evaluate(model, tokenizer, dataloader):
175
+ """
176
+ Evaluate model on the given dataset
177
+ Params:
178
+ model (PreTrainedModel): seq2seq model
179
+ tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
180
+ dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
181
+ Returns:
182
+ results (dict): dictionary with the computed evaluation metrics
183
+ predictions (list): list of the decoded predictions of the model
184
+ """
185
+ decoded_preds, decoded_labels = [], []
186
+
187
+ model.eval()
188
+ # iterate through batchs in the dataloader
189
+ for batch in tqdm(dataloader):
190
+ with torch.no_grad():
191
+ batch = {k: v.to(device) for k, v in batch.items()}
192
+ # generate tokens from batch
193
+ generated_tokens = model.generate(
194
+ batch['input_ids'],
195
+ attention_mask=batch['attention_mask'],
196
+ max_length=MAX_TARGET_LENGTH
197
+ )
198
+ # get golden labels from batch
199
+ labels_batch = batch['labels']
200
+
201
+ # decode model predictions and golden labels
202
+ decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
203
+ decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
204
+
205
+ decoded_preds.append(decoded_preds_batch)
206
+ decoded_labels.append(decoded_labels_batch)
207
+
208
+ # convert predictions and golden labels into flattened lists
209
+ predictions = flatten_list(decoded_preds)
210
+ labels = flatten_list(decoded_labels)
211
+
212
+ # compute metrics based on predictions and golden labels
213
+ results = compute_metrics(predictions, labels)
214
+
215
+ return results, predictions