-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy patheval.py
365 lines (320 loc) · 11.6 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
from pathlib import Path
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
import argparse
import pdb
from eval_videomme import eval_your_results
from util import *
def eval_qa_egoschema(data):
num_valids = 0
num_corrects = 0
for uid, el in data.items():
if el['pred'] == -1:
num_corrects += 0.2 # random guess
continue
num_valids += 1
if el['truth'] == el['pred']:
num_corrects += 1
stat = {
'num_total': len(data),
'num_valids': num_valids,
'num_corrects': num_corrects,
'acc': num_corrects / len(data),
}
pprint(stat)
stat['data'] = data
return stat
def eval_qa_egoschema_from_file(fp):
data = load_json(fp)
if 'data' in data:
data = data['data']
eval_qa_egoschema(data)
def eval_qa_nextqa(anno_file_path, preds):
'''
This function was adapted from https://github.com/doc-doc/NExT-QA/blob/main/eval_mc.py
'''
map_name = {'CW': 'Why', 'CH': 'How', 'TN': 'Bef&Aft', 'TC': 'When', 'DC': 'Cnt', 'DL': 'Loc', 'DO': 'Other', 'C': 'Acc_C', 'T': 'Acc_T', 'D': 'Acc_D'}
sample_list = pd.read_csv(anno_file_path)
group = {'CW':[], 'CH':[], 'TN':[], 'TC':[], 'DC':[], 'DL':[], 'DO':[]}
for id, row in sample_list.iterrows():
qns_id = str(row['video']) + '_' + str(row['qid'])
if qns_id not in preds:
continue
qtype = str(row['type'])
#(combine temporal qns of previous and next as 'TN')
if qtype == 'TP': qtype = 'TN'
group[qtype].append(qns_id)
group_acc = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
group_cnt = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
overall_acc = {'C':0, 'T':0, 'D':0}
overall_cnt = {'C':0, 'T':0, 'D':0}
all_acc = 0
all_cnt = 0
for qtype, qns_ids in group.items():
cnt = 0
acc = 0
for qid in qns_ids:
cnt += 1
answer = preds[qid]['truth']
pred = preds[qid]['pred']
if pred == -1:
acc += 0.2
elif answer == pred:
acc += 1
group_cnt[qtype] = cnt
group_acc[qtype] += acc
overall_acc[qtype[0]] += acc
overall_cnt[qtype[0]] += cnt
all_acc += acc
all_cnt += cnt
for qtype, value in overall_acc.items():
group_acc[qtype] = value
group_cnt[qtype] = overall_cnt[qtype]
stat = {}
for qtype in group_acc:
print(map_name[qtype], end='\t')
print('')
for qtype, acc in group_acc.items():
if group_cnt[qtype] == 0:
stat[qtype] = 0
print('{:.2f}'.format(0), end ='\t')
else:
stat[qtype] = acc*100.0/group_cnt[qtype]
print('{:.2f}'.format(acc*100.0/group_cnt[qtype]), end ='\t')
stat['Acc'] = all_acc*100.0/all_cnt
print('')
print('Acc: {:.2f}'.format(all_acc*100.0/all_cnt))
stat['data'] = preds
return stat
def eval_qa_nextqa_from_file(anno_file_path, pred_file_path):
data = load_json(pred_file_path)
if 'data' in data:
data = data['data']
eval_qa_nextqa(anno_file_path, data)
def eval_sum(data):
num_words_ls = []
for example in data.values():
summarization = example['response']
num_words = len(summarization.replace('.', ' ').replace(',', ' ').replace('\n', ' ').split(' '))
num_words_ls.append(num_words)
num_words_series = pd.Series(num_words_ls)
stat = {
'min': float(num_words_series.min()),
'max': float(num_words_series.max()),
'mean': float(num_words_series.mean()),
'std': float(num_words_series.std()),
}
stat['data'] = data
sum_data = {uid: el['response'] for uid, el in data.items()}
return stat, sum_data
def eval_gqa(gt_ground_path, pred_ground_raw, pred_qa_path=None, subset=None, gs=False):
'''
This function is adapted from https://github.com/doc-doc/NExT-GQA/blob/main/code/TempGQA/eval_ground.py
'''
def get_tIoU(loc, span):
if span[0] == span[-1]:
if loc[0] <= span[0] and span[0] <= loc[1]:
return 0, 1
else:
return 0, 0
span_u = (min(loc[0], span[0]), max(loc[-1], span[-1]))
span_i = (max(loc[0], span[0]), min(loc[-1], span[-1]))
dis_i = (span_i[1] - span_i[0])
if span_u[1] > span_u[0]:
IoU = dis_i / (span_u[1] - span_u[0])
else:
IoU = 0.0
if span[-1] > span[0]:
IoP = dis_i / (span[-1] - span[0])
else:
IoP = 0.0
return IoU, IoP
def get_tIoU_multi(loc, spans):
overlap = 0
loc_start, loc_end = loc[0], loc[1]
for [span_start, span_end] in spans:
if span_start > span_end:
span_start, span_end = span_end, span_start
# if span_end < loc_start or span_start > loc_end:
# continue
overlap_start, overlap_end = max(span_start, loc_start), min(span_end, loc_end)
if overlap_start >= overlap_end:
continue
overlap += abs(overlap_end-overlap_start)
gt = loc[1] - loc[0]
pred = sum(map(lambda x: abs(x[1] - x[0]), spans))
IoU = overlap / (gt + pred - overlap)
IoP = overlap / pred
return IoU, IoP
gt_ground = load_json(gt_ground_path)
pred_qa = load_json(pred_qa_path) if pred_qa_path else None
if pred_qa:
if 'data' in pred_qa:
pred_qa = pred_qa['data']
pred_ground = {quid: el['pred'] for quid, el in pred_ground_raw.items()}
mIoU, mIoP = 0, 0
cnt, cqt = 0, 0
crt3, crt5 = 0, 0
crtp3, crtp5 = 0, 0
for vid, anno in gt_ground.items():
for qid, locs in anno['location'].items():
if not (f'{vid}_{qid}' in pred_ground):
# print(vid, qid)
continue
if subset != None:
# Non-Blind and Non-Sig QA subset
if not (f'{vid}_{qid}' in subset):
continue
max_tIoU, max_tIoP = 0, 0
for loc in locs:
span = pred_ground[f'{vid}_{qid}']
# we need to multiply video duration if Gaussian
if gs: span = np.round(np.asarray(span)*anno['duration'], 1)
span = span[0]
tIoU, tIoP = get_tIoU(loc, span)
# span = [span[0]]
# tIoU, tIoP = get_tIoU_multi(loc, span)
if tIoU > max_tIoU:
max_tIoU = tIoU
if tIoP > max_tIoP:
max_tIoP = tIoP
if max_tIoP >= 0.3:
crtp3 += 1
if max_tIoP >= 0.5:
crtp5 += 1
kid = f'{vid}_{qid}'
if pred_qa:
if pred_qa[kid]['truth'] == pred_qa[kid]['pred']:
cqt+= 1
# print(kid)
if max_tIoU >= 0.3:
crt3 += 1
if max_tIoU >= 0.5:
crt5 += 1
# if pred_qa:
# if pred_qa[kid]['answer'] == pred_qa[kid]['prediction']:
# print(kid)
cnt += 1
mIoU += max_tIoU
mIoP += max_tIoP
mIoU = mIoU /cnt * 100
mIoP = mIoP/cnt * 100
print('{:.1f} \t {:.1f}\t {:.1f}\t {:.1f} \t {:.1f} \t {:.1f} \t {:.1f}'.format(cqt*1.0/cnt*100, mIoP, crtp3*1.0/cnt*100, crtp5*1.0/cnt*100, mIoU, crt3*1.0/cnt*100, crt5*1.0/cnt*100))
stat = {
'Acc_GQA': cqt*1.0/cnt*100,
'mIoP': mIoP,
'TIoP_0.3': crtp3*1.0/cnt*100,
'TIoP_0.5': crtp5*1.0/cnt*100,
'mIoU': mIoU,
'TIoU_0.3': crt3*1.0/cnt*100,
'TIoU_0.5': crt5*1.0/cnt*100
}
stat['data'] = pred_ground_raw
return stat
def eval_gqa_from_file(gt_ground_path, pred_ground_path, pred_qa_path=None):
pred_ground = load_json(pred_ground_path)
if 'data' in pred_ground:
pred_ground = pred_ground['data']
eval_gqa(gt_ground_path, pred_ground, pred_qa_path=pred_qa_path)
def eval_egoschema_cats(data_path, cats_path):
data = load_json(data_path)
if 'data' in data:
data = data['data']
cats = load_json(cats_path)
cats = {el[1]: el[-1] for el in cats} # uid --> [cat0, cat1, ...]
def eval(preds):
num_corrects = defaultdict(int) # q_type --> int
num_total = defaultdict(int) # q_type --> int
for uid, info in preds.items():
q_type_list = info['type']
pred = info['pred']
truth = info['truth']
for q_type in q_type_list:
if pred == -1:
num_corrects[q_type] += 0.2
else:
num_corrects[q_type] += (pred==truth)
num_total[q_type] += 1
accs = {k: num_corrects[k] / num_total[k] for k in num_corrects}
acc_all = sum(list(num_corrects.values())) / sum(list(num_total.values()))
return accs, acc_all
for k, v in cats.items():
for el in v:
if el not in [1, 2, 3, 4, 5]:
print('question category not found: ', k)
# category stat
id_to_name = {
1: 'Purpose/Goal Identification',
2: 'Character Interaction',
3: 'Tools and Materials Usage',
4: 'Key Action/Moment Detection',
5: 'Action Sequence Analysis'
}
arr = sum(list(cats.values()), [])
stat = Counter(arr).most_common()
print('Category Statistics:')
for q_type, count in stat:
print(f"{id_to_name[q_type]}: {count / len(cats) * 100:.1f}")
print()
# eval
preds = {uid: {'pred': uid_info['pred'], 'truth': uid_info['truth'], 'type': cats[uid]} for uid, uid_info in data.items() if uid in cats}
accs, acc_all = eval(preds)
accs = sorted(list(accs.items()))
print('Evaluation:')
for k, v in accs:
print(f"{id_to_name[k]}: {v*100:.1f}")
print()
print(f"all: {acc_all*100:.1f}")
def eval_qa_videomme(data):
num_valids = 0
num_corrects = 0
for uid, el in data.items():
if el['pred'] == -1:
num_corrects += 0.25 # random guess
continue
num_valids += 1
if el['truth'] == el['pred']:
num_corrects += 1
stat = {
'num_total': len(data),
'num_valids': num_valids,
'num_corrects': num_corrects,
'acc': num_corrects / len(data),
}
pprint(stat)
stat['data'] = data
return stat
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--function",
required=True,
type=str,
)
args, unknown = parser.parse_known_args()
function_arg_names = unknown[0::2]
function_arg_values = unknown[1::2]
function_args = {function_arg_names[i]: function_arg_values[i] for i in range(len(function_arg_names))}
print()
globals()[args.function](**function_args)
'''
python eval.py \
--function eval_egoschema_cats \
data_path output/egoschema/standard_qa.json \
cats_path data/egoschema/categories.json
python eval.py \
--function eval_qa_nextqa_from_file \
anno_file_path data/nextgqa/test.csv \
pred_file_path output/nextgqa/gpt4_llava.json
python eval.py \
--function eval_gqa_from_file \
gt_ground_path data/nextgqa/gsub_test.json \
pred_qa_path output/nextgqa/gpt4_llava.json \
pred_ground_path output/nextgqa/gpt4_llava_grounding.json
python eval.py \
--function eval_egoschema_cats \
data_path output/egoschema/1106.json \
cats_path data/egoschema/categories.json
'''