forked from lancopku/simNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKarpathySplit.py
54 lines (37 loc) · 1.34 KB
/
KarpathySplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# coding: utf-8
# # Karpathy Split for MS-COCO Dataset
import json
from random import shuffle, seed
seed(123)
num_val = 5000
num_test = 5000
val = json.load(open('./data/annotations/captions_val2014.json', 'r'))
train = json.load(open('./data/annotations/captions_train2014.json', 'r'))
# Merge together
imgs = val['images'] + train['images']
annots = val['annotations'] + train['annotations']
shuffle(imgs)
# Split into val, test, train
dataset = {}
dataset['val'] = imgs[:num_val]
dataset['test'] = imgs[num_val: num_val + num_test]
dataset['train'] = imgs[num_val + num_test:]
# Group by image ids
itoa = {}
for a in annots:
imgid = a['image_id']
if not imgid in itoa: itoa[imgid] = []
itoa[imgid].append(a)
json_data = {}
info = train['info']
licenses = train['licenses']
split = ['test', 'val', 'train']
for subset in split:
json_data[subset] = { 'type':'caption', 'info':info, 'licenses': licenses,
'images':[], 'annotations':[] }
for img in dataset[subset]:
img_id = img['id']
anns = itoa[img_id]
json_data[subset]['images'].append(img)
json_data[subset]['annotations'].extend(anns)
json.dump(json_data[subset], open('./data/annotations/karpathy_split_' + subset + '.json', 'w'))