forked from vijayvee/video-captioning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_videocap.py
128 lines (118 loc) · 6.88 KB
/
test_videocap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python
import numpy as np
import tensorflow as tf
from utils import *
import sys
#GLOBAL VARIABLE INITIALIZATIONS TO BUILD MODEL
n_steps = 80
hidden_dim = 500
frame_dim = 4096
batch_size = 1
vocab_size = len(word2id)
bias_init_vector = get_bias_vector()
def build_model():
"""This function creates weight matrices that transform:
* frames to caption dimension
* hidden state to vocabulary dimension
* creates word embedding matrix """
print "Network config: \nN_Steps: {}\nHidden_dim:{}\nFrame_dim:{}\nBatch_size:{}\nVocab_size:{}\n".format(n_steps,
hidden_dim,
frame_dim,
batch_size,
vocab_size)
#Create placeholders for holding a batch of videos, captions and caption masks
video = tf.placeholder(tf.float32,shape=[batch_size,n_steps,frame_dim],name='Input_Video')
caption = tf.placeholder(tf.int32,shape=[batch_size,n_steps],name='GT_Caption')
caption_mask = tf.placeholder(tf.float32,shape=[batch_size,n_steps],name='Caption_Mask')
dropout_prob = tf.placeholder(tf.float32,name='Dropout_Keep_Probability')
with tf.variable_scope('Im2Cap') as scope:
W_im2cap = tf.get_variable(name='W_im2cap',shape=[frame_dim,
hidden_dim],
initializer=tf.random_uniform_initializer(minval=-0.08,maxval=0.08))
b_im2cap = tf.get_variable(name='b_im2cap',shape=[hidden_dim],
initializer=tf.constant_initializer(0.0))
with tf.variable_scope('Hid2Vocab') as scope:
W_H2vocab = tf.get_variable(name='W_H2vocab',shape=[hidden_dim,vocab_size],
initializer=tf.random_uniform_initializer(minval=-0.08,maxval=0.08))
b_H2vocab = tf.Variable(name='b_H2vocab',initial_value=bias_init_vector.astype(np.float32))
with tf.variable_scope('Word_Vectors') as scope:
word_emb = tf.get_variable(name='Word_embedding',shape=[vocab_size,hidden_dim],
initializer=tf.random_uniform_initializer(minval=-0.08,maxval=0.08))
print "Created weights"
#Build two LSTMs, one for processing the video and another for generating the caption
with tf.variable_scope('LSTM_Video',reuse=None) as scope:
lstm_vid = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim)
lstm_vid = tf.nn.rnn_cell.DropoutWrapper(lstm_vid,output_keep_prob=dropout_prob)
with tf.variable_scope('LSTM_Caption',reuse=None) as scope:
lstm_cap = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim)
lstm_cap = tf.nn.rnn_cell.DropoutWrapper(lstm_cap,output_keep_prob=dropout_prob)
#Prepare input for lstm_video
video_rshp = tf.reshape(video,[-1,frame_dim])
video_rshp = tf.nn.dropout(video_rshp,keep_prob=dropout_prob)
video_emb = tf.nn.xw_plus_b(video_rshp,W_im2cap,b_im2cap)
video_emb = tf.reshape(video_emb,[batch_size,n_steps,hidden_dim])
padding = tf.zeros([batch_size,n_steps-1,hidden_dim])
video_input = tf.concat([video_emb,padding],1)
print "Video_input: {}".format(video_input.get_shape())
#Run lstm_vid for 2*n_steps-1 timesteps
with tf.variable_scope('LSTM_Video') as scope:
out_vid,state_vid = tf.nn.dynamic_rnn(lstm_vid,video_input,dtype=tf.float32)
print "Video_output: {}".format(out_vid.get_shape())
#Prepare input for lstm_cap
padding = tf.zeros([batch_size,n_steps,hidden_dim])
caption_vectors = tf.nn.embedding_lookup(word_emb,caption[:,0:n_steps-1])
caption_vectors = tf.nn.dropout(caption_vectors,keep_prob=dropout_prob)
caption_2n = tf.concat([padding,caption_vectors],1)
caption_input = tf.concat([caption_2n,out_vid],2)
print "Caption_input: {}".format(caption_input.get_shape())
#Run lstm_cap for 2*n_steps-1 timesteps
with tf.variable_scope('LSTM_Caption') as scope:
out_cap,state_cap = tf.nn.dynamic_rnn(lstm_cap,caption_input,dtype=tf.float32)
print "Caption_output: {}".format(out_cap.get_shape())
#Compute masked loss
output_captions = out_cap[:,n_steps:,:]
output_logits = tf.reshape(output_captions,[-1,hidden_dim])
output_logits = tf.nn.dropout(output_logits,keep_prob=dropout_prob)
output_logits = tf.nn.xw_plus_b(output_logits,W_H2vocab,b_H2vocab)
output_labels = tf.reshape(caption[:,1:],[-1])
caption_mask_out = tf.reshape(caption_mask[:,1:],[-1])
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_logits,labels=output_labels)
masked_loss = loss*caption_mask_out
loss = tf.reduce_sum(masked_loss)/tf.reduce_sum(caption_mask_out)
return video,caption,caption_mask,output_logits,loss,dropout_prob
if __name__=="__main__":
with tf.Graph().as_default():
learning_rate = 0.00001
video,caption,caption_mask,output_logits,loss,dropout_prob = build_model()
optim = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
ckpt_file = 'S2VT_Dyn_10_0.0001_300_46000.ckpt.meta'
saver = tf.train.Saver()
with tf.Session() as sess:
if ckpt_file:
saver_ = tf.train.import_meta_graph(ckpt_file)
saver_.restore(sess,'./S2VT_Dyn_10_0.0001_300_46000.ckpt')
print "Restored model"
else:
sess.run(tf.initialize_all_variables())
while(1):
vid,caption_GT,_,video_urls = fetch_data_batch_val(1)
caps,caps_mask = convert_caption(['<BOS>'],word2id,80)
for i in range(n_steps):
o_l = sess.run(output_logits,feed_dict={video:vid,
caption:caps,
caption_mask:caps_mask,
dropout_prob:1.0})
out_logits = o_l.reshape([batch_size,n_steps-1,vocab_size])
output_captions = np.argmax(out_logits,2)
caps[0][i+1] = output_captions[0][i]
print_in_english(caps)
if id2word[output_captions[0][i]] == '<EOS>':
break
print '............................\nGT Caption:\n'
print_in_english(caption_GT)
play_video = raw_input('Should I play the video? ')
if play_video.lower() == 'y':
playVideo(video_urls)
test_again = raw_input('Want another test run? ')
if test_again.lower() == 'n':
break