forked from YuanEZhou/show-attend-and-tell
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
1,698 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import cPickle as pickle | ||
import os | ||
import sys | ||
sys.path.append('/mnt/zye/coco-caption') | ||
from pycocoevalcap.bleu.bleu import Bleu | ||
from pycocoevalcap.rouge.rouge import Rouge | ||
from pycocoevalcap.cider.cider import Cider | ||
from pycocoevalcap.meteor.meteor import Meteor | ||
|
||
def score(ref, hypo): | ||
scorers = [ | ||
(Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"]), | ||
(Meteor(),"METEOR"), | ||
(Rouge(),"ROUGE_L"), | ||
(Cider(),"CIDEr") | ||
] | ||
final_scores = {} | ||
for scorer,method in scorers: | ||
score,scores = scorer.compute_score(ref,hypo) | ||
if type(score)==list: | ||
for m,s in zip(method,score): | ||
final_scores[m] = s | ||
else: | ||
final_scores[method] = score | ||
|
||
return final_scores | ||
|
||
|
||
def evaluate(data_path='/mnt/zye/show-attend-and-tell/data', split='test', get_scores=False): | ||
reference_path = os.path.join(data_path, "%s/%s.references.pkl" %(split, split)) | ||
candidate_path = os.path.join(data_path, "%s/%s.candidate.captions.pkl" %(split, split)) | ||
|
||
# load caption data | ||
with open(reference_path, 'rb') as f: | ||
ref = pickle.load(f) | ||
with open(candidate_path, 'rb') as f: | ||
cand = pickle.load(f) | ||
|
||
# make dictionary | ||
hypo = {} | ||
for i, caption in enumerate(cand): | ||
hypo[i] = [caption] | ||
|
||
# compute bleu score | ||
final_scores = score(ref, hypo) | ||
|
||
# print out scores | ||
print 'Bleu_1:\t',final_scores['Bleu_1'] | ||
print 'Bleu_2:\t',final_scores['Bleu_2'] | ||
print 'Bleu_3:\t',final_scores['Bleu_3'] | ||
print 'Bleu_4:\t',final_scores['Bleu_4'] | ||
print 'METEOR:\t',final_scores['METEOR'] | ||
print 'ROUGE_L:',final_scores['ROUGE_L'] | ||
print 'CIDEr:\t',final_scores['CIDEr'] | ||
|
||
if get_scores: | ||
return final_scores | ||
|
||
|
||
|
||
if __name__=='__main__': | ||
evaluate() | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
# ========================================================================================= | ||
# Implementation of "Show, Attend and Tell: Neural Caption Generator With Visual Attention". | ||
# There are some notations. | ||
# N is batch size. | ||
# L is spacial size of feature vector (196). | ||
# D is dimension of image feature vector (512). | ||
# T is the number of time step which is equal to caption's length-1 (16). | ||
# V is vocabulary size (about 10000). | ||
# M is dimension of word vector which is embedding size (default is 512). | ||
# H is dimension of hidden state (default is 1024). | ||
# ========================================================================================= | ||
|
||
from __future__ import division | ||
|
||
import tensorflow as tf | ||
# import numpy as np | ||
# from core.utils import load_coco_data | ||
# from core.vggnet import Vgg19 | ||
|
||
class CaptionGenerator(object): | ||
def __init__(self, word_to_idx, dim_feature=[196, 512], dim_embed=512, dim_hidden=1024, n_time_step=16, | ||
prev2out=True, ctx2out=True, alpha_c=0.0, selector=True, dropout=True): | ||
""" | ||
Args: | ||
word_to_idx: word-to-index mapping dictionary. | ||
dim_feature: (optional) Dimension of vggnet19 conv5_3 feature vectors. | ||
dim_embed: (optional) Dimension of word embedding. | ||
dim_hidden: (optional) Dimension of all hidden state. | ||
n_time_step: (optional) Time step size of LSTM. | ||
prev2out: (optional) previously generated word to hidden state. (see Eq (7) for explanation) | ||
ctx2out: (optional) context to hidden state (see Eq (7) for explanation) | ||
alpha_c: (optional) Doubly stochastic regularization coefficient. (see Section (4.2.1) for explanation) | ||
selector: (optional) gating scalar for context vector. (see Section (4.2.1) for explanation) | ||
dropout: (optional) If true then dropout layer is added. | ||
""" | ||
|
||
self.word_to_idx = word_to_idx | ||
self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()} | ||
self.prev2out = prev2out | ||
self.ctx2out = ctx2out | ||
self.alpha_c = alpha_c | ||
self.selector = selector | ||
self.dropout = dropout | ||
self.V = len(word_to_idx) | ||
self.L = dim_feature[0] | ||
self.D = dim_feature[1] | ||
self.M = dim_embed | ||
self.H = dim_hidden | ||
self.T = n_time_step | ||
self._start = word_to_idx['<START>'] | ||
self._null = word_to_idx['<NULL>'] | ||
|
||
self.weight_initializer = tf.contrib.layers.xavier_initializer() | ||
self.const_initializer = tf.constant_initializer(0.0) | ||
self.emb_initializer = tf.random_uniform_initializer(minval=-1.0, maxval=1.0) | ||
|
||
# Place holder for features and captions | ||
self.features = tf.placeholder(tf.float32, [None, self.L, self.D]) | ||
self.captions = tf.placeholder(tf.int32, [None, self.T + 1]) | ||
|
||
def _get_initial_lstm(self, features): | ||
with tf.variable_scope('initial_lstm'): | ||
features_mean = tf.reduce_mean(features, 1) | ||
|
||
w_h = tf.get_variable('w_h', [self.D, self.H], initializer=self.weight_initializer) | ||
b_h = tf.get_variable('b_h', [self.H], initializer=self.const_initializer) | ||
h = tf.nn.tanh(tf.matmul(features_mean, w_h) + b_h) | ||
|
||
w_c = tf.get_variable('w_c', [self.D, self.H], initializer=self.weight_initializer) | ||
b_c = tf.get_variable('b_c', [self.H], initializer=self.const_initializer) | ||
c = tf.nn.tanh(tf.matmul(features_mean, w_c) + b_c) | ||
return c, h | ||
|
||
def _word_embedding(self, inputs, reuse=False): | ||
with tf.variable_scope('word_embedding', reuse=reuse): | ||
w = tf.get_variable('w', [self.V, self.M], initializer=self.emb_initializer) | ||
x = tf.nn.embedding_lookup(w, inputs, name='word_vector') # (N, T, M) or (N, M) | ||
return x | ||
|
||
def _project_features(self, features): | ||
with tf.variable_scope('project_features'): | ||
w = tf.get_variable('w', [self.D, self.D], initializer=self.weight_initializer) | ||
features_flat = tf.reshape(features, [-1, self.D]) | ||
features_proj = tf.matmul(features_flat, w) | ||
features_proj = tf.reshape(features_proj, [-1, self.L, self.D]) | ||
return features_proj | ||
|
||
def _attention_layer(self, features, features_proj, h, reuse=False): | ||
with tf.variable_scope('attention_layer', reuse=reuse): | ||
w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer) | ||
b = tf.get_variable('b', [self.D], initializer=self.const_initializer) | ||
w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer) | ||
|
||
h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b) # (N, L, D) | ||
out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L]) # (N, L) | ||
alpha = tf.nn.softmax(out_att) | ||
context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context') #(N, D) | ||
return context, alpha | ||
|
||
def _selector(self, context, h, reuse=False): | ||
with tf.variable_scope('selector', reuse=reuse): | ||
w = tf.get_variable('w', [self.H, 1], initializer=self.weight_initializer) | ||
b = tf.get_variable('b', [1], initializer=self.const_initializer) | ||
beta = tf.nn.sigmoid(tf.matmul(h, w) + b, 'beta') # (N, 1) | ||
context = tf.multiply(beta, context, name='selected_context') | ||
return context, beta | ||
|
||
def _decode_lstm(self, x, h, context, dropout=False, reuse=False): | ||
#This is my change(Transpose shared) | ||
with tf.variable_scope('word_embedding', reuse=True): | ||
w = tf.get_variable('w', [self.V, self.M], initializer=self.emb_initializer) | ||
w_out=tf.transpose(w) | ||
##################################### | ||
with tf.variable_scope('logits', reuse=reuse): | ||
w_h = tf.get_variable('w_h', [self.H, self.M], initializer=self.weight_initializer) | ||
b_h = tf.get_variable('b_h', [self.M], initializer=self.const_initializer) | ||
# w_out = tf.get_variable('w_out', [self.M, self.V], initializer=self.weight_initializer) | ||
b_out = tf.get_variable('b_out', [self.V], initializer=self.const_initializer) | ||
|
||
if dropout: | ||
h = tf.nn.dropout(h, 0.5) | ||
h_logits = tf.matmul(h, w_h) + b_h | ||
|
||
if self.ctx2out: | ||
w_ctx2out = tf.get_variable('w_ctx2out', [self.D, self.M], initializer=self.weight_initializer) | ||
h_logits += tf.matmul(context, w_ctx2out) | ||
|
||
if self.prev2out: | ||
h_logits += x | ||
h_logits = tf.nn.tanh(h_logits) | ||
|
||
if dropout: | ||
h_logits = tf.nn.dropout(h_logits, 0.5) | ||
out_logits = tf.matmul(h_logits, w_out) + b_out | ||
return out_logits | ||
|
||
def _batch_norm(self, x, mode='train', name=None): | ||
return tf.contrib.layers.batch_norm(inputs=x, | ||
decay=0.95, | ||
center=True, | ||
scale=True, | ||
is_training=(mode=='train'), | ||
updates_collections=None, | ||
scope=(name+'batch_norm')) | ||
|
||
def build_model(self): | ||
features = self.features | ||
captions = self.captions | ||
batch_size = tf.shape(features)[0] | ||
|
||
captions_in = captions[:, :self.T] | ||
captions_out = captions[:, 1:] | ||
mask = tf.to_float(tf.not_equal(captions_out, self._null)) | ||
|
||
|
||
# batch normalize feature vectors | ||
features = self._batch_norm(features, mode='train', name='conv_features') | ||
|
||
c, h = self._get_initial_lstm(features=features) | ||
x = self._word_embedding(inputs=captions_in) | ||
features_proj = self._project_features(features=features) | ||
|
||
loss = 0.0 | ||
alpha_list = [] | ||
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H) | ||
|
||
for t in range(self.T): | ||
context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0)) | ||
alpha_list.append(alpha) | ||
|
||
if self.selector: | ||
context, beta = self._selector(context, h, reuse=(t!=0)) | ||
|
||
with tf.variable_scope('lstm', reuse=(t!=0)): | ||
_, (c, h) = lstm_cell(inputs=tf.concat([x[:,t,:], context],1), state=[c, h]) | ||
|
||
logits = self._decode_lstm(x[:,t,:], h, context, dropout=self.dropout, reuse=(t!=0)) | ||
loss += tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=captions_out[:, t]) * mask[:, t]) | ||
|
||
if self.alpha_c > 0: | ||
alphas = tf.transpose(tf.stack(alpha_list), (1, 0, 2)) *tf.expand_dims(mask,axis=2) # (N, T, L) | ||
alphas_all = tf.reduce_sum(alphas, 1) # (N, L) | ||
alpha_reg = self.alpha_c * tf.reduce_sum((16./196 - alphas_all) ** 2) | ||
loss += alpha_reg | ||
|
||
return loss / tf.to_float(batch_size) | ||
|
||
def build_sampler(self, max_len=20): | ||
features = self.features | ||
|
||
# batch normalize feature vectors | ||
features = self._batch_norm(features, mode='test', name='conv_features') | ||
|
||
c, h = self._get_initial_lstm(features=features) | ||
features_proj = self._project_features(features=features) | ||
|
||
sampled_word_list = [] | ||
alpha_list = [] | ||
beta_list = [] | ||
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H) | ||
|
||
for t in range(max_len): | ||
if t == 0: | ||
x = self._word_embedding(inputs=tf.fill([tf.shape(features)[0]], self._start)) | ||
else: | ||
x = self._word_embedding(inputs=sampled_word, reuse=True) | ||
|
||
context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0)) | ||
alpha_list.append(alpha) | ||
|
||
if self.selector: | ||
context, beta = self._selector(context, h, reuse=(t!=0)) | ||
beta_list.append(beta) | ||
|
||
with tf.variable_scope('lstm', reuse=(t!=0)): | ||
_, (c, h) = lstm_cell(inputs=tf.concat([x, context],1), state=[c, h]) | ||
|
||
logits = self._decode_lstm(x, h, context, reuse=(t!=0)) | ||
sampled_word = tf.argmax(logits, 1) | ||
sampled_word_list.append(sampled_word) | ||
|
||
alphas = tf.transpose(tf.stack(alpha_list), (1, 0, 2)) # (N, T, L) | ||
betas = tf.transpose(tf.squeeze(beta_list), (1, 0)) # (N, T) | ||
sampled_captions = tf.transpose(tf.stack(sampled_word_list), (1, 0)) # (N, max_len) | ||
return alphas, betas, sampled_captions | ||
|
||
|
||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
tf.set_random_seed(1234) | ||
# x=np.array([[[1,1,1,1],[2,2,2,2],[3,3,3,3]], | ||
# [[4,4,4,4],[5,5,5,5],[6,6,6,6]]],dtype=np.float32)#2*3*4 | ||
# context=np.array([[9,9,9,9], | ||
# [8,8,8,8]],dtype=np.float32)#2*4 | ||
# c=tf.constant([[0,0,0,0,0], | ||
# [0,0,0,0,0]],dtype=tf.float32) | ||
# h=tf.constant([[1,1,1,1,1], | ||
# [1,1,1,1,1]],dtype=tf.float32) | ||
# lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=5) | ||
# _, (c, h) = lstm_cell(inputs=tf.concat([x[:,1,:], context],1), state=[c, h]) | ||
|
||
# with tf.Session() as sess: | ||
# sess.run(tf.global_variables_initializer()) | ||
# print tf.get_variable_scope().trainable_variables() | ||
# c ,h=sess.run([c,h]) | ||
# print h | ||
|
||
##################################################### | ||
# with tf.variable_scope('scope1'): | ||
# a=tf.get_variable('a',[1]) | ||
# tf.get_variable_scope().reuse_variables() | ||
# a2 = tf.get_variable('a', [1]) | ||
# assert a==a2 | ||
# tf.get_variable_scope().reuse_variables() | ||
# with tf.variable_scope('scope1'): | ||
# a3 = tf.get_variable('a', [1]) | ||
|
||
# with tf.variable_scope('scope2'): | ||
# b=tf.get_variable('b',[2]) | ||
# a=tf.Variable(1,trainable=False,name='a',collections=[tf.contrib.framework.model_variable]) | ||
# b=tf.Variable(2,trainable=True,name='b') | ||
# with tf.Session() as sess: | ||
# sess.run(tf.global_variables_initializer()) | ||
# print tf.GraphKeys.GLOBAL_VARIABLES | ||
# print tf.global_variables() | ||
# print tf.GraphKeys.TRAINABLE_VARIABLES | ||
# print tf.trainable_variables() | ||
# print tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) | ||
# | ||
# dummy_input=tf.random_normal([3]) | ||
# dummy_input=tf.Print(dummy_input,data=[dummy_input],message='dummy inputs have been created:') | ||
# q=tf.FIFOQueue(capacity=3,dtypes=tf.float32) | ||
# enqueue_op=q.enqueue_many(dummy_input) | ||
# date=q.dequeue() | ||
# date=tf.Print(date,data=[q.size()],message='items are left in q:') | ||
# fg=date+1 | ||
# with tf.Session() as sess: | ||
# sess.run(enqueue_op) | ||
# sess.run(fg) | ||
# sess.run(fg) | ||
# sess.run(fg) | ||
# sess.run(fg) | ||
# print 'here!' | ||
with tf.variable_scope('scope1'): | ||
a=tf.get_variable('a',[1]) | ||
with tf.variable_scope('scope1',reuse=True): | ||
b = tf.get_variable('a',[1]) | ||
assert a==b | ||
# a_t=tf.transpose(a) | ||
# with tf.variable_scope('scope2'): | ||
# b=tf.Variable([[1,2]],name='b') | ||
# c=a_t+b | ||
|
||
# with tf.variable_scope('scope2'): | ||
# b=tf.Variable([2],name='b') | ||
# with tf.variable_scope('scope1'): | ||
# c=tf.Variable([1],name='a') | ||
# | ||
with tf.Session() as sess: | ||
sess.run(tf.global_variables_initializer()) | ||
print a | ||
print b |
Oops, something went wrong.