Skip to content


add file
Browse files Browse the repository at this point in the history
  • Loading branch information
YuanEZhou committed Sep 2, 2017
1 parent 50b6883 commit e0391b9
Show file tree
Hide file tree
Showing 18 changed files with 1,698 additions and 0 deletions.
Empty file added core/
Empty file.
Binary file added core/__init__.pyc
Binary file not shown.
71 changes: 71 additions & 0 deletions core/
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import cPickle as pickle
import os
import sys
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor

def score(ref, hypo):
scorers = [
final_scores = {}
for scorer,method in scorers:
score,scores = scorer.compute_score(ref,hypo)
if type(score)==list:
for m,s in zip(method,score):
final_scores[m] = s
final_scores[method] = score

return final_scores

def evaluate(data_path='/mnt/zye/show-attend-and-tell/data', split='test', get_scores=False):
reference_path = os.path.join(data_path, "%s/%s.references.pkl" %(split, split))
candidate_path = os.path.join(data_path, "%s/%s.candidate.captions.pkl" %(split, split))

# load caption data
with open(reference_path, 'rb') as f:
ref = pickle.load(f)
with open(candidate_path, 'rb') as f:
cand = pickle.load(f)

# make dictionary
hypo = {}
for i, caption in enumerate(cand):
hypo[i] = [caption]

# compute bleu score
final_scores = score(ref, hypo)

# print out scores
print 'Bleu_1:\t',final_scores['Bleu_1']
print 'Bleu_2:\t',final_scores['Bleu_2']
print 'Bleu_3:\t',final_scores['Bleu_3']
print 'Bleu_4:\t',final_scores['Bleu_4']
print 'METEOR:\t',final_scores['METEOR']
print 'ROUGE_L:',final_scores['ROUGE_L']
print 'CIDEr:\t',final_scores['CIDEr']

if get_scores:
return final_scores

if __name__=='__main__':

Binary file added core/bleu.pyc
Binary file not shown.
230 changes: 230 additions & 0 deletions core/
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
# =========================================================================================
# Implementation of "Show, Attend and Tell: Neural Caption Generator With Visual Attention".
# There are some notations.
# N is batch size.
# L is spacial size of feature vector (196).
# D is dimension of image feature vector (512).
# T is the number of time step which is equal to caption's length-1 (16).
# V is vocabulary size (about 10000).
# M is dimension of word vector which is embedding size (default is 512).
# H is dimension of hidden state (default is 1024).
# =========================================================================================

from __future__ import division

import tensorflow as tf
# import numpy as np
# from core.utils import load_coco_data
# from core.vggnet import Vgg19

class CaptionGenerator(object):
def __init__(self, word_to_idx, dim_feature=[196, 512], dim_embed=512, dim_hidden=1024, n_time_step=16,
prev2out=True, ctx2out=True, alpha_c=0.0, selector=True, dropout=True):
word_to_idx: word-to-index mapping dictionary.
dim_feature: (optional) Dimension of vggnet19 conv5_3 feature vectors.
dim_embed: (optional) Dimension of word embedding.
dim_hidden: (optional) Dimension of all hidden state.
n_time_step: (optional) Time step size of LSTM.
prev2out: (optional) previously generated word to hidden state. (see Eq (7) for explanation)
ctx2out: (optional) context to hidden state (see Eq (7) for explanation)
alpha_c: (optional) Doubly stochastic regularization coefficient. (see Section (4.2.1) for explanation)
selector: (optional) gating scalar for context vector. (see Section (4.2.1) for explanation)
dropout: (optional) If true then dropout layer is added.

self.word_to_idx = word_to_idx
self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()}
self.prev2out = prev2out
self.ctx2out = ctx2out
self.alpha_c = alpha_c
self.selector = selector
self.dropout = dropout
self.V = len(word_to_idx)
self.L = dim_feature[0]
self.D = dim_feature[1]
self.M = dim_embed
self.H = dim_hidden
self.T = n_time_step
self._start = word_to_idx['<START>']
self._null = word_to_idx['<NULL>']

self.weight_initializer = tf.contrib.layers.xavier_initializer()
self.const_initializer = tf.constant_initializer(0.0)
self.emb_initializer = tf.random_uniform_initializer(minval=-1.0, maxval=1.0)

# Place holder for features and captions
self.features = tf.placeholder(tf.float32, [None, self.L, self.D])
self.captions = tf.placeholder(tf.int32, [None, self.T + 1])

def _get_initial_lstm(self, features):
with tf.variable_scope('initial_lstm'):
features_mean = tf.reduce_mean(features, 1)

w_h = tf.get_variable('w_h', [self.D, self.H], initializer=self.weight_initializer)
b_h = tf.get_variable('b_h', [self.H], initializer=self.const_initializer)
h = tf.nn.tanh(tf.matmul(features_mean, w_h) + b_h)

w_c = tf.get_variable('w_c', [self.D, self.H], initializer=self.weight_initializer)
b_c = tf.get_variable('b_c', [self.H], initializer=self.const_initializer)
c = tf.nn.tanh(tf.matmul(features_mean, w_c) + b_c)
return c, h

def _word_embedding(self, inputs, reuse=False):
with tf.variable_scope('word_embedding', reuse=reuse):
w = tf.get_variable('w', [self.V, self.M], initializer=self.emb_initializer)
x = tf.nn.embedding_lookup(w, inputs, name='word_vector') # (N, T, M) or (N, M)
return x

def _project_features(self, features):
with tf.variable_scope('project_features'):
w = tf.get_variable('w', [self.D, self.D], initializer=self.weight_initializer)
features_flat = tf.reshape(features, [-1, self.D])
features_proj = tf.matmul(features_flat, w)
features_proj = tf.reshape(features_proj, [-1, self.L, self.D])
return features_proj

def _attention_layer(self, features, features_proj, h, reuse=False):
with tf.variable_scope('attention_layer', reuse=reuse):
w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer)
b = tf.get_variable('b', [self.D], initializer=self.const_initializer)
w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer)

h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b) # (N, L, D)
out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L]) # (N, L)
alpha = tf.nn.softmax(out_att)
context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context') #(N, D)
return context, alpha

def _selector(self, context, h, reuse=False):
with tf.variable_scope('selector', reuse=reuse):
w = tf.get_variable('w', [self.H, 1], initializer=self.weight_initializer)
b = tf.get_variable('b', [1], initializer=self.const_initializer)
beta = tf.nn.sigmoid(tf.matmul(h, w) + b, 'beta') # (N, 1)
context = tf.multiply(beta, context, name='selected_context')
return context, beta

def _decode_lstm(self, x, h, context, dropout=False, reuse=False):
#This is my change(Transpose shared)
with tf.variable_scope('word_embedding', reuse=True):
w = tf.get_variable('w', [self.V, self.M], initializer=self.emb_initializer)
with tf.variable_scope('logits', reuse=reuse):
w_h = tf.get_variable('w_h', [self.H, self.M], initializer=self.weight_initializer)
b_h = tf.get_variable('b_h', [self.M], initializer=self.const_initializer)
# w_out = tf.get_variable('w_out', [self.M, self.V], initializer=self.weight_initializer)
b_out = tf.get_variable('b_out', [self.V], initializer=self.const_initializer)

if dropout:
h = tf.nn.dropout(h, 0.5)
h_logits = tf.matmul(h, w_h) + b_h

if self.ctx2out:
w_ctx2out = tf.get_variable('w_ctx2out', [self.D, self.M], initializer=self.weight_initializer)
h_logits += tf.matmul(context, w_ctx2out)

if self.prev2out:
h_logits += x
h_logits = tf.nn.tanh(h_logits)

if dropout:
h_logits = tf.nn.dropout(h_logits, 0.5)
out_logits = tf.matmul(h_logits, w_out) + b_out
return out_logits

def _batch_norm(self, x, mode='train', name=None):
return tf.contrib.layers.batch_norm(inputs=x,

def build_model(self):
features = self.features
captions = self.captions
batch_size = tf.shape(features)[0]

captions_in = captions[:, :self.T]
captions_out = captions[:, 1:]
mask = tf.to_float(tf.not_equal(captions_out, self._null))

# batch normalize feature vectors
features = self._batch_norm(features, mode='train', name='conv_features')

c, h = self._get_initial_lstm(features=features)
x = self._word_embedding(inputs=captions_in)
features_proj = self._project_features(features=features)

loss = 0.0
alpha_list = []
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H)

for t in range(self.T):
context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0))

if self.selector:
context, beta = self._selector(context, h, reuse=(t!=0))

with tf.variable_scope('lstm', reuse=(t!=0)):
_, (c, h) = lstm_cell(inputs=tf.concat([x[:,t,:], context],1), state=[c, h])

logits = self._decode_lstm(x[:,t,:], h, context, dropout=self.dropout, reuse=(t!=0))
loss += tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=captions_out[:, t]) * mask[:, t])

if self.alpha_c > 0:
alphas = tf.transpose(tf.stack(alpha_list), (1, 0, 2)) *tf.expand_dims(mask,axis=2) # (N, T, L)
alphas_all = tf.reduce_sum(alphas, 1) # (N, L)
alpha_reg = self.alpha_c * tf.reduce_sum((16./196 - alphas_all) ** 2)
loss += alpha_reg

return loss / tf.to_float(batch_size)

def build_sampler(self, max_len=20):
features = self.features

# batch normalize feature vectors
features = self._batch_norm(features, mode='test', name='conv_features')

c, h = self._get_initial_lstm(features=features)
features_proj = self._project_features(features=features)

sampled_word_list = []
alpha_list = []
beta_list = []
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H)

for t in range(max_len):
if t == 0:
x = self._word_embedding(inputs=tf.fill([tf.shape(features)[0]], self._start))
x = self._word_embedding(inputs=sampled_word, reuse=True)

context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0))

if self.selector:
context, beta = self._selector(context, h, reuse=(t!=0))

with tf.variable_scope('lstm', reuse=(t!=0)):
_, (c, h) = lstm_cell(inputs=tf.concat([x, context],1), state=[c, h])

logits = self._decode_lstm(x, h, context, reuse=(t!=0))
sampled_word = tf.argmax(logits, 1)

alphas = tf.transpose(tf.stack(alpha_list), (1, 0, 2)) # (N, T, L)
betas = tf.transpose(tf.squeeze(beta_list), (1, 0)) # (N, T)
sampled_captions = tf.transpose(tf.stack(sampled_word_list), (1, 0)) # (N, max_len)
return alphas, betas, sampled_captions

Binary file added core/model.pyc
Binary file not shown.
75 changes: 75 additions & 0 deletions core/
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import tensorflow as tf
import numpy as np
# x=np.array([[[1,1,1,1],[2,2,2,2],[3,3,3,3]],
# [[4,4,4,4],[5,5,5,5],[6,6,6,6]]],dtype=np.float32)#2*3*4
# context=np.array([[9,9,9,9],
# [8,8,8,8]],dtype=np.float32)#2*4
# c=tf.constant([[0,0,0,0,0],
# [0,0,0,0,0]],dtype=tf.float32)
# h=tf.constant([[1,1,1,1,1],
# [1,1,1,1,1]],dtype=tf.float32)
# lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=5)
# _, (c, h) = lstm_cell(inputs=tf.concat([x[:,1,:], context],1), state=[c, h])

# with tf.Session() as sess:
# print tf.get_variable_scope().trainable_variables()
# c ,[c,h])
# print h

# with tf.variable_scope('scope1'):
# a=tf.get_variable('a',[1])
# tf.get_variable_scope().reuse_variables()
# a2 = tf.get_variable('a', [1])
# assert a==a2
# tf.get_variable_scope().reuse_variables()
# with tf.variable_scope('scope1'):
# a3 = tf.get_variable('a', [1])

# with tf.variable_scope('scope2'):
# b=tf.get_variable('b',[2])
# a=tf.Variable(1,trainable=False,name='a',collections=[tf.contrib.framework.model_variable])
# b=tf.Variable(2,trainable=True,name='b')
# with tf.Session() as sess:
# print tf.GraphKeys.GLOBAL_VARIABLES
# print tf.global_variables()
# print tf.GraphKeys.TRAINABLE_VARIABLES
# print tf.trainable_variables()
# print tf.get_collection(tf.GraphKeys.MODEL_VARIABLES)
# dummy_input=tf.random_normal([3])
# dummy_input=tf.Print(dummy_input,data=[dummy_input],message='dummy inputs have been created:')
# q=tf.FIFOQueue(capacity=3,dtypes=tf.float32)
# enqueue_op=q.enqueue_many(dummy_input)
# date=q.dequeue()
# date=tf.Print(date,data=[q.size()],message='items are left in q:')
# fg=date+1
# with tf.Session() as sess:
# print 'here!'
with tf.variable_scope('scope1'):
with tf.variable_scope('scope1',reuse=True):
b = tf.get_variable('a',[1])
assert a==b
# a_t=tf.transpose(a)
# with tf.variable_scope('scope2'):
# b=tf.Variable([[1,2]],name='b')
# c=a_t+b

# with tf.variable_scope('scope2'):
# b=tf.Variable([2],name='b')
# with tf.variable_scope('scope1'):
# c=tf.Variable([1],name='a')
with tf.Session() as sess:
print a
print b

0 comments on commit e0391b9

Please sign in to comment.