add file

94happy · Sep 2, 2017 · e0391b9 · e0391b9
1 parent 50b6883
commit e0391b9
Show file tree

Hide file tree

Showing 18 changed files with 1,698 additions and 0 deletions.
diff --git a/core/__init__.py b/core/__init__.py
diff --git a/core/__init__.pyc b/core/__init__.pyc
diff --git a/core/bleu.py b/core/bleu.py
@@ -0,0 +1,71 @@
+import cPickle as pickle
+import os
+import sys
+sys.path.append('/mnt/zye/coco-caption')
+from pycocoevalcap.bleu.bleu import Bleu
+from pycocoevalcap.rouge.rouge import Rouge
+from pycocoevalcap.cider.cider import Cider
+from pycocoevalcap.meteor.meteor import Meteor
+
+def score(ref, hypo):
+    scorers = [
+        (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"]),
+        (Meteor(),"METEOR"),
+        (Rouge(),"ROUGE_L"),
+        (Cider(),"CIDEr")
+    ]
+    final_scores = {}
+    for scorer,method in scorers:
+        score,scores = scorer.compute_score(ref,hypo)
+        if type(score)==list:
+            for m,s in zip(method,score):
+                final_scores[m] = s
+        else:
+            final_scores[method] = score
+
+    return final_scores
+
+
+def evaluate(data_path='/mnt/zye/show-attend-and-tell/data', split='test', get_scores=False):
+    reference_path = os.path.join(data_path, "%s/%s.references.pkl" %(split, split))
+    candidate_path = os.path.join(data_path, "%s/%s.candidate.captions.pkl" %(split, split))
+
+    # load caption data
+    with open(reference_path, 'rb') as f:
+        ref = pickle.load(f)
+    with open(candidate_path, 'rb') as f:
+        cand = pickle.load(f)
+
+    # make dictionary
+    hypo = {}
+    for i, caption in enumerate(cand):
+        hypo[i] = [caption]
+
+    # compute bleu score
+    final_scores = score(ref, hypo)
+
+    # print out scores
+    print 'Bleu_1:\t',final_scores['Bleu_1']  
+    print 'Bleu_2:\t',final_scores['Bleu_2']  
+    print 'Bleu_3:\t',final_scores['Bleu_3']  
+    print 'Bleu_4:\t',final_scores['Bleu_4']  
+    print 'METEOR:\t',final_scores['METEOR']  
+    print 'ROUGE_L:',final_scores['ROUGE_L']  
+    print 'CIDEr:\t',final_scores['CIDEr']
+
+    if get_scores:
+        return final_scores
+
+
+
+if __name__=='__main__':
+    evaluate()
+
+
+
+
+
+
+
+
+
diff --git a/core/bleu.pyc b/core/bleu.pyc
diff --git a/core/model.py b/core/model.py
@@ -0,0 +1,230 @@
+# =========================================================================================
+# Implementation of "Show, Attend and Tell: Neural Caption Generator With Visual Attention".
+# There are some notations.
+# N is batch size.
+# L is spacial size of feature vector (196).
+# D is dimension of image feature vector (512).
+# T is the number of time step which is equal to caption's length-1 (16).
+# V is vocabulary size (about 10000).
+# M is dimension of word vector which is embedding size (default is 512).
+# H is dimension of hidden state (default is 1024).
+# =========================================================================================
+
+from __future__ import division
+
+import tensorflow as tf
+# import numpy as np
+# from core.utils import load_coco_data
+# from core.vggnet import Vgg19
+
+class CaptionGenerator(object):
+    def __init__(self, word_to_idx, dim_feature=[196, 512], dim_embed=512, dim_hidden=1024, n_time_step=16,
+                  prev2out=True, ctx2out=True, alpha_c=0.0, selector=True, dropout=True):
+        """
+        Args:
+            word_to_idx: word-to-index mapping dictionary.
+            dim_feature: (optional) Dimension of vggnet19 conv5_3 feature vectors.
+            dim_embed: (optional) Dimension of word embedding.
+            dim_hidden: (optional) Dimension of all hidden state.
+            n_time_step: (optional) Time step size of LSTM. 
+            prev2out: (optional) previously generated word to hidden state. (see Eq (7) for explanation)
+            ctx2out: (optional) context to hidden state (see Eq (7) for explanation)
+            alpha_c: (optional) Doubly stochastic regularization coefficient. (see Section (4.2.1) for explanation)
+            selector: (optional) gating scalar for context vector. (see Section (4.2.1) for explanation)
+            dropout: (optional) If true then dropout layer is added.
+        """
+
+        self.word_to_idx = word_to_idx
+        self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()}
+        self.prev2out = prev2out
+        self.ctx2out = ctx2out
+        self.alpha_c = alpha_c
+        self.selector = selector
+        self.dropout = dropout
+        self.V = len(word_to_idx)
+        self.L = dim_feature[0]
+        self.D = dim_feature[1]
+        self.M = dim_embed
+        self.H = dim_hidden
+        self.T = n_time_step
+        self._start = word_to_idx['<START>']
+        self._null = word_to_idx['<NULL>']
+
+        self.weight_initializer = tf.contrib.layers.xavier_initializer()
+        self.const_initializer = tf.constant_initializer(0.0)
+        self.emb_initializer = tf.random_uniform_initializer(minval=-1.0, maxval=1.0)
+
+        # Place holder for features and captions
+        self.features = tf.placeholder(tf.float32, [None, self.L, self.D])
+        self.captions = tf.placeholder(tf.int32, [None, self.T + 1])
+
+    def _get_initial_lstm(self, features):
+        with tf.variable_scope('initial_lstm'):
+            features_mean = tf.reduce_mean(features, 1)
+
+            w_h = tf.get_variable('w_h', [self.D, self.H], initializer=self.weight_initializer)
+            b_h = tf.get_variable('b_h', [self.H], initializer=self.const_initializer)
+            h = tf.nn.tanh(tf.matmul(features_mean, w_h) + b_h)
+
+            w_c = tf.get_variable('w_c', [self.D, self.H], initializer=self.weight_initializer)
+            b_c = tf.get_variable('b_c', [self.H], initializer=self.const_initializer)
+            c = tf.nn.tanh(tf.matmul(features_mean, w_c) + b_c)
+            return c, h
+
+    def _word_embedding(self, inputs, reuse=False):
+        with tf.variable_scope('word_embedding', reuse=reuse):
+            w = tf.get_variable('w', [self.V, self.M], initializer=self.emb_initializer)
+            x = tf.nn.embedding_lookup(w, inputs, name='word_vector')  # (N, T, M) or (N, M)
+            return x
+
+    def _project_features(self, features):
+        with tf.variable_scope('project_features'):
+            w = tf.get_variable('w', [self.D, self.D], initializer=self.weight_initializer)
+            features_flat = tf.reshape(features, [-1, self.D])
+            features_proj = tf.matmul(features_flat, w)  
+            features_proj = tf.reshape(features_proj, [-1, self.L, self.D])
+            return features_proj
+
+    def _attention_layer(self, features, features_proj, h, reuse=False):
+        with tf.variable_scope('attention_layer', reuse=reuse):
+            w = tf.get_variable('w', [self.H, self.D], initializer=self.weight_initializer)
+            b = tf.get_variable('b', [self.D], initializer=self.const_initializer)
+            w_att = tf.get_variable('w_att', [self.D, 1], initializer=self.weight_initializer)
+
+            h_att = tf.nn.relu(features_proj + tf.expand_dims(tf.matmul(h, w), 1) + b)    # (N, L, D)
+            out_att = tf.reshape(tf.matmul(tf.reshape(h_att, [-1, self.D]), w_att), [-1, self.L])   # (N, L)
+            alpha = tf.nn.softmax(out_att)  
+            context = tf.reduce_sum(features * tf.expand_dims(alpha, 2), 1, name='context')   #(N, D)
+            return context, alpha
+
+    def _selector(self, context, h, reuse=False):
+        with tf.variable_scope('selector', reuse=reuse):
+            w = tf.get_variable('w', [self.H, 1], initializer=self.weight_initializer)
+            b = tf.get_variable('b', [1], initializer=self.const_initializer)
+            beta = tf.nn.sigmoid(tf.matmul(h, w) + b, 'beta')    # (N, 1)
+            context = tf.multiply(beta, context, name='selected_context')
+            return context, beta
+
+    def _decode_lstm(self, x, h, context, dropout=False, reuse=False):
+        #This is my  change(Transpose shared)
+        with tf.variable_scope('word_embedding', reuse=True):
+            w = tf.get_variable('w', [self.V, self.M], initializer=self.emb_initializer)
+        w_out=tf.transpose(w)
+        #####################################
+        with tf.variable_scope('logits', reuse=reuse):
+            w_h = tf.get_variable('w_h', [self.H, self.M], initializer=self.weight_initializer)
+            b_h = tf.get_variable('b_h', [self.M], initializer=self.const_initializer)
+            # w_out = tf.get_variable('w_out', [self.M, self.V], initializer=self.weight_initializer)
+            b_out = tf.get_variable('b_out', [self.V], initializer=self.const_initializer)
+
+            if dropout:
+                h = tf.nn.dropout(h, 0.5)
+            h_logits = tf.matmul(h, w_h) + b_h
+
+            if self.ctx2out:
+                w_ctx2out = tf.get_variable('w_ctx2out', [self.D, self.M], initializer=self.weight_initializer)
+                h_logits += tf.matmul(context, w_ctx2out)
+
+            if self.prev2out:
+                h_logits += x
+            h_logits = tf.nn.tanh(h_logits)
+
+            if dropout:
+                h_logits = tf.nn.dropout(h_logits, 0.5)
+            out_logits = tf.matmul(h_logits, w_out) + b_out
+            return out_logits
+
+    def _batch_norm(self, x, mode='train', name=None):
+        return tf.contrib.layers.batch_norm(inputs=x, 
+                                            decay=0.95,
+                                            center=True,
+                                            scale=True,
+                                            is_training=(mode=='train'),
+                                            updates_collections=None,
+                                            scope=(name+'batch_norm'))
+
+    def build_model(self):
+        features = self.features
+        captions = self.captions
+        batch_size = tf.shape(features)[0]
+
+        captions_in = captions[:, :self.T]      
+        captions_out = captions[:, 1:]  
+        mask = tf.to_float(tf.not_equal(captions_out, self._null))
+
+
+        # batch normalize feature vectors
+        features = self._batch_norm(features, mode='train', name='conv_features')
+
+        c, h = self._get_initial_lstm(features=features)
+        x = self._word_embedding(inputs=captions_in)
+        features_proj = self._project_features(features=features)
+
+        loss = 0.0
+        alpha_list = []
+        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H)
+
+        for t in range(self.T):
+            context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0))
+            alpha_list.append(alpha)
+
+            if self.selector:
+                context, beta = self._selector(context, h, reuse=(t!=0)) 
+
+            with tf.variable_scope('lstm', reuse=(t!=0)):
+                _, (c, h) = lstm_cell(inputs=tf.concat([x[:,t,:], context],1), state=[c, h])
+
+            logits = self._decode_lstm(x[:,t,:], h, context, dropout=self.dropout, reuse=(t!=0))
+            loss += tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=captions_out[:, t]) * mask[:, t])
+
+        if self.alpha_c > 0:
+            alphas = tf.transpose(tf.stack(alpha_list), (1, 0, 2)) *tf.expand_dims(mask,axis=2)    # (N, T, L)
+            alphas_all = tf.reduce_sum(alphas, 1)      # (N, L)
+            alpha_reg = self.alpha_c * tf.reduce_sum((16./196 - alphas_all) ** 2)
+            loss += alpha_reg
+
+        return loss / tf.to_float(batch_size)
+
+    def build_sampler(self, max_len=20):
+        features = self.features
+
+        # batch normalize feature vectors
+        features = self._batch_norm(features, mode='test', name='conv_features')
+
+        c, h = self._get_initial_lstm(features=features)
+        features_proj = self._project_features(features=features)
+
+        sampled_word_list = []
+        alpha_list = []
+        beta_list = []
+        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H)
+
+        for t in range(max_len):
+            if t == 0:
+                x = self._word_embedding(inputs=tf.fill([tf.shape(features)[0]], self._start))
+            else:
+                x = self._word_embedding(inputs=sampled_word, reuse=True)  
+
+            context, alpha = self._attention_layer(features, features_proj, h, reuse=(t!=0))
+            alpha_list.append(alpha)
+
+            if self.selector:
+                context, beta = self._selector(context, h, reuse=(t!=0)) 
+                beta_list.append(beta)
+
+            with tf.variable_scope('lstm', reuse=(t!=0)):
+                _, (c, h) = lstm_cell(inputs=tf.concat([x, context],1), state=[c, h])
+
+            logits = self._decode_lstm(x, h, context, reuse=(t!=0))
+            sampled_word = tf.argmax(logits, 1)       
+            sampled_word_list.append(sampled_word)     
+
+        alphas = tf.transpose(tf.stack(alpha_list), (1, 0, 2))     # (N, T, L)
+        betas = tf.transpose(tf.squeeze(beta_list), (1, 0))    # (N, T)
+        sampled_captions = tf.transpose(tf.stack(sampled_word_list), (1, 0))     # (N, max_len)
+        return alphas, betas, sampled_captions
+
+
+
+
+
diff --git a/core/model.pyc b/core/model.pyc
diff --git a/core/practice.py b/core/practice.py
@@ -0,0 +1,75 @@
+import tensorflow as tf
+import numpy as np
+tf.set_random_seed(1234)
+# x=np.array([[[1,1,1,1],[2,2,2,2],[3,3,3,3]],
+#             [[4,4,4,4],[5,5,5,5],[6,6,6,6]]],dtype=np.float32)#2*3*4
+# context=np.array([[9,9,9,9],
+#                   [8,8,8,8]],dtype=np.float32)#2*4
+# c=tf.constant([[0,0,0,0,0],
+#             [0,0,0,0,0]],dtype=tf.float32)
+# h=tf.constant([[1,1,1,1,1],
+#             [1,1,1,1,1]],dtype=tf.float32)
+# lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=5)
+# _, (c, h) = lstm_cell(inputs=tf.concat([x[:,1,:], context],1), state=[c, h])
+
+# with tf.Session() as sess:
+    # sess.run(tf.global_variables_initializer())
+    # print tf.get_variable_scope().trainable_variables()
+    # c ,h=sess.run([c,h])
+    # print h
+
+#####################################################
+# with tf.variable_scope('scope1'):
+#     a=tf.get_variable('a',[1])
+#     tf.get_variable_scope().reuse_variables()
+#     a2 = tf.get_variable('a', [1])
+#     assert  a==a2
+# tf.get_variable_scope().reuse_variables()
+# with tf.variable_scope('scope1'):
+#     a3 = tf.get_variable('a', [1])
+
+# with tf.variable_scope('scope2'):
+#     b=tf.get_variable('b',[2])
+# a=tf.Variable(1,trainable=False,name='a',collections=[tf.contrib.framework.model_variable])
+# b=tf.Variable(2,trainable=True,name='b')
+# with tf.Session() as sess:
+#     sess.run(tf.global_variables_initializer())
+#     print tf.GraphKeys.GLOBAL_VARIABLES
+#     print tf.global_variables()
+#     print tf.GraphKeys.TRAINABLE_VARIABLES
+#     print tf.trainable_variables()
+#     print tf.get_collection(tf.GraphKeys.MODEL_VARIABLES)
+#
+# dummy_input=tf.random_normal([3])
+# dummy_input=tf.Print(dummy_input,data=[dummy_input],message='dummy inputs have been created:')
+# q=tf.FIFOQueue(capacity=3,dtypes=tf.float32)
+# enqueue_op=q.enqueue_many(dummy_input)
+# date=q.dequeue()
+# date=tf.Print(date,data=[q.size()],message='items are left in q:')
+# fg=date+1
+# with tf.Session() as sess:
+#     sess.run(enqueue_op)
+#     sess.run(fg)
+#     sess.run(fg)
+#     sess.run(fg)
+#     sess.run(fg)
+#     print 'here!'
+with tf.variable_scope('scope1'):
+    a=tf.get_variable('a',[1])
+with tf.variable_scope('scope1',reuse=True):
+    b = tf.get_variable('a',[1])
+assert a==b
+# a_t=tf.transpose(a)
+# with tf.variable_scope('scope2'):
+#     b=tf.Variable([[1,2]],name='b')
+#     c=a_t+b
+
+# with tf.variable_scope('scope2'):
+#     b=tf.Variable([2],name='b')
+#     with tf.variable_scope('scope1'):
+#         c=tf.Variable([1],name='a')
+#
+with tf.Session() as sess:
+    sess.run(tf.global_variables_initializer())
+    print a
+    print b