word2vec_model_resample_to_agree_length.py

# -*- coding: utf-8 -*-
"""word2vec model resample to agree length

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1wET4XbXm2s03ocHwk_9wdjn_YS56Pe4k

#WORD2VEC
"""

from google.colab import drive
drive.mount('/content/drive')

!wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_stances.csv -P /content/drive/MyDrive/Georgia\ Tech/Clubs/Big\ Data/Datasets
!wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_bodies.csv -P /content/drive/MyDrive/Georgia\ Tech/Clubs/Big\ Data/Datasets
!wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/competition_test_bodies.csv -P /content/drive/MyDrive/Georgia\ Tech/Clubs/Big\ Data/Datasets
!wget https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/competition_test_stances.csv -P /content/drive/MyDrive/Georgia\ Tech/Clubs/Big\ Data/Datasets

import pandas as pandas
import numpy as numpy
import tensorflow as tf

from keras_preprocessing.text import Tokenizer
from gensim.models import KeyedVectors
from keras_preprocessing.sequence import pad_sequences
from keras import Sequential, Model
from keras.layers import Conv1D, Dropout, Dense, Embedding, MaxPooling1D, Concatenate, Flatten, Input
from keras.layers.merge import concatenate
from sklearn.utils import resample

PATH = '/content/drive/MyDrive/Georgia Tech/Clubs/Big Data/Datasets/'

RANDOM_SEED = 42
numpy.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
import matplotlib.pyplot as plt
def load_test_data():     
    #create Pandas dataframes from the two csv files
    train_bodies = pandas.read_csv(PATH + "competition_test_bodies.csv", encoding='utf-8')
    train_headlines = pandas.read_csv(PATH + "competition_test_stances.csv", encoding='utf-8')

    #merge the csv files on Body ID
    test_data_set = pandas.merge(train_bodies, train_headlines, how='left', on='Body ID')
    stances = {
        'Stance': {
          'agree': 0,
          'disagree': 1,
          'discuss': 2,
          'unrelated': 3,
        }
    }
    test_data_set.replace(stances, inplace=True)
    print(test_data_set)
    
    return test_data_set 

# load the data set from the train csv files
def load_train_data():     
    #create Pandas dataframes from the two csv files
    train_bodies = pandas.read_csv(PATH + "train_bodies.csv", encoding='utf-8')
    train_headlines = pandas.read_csv(PATH + "train_stances.csv", encoding='utf-8')

    #merge the csv files on Body ID
    train_data_set = pandas.merge(train_bodies, train_headlines, how='left', on='Body ID')
    stances = {
        'Stance': {
          'agree': 0,
          'disagree': 1,
          'discuss': 2,
          'unrelated': 3,
        }
    }
    train_data_set.replace(stances, inplace=True)
    print(train_data_set)
    print(train_data_set['Stance'].value_counts())
   
    # average to 8909 or 3678? because 36545 is a lot but 840 is very small
    # 3 - 36545, 2 - 8909, 0 - 3678, 1 - 840
    print(train_data_set['Stance'].value_counts())
    data_length = 8909


    unrelated_downsampled = resample(train_data_set.loc[train_data_set['Stance'] == 3], replace = False, n_samples = data_length, random_state = RANDOM_SEED)
    discuss_downsampled = resample(train_data_set.loc[train_data_set['Stance'] == 2], replace = False, n_samples = data_length, random_state = RANDOM_SEED)
    # agree_upsampled = resample(train_data_set.loc[train_data_set['Stance'] == 0], replace=True, n_samples=data_length, random_state=RANDOM_SEED)
    agree = train_data_set.loc[train_data_set['Stance'] == 0]
    disagree_upsampled = resample(train_data_set.loc[train_data_set['Stance'] == 1], replace=True, n_samples=data_length, random_state=RANDOM_SEED)
  

    all_resampled = [unrelated_downsampled, discuss_downsampled, agree, disagree_upsampled]
    result = pandas.concat(all_resampled)

    return result

def prepare_data(data_set, length=None):
    #tokenize the data set
    bodies_tokenizer, headlines_tokenizer = (Tokenizer(), Tokenizer())

    #find the max length of each dataset
    bodies_max_length = 0
    headlines_max_length = 0
    if not length:
      bodies_max_length = data_set['articleBody'].map(lambda x : len(x.split())).max()
      headlines_max_length = data_set['Headline'].map(lambda x : len(x.split())).max()
    else:
      bodies_max_length = length[0]
      headlines_max_length = length[1]
    
    #fit the tokenizer on the data set
    bodies_tokenizer.fit_on_texts(data_set['articleBody'])
    headlines_tokenizer.fit_on_texts(data_set['Headline'])

    #convert the texts to sequences
    bodies_sequences = bodies_tokenizer.texts_to_sequences(data_set['articleBody'])
    headlines_sequences = headlines_tokenizer.texts_to_sequences(data_set['Headline'])

    #pad the data to be the max length
    bodies_sequences = pad_sequences(bodies_sequences, maxlen=bodies_max_length, padding='post', truncating='post')
    headlines_sequences = pad_sequences(headlines_sequences, maxlen=headlines_max_length, padding='post', truncating='post')

    
    return bodies_sequences, headlines_sequences, bodies_tokenizer.word_index, headlines_tokenizer.word_index, data_set['Stance']

def create_embeddings(bodies_word_index, headlines_word_index):
    # create empty dictionaries for the embeddings
    bodies_embeddings_index, headlines_embeddings_index = ({},{})
    word2vec_model = KeyedVectors.load_word2vec_format(PATH + "GoogleNews-vectors-negative300.bin", binary=True)

    def getVector(str):
      if str in word2vec_model:
        return word2vec_model[str]
      else:
        return None;

    #save the vector for each word to the matrix
    bodies_embeddings_matrix = numpy.zeros((len(bodies_word_index)+1, 300))
    for word, i in bodies_word_index.items():
        embedding_vector = getVector(word)
        if embedding_vector is not None:
            bodies_embeddings_matrix[i] = embedding_vector

    headlines_embeddings_matrix = numpy.zeros((len(headlines_word_index)+1, 300))
    for word, i in headlines_word_index.items():
        embedding_vector = getVector(word)
        if embedding_vector is not None:
            headlines_embeddings_matrix[i] = embedding_vector

    return bodies_embeddings_matrix, headlines_embeddings_matrix
    
    #save the wector for each word to the matrix
    bodies_embeddings_matrix = numpy.zeros((len(bodies_word_index)+1, 100))
    for word, i in bodies_word_index.items():
        embedding_vector = bodies_embeddings_index.get(word)
        if embedding_vector is not None:
            bodies_embeddings_matrix[i] = embedding_vector

    headlines_embeddings_matrix = numpy.zeros((len(headlines_word_index)+1, 100))
    for word, i in headlines_word_index.items():
        embedding_vector = headlines_embeddings_index.get(word)
        if embedding_vector is not None:
            headlines_embeddings_matrix[i] = embedding_vector

    return bodies_embeddings_matrix, headlines_embeddings_matrix

if __name__ == '__main__':
    train_data = load_train_data()
    # train_data = train_data[train_data['Stance'] != 3]

    # g = train_data.groupby('Stance')
    # train_data = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))


    test_data = load_test_data()

    # f = test_data.groupby('Stance')
    # test_data = f.apply(lambda x: x.sample(f.size().min()).reset_index(drop=True))
    # test_data = test_data[test_data['Stance'] != 3]
    
    bodies_sequences, headlines_sequences, bodies_word_index, headlines_word_index, stances = prepare_data(train_data)
    test_bodies_sequences, test_headlines_sequences, test_bodies_word_index, test_headlines_word_index, test_stances = prepare_data(test_data,[bodies_sequences.shape[1],headlines_sequences.shape[1]])

    
    bodies_embeddings_matrix, headlines_embeddings_matrix = create_embeddings(bodies_word_index=bodies_word_index, headlines_word_index=headlines_word_index)

    bodies_vocab_size, headlines_vocab_size = len(bodies_word_index), len(headlines_word_index)

def create_model(embedding_matrix, vocab_size, input_length):
    model = Sequential()
   # model.add(Input())
    model.add(Embedding(vocab_size + 1,300, weights = [embedding_matrix], trainable=False, input_length=input_length))

    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2, padding="same"))

    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))

    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))

    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))

    # if input_length >= 512:
    #   model.add(Conv1D(512, 3, activation='relu'))
    #   model.add(Dropout(0.5))
    #   model.add(MaxPooling1D(pool_size=2,padding="same"))

    model.add(Flatten())
    print("issue here3")


    # model.add(Conv1D(768, 3, activation='relu'))
    # print("issue here4")

    # model.add(Dropout(0.5))
    # model.add(MaxPooling1D(pool_size=2, padding="same"))

    return model

bodies_model = create_model(embedding_matrix=bodies_embeddings_matrix, vocab_size=bodies_vocab_size, input_length=bodies_sequences.shape[1])
headlines_model = create_model(embedding_matrix=headlines_embeddings_matrix, vocab_size=headlines_vocab_size, input_length=headlines_sequences.shape[1])

print(bodies_vocab_size)
print(headlines_vocab_size)

#bodies_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#print(bodies_model.summary())

#headlines_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#print(headlines_model.summary())
finalModel = Sequential()
print(bodies_model.input)
print(headlines_model.input)
print(bodies_model.output)
print(headlines_model.output)
finalModel = Concatenate()([bodies_model.output, headlines_model.output])
finalModel = Flatten()(finalModel)
finalModel = Dense(1024, activation='relu') (finalModel)
finalModel = Dense(1024, activation='relu') (finalModel)
finalModel = Dense(1024, activation='relu') (finalModel)
finalModel = Dense(4, activation='softmax') (finalModel)
#0,1,2,3
#0: [1,0,0,0]
#1: [0,1,0,0]
#2: [0,0,1,0]

model = Model(inputs=[bodies_model.input, headlines_model.input], outputs = finalModel)

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


print(model.summary())
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# print(headlines_sequences[4].size)
    from keras.utils import to_categorical

    print(bodies_sequences.shape)
    print(headlines_sequences.shape)
    print(stances)
    onehot_stances = to_categorical(stances)
    print(onehot_stances)
    
    model.fit([bodies_sequences, headlines_sequences],
              onehot_stances,batch_size=16,
              epochs=100, 
              validation_split=0.05, 
              shuffle=True,
              )
    model.save(PATH)

test_onehot_stances = to_categorical(test_stances)
print(len(bodies_sequences))
print(len(test_bodies_sequences), test_headlines_sequences[0], test_onehot_stances[0])
model.evaluate([test_bodies_sequences, test_headlines_sequences], test_onehot_stances)

import pandas as pd
import numpy as np
def test(headline, body):
  data = {'Headline': [headline], 'articleBody':[body], 'Stance': [None]}
  df = pd.DataFrame.from_dict(data)
  bodies_sequences, headlines_sequences, bodies_word_index, headlines_word_index, stances = prepare_data(df, [2243,40])
  stances = {
      0: "agree",
      1: "disagree",
      2: "discuss",
      3: "unrelated"
  }
  prediction = model.predict([bodies_sequences, headlines_sequences])
  print(prediction)
  print(stances[np.argmax(prediction)])
test("Pope Francis loves Donald Trump", '''Pope Francis hates Donlad Trump''')