Updated Readme

snow-flaky · Jul 9, 2020 · b67c241 · b67c241
commit b67c241
Show file tree

Hide file tree

Showing 12 changed files with 1,589 additions and 0 deletions.
diff --git a/CDS_summary.ipynb b/CDS_summary.ipynb
@@ -0,0 +1,346 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Text Summarization using Connected Dominating Set \n",
+    "\n",
+    "### STEP 1 : Data cleaning ( removing stop words, non letter characters, turning to lower case letters )\n",
+    "### STEP 2 : Sentence vector representation\n",
+    "### STEP 3 : Graph formation where edges formed using cosine similarity between sentences\n",
+    "### STEP 4 : Finding minimum Connected Dominating Set and outputting the summary "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initial Phase\n",
+    "### Importing Libraries and Reading Data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import networkx as nx\n",
+    "import networkx.algorithms.approximation as nxaa\n",
+    "from collections import OrderedDict, deque\n",
+    "import copy\n",
+    "import operator\n",
+    "\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.cluster.util import cosine_distance\n",
+    "import numpy as np\n",
+    "import pandas\n",
+    "import nltk\n",
+    "import re\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import sent_tokenize\n",
+    "import networkx as nx\n",
+    "from sklearn.metrics.pairwise import cosine_similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### reading data file\n",
+    "\n",
+    "df = pandas.read_csv('Downloads/tennis_articles_v4.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## STEP 1 : Data Cleaning\n",
+    "### Cleaning sentences, by removing Non Alphabet Characters and converting to Lower Case Letters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### cleaning sentences, by removing non alphabet characters and converting to lower case letters\n",
+    "\n",
+    "s = \"\"\n",
+    "d = {}\n",
+    "for a in df['article_text']:\n",
+    "      s += a\n",
+    "# print s\n",
+    "sentences = sent_tokenize(s)\n",
+    "clean_sentences = []\n",
+    "for s in sentences:\n",
+    "    temp = re.sub(\"[^a-zA-Z]\",\" \",s)\n",
+    "    temp = temp.lower()\n",
+    "    clean_sentences.append(temp)\n",
+    "    d[temp] = s \n",
+    "# print clean_sentences"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Removing Stop Words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### defined a functiom for removing stop words which are downloaded from NLTk's list of english stop words\n",
+    "\n",
+    "stop_words = stopwords.words('english')\n",
+    "def rem_stop(s):\n",
+    "    var = \"\"\n",
+    "    words = nltk.word_tokenize(s)\n",
+    "    for w in words:\n",
+    "        if( w not in stop_words):\n",
+    "           var+=w+\" \"\n",
+    "    return var"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### removed the stop words using the function defined above\n",
+    "\n",
+    "dict = {}\n",
+    "clean = []\n",
+    "# print clean_sentences\n",
+    "for s in clean_sentences:\n",
+    "    temp = rem_stop(s)\n",
+    "    clean.append(temp)\n",
+    "    dict[temp] = d[s]\n",
+    "# print clean  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### loaded pre trained word2vec model from Gensim\n",
+    "\n",
+    "from gensim.models import KeyedVectors\n",
+    "filename = 'Downloads/GoogleNews-vectors-negative300.bin'\n",
+    "model = KeyedVectors.load_word2vec_format(filename, binary=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## STEP 2 : Sentence Vector Generation\n",
+    "### Vector Representations are created using pre trained word2vec model from Gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### creating vector representation of sentences after extracting word vectors\n",
+    "\n",
+    "# print(model)\n",
+    "word_embeddings = {}\n",
+    "words = list(model.wv.vocab)\n",
+    "# print len(words)\n",
+    "for a in words:\n",
+    "    word_embeddings[a]=model[a]\n",
+    "\n",
+    "# print len(word_embeddings)\n",
+    "\n",
+    "\n",
+    "sentence_vectors = []\n",
+    "for i in clean:\n",
+    "  if len(i) != 0:\n",
+    "    v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.001)\n",
+    "  else:\n",
+    "    v = np.zeros((300,))\n",
+    "  sentence_vectors.append(v)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## STEP 3 : Graph Formation\n",
+    "### Graph is formed where sentences are the nodes and edges are formed using Cosine Similarity between the sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### generating the final summary after producing the graph using networkx and applying pagerank algo\n",
+    "\n",
+    "sentence_similarity_martix = np.zeros([len(sentences), len(sentences)])\n",
+    "for i in range(len(sentences)):\n",
+    "  for j in range(len(sentences)):\n",
+    "    if i != j:\n",
+    "      sentence_similarity_martix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]\n",
+    "\n",
+    "sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)   "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## STEP 4 : Finding minimum Connected Dominating Set and Outputting the summary\n",
+    "### minimum Connected Dominating Set is found using a Greedy Approach which can be summarized in the following 3 steps :\n",
+    "### 1. Initialization : \n",
+    "### Take the node with maximum degree as the starting node\n",
+    "### Enqueue the neighbor nodes of starting node to Q in descending order by their degree\n",
+    "### Maintain a priority queue centrally to decide whether an element would be a part of CDS.\n",
+    "### 2. CDS Calculation :\n",
+    "### Check if the graph after removing u is still connected\n",
+    "### Add neighbors of u to the priority queue, which never are inserted into Q\n",
+    "### 3. Result Verification :\n",
+    "### Verify the set is Dominating and Connected\n",
+    "### Output the Result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Initialization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert nx.is_connected(G)\n",
+    "\n",
+    "### finding minimum connected dominating set using a greedy approach\n",
+    "\n",
+    "G2 = copy.deepcopy(G)\n",
+    "\n",
+    "# Step 1: initialization\n",
+    "# take the node with maximum degree as the starting node\n",
+    "starting_node = max(dict(G2.degree()).items(), key=operator.itemgetter(1))[0] \n",
+    "fixed_nodes = {starting_node}\n",
+    "\n",
+    "# Enqueue the neighbor nodes of starting node to Q in descending order by their degree\n",
+    "neighbor_nodes = G2.neighbors(starting_node)\n",
+    "neighbor_nodes_sorted =list( OrderedDict(sorted(dict(G2.degree(neighbor_nodes)).items(), key=operator.itemgetter(1), reverse=True)).keys())\n",
+    "\n",
+    "priority_queue = deque(neighbor_nodes_sorted) # a priority queue is maintained centrally to decide whether an element would be a part of CDS.\n",
+    "# print([starting_node]+neighbor_nodes_sorted)\n",
+    "inserted_set = set(neighbor_nodes_sorted + [starting_node])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. CDS Calculation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 2: calculate the cds\n",
+    "while priority_queue:\n",
+    "    u = priority_queue.pop()\n",
+    "\n",
+    "# check if the graph after removing u is still connected\n",
+    "rest_graph = copy.deepcopy(G2)\n",
+    "rest_graph.remove_node(u)\n",
+    "\n",
+    "if nx.is_connected(rest_graph):\n",
+    "G2.remove_node(u)\n",
+    "else: # is not connected \n",
+    "fixed_nodes.add(u)\n",
+    "\n",
+    "# add neighbors of u to the priority queue, which never are inserted into Q\n",
+    "inserted_neighbors = set(G2.neighbors(u)) - inserted_set\n",
+    "inserted_neighbors_sorted = OrderedDict(sorted(dict(G2.degree(inserted_neighbors)).items(),key=operator.itemgetter(1), reverse=True)).keys()\n",
+    "\n",
+    "priority_queue.extend(inserted_neighbors_sorted)\n",
+    "inserted_set.update(inserted_neighbors_sorted)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Result Verification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 3: verify the result\n",
+    "assert nx.is_dominating_set(G, fixed_nodes) and nx.is_connected(G.subgraph(fixed_nodes))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Outputting the set formed in the previous step as the summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print fixed_nodes"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}