-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b67c241
Showing
12 changed files
with
1,589 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,346 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Text Summarization using Connected Dominating Set \n", | ||
"\n", | ||
"### STEP 1 : Data cleaning ( removing stop words, non letter characters, turning to lower case letters )\n", | ||
"### STEP 2 : Sentence vector representation\n", | ||
"### STEP 3 : Graph formation where edges formed using cosine similarity between sentences\n", | ||
"### STEP 4 : Finding minimum Connected Dominating Set and outputting the summary " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Initial Phase\n", | ||
"### Importing Libraries and Reading Data " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import matplotlib.pyplot as plt\n", | ||
"import networkx as nx\n", | ||
"import networkx.algorithms.approximation as nxaa\n", | ||
"from collections import OrderedDict, deque\n", | ||
"import copy\n", | ||
"import operator\n", | ||
"\n", | ||
"from nltk.corpus import stopwords\n", | ||
"from nltk.cluster.util import cosine_distance\n", | ||
"import numpy as np\n", | ||
"import pandas\n", | ||
"import nltk\n", | ||
"import re\n", | ||
"from nltk.corpus import stopwords\n", | ||
"from nltk.tokenize import sent_tokenize\n", | ||
"import networkx as nx\n", | ||
"from sklearn.metrics.pairwise import cosine_similarity" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### reading data file\n", | ||
"\n", | ||
"df = pandas.read_csv('Downloads/tennis_articles_v4.csv')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## STEP 1 : Data Cleaning\n", | ||
"### Cleaning sentences, by removing Non Alphabet Characters and converting to Lower Case Letters" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### cleaning sentences, by removing non alphabet characters and converting to lower case letters\n", | ||
"\n", | ||
"s = \"\"\n", | ||
"d = {}\n", | ||
"for a in df['article_text']:\n", | ||
" s += a\n", | ||
"# print s\n", | ||
"sentences = sent_tokenize(s)\n", | ||
"clean_sentences = []\n", | ||
"for s in sentences:\n", | ||
" temp = re.sub(\"[^a-zA-Z]\",\" \",s)\n", | ||
" temp = temp.lower()\n", | ||
" clean_sentences.append(temp)\n", | ||
" d[temp] = s \n", | ||
"# print clean_sentences" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Removing Stop Words" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### defined a functiom for removing stop words which are downloaded from NLTk's list of english stop words\n", | ||
"\n", | ||
"stop_words = stopwords.words('english')\n", | ||
"def rem_stop(s):\n", | ||
" var = \"\"\n", | ||
" words = nltk.word_tokenize(s)\n", | ||
" for w in words:\n", | ||
" if( w not in stop_words):\n", | ||
" var+=w+\" \"\n", | ||
" return var" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### removed the stop words using the function defined above\n", | ||
"\n", | ||
"dict = {}\n", | ||
"clean = []\n", | ||
"# print clean_sentences\n", | ||
"for s in clean_sentences:\n", | ||
" temp = rem_stop(s)\n", | ||
" clean.append(temp)\n", | ||
" dict[temp] = d[s]\n", | ||
"# print clean " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### loaded pre trained word2vec model from Gensim\n", | ||
"\n", | ||
"from gensim.models import KeyedVectors\n", | ||
"filename = 'Downloads/GoogleNews-vectors-negative300.bin'\n", | ||
"model = KeyedVectors.load_word2vec_format(filename, binary=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## STEP 2 : Sentence Vector Generation\n", | ||
"### Vector Representations are created using pre trained word2vec model from Gensim" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### creating vector representation of sentences after extracting word vectors\n", | ||
"\n", | ||
"# print(model)\n", | ||
"word_embeddings = {}\n", | ||
"words = list(model.wv.vocab)\n", | ||
"# print len(words)\n", | ||
"for a in words:\n", | ||
" word_embeddings[a]=model[a]\n", | ||
"\n", | ||
"# print len(word_embeddings)\n", | ||
"\n", | ||
"\n", | ||
"sentence_vectors = []\n", | ||
"for i in clean:\n", | ||
" if len(i) != 0:\n", | ||
" v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.001)\n", | ||
" else:\n", | ||
" v = np.zeros((300,))\n", | ||
" sentence_vectors.append(v)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## STEP 3 : Graph Formation\n", | ||
"### Graph is formed where sentences are the nodes and edges are formed using Cosine Similarity between the sentences" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"### generating the final summary after producing the graph using networkx and applying pagerank algo\n", | ||
"\n", | ||
"sentence_similarity_martix = np.zeros([len(sentences), len(sentences)])\n", | ||
"for i in range(len(sentences)):\n", | ||
" for j in range(len(sentences)):\n", | ||
" if i != j:\n", | ||
" sentence_similarity_martix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]\n", | ||
"\n", | ||
"sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## STEP 4 : Finding minimum Connected Dominating Set and Outputting the summary\n", | ||
"### minimum Connected Dominating Set is found using a Greedy Approach which can be summarized in the following 3 steps :\n", | ||
"### 1. Initialization : \n", | ||
"### Take the node with maximum degree as the starting node\n", | ||
"### Enqueue the neighbor nodes of starting node to Q in descending order by their degree\n", | ||
"### Maintain a priority queue centrally to decide whether an element would be a part of CDS.\n", | ||
"### 2. CDS Calculation :\n", | ||
"### Check if the graph after removing u is still connected\n", | ||
"### Add neighbors of u to the priority queue, which never are inserted into Q\n", | ||
"### 3. Result Verification :\n", | ||
"### Verify the set is Dominating and Connected\n", | ||
"### Output the Result" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### 1. Initialization" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"assert nx.is_connected(G)\n", | ||
"\n", | ||
"### finding minimum connected dominating set using a greedy approach\n", | ||
"\n", | ||
"G2 = copy.deepcopy(G)\n", | ||
"\n", | ||
"# Step 1: initialization\n", | ||
"# take the node with maximum degree as the starting node\n", | ||
"starting_node = max(dict(G2.degree()).items(), key=operator.itemgetter(1))[0] \n", | ||
"fixed_nodes = {starting_node}\n", | ||
"\n", | ||
"# Enqueue the neighbor nodes of starting node to Q in descending order by their degree\n", | ||
"neighbor_nodes = G2.neighbors(starting_node)\n", | ||
"neighbor_nodes_sorted =list( OrderedDict(sorted(dict(G2.degree(neighbor_nodes)).items(), key=operator.itemgetter(1), reverse=True)).keys())\n", | ||
"\n", | ||
"priority_queue = deque(neighbor_nodes_sorted) # a priority queue is maintained centrally to decide whether an element would be a part of CDS.\n", | ||
"# print([starting_node]+neighbor_nodes_sorted)\n", | ||
"inserted_set = set(neighbor_nodes_sorted + [starting_node])\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### 2. CDS Calculation" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Step 2: calculate the cds\n", | ||
"while priority_queue:\n", | ||
" u = priority_queue.pop()\n", | ||
"\n", | ||
"# check if the graph after removing u is still connected\n", | ||
"rest_graph = copy.deepcopy(G2)\n", | ||
"rest_graph.remove_node(u)\n", | ||
"\n", | ||
"if nx.is_connected(rest_graph):\n", | ||
"G2.remove_node(u)\n", | ||
"else: # is not connected \n", | ||
"fixed_nodes.add(u)\n", | ||
"\n", | ||
"# add neighbors of u to the priority queue, which never are inserted into Q\n", | ||
"inserted_neighbors = set(G2.neighbors(u)) - inserted_set\n", | ||
"inserted_neighbors_sorted = OrderedDict(sorted(dict(G2.degree(inserted_neighbors)).items(),key=operator.itemgetter(1), reverse=True)).keys()\n", | ||
"\n", | ||
"priority_queue.extend(inserted_neighbors_sorted)\n", | ||
"inserted_set.update(inserted_neighbors_sorted)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### 3. Result Verification" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Step 3: verify the result\n", | ||
"assert nx.is_dominating_set(G, fixed_nodes) and nx.is_connected(G.subgraph(fixed_nodes))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Outputting the set formed in the previous step as the summary" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print fixed_nodes" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 2", | ||
"language": "python", | ||
"name": "python2" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.17" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.