-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6f7fef6
Showing
7 changed files
with
15,338 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.xlsx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# CSV to N-grams demo (+ `pyinstaller` example) | ||
|
||
A simple demo script I created for creating N grams. | ||
|
||
Included an example `input.csv`. If you want to use your own data, you will want to make sure you are using the same format (`.csv` file, single column, no header). | ||
|
||
Once libraries installed from `requirements.txt`, you can either run: | ||
- `demo_script.py` | ||
- `demo_ngrams.ipynb` - this notebook shows the step-by-step outputs from the main script. | ||
|
||
It's also packageable into a single executable with `pyinstaller`. Just run: | ||
|
||
``` | ||
pyinstaller --onefile --add-data "input.csv:." demo_script.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pyinstaller --onefile --add-data "input.csv:." demo_script.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,310 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 55, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import re\n", | ||
"import xlsxwriter\n", | ||
"\n", | ||
"import nltk\n", | ||
"from nltk.util import ngrams\n", | ||
"from nltk import FreqDist" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 56, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"[nltk_data] Downloading package punkt to /Users/kurt/nltk_data...\n", | ||
"[nltk_data] Package punkt is already up-to-date!\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"True" | ||
] | ||
}, | ||
"execution_count": 56, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Download for tokenize functionality\n", | ||
"\n", | ||
"nltk.download('punkt')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 57, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0 @VirginAmerica What @dhepburn said.\n", | ||
"1 @VirginAmerica plus you've added commercials t...\n", | ||
"2 @VirginAmerica I didn't today... Must mean I n...\n", | ||
"3 @VirginAmerica it's really aggressive to blast...\n", | ||
"4 @VirginAmerica and it's a really big bad thing...\n", | ||
"Name: 0, dtype: object" | ||
] | ||
}, | ||
"execution_count": 57, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Import csv\n", | ||
"df = pd.read_csv(\"input.csv\", header=None)\n", | ||
"\n", | ||
"# Get Series\n", | ||
"tweet_list = df[0]\n", | ||
"tweet_list.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 58, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'VirginAmerica its really aggressive to blast obnoxious entertainment in your guests faces and they have little recourse'" | ||
] | ||
}, | ||
"execution_count": 58, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Clean tweets - removes any HTML and punctuation\n", | ||
"\n", | ||
"clean_tweets = []\n", | ||
"\n", | ||
"for t in tweet_list:\n", | ||
" t = re.sub(r'(<br />)|(<br>)|(\\\\n)|(\\\\r)|-', \" \", str(t))\n", | ||
" t = re.sub(r'[\\.,()+…?!\"/:@\\'“]', \"\", str(t))\n", | ||
" t = re.sub(r'(&)', \"and\", str(t))\n", | ||
" t = \" \".join(t.split())\n", | ||
" clean_tweets.append(t)\n", | ||
" \n", | ||
"clean_tweets[3]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 59, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Join into one string and change to lower case\n", | ||
"# Tokenize \n", | ||
"\n", | ||
"all_tweets = \" \".join(clean_tweets).lower()\n", | ||
"token_tweets = nltk.word_tokenize(all_tweets)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 60, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create ngrams and frequency distributions\n", | ||
"\n", | ||
"tweet_sgrams = ngrams(token_tweets, 1)\n", | ||
"tweet_bigrams = ngrams(token_tweets, 2)\n", | ||
"tweet_trigrams = ngrams(token_tweets, 3)\n", | ||
"tweet_quadgrams = ngrams(token_tweets, 4)\n", | ||
"tweet_pentgrams = ngrams(token_tweets, 5)\n", | ||
"\n", | ||
"tweet_fds = FreqDist(tweet_sgrams)\n", | ||
"tweet_fdb = FreqDist(tweet_bigrams)\n", | ||
"tweet_fdt = FreqDist(tweet_trigrams)\n", | ||
"tweet_fdq = FreqDist(tweet_quadgrams)\n", | ||
"tweet_fdp = FreqDist(tweet_pentgrams)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 61, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Convert to DF\n", | ||
"\n", | ||
"df_tweet_sgram = pd.DataFrame.from_dict(tweet_fds, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n", | ||
"df_tweet_bigram = pd.DataFrame.from_dict(tweet_fdb, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n", | ||
"df_tweet_trigram = pd.DataFrame.from_dict(tweet_fdt, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n", | ||
"df_tweet_quadgram = pd.DataFrame.from_dict(tweet_fdq, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n", | ||
"df_tweet_pentgram = pd.DataFrame.from_dict(tweet_fdp, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 62, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>index</th>\n", | ||
" <th>freq</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>116501</th>\n", | ||
" <td>(our, fleets, on, fleek)</td>\n", | ||
" <td>144</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>116500</th>\n", | ||
" <td>(jetblue, our, fleets, on)</td>\n", | ||
" <td>137</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>36291</th>\n", | ||
" <td>(been, on, hold, for)</td>\n", | ||
" <td>113</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>19079</th>\n", | ||
" <td>(ive, been, on, hold)</td>\n", | ||
" <td>59</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>117481</th>\n", | ||
" <td>(rt, jetblue, our, fleets)</td>\n", | ||
" <td>50</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2908</th>\n", | ||
" <td>(thank, you, for, the)</td>\n", | ||
" <td>50</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>10877</th>\n", | ||
" <td>(flight, was, cancelled, flightled)</td>\n", | ||
" <td>48</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>24983</th>\n", | ||
" <td>(on, hold, for, over)</td>\n", | ||
" <td>43</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>14420</th>\n", | ||
" <td>(for, over, an, hour)</td>\n", | ||
" <td>41</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3229</th>\n", | ||
" <td>(cancelled, flightled, my, flight)</td>\n", | ||
" <td>30</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" index freq\n", | ||
"116501 (our, fleets, on, fleek) 144\n", | ||
"116500 (jetblue, our, fleets, on) 137\n", | ||
"36291 (been, on, hold, for) 113\n", | ||
"19079 (ive, been, on, hold) 59\n", | ||
"117481 (rt, jetblue, our, fleets) 50\n", | ||
"2908 (thank, you, for, the) 50\n", | ||
"10877 (flight, was, cancelled, flightled) 48\n", | ||
"24983 (on, hold, for, over) 43\n", | ||
"14420 (for, over, an, hour) 41\n", | ||
"3229 (cancelled, flightled, my, flight) 30" | ||
] | ||
}, | ||
"execution_count": 62, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df_tweet_quadgram.head(10)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 64, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a Pandas Excel writer using XlsxWriter as the engine.\n", | ||
"writer = pd.ExcelWriter('ngrams_output.xlsx', engine='xlsxwriter')\n", | ||
"\n", | ||
"# Write each dataframe to a different worksheet.\n", | ||
"df_tweet_sgram.to_excel(writer, sheet_name='Book-Single')\n", | ||
"df_tweet_bigram.to_excel(writer, sheet_name='Book-Bigram')\n", | ||
"df_tweet_trigram.to_excel(writer, sheet_name='Book-Trigram')\n", | ||
"df_tweet_quadgram.to_excel(writer, sheet_name='Book-Quadgram')\n", | ||
"df_tweet_pentgram.to_excel(writer, sheet_name='Book-Pentgram')\n", | ||
"\n", | ||
"# Close the Pandas Excel writer and output the Excel file.\n", | ||
"writer.save()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.