Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
kurt213 committed Feb 11, 2022
0 parents commit 6f7fef6
Show file tree
Hide file tree
Showing 7 changed files with 15,338 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.xlsx
15 changes: 15 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# CSV to N-grams demo (+ `pyinstaller` example)

A simple demo script I created for creating N grams.

Included an example `input.csv`. If you want to use your own data, you will want to make sure you are using the same format (`.csv` file, single column, no header).

Once libraries installed from `requirements.txt`, you can either run:
- `demo_script.py`
- `demo_ngrams.ipynb` - this notebook shows the step-by-step outputs from the main script.

It's also packageable into a single executable with `pyinstaller`. Just run:

```
pyinstaller --onefile --add-data "input.csv:." demo_script.py
```
1 change: 1 addition & 0 deletions create_exe.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyinstaller --onefile --add-data "input.csv:." demo_script.py
310 changes: 310 additions & 0 deletions demo_ngrams.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import xlsxwriter\n",
"\n",
"import nltk\n",
"from nltk.util import ngrams\n",
"from nltk import FreqDist"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /Users/kurt/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download for tokenize functionality\n",
"\n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 @VirginAmerica What @dhepburn said.\n",
"1 @VirginAmerica plus you've added commercials t...\n",
"2 @VirginAmerica I didn't today... Must mean I n...\n",
"3 @VirginAmerica it's really aggressive to blast...\n",
"4 @VirginAmerica and it's a really big bad thing...\n",
"Name: 0, dtype: object"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import csv\n",
"df = pd.read_csv(\"input.csv\", header=None)\n",
"\n",
"# Get Series\n",
"tweet_list = df[0]\n",
"tweet_list.head()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'VirginAmerica its really aggressive to blast obnoxious entertainment in your guests faces and they have little recourse'"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Clean tweets - removes any HTML and punctuation\n",
"\n",
"clean_tweets = []\n",
"\n",
"for t in tweet_list:\n",
" t = re.sub(r'(<br />)|(<br>)|(\\\\n)|(\\\\r)|-', \" \", str(t))\n",
" t = re.sub(r'[\\.,()+…?!\"/:@\\'“]', \"\", str(t))\n",
" t = re.sub(r'(&amp;)', \"and\", str(t))\n",
" t = \" \".join(t.split())\n",
" clean_tweets.append(t)\n",
" \n",
"clean_tweets[3]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"# Join into one string and change to lower case\n",
"# Tokenize \n",
"\n",
"all_tweets = \" \".join(clean_tweets).lower()\n",
"token_tweets = nltk.word_tokenize(all_tweets)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"# Create ngrams and frequency distributions\n",
"\n",
"tweet_sgrams = ngrams(token_tweets, 1)\n",
"tweet_bigrams = ngrams(token_tweets, 2)\n",
"tweet_trigrams = ngrams(token_tweets, 3)\n",
"tweet_quadgrams = ngrams(token_tweets, 4)\n",
"tweet_pentgrams = ngrams(token_tweets, 5)\n",
"\n",
"tweet_fds = FreqDist(tweet_sgrams)\n",
"tweet_fdb = FreqDist(tweet_bigrams)\n",
"tweet_fdt = FreqDist(tweet_trigrams)\n",
"tweet_fdq = FreqDist(tweet_quadgrams)\n",
"tweet_fdp = FreqDist(tweet_pentgrams)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"# Convert to DF\n",
"\n",
"df_tweet_sgram = pd.DataFrame.from_dict(tweet_fds, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
"df_tweet_bigram = pd.DataFrame.from_dict(tweet_fdb, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
"df_tweet_trigram = pd.DataFrame.from_dict(tweet_fdt, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
"df_tweet_quadgram = pd.DataFrame.from_dict(tweet_fdq, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
"df_tweet_pentgram = pd.DataFrame.from_dict(tweet_fdp, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>freq</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>116501</th>\n",
" <td>(our, fleets, on, fleek)</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116500</th>\n",
" <td>(jetblue, our, fleets, on)</td>\n",
" <td>137</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36291</th>\n",
" <td>(been, on, hold, for)</td>\n",
" <td>113</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19079</th>\n",
" <td>(ive, been, on, hold)</td>\n",
" <td>59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117481</th>\n",
" <td>(rt, jetblue, our, fleets)</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2908</th>\n",
" <td>(thank, you, for, the)</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10877</th>\n",
" <td>(flight, was, cancelled, flightled)</td>\n",
" <td>48</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24983</th>\n",
" <td>(on, hold, for, over)</td>\n",
" <td>43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14420</th>\n",
" <td>(for, over, an, hour)</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3229</th>\n",
" <td>(cancelled, flightled, my, flight)</td>\n",
" <td>30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index freq\n",
"116501 (our, fleets, on, fleek) 144\n",
"116500 (jetblue, our, fleets, on) 137\n",
"36291 (been, on, hold, for) 113\n",
"19079 (ive, been, on, hold) 59\n",
"117481 (rt, jetblue, our, fleets) 50\n",
"2908 (thank, you, for, the) 50\n",
"10877 (flight, was, cancelled, flightled) 48\n",
"24983 (on, hold, for, over) 43\n",
"14420 (for, over, an, hour) 41\n",
"3229 (cancelled, flightled, my, flight) 30"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_tweet_quadgram.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# Create a Pandas Excel writer using XlsxWriter as the engine.\n",
"writer = pd.ExcelWriter('ngrams_output.xlsx', engine='xlsxwriter')\n",
"\n",
"# Write each dataframe to a different worksheet.\n",
"df_tweet_sgram.to_excel(writer, sheet_name='Book-Single')\n",
"df_tweet_bigram.to_excel(writer, sheet_name='Book-Bigram')\n",
"df_tweet_trigram.to_excel(writer, sheet_name='Book-Trigram')\n",
"df_tweet_quadgram.to_excel(writer, sheet_name='Book-Quadgram')\n",
"df_tweet_pentgram.to_excel(writer, sheet_name='Book-Pentgram')\n",
"\n",
"# Close the Pandas Excel writer and output the Excel file.\n",
"writer.save()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 6f7fef6

Please sign in to comment.