initial commit

kurt213 · Feb 11, 2022 · 6f7fef6 · 6f7fef6
commit 6f7fef6
Show file tree

Hide file tree

Showing 7 changed files with 15,338 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.xlsx
diff --git a/README.MD b/README.MD
@@ -0,0 +1,15 @@
+# CSV to N-grams demo (+ `pyinstaller` example)
+
+A simple demo script I created for creating N grams.
+
+Included an example `input.csv`. If you want to use your own data, you will want to make sure you are using the same format (`.csv` file, single column, no header).
+
+Once libraries installed from `requirements.txt`, you can either run:
+- `demo_script.py`
+- `demo_ngrams.ipynb` - this notebook shows the step-by-step outputs from the main script. 
+
+It's also packageable into a single executable with `pyinstaller`. Just run:
+
+```
+pyinstaller --onefile --add-data "input.csv:." demo_script.py
+```
diff --git a/create_exe.txt b/create_exe.txt
@@ -0,0 +1 @@
+pyinstaller --onefile --add-data "input.csv:." demo_script.py
diff --git a/demo_ngrams.ipynb b/demo_ngrams.ipynb
@@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "import xlsxwriter\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.util import ngrams\n",
+    "from nltk import FreqDist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /Users/kurt/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Download for tokenize functionality\n",
+    "\n",
+    "nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0                  @VirginAmerica What @dhepburn said.\n",
+       "1    @VirginAmerica plus you've added commercials t...\n",
+       "2    @VirginAmerica I didn't today... Must mean I n...\n",
+       "3    @VirginAmerica it's really aggressive to blast...\n",
+       "4    @VirginAmerica and it's a really big bad thing...\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Import csv\n",
+    "df = pd.read_csv(\"input.csv\", header=None)\n",
+    "\n",
+    "# Get Series\n",
+    "tweet_list = df[0]\n",
+    "tweet_list.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'VirginAmerica its really aggressive to blast obnoxious entertainment in your guests faces and they have little recourse'"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Clean tweets - removes any HTML and punctuation\n",
+    "\n",
+    "clean_tweets = []\n",
+    "\n",
+    "for t in tweet_list:\n",
+    "    t = re.sub(r'(<br />)|(<br>)|(\\\\n)|(\\\\r)|-', \" \", str(t))\n",
+    "    t = re.sub(r'[\\.,()+…?!\"/:@\\'“]', \"\", str(t))\n",
+    "    t = re.sub(r'(&amp;)', \"and\", str(t))\n",
+    "    t = \" \".join(t.split())\n",
+    "    clean_tweets.append(t)\n",
+    "    \n",
+    "clean_tweets[3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Join into one string and change to lower case\n",
+    "# Tokenize \n",
+    "\n",
+    "all_tweets = \" \".join(clean_tweets).lower()\n",
+    "token_tweets = nltk.word_tokenize(all_tweets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create ngrams and frequency distributions\n",
+    "\n",
+    "tweet_sgrams = ngrams(token_tweets, 1)\n",
+    "tweet_bigrams = ngrams(token_tweets, 2)\n",
+    "tweet_trigrams = ngrams(token_tweets, 3)\n",
+    "tweet_quadgrams = ngrams(token_tweets, 4)\n",
+    "tweet_pentgrams = ngrams(token_tweets, 5)\n",
+    "\n",
+    "tweet_fds = FreqDist(tweet_sgrams)\n",
+    "tweet_fdb = FreqDist(tweet_bigrams)\n",
+    "tweet_fdt = FreqDist(tweet_trigrams)\n",
+    "tweet_fdq = FreqDist(tweet_quadgrams)\n",
+    "tweet_fdp = FreqDist(tweet_pentgrams)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert to DF\n",
+    "\n",
+    "df_tweet_sgram = pd.DataFrame.from_dict(tweet_fds, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
+    "df_tweet_bigram = pd.DataFrame.from_dict(tweet_fdb, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
+    "df_tweet_trigram = pd.DataFrame.from_dict(tweet_fdt, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
+    "df_tweet_quadgram = pd.DataFrame.from_dict(tweet_fdq, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)\n",
+    "df_tweet_pentgram = pd.DataFrame.from_dict(tweet_fdp, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>freq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>116501</th>\n",
+       "      <td>(our, fleets, on, fleek)</td>\n",
+       "      <td>144</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>116500</th>\n",
+       "      <td>(jetblue, our, fleets, on)</td>\n",
+       "      <td>137</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36291</th>\n",
+       "      <td>(been, on, hold, for)</td>\n",
+       "      <td>113</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19079</th>\n",
+       "      <td>(ive, been, on, hold)</td>\n",
+       "      <td>59</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>117481</th>\n",
+       "      <td>(rt, jetblue, our, fleets)</td>\n",
+       "      <td>50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2908</th>\n",
+       "      <td>(thank, you, for, the)</td>\n",
+       "      <td>50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10877</th>\n",
+       "      <td>(flight, was, cancelled, flightled)</td>\n",
+       "      <td>48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24983</th>\n",
+       "      <td>(on, hold, for, over)</td>\n",
+       "      <td>43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14420</th>\n",
+       "      <td>(for, over, an, hour)</td>\n",
+       "      <td>41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3229</th>\n",
+       "      <td>(cancelled, flightled, my, flight)</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      index  freq\n",
+       "116501             (our, fleets, on, fleek)   144\n",
+       "116500           (jetblue, our, fleets, on)   137\n",
+       "36291                 (been, on, hold, for)   113\n",
+       "19079                 (ive, been, on, hold)    59\n",
+       "117481           (rt, jetblue, our, fleets)    50\n",
+       "2908                 (thank, you, for, the)    50\n",
+       "10877   (flight, was, cancelled, flightled)    48\n",
+       "24983                 (on, hold, for, over)    43\n",
+       "14420                 (for, over, an, hour)    41\n",
+       "3229     (cancelled, flightled, my, flight)    30"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_tweet_quadgram.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a Pandas Excel writer using XlsxWriter as the engine.\n",
+    "writer = pd.ExcelWriter('ngrams_output.xlsx', engine='xlsxwriter')\n",
+    "\n",
+    "# Write each dataframe to a different worksheet.\n",
+    "df_tweet_sgram.to_excel(writer, sheet_name='Book-Single')\n",
+    "df_tweet_bigram.to_excel(writer, sheet_name='Book-Bigram')\n",
+    "df_tweet_trigram.to_excel(writer, sheet_name='Book-Trigram')\n",
+    "df_tweet_quadgram.to_excel(writer, sheet_name='Book-Quadgram')\n",
+    "df_tweet_pentgram.to_excel(writer, sheet_name='Book-Pentgram')\n",
+    "\n",
+    "# Close the Pandas Excel writer and output the Excel file.\n",
+    "writer.save()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		pyinstaller --onefile --add-data "input.csv:." demo_script.py