removed stemming replaced lemmaztization

privacyrespected · Jun 28, 2022 · 573ddcc · 573ddcc
1 parent 8a4afa1
commit 573ddcc
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 22 deletions.
diff --git a/experiments/SL/stemming.ipynb b/experiments/SL/stemming.ipynb
@@ -62,9 +62,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "article\n",
+      "friendship\n",
+      "study\n",
+      "phone\n"
+     ]
+    }
+   ],
    "source": [
     "from nltk import WordNetLemmatizer\n",
     "lemmatizer = WordNetLemmatizer()\n",

diff --git a/experiments/implementation.ipynb b/experiments/implementation.ipynb
@@ -57,20 +57,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Original\n",
-      ", what is the definition of photosynthesi\n",
-      "stemmed\n",
-      ", what is the definition of photosynthesis\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#this is the first part of the NLP\n",
     "from nltk.stem import PorterStemmer\n",
@@ -113,9 +102,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['', 'what', 'is', 'the', 'definition', 'of', 'photosynthesis']\n",
+      "\n",
+      "what\n",
+      "is\n",
+      "the\n",
+      "definit\n",
+      "of\n",
+      "photosynthes\n"
+     ]
+    }
+   ],
    "source": [
     "#this is the first part of the NLP\n",
     "from string import punctuation\n",
@@ -138,17 +142,84 @@
     "        query=query.replace(x,\"\")\n",
     "    else:\n",
     "        continue\n",
+    "for x in punctuation:\n",
+    "    if x in query:\n",
+    "        query=query.replace(x,\"\")\n",
+    "    else:\n",
+    "        continue\n",
     "#break string\n",
     "new=query.split(\" \")\n",
     "print(new)\n",
     "# PorterStemmer\n",
     "porter = PorterStemmer()\n",
     "# LancasterStemmer\n",
     "lancaster = LancasterStemmer()\n",
-    "print(\"Original\")\n",
-    "print(porter.stem(query))\n",
-    "print(\"stemmed\")\n",
-    "print(lancaster.stem(query))"
+    "for x in new:\n",
+    "    print(lancaster.stem(x))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Seems like stemming can hardly be implemented as the results will be nonsensical. As such, we should continue on the trial on lemmatization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "what\n",
+      "is\n",
+      "the\n",
+      "definition\n",
+      "of\n",
+      "photosynthesis\n"
+     ]
+    }
+   ],
+   "source": [
+    "from nltk import WordNetLemmatizer\n",
+    "lemmatizer = WordNetLemmatizer()\n",
+    "query=\"Alpha, what is the definitions of photosynthesis\"\n",
+    "query=query.lower()\n",
+    "extrawords=[\n",
+    "    \"alpha\",\n",
+    "    \"hello\"\n",
+    "]\n",
+    "punctuation=[ #can be replaced with chatterbot corpus maybe?\n",
+    "    \",\",\n",
+    "    \".\",\n",
+    "    \"?\",\n",
+    "    \"!\"\n",
+    "]\n",
+    "for x in extrawords:\n",
+    "    if x in query:\n",
+    "        query=query.replace(x,\"\")\n",
+    "    else:\n",
+    "        continue\n",
+    "for x in punctuation:\n",
+    "    if x in query:\n",
+    "        query=query.replace(x,\"\")\n",
+    "    else:\n",
+    "        continue\n",
+    "#break string\n",
+    "words=query.split(\" \")\n",
+    "for word in words:\n",
+    "    print(lemmatizer.lemmatize(word))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As shown, the query will be a lot clean in this scenario. Now that we will just need to link the broken string back into one single string for easier processing."
    ]
   }
  ],

diff --git a/experiments/speechrecog.ipynb b/experiments/speechrecog.ipynb
@@ -137,7 +137,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Task: Explore potential in implementation"
+    "Task: Explore potential in implementation"
    ]
   }
  ],