grouped counts

kritic3 · Feb 27, 2024 · 81da9e7 · 81da9e7
1 parent 706c34c
commit 81da9e7
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 30 deletions.
diff --git a/Visualization.ipynb b/Visualization.ipynb
@@ -65,14 +65,14 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7f3739a81544457f83a9a2ed8af84a6f",
+       "model_id": "73ef2ff24bf3487d9d37babc6a017293",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "FigureWidget({\n",
        "    'data': [{'type': 'bar',\n",
-       "              'uid': '2ce7fbcc-e9e3-4325-8183-a7fc0a5ac065',\n",
+       "              'uid': '2415e166-8646-4d5c-aaff-1ed05e5ab8a4',\n",
        "              'width': 0.3,\n",
        "              'x': [wait for data],\n",
        "              'y': [0]}],\n",
@@ -126,14 +126,14 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6a72cfa2d14948d6aa06f910b56aa37c",
+       "model_id": "5c2a6e505e264d30bc8e664ca55a861e",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "FigureWidget({\n",
        "    'data': [{'type': 'bar',\n",
-       "              'uid': '0b3a1870-8ac1-4fe5-a0ad-fe371d4991b0',\n",
+       "              'uid': '122f28f0-bce8-483e-a563-9bc93485faba',\n",
        "              'width': 0.3,\n",
        "              'x': [wait for data],\n",
        "              'y': [0]}],\n",
@@ -186,14 +186,14 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7848fc3da5214ed9a2b38bf187320bb4",
+       "model_id": "0a2e6ed5c3834e2a911d5a613fe28f73",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "FigureWidget({\n",
        "    'data': [{'type': 'bar',\n",
-       "              'uid': 'bf590879-f13b-461d-b716-6d22b56bc524',\n",
+       "              'uid': '9e4d769a-a514-49ef-bf40-ae3cc93122f1',\n",
        "              'width': 0.3,\n",
        "              'x': [wait for data],\n",
        "              'y': [0]}],\n",
@@ -246,14 +246,14 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "74c38533a7894261b3367d9b64ae5629",
+       "model_id": "8878ab396738438aa0a2c0b5f1e57259",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "FigureWidget({\n",
        "    'data': [{'type': 'bar',\n",
-       "              'uid': '8eeb94e4-737f-4d3f-9ade-d7edef1a8ec7',\n",
+       "              'uid': 'abb0f4cb-ce3a-4755-9f09-e7452374ff47',\n",
        "              'width': 0.3,\n",
        "              'x': [wait for data],\n",
        "              'y': [0]}],\n",
@@ -280,20 +280,7 @@
    "execution_count": 11,
    "id": "b91fd8d7-3e8a-423b-a7db-4340e1580beb",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "IndexError",
-     "evalue": "list assignment index out of range",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[11], line 5\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_slice \u001b[38;5;129;01min\u001b[39;00m df_list:\n\u001b[1;32m      4\u001b[0m     time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.5\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m     \u001b[43mgroup_by_sum_ola\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess_slice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_slice\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/CS598-MP1-OLA/ola.py:190\u001b[0m, in \u001b[0;36mGroupBySumOla.process_slice\u001b[0;34m(self, df_slice)\u001b[0m\n\u001b[1;32m    187\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    188\u001b[0m         \u001b[38;5;66;03m# Initialize running sums\u001b[39;00m\n\u001b[1;32m    189\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msums[group] \u001b[38;5;241m=\u001b[39m sums\n\u001b[0;32m--> 190\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimates\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msums[group] \u001b[38;5;241m*\u001b[39m multiplier\n\u001b[1;32m    192\u001b[0m \u001b[38;5;66;03m# Update the plot\u001b[39;00m\n\u001b[1;32m    193\u001b[0m \u001b[38;5;66;03m# hint: self.update_widget(*list of groups*, *list of estimated grouped sums of sum_col*)\u001b[39;00m\n\u001b[1;32m    194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupdate_widget(\u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimates\u001b[38;5;241m.\u001b[39mkeys()), \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimates\u001b[38;5;241m.\u001b[39mvalues()))\n",
-      "\u001b[0;31mIndexError\u001b[0m: list assignment index out of range"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Incrementally process the slices. The plot should update during the processing.\n",
     "group_by_sum_ola = GroupBySumOla(group_by_sum_widget, len(df), \"shop_id\", \"item_cnt_day\")\n",
@@ -319,14 +306,14 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "90e50760c93b49af8ae2427895c9a267",
+       "model_id": "aa09c720018840b19013b6be5b16be57",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "FigureWidget({\n",
        "    'data': [{'type': 'bar',\n",
-       "              'uid': '47b369fb-5eb1-4fff-9174-e6d212f42ea6',\n",
+       "              'uid': 'b5937a9c-4501-44b6-a582-fe83f473566d',\n",
        "              'width': 0.3,\n",
        "              'x': [wait for data],\n",
        "              'y': [0]}],\n",
@@ -361,7 +348,20 @@
    "execution_count": 13,
    "id": "dce377e4-5615-4407-915d-fd13a7099f71",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'GroupByCountOla' object has no attribute 'mean_col'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 5\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_slice \u001b[38;5;129;01min\u001b[39;00m df_list:\n\u001b[1;32m      4\u001b[0m     time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.5\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m     \u001b[43mgroup_by_count_ola\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess_slice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_slice\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/CS598-MP1-OLA/ola.py:222\u001b[0m, in \u001b[0;36mGroupByCountOla.process_slice\u001b[0;34m(self, df_slice)\u001b[0m\n\u001b[1;32m    218\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    219\u001b[0m \u001b[38;5;124;03m    Update the running grouped counts with a dataframe slice.\u001b[39;00m\n\u001b[1;32m    220\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    221\u001b[0m \u001b[38;5;66;03m# Implement me!\u001b[39;00m\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcount \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m df_slice\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroupby_col)\u001b[38;5;241m.\u001b[39mcount()[\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean_col\u001b[49m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    223\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindexes \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m df_slice\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroupby_col)\u001b[38;5;241m.\u001b[39mcount()[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmean_col]\u001b[38;5;241m.\u001b[39mindex\n\u001b[1;32m    225\u001b[0m \u001b[38;5;66;03m# Update the plot\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;66;03m# hint: self.update_widget(*list of groups*, *list of estimated group counts of count_col*)\u001b[39;00m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'GroupByCountOla' object has no attribute 'mean_col'"
+     ]
+    }
+   ],
    "source": [
     "# Incrementally process the slices. The plot should update during the processing.\n",
     "group_by_count_ola = GroupByCountOla(group_by_count_widget, len(df), \"shop_id\", \"item_cnt_day\")\n",

diff --git a/ola.py b/ola.py
@@ -211,20 +211,39 @@ def __init__(self, widget: go.FigureWidget, original_df_num_rows: int, groupby_c
         self.count_col = count_col
 
         # Put any other bookkeeping class variables you need here...
-        self.count = []
-        self.indexes = []
+        self.counts = {}
+        self.seen_rows = 0
+        self.estimates = {}
 
     def process_slice(self, df_slice: pd.DataFrame) -> None:
         """
             Update the running grouped counts with a dataframe slice.
         """
         # Implement me!
-        self.count += df_slice.groupby(self.groupby_col).count()[self.mean_col].values
-        self.indexes += df_slice.groupby(self.groupby_col).count()[self.mean_col].index
+        grouped_counts = df_slice.groupby(self.groupby_col)[self.count_col].count()
+
+        self.seen_rows += len(df_slice)  
+
+        # Calculate the percentage of the dataframe seen
+        multiplier  = 1 / (self.seen_rows / self.original_df_num_rows)
+
+
+        # Update running sums for each group
+        for group, counts in grouped_counts.items():
+            if group in self.counts:
+                # Update running sums
+                self.counts[group] += counts
+                self.estimates[group] = self.counts[group] * multiplier
+
+            else:
+                # Initialize running sums
+                self.counts[group] = counts
+                self.estimates[group] = self.counts[group] * multiplier
+
 
         # Update the plot
         # hint: self.update_widget(*list of groups*, *list of estimated group counts of count_col*)
-        self.update_widget(self.indexes, self.count)
+        self.update_widget(list(self.estimates.keys()), list(self.estimates.values()))
 
 
 class FilterDistinctOla(OLA):