From 81da9e7363b9d9aa66f26cbb31ade92623adecd5 Mon Sep 17 00:00:00 2001 From: Kriti Chandak Date: Tue, 27 Feb 2024 16:27:02 -0600 Subject: [PATCH] grouped counts --- Visualization.ipynb | 50 ++++++++++++++++++++++----------------------- ola.py | 29 +++++++++++++++++++++----- 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/Visualization.ipynb b/Visualization.ipynb index a62cdc2..ae75e6f 100644 --- a/Visualization.ipynb +++ b/Visualization.ipynb @@ -65,14 +65,14 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7f3739a81544457f83a9a2ed8af84a6f", + "model_id": "73ef2ff24bf3487d9d37babc6a017293", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FigureWidget({\n", " 'data': [{'type': 'bar',\n", - " 'uid': '2ce7fbcc-e9e3-4325-8183-a7fc0a5ac065',\n", + " 'uid': '2415e166-8646-4d5c-aaff-1ed05e5ab8a4',\n", " 'width': 0.3,\n", " 'x': [wait for data],\n", " 'y': [0]}],\n", @@ -126,14 +126,14 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6a72cfa2d14948d6aa06f910b56aa37c", + "model_id": "5c2a6e505e264d30bc8e664ca55a861e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FigureWidget({\n", " 'data': [{'type': 'bar',\n", - " 'uid': '0b3a1870-8ac1-4fe5-a0ad-fe371d4991b0',\n", + " 'uid': '122f28f0-bce8-483e-a563-9bc93485faba',\n", " 'width': 0.3,\n", " 'x': [wait for data],\n", " 'y': [0]}],\n", @@ -186,14 +186,14 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7848fc3da5214ed9a2b38bf187320bb4", + "model_id": "0a2e6ed5c3834e2a911d5a613fe28f73", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FigureWidget({\n", " 'data': [{'type': 'bar',\n", - " 'uid': 'bf590879-f13b-461d-b716-6d22b56bc524',\n", + " 'uid': '9e4d769a-a514-49ef-bf40-ae3cc93122f1',\n", " 'width': 0.3,\n", " 'x': [wait for data],\n", " 'y': [0]}],\n", @@ -246,14 +246,14 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "74c38533a7894261b3367d9b64ae5629", + "model_id": "8878ab396738438aa0a2c0b5f1e57259", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FigureWidget({\n", " 'data': [{'type': 'bar',\n", - " 'uid': '8eeb94e4-737f-4d3f-9ade-d7edef1a8ec7',\n", + " 'uid': 'abb0f4cb-ce3a-4755-9f09-e7452374ff47',\n", " 'width': 0.3,\n", " 'x': [wait for data],\n", " 'y': [0]}],\n", @@ -280,20 +280,7 @@ "execution_count": 11, "id": "b91fd8d7-3e8a-423b-a7db-4340e1580beb", "metadata": {}, - "outputs": [ - { - "ename": "IndexError", - "evalue": "list assignment index out of range", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_slice \u001b[38;5;129;01min\u001b[39;00m df_list:\n\u001b[1;32m 4\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.5\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mgroup_by_sum_ola\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess_slice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_slice\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/CS598-MP1-OLA/ola.py:190\u001b[0m, in \u001b[0;36mGroupBySumOla.process_slice\u001b[0;34m(self, df_slice)\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 188\u001b[0m \u001b[38;5;66;03m# Initialize running sums\u001b[39;00m\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msums[group] \u001b[38;5;241m=\u001b[39m sums\n\u001b[0;32m--> 190\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimates\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msums[group] \u001b[38;5;241m*\u001b[39m multiplier\n\u001b[1;32m 192\u001b[0m \u001b[38;5;66;03m# Update the plot\u001b[39;00m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;66;03m# hint: self.update_widget(*list of groups*, *list of estimated grouped sums of sum_col*)\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupdate_widget(\u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimates\u001b[38;5;241m.\u001b[39mkeys()), \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimates\u001b[38;5;241m.\u001b[39mvalues()))\n", - "\u001b[0;31mIndexError\u001b[0m: list assignment index out of range" - ] - } - ], + "outputs": [], "source": [ "# Incrementally process the slices. The plot should update during the processing.\n", "group_by_sum_ola = GroupBySumOla(group_by_sum_widget, len(df), \"shop_id\", \"item_cnt_day\")\n", @@ -319,14 +306,14 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "90e50760c93b49af8ae2427895c9a267", + "model_id": "aa09c720018840b19013b6be5b16be57", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FigureWidget({\n", " 'data': [{'type': 'bar',\n", - " 'uid': '47b369fb-5eb1-4fff-9174-e6d212f42ea6',\n", + " 'uid': 'b5937a9c-4501-44b6-a582-fe83f473566d',\n", " 'width': 0.3,\n", " 'x': [wait for data],\n", " 'y': [0]}],\n", @@ -361,7 +348,20 @@ "execution_count": 13, "id": "dce377e4-5615-4407-915d-fd13a7099f71", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'GroupByCountOla' object has no attribute 'mean_col'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_slice \u001b[38;5;129;01min\u001b[39;00m df_list:\n\u001b[1;32m 4\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.5\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mgroup_by_count_ola\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess_slice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_slice\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/CS598-MP1-OLA/ola.py:222\u001b[0m, in \u001b[0;36mGroupByCountOla.process_slice\u001b[0;34m(self, df_slice)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124;03m Update the running grouped counts with a dataframe slice.\u001b[39;00m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;66;03m# Implement me!\u001b[39;00m\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcount \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m df_slice\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroupby_col)\u001b[38;5;241m.\u001b[39mcount()[\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean_col\u001b[49m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindexes \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m df_slice\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroupby_col)\u001b[38;5;241m.\u001b[39mcount()[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmean_col]\u001b[38;5;241m.\u001b[39mindex\n\u001b[1;32m 225\u001b[0m \u001b[38;5;66;03m# Update the plot\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;66;03m# hint: self.update_widget(*list of groups*, *list of estimated group counts of count_col*)\u001b[39;00m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'GroupByCountOla' object has no attribute 'mean_col'" + ] + } + ], "source": [ "# Incrementally process the slices. The plot should update during the processing.\n", "group_by_count_ola = GroupByCountOla(group_by_count_widget, len(df), \"shop_id\", \"item_cnt_day\")\n", diff --git a/ola.py b/ola.py index febc6cc..088de32 100644 --- a/ola.py +++ b/ola.py @@ -211,20 +211,39 @@ def __init__(self, widget: go.FigureWidget, original_df_num_rows: int, groupby_c self.count_col = count_col # Put any other bookkeeping class variables you need here... - self.count = [] - self.indexes = [] + self.counts = {} + self.seen_rows = 0 + self.estimates = {} def process_slice(self, df_slice: pd.DataFrame) -> None: """ Update the running grouped counts with a dataframe slice. """ # Implement me! - self.count += df_slice.groupby(self.groupby_col).count()[self.mean_col].values - self.indexes += df_slice.groupby(self.groupby_col).count()[self.mean_col].index + grouped_counts = df_slice.groupby(self.groupby_col)[self.count_col].count() + + self.seen_rows += len(df_slice) + + # Calculate the percentage of the dataframe seen + multiplier = 1 / (self.seen_rows / self.original_df_num_rows) + + + # Update running sums for each group + for group, counts in grouped_counts.items(): + if group in self.counts: + # Update running sums + self.counts[group] += counts + self.estimates[group] = self.counts[group] * multiplier + + else: + # Initialize running sums + self.counts[group] = counts + self.estimates[group] = self.counts[group] * multiplier + # Update the plot # hint: self.update_widget(*list of groups*, *list of estimated group counts of count_col*) - self.update_widget(self.indexes, self.count) + self.update_widget(list(self.estimates.keys()), list(self.estimates.values())) class FilterDistinctOla(OLA):