Skip to content

Commit

Permalink
grouped counts
Browse files Browse the repository at this point in the history
  • Loading branch information
Kriti Chandak committed Feb 27, 2024
1 parent 706c34c commit 81da9e7
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 30 deletions.
50 changes: 25 additions & 25 deletions Visualization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7f3739a81544457f83a9a2ed8af84a6f",
"model_id": "73ef2ff24bf3487d9d37babc6a017293",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FigureWidget({\n",
" 'data': [{'type': 'bar',\n",
" 'uid': '2ce7fbcc-e9e3-4325-8183-a7fc0a5ac065',\n",
" 'uid': '2415e166-8646-4d5c-aaff-1ed05e5ab8a4',\n",
" 'width': 0.3,\n",
" 'x': [wait for data],\n",
" 'y': [0]}],\n",
Expand Down Expand Up @@ -126,14 +126,14 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a72cfa2d14948d6aa06f910b56aa37c",
"model_id": "5c2a6e505e264d30bc8e664ca55a861e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FigureWidget({\n",
" 'data': [{'type': 'bar',\n",
" 'uid': '0b3a1870-8ac1-4fe5-a0ad-fe371d4991b0',\n",
" 'uid': '122f28f0-bce8-483e-a563-9bc93485faba',\n",
" 'width': 0.3,\n",
" 'x': [wait for data],\n",
" 'y': [0]}],\n",
Expand Down Expand Up @@ -186,14 +186,14 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7848fc3da5214ed9a2b38bf187320bb4",
"model_id": "0a2e6ed5c3834e2a911d5a613fe28f73",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FigureWidget({\n",
" 'data': [{'type': 'bar',\n",
" 'uid': 'bf590879-f13b-461d-b716-6d22b56bc524',\n",
" 'uid': '9e4d769a-a514-49ef-bf40-ae3cc93122f1',\n",
" 'width': 0.3,\n",
" 'x': [wait for data],\n",
" 'y': [0]}],\n",
Expand Down Expand Up @@ -246,14 +246,14 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "74c38533a7894261b3367d9b64ae5629",
"model_id": "8878ab396738438aa0a2c0b5f1e57259",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FigureWidget({\n",
" 'data': [{'type': 'bar',\n",
" 'uid': '8eeb94e4-737f-4d3f-9ade-d7edef1a8ec7',\n",
" 'uid': 'abb0f4cb-ce3a-4755-9f09-e7452374ff47',\n",
" 'width': 0.3,\n",
" 'x': [wait for data],\n",
" 'y': [0]}],\n",
Expand All @@ -280,20 +280,7 @@
"execution_count": 11,
"id": "b91fd8d7-3e8a-423b-a7db-4340e1580beb",
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "list assignment index out of range",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[11], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_slice \u001b[38;5;129;01min\u001b[39;00m df_list:\n\u001b[1;32m 4\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.5\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mgroup_by_sum_ola\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess_slice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_slice\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/CS598-MP1-OLA/ola.py:190\u001b[0m, in \u001b[0;36mGroupBySumOla.process_slice\u001b[0;34m(self, df_slice)\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 188\u001b[0m \u001b[38;5;66;03m# Initialize running sums\u001b[39;00m\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msums[group] \u001b[38;5;241m=\u001b[39m sums\n\u001b[0;32m--> 190\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimates\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msums[group] \u001b[38;5;241m*\u001b[39m multiplier\n\u001b[1;32m 192\u001b[0m \u001b[38;5;66;03m# Update the plot\u001b[39;00m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;66;03m# hint: self.update_widget(*list of groups*, *list of estimated grouped sums of sum_col*)\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupdate_widget(\u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimates\u001b[38;5;241m.\u001b[39mkeys()), \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimates\u001b[38;5;241m.\u001b[39mvalues()))\n",
"\u001b[0;31mIndexError\u001b[0m: list assignment index out of range"
]
}
],
"outputs": [],
"source": [
"# Incrementally process the slices. The plot should update during the processing.\n",
"group_by_sum_ola = GroupBySumOla(group_by_sum_widget, len(df), \"shop_id\", \"item_cnt_day\")\n",
Expand All @@ -319,14 +306,14 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "90e50760c93b49af8ae2427895c9a267",
"model_id": "aa09c720018840b19013b6be5b16be57",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FigureWidget({\n",
" 'data': [{'type': 'bar',\n",
" 'uid': '47b369fb-5eb1-4fff-9174-e6d212f42ea6',\n",
" 'uid': 'b5937a9c-4501-44b6-a582-fe83f473566d',\n",
" 'width': 0.3,\n",
" 'x': [wait for data],\n",
" 'y': [0]}],\n",
Expand Down Expand Up @@ -361,7 +348,20 @@
"execution_count": 13,
"id": "dce377e4-5615-4407-915d-fd13a7099f71",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "AttributeError",
"evalue": "'GroupByCountOla' object has no attribute 'mean_col'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[13], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_slice \u001b[38;5;129;01min\u001b[39;00m df_list:\n\u001b[1;32m 4\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m0.5\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mgroup_by_count_ola\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess_slice\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_slice\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/CS598-MP1-OLA/ola.py:222\u001b[0m, in \u001b[0;36mGroupByCountOla.process_slice\u001b[0;34m(self, df_slice)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124;03m Update the running grouped counts with a dataframe slice.\u001b[39;00m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;66;03m# Implement me!\u001b[39;00m\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcount \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m df_slice\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroupby_col)\u001b[38;5;241m.\u001b[39mcount()[\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean_col\u001b[49m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 223\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindexes \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m df_slice\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroupby_col)\u001b[38;5;241m.\u001b[39mcount()[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmean_col]\u001b[38;5;241m.\u001b[39mindex\n\u001b[1;32m 225\u001b[0m \u001b[38;5;66;03m# Update the plot\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;66;03m# hint: self.update_widget(*list of groups*, *list of estimated group counts of count_col*)\u001b[39;00m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'GroupByCountOla' object has no attribute 'mean_col'"
]
}
],
"source": [
"# Incrementally process the slices. The plot should update during the processing.\n",
"group_by_count_ola = GroupByCountOla(group_by_count_widget, len(df), \"shop_id\", \"item_cnt_day\")\n",
Expand Down
29 changes: 24 additions & 5 deletions ola.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,20 +211,39 @@ def __init__(self, widget: go.FigureWidget, original_df_num_rows: int, groupby_c
self.count_col = count_col

# Put any other bookkeeping class variables you need here...
self.count = []
self.indexes = []
self.counts = {}
self.seen_rows = 0
self.estimates = {}

def process_slice(self, df_slice: pd.DataFrame) -> None:
"""
Update the running grouped counts with a dataframe slice.
"""
# Implement me!
self.count += df_slice.groupby(self.groupby_col).count()[self.mean_col].values
self.indexes += df_slice.groupby(self.groupby_col).count()[self.mean_col].index
grouped_counts = df_slice.groupby(self.groupby_col)[self.count_col].count()

self.seen_rows += len(df_slice)

# Calculate the percentage of the dataframe seen
multiplier = 1 / (self.seen_rows / self.original_df_num_rows)


# Update running sums for each group
for group, counts in grouped_counts.items():
if group in self.counts:
# Update running sums
self.counts[group] += counts
self.estimates[group] = self.counts[group] * multiplier

else:
# Initialize running sums
self.counts[group] = counts
self.estimates[group] = self.counts[group] * multiplier


# Update the plot
# hint: self.update_widget(*list of groups*, *list of estimated group counts of count_col*)
self.update_widget(self.indexes, self.count)
self.update_widget(list(self.estimates.keys()), list(self.estimates.values()))


class FilterDistinctOla(OLA):
Expand Down

0 comments on commit 81da9e7

Please sign in to comment.