Skip to content

Commit

Permalink
Update cells CSV file
Browse files Browse the repository at this point in the history
  • Loading branch information
s2t2 committed Dec 14, 2023
1 parent 6963f2c commit a036e69
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 25 deletions.
16 changes: 16 additions & 0 deletions app/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@
FIG_SHOW = bool(os.getenv("FIG_SHOW", default="true") == "true")


# consider moving to another file or something:
def print_docs(docs, meta=False):
for doc in docs:
#print("----")
print(doc.page_content[0:50], "...", doc.page_content[-25:])
if meta:
print(doc.metadata)

# consider moving to another file or something:
def print_rows(rows):
for _, row in rows.iterrows():
#print("----")
print(row["page_content"][0:50], "...", row["page_content"][-25:])



#class Cell(Document):
# #def metadata(self):
# # meta = super().metadata
Expand Down
14 changes: 4 additions & 10 deletions app/jobs/starter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
from warnings import filterwarnings
filterwarnings("ignore")

from app import DATA_DIRPATH
from app.prompts import STUDENT_QUERY
from app.submissions_manager import SubmissionsManager
from app.document_processor import DocumentProcessor #, print_docs
from app.document_processor import DocumentProcessor, print_docs

#from pandas import pivot_table

from pandas import pivot_table

def print_docs(docs, meta=False):
for doc in docs:
#print("----")
print(doc.page_content[0:50], "...", doc.page_content[-25:])
if meta:
print(doc.metadata)

def print_relevant_cells(cells):
total_length = 0
Expand Down Expand Up @@ -65,7 +60,6 @@ def print_relevant_cells(cells):
if keep_going != "Y":
exit()

STUDENT_QUERY = "What is the student's name? What is their GW ID?"
print("QUERY:", STUDENT_QUERY)

relevant_docs = dp.text_retriever.get_relevant_documents(STUDENT_QUERY)
Expand Down
55 changes: 40 additions & 15 deletions app/jobs/submissions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from warnings import filterwarnings
filterwarnings("ignore")

import os
from pandas import DataFrame, merge
import plotly.express as px

from app import DATA_DIRPATH
from app.submissions_manager import SubmissionsManager
from app.document_processor import DocumentProcessor, print_docs
from app.document_processor import DocumentProcessor, print_docs, print_rows
from app.colors import CELL_COLORS_MAP


EMPTY_CODE_CELL = "'code' cell: '[]'"
EMPTY_TEXT_CELL = "'markdown' cell: '[]'"

Expand Down Expand Up @@ -44,13 +47,13 @@
}
records.append(record)
all_cells += dp.cells
print(len(records))
print(len(all_cells))
print("NOTEBOOKS:", len(records))
print("CELLS:", len(all_cells))

notebooks_df = DataFrame(records)
notebooks_df.index = notebooks_df["notebook"]
notebooks_df.drop(columns=["notebook"], inplace=True)
#notebooks_df.to_csv("hw_4_notebooks.csv")
notebooks_df.to_csv(os.path.join(DATA_DIRPATH, "notebooks.csv"), index=False)
#notebooks_df.head()

chart_df = notebooks_df.copy()
Expand All @@ -65,31 +68,55 @@

cells_df = DataFrame([cell.metadata for cell in all_cells])

# ... DUPLICATED CONTENT ANALYSIS

cells_df['dup_content'] = cells_df.duplicated(subset='page_content', keep=False)
print("------")
print(cells_df["dup_content"].value_counts())

starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ]
# ... STARTER CONTENT DIFFING (~30% of cells are the same as starter cells)

#starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ]
starter_rows = cells_df[ cells_df["filename"] == starter_dp.filename ]
cells_df = merge(cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter'))
cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True)
cells_df["starter_content"] = cells_df['starter_cell_id'].notna()
print("------")
print(cells_df["starter_content"].value_counts())

# ... EMPTY / BLANK CONTENT FILTERING
cells_df["is_empty"] = cells_df["page_content"].map(lambda page_content: True if page_content.strip() in [EMPTY_CODE_CELL, EMPTY_TEXT_CELL] else False)
print("------")
print(cells_df["is_empty"].value_counts())

print("------")
print("DUPLICATE NON-STARTER NON-BLANK CELLS:")
#dup_rows = cells_df[ (cells_df["starter_content"] == False) & (cells_df["dup_content"] == True) & (cells_df["is_empty"] == False)]
dup_rows = cells_df[ (cells_df["starter_content"] == False) & (cells_df["dup_content"] == True) & (cells_df["is_empty"] == False)].sort_values(by="page_content")
#for row in dup_rows:
# print(row)
print_rows(dup_rows)

cells_df.to_csv(os.path.join(DATA_DIRPATH, "all_cells.csv"), index=False)



# PLOTTING...

print("NON-STARTER DUP CELLS:")
nonstarter_dups = cells_df[ (cells_df["dup_content"] == True) & (cells_df["starter_content"] == False) ]
for i, row in nonstarter_dups.iterrows():
if row["page_content"].strip() not in [EMPTY_CODE_CELL, EMPTY_TEXT_CELL]:
print("----")
#print(row["filename"][0:25], row["cell_id"])
print(row["page_content"][0:250])
#print("NON-STARTER DUP CELLS:")
#nonstarter_dups = cells_df[ (cells_df["dup_content"] == True) & (cells_df["starter_content"] == False) ]
#for i, row in nonstarter_dups.iterrows():
# if row["page_content"].strip() not in [EMPTY_CODE_CELL, EMPTY_TEXT_CELL]:
# print("----")
# #print(row["filename"][0:25], row["cell_id"])
# print(row["page_content"][0:250])

#cells_df.to_csv("cells.csv", index=False)
#print(all_cells_df.shape)
#all_cells_df.head()

chart_df = cells_df.copy()
chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K
print(len(chart_df))
fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500,
title="Cell Lengths (All Submissions)",
hover_data=["page_content"], facet_row="cell_type",
Expand All @@ -104,7 +131,6 @@
chart_df = cells_df.copy()
chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K
chart_df = chart_df[chart_df["starter_content"] == False]
print(len(chart_df))
fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500,
title="Non-Starter Cell Lengths (All Submissions)",
hover_data=["page_content"], facet_row="cell_type",
Expand All @@ -119,7 +145,6 @@
chart_df = cells_df.copy()
chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K
chart_df = chart_df[chart_df["dup_content"] == False]
print(len(chart_df))
fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500,
title="Unique Cell Lengths (All Submissions)",
hover_data=["page_content"], facet_row="cell_type",
Expand Down
6 changes: 6 additions & 0 deletions app/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@





STUDENT_QUERY = "What is the student's name? What is their GW ID?"

0 comments on commit a036e69

Please sign in to comment.