Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating graphs #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/scripts/update_cases.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from urllib.error import HTTPError
import pandas as pd
import datetime
Expand Down Expand Up @@ -25,7 +26,7 @@ def download_sd_cases():
DataFrame detailing the daily number of cases in San Diego.
"""
def _append_population( dataframe ):
pop_loc = "resources/zip_pop.csv"
pop_loc = os.path.abspath("../../resources/zip_pop.csv")
pop = pd.read_csv( pop_loc, usecols=["Zip", "Total Population"], thousands=",", dtype={"Zip" : str, "Total Population" : int } )
dataframe = dataframe.merge( pop, left_on="ziptext", right_on="Zip", how="left" )
dataframe = dataframe.drop( columns=["Zip"] )
Expand All @@ -42,14 +43,14 @@ def _add_missing_cases( entry, start ):
return entry

# First, we load current dataset of cases
sd = pd.read_csv( "resources/cases.csv", parse_dates=["updatedate"] )
sd = pd.read_csv(os.path.abspath("../../resources/cases.csv"), parse_dates=["updatedate"] )
sd = sd.loc[~sd["ziptext"].isna()]
sd = sd.loc[sd["ziptext"]!="None"]
sd["ziptext"] = sd["ziptext"].astype(int).astype(str)

# Next we load the diff, which is an offset which helps reconcile differences between dataset. Not perfect and we
# still see a big leap when we switched.
diff = pd.read_csv("resources/cases-zip-diff.csv", index_col="zipcode", dtype={"zipcode" : str, "diff" : float})
diff = pd.read_csv(os.path.abspath("../../resources/cases-zip-diff.csv"), index_col="zipcode", dtype={"zipcode" : str, "diff" : float})
diff = diff["diff"]

# We load the latest cummulative cases from the Tableau dashbaord
Expand Down Expand Up @@ -157,4 +158,4 @@ def download_cases():

if __name__ == "__main__":
cases = download_cases()
cases.to_csv( "resources/cases.csv", index=False )
cases.to_csv( os.path.abspath("../../resources/cases.csv"), index=False )
15 changes: 8 additions & 7 deletions .github/scripts/update_growth_rates.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
Expand All @@ -9,8 +10,8 @@
from scipy.special import expit, logit
import pickle

SEQS_LOCATION = "resources/sequences.csv"
VOC_LOCATION = "resources/voc.txt"
SEQS_LOCATION = os.path.abspath("../../resources/sequences.csv")
VOC_LOCATION = os.path.abspath("../../resources/voc.txt")

aliasor = Aliasor()

Expand All @@ -37,7 +38,7 @@ def load_sequences():
def collapse_lineage( entry : str, accepted_lineages: set[str] ):
if entry in accepted_lineages or "." not in entry:
return entry
elif re.match( "[A-Z]{2}.\d+$", entry ):
elif re.match( "[A-Z]{2}.\\+$", entry ):
return aliasor.partial_compress( aliasor.uncompress( entry ), accepted_aliases=["BA"] )
return ".".join( entry.split( "." )[:-1] )

Expand Down Expand Up @@ -83,9 +84,9 @@ def calculate_growth_rate( results ):
return coeff

def dump_model_names( model, collapsed_names ):
with open( "resources/clinical.model", "wb" ) as model_file:
with open( os.path.abspath("../../resources/clinical.model"), "wb" ) as model_file:
pickle.dump( model, model_file )
with open( "resources/collapsed_names.csv", "w" ) as cn:
with open( os.path.abspath("../../resources/collapsed_names.csv"), "w" ) as cn:
cn.write( "lineage,collapsed_lineage\n" )
[cn.write( f"{k},{v}\n" ) for k, v in collapsed_names.items()]

Expand Down Expand Up @@ -214,5 +215,5 @@ def calculate_growth_rates():

if __name__ == "__main__":
growth_rates_filtered, all = calculate_growth_rates()
growth_rates_filtered.to_csv( "resources/growth_rates.csv", index=False )
all.to_csv( "resources/growth_rates_all.csv", index=False )
growth_rates_filtered.to_csv( os.path.abspath("../../resources/growth_rates.csv") , index=False )
all.to_csv( os.path.abspath("../../resources/growth_rates_all.csv"), index=False )
14 changes: 8 additions & 6 deletions .github/scripts/update_seqs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
import pandas as pd
from epiweeks import Week
import datetime

path = os.path.abspath('../../resources/excite_providers.csv')
def load_excite_providers() :
excite = pd.read_csv( "resources/excite_providers.csv" )
excite = excite.set_index( "search_id" )
return excite["source"].to_dict()
excite = pd.read_csv(path)
excite = excite.set_index( "ID" )
return excite["provider"].to_dict()

def load_file_as_list( loc ):
with open( loc, "r" ) as open_file:
Expand Down Expand Up @@ -66,7 +68,7 @@ def download_search():
md["sequencer"] = "Andersen Lab"
md.loc[md["originating_lab"]=="UCSD EXCITE Lab","sequencer"] = "UCSD EXCITE Lab"
md.loc[md["authors"]=="Helix","sequencer"] = "Helix"
md.loc[md["ID"].isin( load_file_as_list( "resources/sdphl_sequences.txt" ) ),"sequencer"] = "SD County Public Health Laboratory"
md.loc[md["ID"].isin( load_file_as_list( os.path.abspath('../../resources/sdphl_sequences.txt' )) ),"sequencer"] = "SD County Public Health Laboratory"
md.loc[md['ID'].str.startswith( "CA-SDCPHL-" ),"sequencer"] = "SD County Public Health Laboratory"

md["provider"] = md["originating_lab"]
Expand Down Expand Up @@ -94,7 +96,7 @@ def download_search():
md["num"] = md["ID"].str.extract( "SEARCH-([0-9]+)" )
md.loc[md["num"].isna(),"num"] = md["ID"]

md = md.merge( pango, left_on="num", right_on="num", how="left", validate="one_to_one" )
md = md.merge( pango, left_on="num", right_on="num", how="left")

# Filter sequences which failed lineage calling. These sequences are likely incomplete/erroneous.
md = md.loc[~md["lineage"].isin( ["None", "Unassigned"] )]
Expand All @@ -105,4 +107,4 @@ def download_search():

if __name__ == "__main__":
seqs_md = download_search()
seqs_md.to_csv( "resources/sequences.csv", index=False )
seqs_md.to_csv( path, index=False )
2 changes: 1 addition & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@
)

if __name__ == '__main__':
app.run_server( debug=False )
app.run_server( debug=False, port=8051 )
Binary file added resources/clinical.model
Binary file not shown.
Loading