Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

missing taxonomy python notebook to script #245

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions tools/missing_taxonomy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import argparse
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above the first import, add this commented line: #!/usr/bin/env python

This allows users to run the script using ./missing_taxonomy.py in addition to python missing_taxonomy.py.

import tdtax
import os
import scope_phenom # https://github.com/bfhealy/scope-phenomenology
bfhealy marked this conversation as resolved.
Show resolved Hide resolved
from scope.utils import read_parquet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove these imports - while the notebook used them, this code does not.


phenom = scope_phenom.taxonomy
sitewide = tdtax.taxonomy

# Recursive functions to perform a reverse lookup in taxonomy dictionaries

def trace_path(dct, key):
if dct == key:
return [dct]
elif isinstance(dct, dict):
for k, v in dct.items():
p = trace_path(v, key)
if p:
return [k] + p
elif isinstance(dct, list):
lst = dct
for i in range(len(lst)):
p = trace_path(lst[i], key)
if p:
return [str(i)] + p

def get_class_path(trail, dct, key):
stepdown = dct[trail[0]][int(trail[1])]
cls = stepdown['class']
if cls == key:
return [cls]
else:
trail = trail[2:]
p = get_class_path(trail, stepdown, key)
return [cls] + p

def missing_taxonomy(parquet, mapper):
# Read in golden dataset (downloaded from Fritz), mapper
parquet_path = os.path.join(os.path.dirname(__file__), parquet)
mapper_path = os.path.join(os.path.dirname(__file__), mapper)
output_path = os.path.join(os.path.dirname(__file__), "golden_missing_labels.csv")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be best to allow the user to customize the name of this output file using another argument.

gold_new = read_parquet(parquet_path)
golden_dataset_mapper = pd.read_json(mapper_path)
gold_map = golden_dataset_mapper.copy()
none_cols = gold_map.loc['fritz_label'] == 'None'
gold_map = gold_map.drop(columns=none_cols.index[none_cols.values])

# Manipulate golden_dataset_mapper to flip keys and values
gold_map = gold_map.transpose()
gold_map.index.rename('trainingset_label', inplace=True)
gold_map = gold_map.reset_index(drop=False).set_index('fritz_label')
gold_dict = gold_map.transpose().to_dict()

labels_gold = gold_new.set_index('obj_id')[gold_new.columns[1:54]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The columns corresponding to the labels may not always be 1:54. Perhaps we could use the mapper's keys or the config file to generate a list of classifications?


classes = labels_gold.columns.values.tolist()

missing_df = pd.DataFrame({'obj_id':labels_gold.index, 'missing_descr':np.zeros(len(labels_gold),dtype=str)}).set_index('obj_id')

values = []
missing_items = []
cnt = 0

for index, row in labels_gold.iterrows():
nonzero_vals = row[row>0].index.values
for value in nonzero_vals:
mapped_c = golden_dataset_mapper[value]['fritz_label']
try:
trail = trace_path(sitewide, mapped_c)
class_path = get_class_path(trail, sitewide, mapped_c)
except TypeError:
trail = trace_path(phenom, mapped_c)
class_path = get_class_path(trail, phenom, mapped_c)
for item in class_path:
if (item != mapped_c) & (item in classes):
mapped_item = gold_dict[item]['trainingset_label']
if labels_gold.loc[index, mapped_item] == 0:
cnt += 1
print(cnt, index, value, 'missing', mapped_item, ';')
values += [value]
missing_items += [mapped_item]
missing_df.loc[index, 'missing_descr'] = missing_df.loc[index, 'missing_descr'] + f"{value} missing {mapped_item};"
missing_df.loc[index, 'missing_descr'] = missing_df.loc[index, 'missing_descr'][:-1]
missing_df[missing_df['missing_descr']!=''].reset_index().to_csv(output_path,index=False)
return None

if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("-parquet", type=str, help="path to parquet")
parser.add_argument("-mapper", type=str, help="path to mapper")
parser.add_argument(
"-merge_features",
type=bool,
nargs='?',
const=True,
default=False,
help="merge downloaded results with features from Kowalski",
)
parser.add_argument(
"-features_catalog",
type=str,
default='ZTF_source_features_DR5',
help="catalog of features on Kowalski",
)
Comment on lines +94 to +107
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These arguments are not used by the code above, so they can be removed.


args = parser.parse_args()

missing_taxonomy(args.parquet, args.mapper)