Skip to content

Commit

Permalink
2nd try
Browse files Browse the repository at this point in the history
  • Loading branch information
namsaraeva committed Apr 22, 2024
1 parent 312d6f8 commit 626c549
Showing 1 changed file with 12 additions and 15 deletions.
27 changes: 12 additions & 15 deletions src/sparcscore/ml/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ class HDF5SingleCellDatasetRegression(Dataset):

def __init__(self,
dir_list: list[str],
target_values: list[float],
target_col: list[int],
root_dir: str,
max_level: int = 5,
transform = None,
Expand All @@ -215,7 +215,7 @@ def __init__(self,
select_channel = None):

self.root_dir = root_dir
self.target_values = target_values
self.target_col = target_col
self.dir_list = dir_list
self.transform = transform
self.select_channel = select_channel
Expand All @@ -225,9 +225,10 @@ def __init__(self,
# scan all directories in dir_list
for i, directory in enumerate(dir_list):
path = os.path.join(self.root_dir, directory) # get full path
current_target = self.target_values[i] # get target value

current_target = self.target_col[i] # get target value

filetype = directory.split(".")[-1] # get filetype

if filetype in self.HDF_FILETYPES:
self.add_hdf_to_index(current_target, directory) # check if "directory" is a path to specific hdf5 and add to index
else:
Expand All @@ -238,16 +239,19 @@ def __init__(self,
self.stats() # print dataset stats at the end


def add_hdf_to_index(self, current_target, path):
def add_hdf_to_index(self, target_col, path):
try:
input_hdf = h5py.File(path, 'r') # read hdf5 file
index_handle = input_hdf.get('single_cell_index') # get index handle

current_target = input_hdf.get('single_cell_index_labelled').asstr()[:, target_col] # get target value
current_target[current_target == ''] = np.nan # replace empty strings with nan
current_target = current_target.astype(float) # convert to float

handle_id = len(self.handle_list) # get handle id
self.handle_list.append(input_hdf.get('single_cell_data')) # append data handle

for row in index_handle:
self.data_locator.append([current_target, handle_id] + list(row)) # append data locator with target, handle id, and row
self.data_locator.append([current_target, handle_id] + list(row)) # append data locator
except:
return

Expand All @@ -263,25 +267,18 @@ def scan_directory(self, path, current_target, levels_left):
self.add_hdf_to_index(current_target, os.path.join(path, file)) # add hdf5 files to index if filetype is supported

for subdirectory in current_level_directories: # recursively scan subdirectories
self.scan_directory(subdirectory, current_target, levels_left-1)
self.scan_directory(subdirectory, current_target, levels_left - 1)
else:
return

def stats(self):
targets = [info[0] for info in self.data_locator]

targets = np.array(targets, dtype=float)

mean_target = np.mean(targets)
median_target = np.median(targets)
std_target = np.std(targets)
min_target = np.min(targets)
max_target = np.max(targets)

print(f"Total samples: {len(targets)}")
print(f"Mean of target values: {mean_target:.2f}")
print(f"Median of target values: {median_target:.2f}")
print(f"SD of targets: {std_target:.2f}")
print(f"Min target: {min_target:.2f}")
print(f"Max target: {max_target:.2f}")

Expand Down

0 comments on commit 626c549

Please sign in to comment.