From 6cc46caa80c9f4d86042a274c39cf5e93d0ebbf5 Mon Sep 17 00:00:00 2001 From: Laurence Jackson Date: Sat, 14 Jan 2023 19:55:54 +0000 Subject: [PATCH 1/3] adds validate_data option --- mlops/data/tools/tools.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mlops/data/tools/tools.py b/mlops/data/tools/tools.py index 74a5bc8..51e9e95 100644 --- a/mlops/data/tools/tools.py +++ b/mlops/data/tools/tools.py @@ -18,20 +18,22 @@ class DataBuilderXNAT: def __init__(self, xnat_configuration: dict, actions: list = None, flatten_output=True, test_batch: int = -1, - num_workers: int = 1): + num_workers: int = 1, validate_data=True): self.xnat_configuration = xnat_configuration self.actions = actions self.flatten_output = flatten_output self.test_batch = test_batch self.missing_data_log = [] self.num_workers = num_workers + self.validate_returns = validate_returns self.dataset = [] def fetch_data(self): - loop = asyncio.get_event_loop() - future = asyncio.ensure_future(self.start_async_process()) - loop.run_until_complete(future) + # loop = asyncio.get_event_loop() + # future = asyncio.ensure_future(self.start_async_process()) + # loop.run_until_complete(future) + asyncio.run(self.start_async_process()) async def start_async_process(self): with ThreadPoolExecutor(max_workers=self.num_workers) as executor: @@ -45,8 +47,6 @@ async def start_async_process(self): logger.info(f"Collecting XNAT project: {self.xnat_configuration['project']}") project = session.projects[self.xnat_configuration["project"]] - dataset = [] - if 0 < self.test_batch < len(project.subjects): from random import sample project_subjects = sample(project.subjects[:], self.test_batch) @@ -66,7 +66,8 @@ async def start_async_process(self): pass # remove any items where not all actions returned a value - self.dataset = [item for item in self.dataset if len(item['data']) == len(self.actions)] + if self.validate_data: + self.dataset = [item for item in self.dataset if len(item['data']) >= len(self.actions)] def process_subject(self, project, subject_i): subject = project.subjects.data[subject_i.id] From 5f1ef8c10f9973046ad85162f21a441f71792d4a Mon Sep 17 00:00:00 2001 From: Laurence Jackson Date: Sat, 14 Jan 2023 19:58:57 +0000 Subject: [PATCH 2/3] Update tools.py --- mlops/data/tools/tools.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mlops/data/tools/tools.py b/mlops/data/tools/tools.py index 51e9e95..2a8776f 100644 --- a/mlops/data/tools/tools.py +++ b/mlops/data/tools/tools.py @@ -25,14 +25,11 @@ def __init__(self, xnat_configuration: dict, actions: list = None, flatten_outpu self.test_batch = test_batch self.missing_data_log = [] self.num_workers = num_workers - self.validate_returns = validate_returns + self.validate_data = validate_data self.dataset = [] def fetch_data(self): - # loop = asyncio.get_event_loop() - # future = asyncio.ensure_future(self.start_async_process()) - # loop.run_until_complete(future) asyncio.run(self.start_async_process()) async def start_async_process(self): From 7125c66456f15e8b3027988a45cc9694ceb58733 Mon Sep 17 00:00:00 2001 From: Laurence Jackson Date: Tue, 17 Jan 2023 13:37:48 +0000 Subject: [PATCH 3/3] Update tools.py --- mlops/data/tools/tools.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mlops/data/tools/tools.py b/mlops/data/tools/tools.py index 2a8776f..76ba78e 100644 --- a/mlops/data/tools/tools.py +++ b/mlops/data/tools/tools.py @@ -83,21 +83,21 @@ def process_subject(self, project, subject_i): # logger.debug(f"Running action: {action.__name__} on {subject.id}") xnat_obj = action(project.subjects[subject.id]) - if type(xnat_obj) == list: - if len(xnat_obj) == 0: - self.missing_data_log.append({'subject_id': subject_i.id, - 'action_data': subject_i.label, - 'failed_action': action}) - logger.warn(f'No data found for {subject_i}: action {action} removing sample') - raise Exception - + if xnat_obj is None or type(xnat_obj) == list and len(xnat_obj) == 0: + self.missing_data_log.append({'subject_id': subject_i.id, + 'action_data': subject_i.label, + 'failed_action': action}) + logger.warn(f'No data found for {subject_i}: action {action} removing sample') + raise Exception + + elif type(xnat_obj) == list: for obj in xnat_obj: action_data.append({'source_action': action.__name__, 'action_data': obj.uri, 'data_type': 'xnat_uri', 'data_label': data_label}) - elif type(xnat_obj) == str: + else: action_data.append({'source_action': action.__name__, 'action_data': xnat_obj, 'data_type': 'value',