Skip to content

Commit

Permalink
-> file paths
Browse files Browse the repository at this point in the history
  • Loading branch information
s-paquette committed Dec 12, 2024
1 parent 3fd39b2 commit b61bab2
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 20 deletions.
38 changes: 20 additions & 18 deletions cohorts/metadata_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import time
from time import sleep
import re
from projects.models import Program, Project, DataSource, DataVersion, Attribute, Attribute_Tooltips, DataSetType
from projects.models import Program, Project, DataSource, DataVersion, Attribute, Attribute_Tooltips, DataSetType, CgcDataVersion
from metadata_utils import sql_age_by_ranges, sql_bmi_by_ranges, sql_simple_days_by_ranges, sql_simple_number_by_200, sql_year_by_ranges, MOLECULAR_CATEGORIES
from solr_helpers import query_solr_and_format_result, build_solr_facets, build_solr_query
from google_helpers.bigquery.bq_support import BigQuerySupport
Expand Down Expand Up @@ -934,38 +934,40 @@ def get_paths_by_uuid(uuids):

query_base = """
SELECT file_node_id, file_name_key, index_file_name_key
FROM `{bq_project}.{bq_dataset}.{table_name}`
FROM `{table_clause}`
WHERE {where_clause}
"""

uuid_filters = {'file_node_id': uuids}

where_clause = BigQuerySupport.build_bq_filter_and_params(uuid_filters)

tables = [{'table': y.data_table, 'dataset': y.bq_dataset} for x in Program.get_public_programs() for y in x.get_data_tables()]
version = CgcDataVersion.objects.filter(active=True)
sources = version.get_data_sources(source_type=DataSource.BIGQUERY, data_type=DataSetType.FILE_DATA)

query = """ UNION DISTINCT """.join(
[query_base.format(
bq_project=settings.BIGQUERY_DATA_PROJECT_ID,
bq_dataset=table['dataset'],
table_name=table['table'].lower(),
table_clause=source.name,
where_clause=where_clause['filter_string']
) for table in tables]
) for source in sources]
)

results = BigQuerySupport.execute_query_and_fetch_results(query, where_clause['parameters'])

if results:
for row in results:
item = {
'file_node_id': row.get("file_node_id"),
'gcs_path': row.get("gcs_path")
}
if row.get("index_file_path", None) and len(row.get("index_file_path")) > 1:
item['index_file_path'] = row.get("index_file_path")

col_idx = {}
for idx, col in enumerate(results['schema']):
if col.name == 'file_name_key':
col_idx[idx] = 'file_path'
elif col.name == 'index_file_name_key':
col_idx[idx] = 'index_file_path'
else:
col_idx[idx] = col.name
for row in results['rows']:
item = {}
for idx, val in enumerate(row):
item[col_idx[idx]] = val if val is not None else "N/A"
paths.append(item)

not_found = [x for x in uuids if x not in [x['file_node_id'] for x in paths]]
not_found = [x for x in uuids if x not in set([x['file_node_id'] for x in paths])]

return paths, not_found
5 changes: 3 additions & 2 deletions projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def __str__(self):
class CgcDataVersionQuerySet(models.QuerySet):

# Return all the data sources corresponding to this queryset
def get_data_sources(self, source_type=None, active=None, current=None, aggregate_level=None):
def get_data_sources(self, source_type=None, active=None, current=None, aggregate_level=None, data_type=None):
sources = None
cgcdvs = self.all()
source_qs = Q()
Expand All @@ -416,6 +416,8 @@ def get_data_sources(self, source_type=None, active=None, current=None, aggregat
sources = sources | versions.get_data_sources()
if source_type:
source_qs &= Q(source_type=source_type)
if data_type:
source_qs &= Q(datasettypes__data_type=data_type)
if aggregate_level:
aggregate_level = aggregate_level if isinstance(aggregate_level, list) else [aggregate_level]
source_qs &= Q(aggregate_level__in=aggregate_level)
Expand Down Expand Up @@ -462,7 +464,6 @@ def get_data_sources(self, active=None, source_type=None, aggregate_level=None):

return versions.get_data_sources(source_type=source_type, aggregate_level=aggregate_level).distinct()


def get_display(self):
return self.__str__()

Expand Down

0 comments on commit b61bab2

Please sign in to comment.