From 0289922e1437418e619bb104cd24c5aba0db6305 Mon Sep 17 00:00:00 2001 From: ricardo-lourenco Date: Fri, 23 Feb 2024 17:21:42 +0100 Subject: [PATCH 1/4] Update manifest script to filter susie prefixes --- 2_generate_manifest.py | 27 +++++++++++++++++++++++---- run_coloc_pipeline_opt.sh | 5 +++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/2_generate_manifest.py b/2_generate_manifest.py index 49cc93d..0a5b5fa 100644 --- a/2_generate_manifest.py +++ b/2_generate_manifest.py @@ -13,9 +13,11 @@ import pandas as pd import yaml import numpy as np +import argparse def main(): + args = parse_args() # Load config with open('/coloc/configs/config.yaml') as config_input: config = yaml.load(config_input, Loader=yaml.FullLoader) @@ -34,6 +36,9 @@ def main(): if config['custom_studies']: custom_studies = pd.read_parquet(config['custom_studies'], columns=['study_id']).study_id.unique() + # SUSIE prefixes regex pattern (x|y|z|...) + susie_prefixes = args.susie_prefixes + susie_pattern = '(' + '|'.join(susie_prefixes.split(";")) + ')' # Out path patterns data_out = '/output' @@ -48,7 +53,6 @@ def construct_left_right_hive_partition_dirs(rec): dirs.append(side + '_' + col + '=' + str(rec.get(side + '_' + ocol, None))) return os.path.join(*dirs) - manifest = [] # Write manifest file os.makedirs(os.path.dirname(out_manifest), exist_ok=True) with gzip.open(out_manifest, 'w') as out_h: @@ -89,9 +93,9 @@ def construct_left_right_hive_partition_dirs(rec): out_record['{}_sumstats'.format(side)] = public_sumstats.format(type=study_type, study_id=study_id) - # If FinnGen, then don't specify LD, as we won't do conditioning + # If SUSIE, then don't specify LD, as we won't do conditioning ld_path = ukb_ld_path.format(chrom=in_record['{}_lead_chrom'.format(side)]) - if re.match('FINNGEN', study_id): + if (susie_prefixes != '') and (re.match(rf'{susie_pattern}', study_id)): ld_path = None out_record['{}_ld'.format(side)] = ld_path @@ -131,7 +135,22 @@ def construct_left_right_hive_partition_dirs(rec): return 0 +def parse_args(): + ''' Load command line args + ''' + parser = argparse.ArgumentParser() + + parser.add_argument('--susie_prefixes', + help=('List of SuSiE prefixes'), + metavar='', + type=str, + const='', + nargs='?', + required=True + ) + + args = parser.parse_args() + return args if __name__ == '__main__': - main() diff --git a/run_coloc_pipeline_opt.sh b/run_coloc_pipeline_opt.sh index 53bcfd9..bda71b5 100644 --- a/run_coloc_pipeline_opt.sh +++ b/run_coloc_pipeline_opt.sh @@ -3,7 +3,8 @@ set -euo pipefail NCORES=$1 -# export PYSPARK_SUBMIT_ARGS=$2 +SUSIE=$2 +# export PYSPARK_SUBMIT_ARGS=$3 #export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell" echo "Running on $NCORES cores" @@ -33,7 +34,7 @@ time /bin/bash 1_find_overlaps.sh # 10 min last run # output: # - /configs/manifest_unfiltered.json.gz echo [$(date +"%Y-%m-%d %H:%M:%S")] Running script: 2_generate_manifest.py -time python 2_generate_manifest.py # ~24 min last run +time python 2_generate_manifest.py --susie_prefixes $SUSIE # ~24 min last run #cp /configs/manifest_unfiltered.json.gz /configs/manifest_unfiltered.all.json.gz From 045f864132d8cca69fd3620a268d176da95b1180 Mon Sep 17 00:00:00 2001 From: ricardo-lourenco Date: Tue, 27 Feb 2024 15:29:39 +0100 Subject: [PATCH 2/4] Revert deleted manifest variable to make PR consistent --- 2_generate_manifest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/2_generate_manifest.py b/2_generate_manifest.py index 0a5b5fa..bfa6931 100644 --- a/2_generate_manifest.py +++ b/2_generate_manifest.py @@ -53,6 +53,7 @@ def construct_left_right_hive_partition_dirs(rec): dirs.append(side + '_' + col + '=' + str(rec.get(side + '_' + ocol, None))) return os.path.join(*dirs) + manifest = [] # Write manifest file os.makedirs(os.path.dirname(out_manifest), exist_ok=True) with gzip.open(out_manifest, 'w') as out_h: From d9dbb6e126dee9910da306517eb220e3bd1c7d31 Mon Sep 17 00:00:00 2001 From: ricardo-lourenco Date: Tue, 5 Mar 2024 16:15:09 +0100 Subject: [PATCH 3/4] Update susie regex --- 2_generate_manifest.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/2_generate_manifest.py b/2_generate_manifest.py index bfa6931..be79e65 100644 --- a/2_generate_manifest.py +++ b/2_generate_manifest.py @@ -36,9 +36,8 @@ def main(): if config['custom_studies']: custom_studies = pd.read_parquet(config['custom_studies'], columns=['study_id']).study_id.unique() - # SUSIE prefixes regex pattern (x|y|z|...) - susie_prefixes = args.susie_prefixes - susie_pattern = '(' + '|'.join(susie_prefixes.split(";")) + ')' + # SuSiE prefixes regex pattern -> (x|y|z|...) + susie_pattern = '(' + '|'.join(args.susie_prefixes.split(";")) + ')' # Out path patterns data_out = '/output' @@ -94,9 +93,9 @@ def construct_left_right_hive_partition_dirs(rec): out_record['{}_sumstats'.format(side)] = public_sumstats.format(type=study_type, study_id=study_id) - # If SUSIE, then don't specify LD, as we won't do conditioning + # If SuSiE, then don't specify LD, as we won't do conditioning ld_path = ukb_ld_path.format(chrom=in_record['{}_lead_chrom'.format(side)]) - if (susie_prefixes != '') and (re.match(rf'{susie_pattern}', study_id)): + if re.match(rf'{susie_pattern}', study_id): ld_path = None out_record['{}_ld'.format(side)] = ld_path From e20308380d5081bc7d461630218b2b631bf8d047 Mon Sep 17 00:00:00 2001 From: ricardo-lourenco Date: Wed, 6 Mar 2024 09:53:34 +0100 Subject: [PATCH 4/4] Update --susie_prefixes argument help message --- 2_generate_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2_generate_manifest.py b/2_generate_manifest.py index be79e65..dfabef3 100644 --- a/2_generate_manifest.py +++ b/2_generate_manifest.py @@ -141,7 +141,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--susie_prefixes', - help=('List of SuSiE prefixes'), + help=('List of semicolon-separated SuSiE prefixes (i.e. "x;y;z")'), metavar='', type=str, const='',