diff --git a/2_generate_manifest.py b/2_generate_manifest.py index 49cc93d..dfabef3 100644 --- a/2_generate_manifest.py +++ b/2_generate_manifest.py @@ -13,9 +13,11 @@ import pandas as pd import yaml import numpy as np +import argparse def main(): + args = parse_args() # Load config with open('/coloc/configs/config.yaml') as config_input: config = yaml.load(config_input, Loader=yaml.FullLoader) @@ -34,6 +36,8 @@ def main(): if config['custom_studies']: custom_studies = pd.read_parquet(config['custom_studies'], columns=['study_id']).study_id.unique() + # SuSiE prefixes regex pattern -> (x|y|z|...) + susie_pattern = '(' + '|'.join(args.susie_prefixes.split(";")) + ')' # Out path patterns data_out = '/output' @@ -89,9 +93,9 @@ def construct_left_right_hive_partition_dirs(rec): out_record['{}_sumstats'.format(side)] = public_sumstats.format(type=study_type, study_id=study_id) - # If FinnGen, then don't specify LD, as we won't do conditioning + # If SuSiE, then don't specify LD, as we won't do conditioning ld_path = ukb_ld_path.format(chrom=in_record['{}_lead_chrom'.format(side)]) - if re.match('FINNGEN', study_id): + if re.match(rf'{susie_pattern}', study_id): ld_path = None out_record['{}_ld'.format(side)] = ld_path @@ -131,7 +135,22 @@ def construct_left_right_hive_partition_dirs(rec): return 0 +def parse_args(): + ''' Load command line args + ''' + parser = argparse.ArgumentParser() + + parser.add_argument('--susie_prefixes', + help=('List of semicolon-separated SuSiE prefixes (i.e. "x;y;z")'), + metavar='', + type=str, + const='', + nargs='?', + required=True + ) + + args = parser.parse_args() + return args if __name__ == '__main__': - main() diff --git a/run_coloc_pipeline_opt.sh b/run_coloc_pipeline_opt.sh index 53bcfd9..bda71b5 100644 --- a/run_coloc_pipeline_opt.sh +++ b/run_coloc_pipeline_opt.sh @@ -3,7 +3,8 @@ set -euo pipefail NCORES=$1 -# export PYSPARK_SUBMIT_ARGS=$2 +SUSIE=$2 +# export PYSPARK_SUBMIT_ARGS=$3 #export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell" echo "Running on $NCORES cores" @@ -33,7 +34,7 @@ time /bin/bash 1_find_overlaps.sh # 10 min last run # output: # - /configs/manifest_unfiltered.json.gz echo [$(date +"%Y-%m-%d %H:%M:%S")] Running script: 2_generate_manifest.py -time python 2_generate_manifest.py # ~24 min last run +time python 2_generate_manifest.py --susie_prefixes $SUSIE # ~24 min last run #cp /configs/manifest_unfiltered.json.gz /configs/manifest_unfiltered.all.json.gz