diff --git a/code/pecotmr_integration/twas.ipynb b/code/pecotmr_integration/twas.ipynb index dd002e465..85d75461a 100644 --- a/code/pecotmr_integration/twas.ipynb +++ b/code/pecotmr_integration/twas.ipynb @@ -197,6 +197,52 @@ "```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here using `--region-name` we focus the analysis on 2 blocks: format as `chr_start_stop`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "sos run xqtl-protocol/code/pecotmr_integration/twas.ipynb quantile_twas \\\n", + " --cwd /output/ --name test \\\n", + " --gwas_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/GWAS/gwas_meta.tsv \\\n", + " --ld_meta_data /mnt/vast/hpc/csg/data_public/20240409_ADSP_LD_matrix/ld_meta_file.tsv \\\n", + " --region_name chr11_84267999_86714492 chr7_54681006_57314931 \\\n", + " --xqtl_meta_data /home/al4225/project/quantile_twas/quantile_twas_analysis/test_data/small_region_gene_meta_data_test.tsv\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is also possible to analyze a selected list of regions using option `--regions`. The 3 columns(chr, start, stop) of this file will be used for the block list to analyze. Here for example use the same list of regions as we used for LD block:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "```\n", + "sos run xqtl-protocol/code/pecotmr_integration/twas.ipynb quantile_twas \\\n", + " --cwd /output/ --name test \\\n", + " --gwas_meta_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/GWAS/gwas_meta.tsv \\\n", + " --ld_meta_data /mnt/vast/hpc/csg/data_public/20240409_ADSP_LD_matrix/ld_meta_file.tsv \\\n", + " --regions /mnt/vast/hpc/csg/cl4215/mrmash/workflow/pipeline_data/EUR_LD_blocks.bed \\\n", + " --xqtl_meta_data /home/al4225/project/quantile_twas/quantile_twas_analysis/test_data/small_region_gene_meta_data_test.tsv\n", + "```" + ] + }, { "cell_type": "code", "execution_count": null, @@ -215,6 +261,9 @@ "parameter: gwas_data = []\n", "parameter: column_mapping = []\n", "parameter: regions = path()\n", + "# Optional: if a region name is provided \n", + "# the analysis would be focused on the union of provides region list and region names\n", + "parameter: region_name = []\n", "parameter: name = f\"{xqtl_meta_data:bn}.{gwas_meta_data:bn}\"\n", "parameter: container = ''\n", "import re\n", @@ -295,7 +344,7 @@ " if missing_columns:\n", " raise ValueError(f\"Missing required columns: {', '.join(missing_columns)}\")\n", "\n", - "def extract_regional_data(gwas_meta_data, xqtl_meta_data, regions, gwas_name, gwas_data, column_mapping):\n", + "def extract_regional_data(gwas_meta_data, xqtl_meta_data, regions, region_name, gwas_name, gwas_data, column_mapping):\n", " \"\"\"\n", " Extracts data from GWAS and xQTL metadata files and additional GWAS data provided. \n", "\n", @@ -334,12 +383,30 @@ " gwas_dict = OrderedDict()\n", " \n", " # Reading LD regions info\n", - " regions_df = pd.read_csv(regions, sep=\"\\t\",skipinitialspace=True)\n", - " regions_df.columns = [col.strip() for col in regions_df.columns] # Strip spaces from column names before\n", - " regions_df['chr'] = regions_df['chr'].str.strip()\n", - " check_required_columns(regions_df, required_ld_columns)\n", + " # Initialize empty DataFrame for regions\n", + " regions_df = pd.DataFrame(columns=['chr', 'start', 'stop'])\n", + "\n", + " # Check if regions file exists and read it\n", + " if os.path.isfile(regions):\n", + " file_regions_df = pd.read_csv(regions, sep=\"\\t\", skipinitialspace=True)\n", + " file_regions_df.columns = [col.strip() for col in file_regions_df.columns] # Strip spaces from column names\n", + " file_regions_df['chr'] = file_regions_df['chr'].str.strip()\n", + " check_required_columns(file_regions_df, required_ld_columns)\n", + " regions_df = pd.concat([regions_df, file_regions_df])\n", + " # Process region_name if provided: \n", + " # fomat: region_name = [\"chr1_16103_2888443\", \"chr1_4320284_5853833\"]\n", + " if len(region_name) > 0:\n", + " # Split region_name entries into chr, start, and stop columns\n", + " extra_regions = [name.split(\"_\") for name in region_name]\n", + " extra_regions_df = pd.DataFrame(extra_regions, columns=['chr', 'start', 'stop'])\n", + " extra_regions_df['start'] = extra_regions_df['start'].astype(int)\n", + " extra_regions_df['stop'] = extra_regions_df['stop'].astype(int)\n", + " # Add extra regions to regions_df\n", + " regions_df = pd.concat([regions_df, extra_regions_df])\n", + " # Remove duplicates and reset index\n", + " regions_df = regions_df.drop_duplicates().reset_index(drop=True)\n", " regions_dict = OrderedDict()\n", - " \n", + "\n", " # Reading the xQTL weight metadata file\n", " xqtl_df = pd.read_csv(xqtl_meta_data, sep=\" \")\n", " check_required_columns(xqtl_df, required_xqtl_columns)\n", @@ -398,7 +465,7 @@ " return gwas_dict, xqtl_dict, regions_dict\n", "\n", "\n", - "gwas_dict, xqtl_dict, regions_dict = extract_regional_data(gwas_meta_data, xqtl_meta_data,regions,gwas_name, gwas_data, column_mapping)\n", + "gwas_dict, xqtl_dict, regions_dict = extract_regional_data(gwas_meta_data, xqtl_meta_data,regions,region_name,gwas_name, gwas_data, column_mapping)\n", "regional_data = dict([(\"GWAS\", gwas_dict), (\"xQTL\", xqtl_dict), (\"Regions\", regions_dict)])\n", "\n", "\n", @@ -684,21 +751,20 @@ " quantile_twas = TRUE, \n", " output_twas_data = ${\"TRUE\" if save_ctwas_data else \"FALSE\"} )\n", "\n", - " # Merging with xQTL meta-data\n", + " # Merging with xQTL meta-data \n", " message(\"Merging twas_result with xqtl_meta_df...\") \n", " twas_results_db$twas_result <- merge(twas_results_db$twas_result, xqtl_meta_df[, c(\"region_id\",\"TSS\",\"start\",\"end\")], by.x=\"molecular_id\", by.y=\"region_id\")\n", - " \n", - " message(\"Preview of merged twas_result:\")\n", " twas_results_db$twas_result <- unique(twas_results_db$twas_result)\n", " fwrite(twas_results_db$twas_result[, c(2, 1, (ncol(twas_results_db$twas_result)-2):ncol(twas_results_db$twas_result), 3:(ncol(twas_results_db$twas_result)-3))], file = ${_output[0]:r}, sep = \"\\t\", compress = \"gzip\")\n", "\n", " # Step 3: reformat for follow up cTWAS analysis\n", " if (${\"TRUE\" if save_ctwas_data else \"FALSE\"}) {\n", - " saveRDS(twas_results_db$twas_data, \"${_output[0]:nnn}.twas_data.rds\", compress='xz')\n", + " saveRDS(twas_results_db$twas_data, \"${_output[0]:nnn}.quantile_twas_data.rds\", compress='xz')\n", " }\n", " if (${\"TRUE\" if save_mr_result else \"FALSE\"}) {\n", - " fwrite(twas_results_db$mr_result, file = \"${_output[0]:nnn}.mr_result.tsv.gz\", sep = \"\\t\", compress = \"gzip\")\n", - " }" + " fwrite(twas_results_db$mr_result, file = \"${_output[0]:nnn}.quantile_mr_result.tsv.gz\", sep = \"\\t\", compress = \"gzip\")\n", + " }\n", + " message(\"quantile twas analysis is completed in this block.\") " ] } ],