From 7283cb0a1c4f45dd9547e5a67de43b4e8861264e Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 10:44:16 +0000 Subject: [PATCH 1/8] merge blacklist fix, dump config fix, assertion to check labels.tsv x selected/ x scNOVA input lists, labels at later stage to prevent working on modified list of cells, other minor fixes --- watchdog_pipeline/watchdog_pipeline.py | 841 +++++++++++++++++++++---- 1 file changed, 720 insertions(+), 121 deletions(-) diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py index d3df33c6..c0f259f9 100644 --- a/watchdog_pipeline/watchdog_pipeline.py +++ b/watchdog_pipeline/watchdog_pipeline.py @@ -1,5 +1,7 @@ +import sqlite3 import time import os, sys, glob, subprocess, re +import requests from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler from datetime import datetime @@ -7,7 +9,10 @@ import json import pandas as pd import threading - +import re +from collections import Counter +from pathlib import Path +import pika # RabbitMQ os.makedirs("watchdog/logs", exist_ok=True) @@ -16,7 +21,9 @@ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ - logging.FileHandler("watchdog/logs/watchdog_ashleys.log"), # File handler to log to a file + logging.FileHandler( + "watchdog/logs/watchdog_ashleys.log" + ), # File handler to log to a file logging.StreamHandler(), # Stream handler to log to the console ], ) @@ -24,17 +31,37 @@ # Set the path you want to watch path_to_watch = sys.argv[1] +dry_run = sys.argv[2] +report_only = sys.argv[3] + -data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS_DEV" -publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV" -# publishdir_location = "/g/korbel/WORKFLOW_RESULTS" +data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS" +# publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV" +publishdir_location = "/g/korbel/WORKFLOW_RESULTS" genecore_prefix = path_to_watch -profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"] -profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/", "-c", "1"] +# profile_slurm = ["--profile", "../snakemake_profiles/HPC/dev/slurm_legacy_conda/"] +profile_slurm = [ + "--profile", + "/g/korbel2/weber/workspace/snakemake_profiles/HPC/slurm_EMBL/", +] +profile_dry_run = [ + "--profile", + "workflow/snakemake_profiles/local/conda_singularity/", + "-c", + "1", +] dry_run_options = ["-n", "-q"] # snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake" -snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_panoptesfix/bin/snakemake" +snakemake_binary = ( + "/g/korbel2/weber/miniconda3/envs/snakemake_panoptesfix/bin/snakemake" +) +# Panoptes +pipeline = "ashleys-qc-pipeline" +my_env = os.environ.copy() +snakemake_binary_folder = "/".join(snakemake_binary.split("/")[:-1]) +my_env["PATH"] = f"{snakemake_binary_folder}:{my_env['PATH']}" +working_directory = "/g/korbel2/weber/workspace/mosaicatcher-update" # plates_processing_status = pd.read_csv("watchdog/processing_status.json", sep="\t") # print(plates_processing_status) @@ -47,27 +74,571 @@ def on_created(self, event): logging.info(f"Directory {event.src_path} has been created!") self.process_new_directory(event.src_path) + def extract_samples_names(self, l, directory_path): + samples = list() + prefixes = list() + plate_types = list() + + pattern = re.compile(r"_lane1(.*?)(iTRU|PE20)(.*?)([A-H]?)(\d{2})(?:_1_|_2_)") + + # First pass: Count occurrences of each sample_name + file_counts_per_sample = Counter() + for file_path in l: + match = pattern.search(file_path) + if match: + sample_name = match.group(1) + file_counts_per_sample[sample_name] += 1 + + # Second pass: Process files and determine plate type per sample + for j, file_path in enumerate(sorted(l)): + match = pattern.search(file_path) + if match: + sample_name = match.group(1) + file_count = file_counts_per_sample[sample_name] + + # Determine plate type using modulo 96 operation + if file_count % 96 != 0: + raise ValueError( + f"Invalid file count for sample {sample_name} with file count {file_count}. Must be a multiple of 96." + ) + plate_type = int(file_count / 2) + + if (j + 1) % file_count == 0: + prefixes.append(match.group(2)) + plate = directory_path.split("/")[-1] + samples.append(sample_name) + plate_types.append(plate_type) + + return prefixes, samples, plate_types + + def check_date(self, plate): + from datetime import datetime, timedelta + + date_str = "-".join(plate.split("-")[:-1]) + date_format = "%Y-%m-%d" + folder_date = datetime.strptime(date_str, date_format) + + # Calculate the date that is 6 months before today + six_months_ago = datetime.now() - timedelta( + days=3 * 30 + ) # This assumes an average of 30 days in a month + # print(plate, six_months_ago, folder_date > six_months_ago) + # Compare dates + return folder_date > six_months_ago + + @staticmethod + def load_from_json(filename: str): + """Load the data from the JSON file.""" + try: + with open(filename, "r") as file: + data = json.load(file) + return data + except (FileNotFoundError, json.JSONDecodeError): + # If the file does not exist or there's an error in reading it, + # return an empty dictionary or other default value + return {} + + @staticmethod + def update_timestamps(directory): + """ + Update the access and modification times of all files in the given directory and its subdirectories. + + :param directory: Path to the directory + """ + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(".fastq.gz"): + continue + try: + file_path = Path(root) / file + current_time = time.time() + os.utime(file_path, (current_time, current_time)) + logging.info(f"Updated timestamp for: {file_path}") + except FileNotFoundError: + logging.info(f"File not found: {file_path}") + + # Example usage + # directory_path = "/path/to/your/directory" + # update_timestamps(directory_path) + + def consume_last_message_from_rabbitmq(self, json_backup_filename=str, queue=str): + pika_connection = pika.BlockingConnection( + pika.ConnectionParameters(host="localhost") + ) + channel = pika_connection.channel() + + # Fetch the message without auto acknowledgment + method_frame, header_frame, body = channel.basic_get( + queue=queue, auto_ack=False + ) + + if method_frame: + # Extract the timestamp from the header frame + if header_frame.timestamp: + timestamp = header_frame.timestamp + human_readable_timestamp = datetime.fromtimestamp( + timestamp / 1000.0 + ).strftime("%Y-%m-%d %H:%M:%S") + + else: + timestamp = None + # Convert timestamp to human-readable format if necessary + + # # Acknowledge the message after processing + # channel.basic_ack(delivery_tag=method_frame.delivery_tag) + pika_connection.close() + data = json.loads(body.decode("utf-8")) + print(data) + # if data dict is empty + if not data: + print("EXITING") + sys.exit("RabbitMQ queue NOT empty but message is") + # print("Loading from JSON file...") + # data_json = self.load_from_json(filename=json_backup_filename) + # file_timestamp = os.path.getmtime(json_backup_filename) + # file_timestamp = datetime.fromtimestamp(file_timestamp).strftime( + # "%Y-%m-%d %H:%M:%S" + # ) + # return data_json, file_timestamp + else: + print("RabbitMQ queue NOT empty and message is NOT empty") + print(data) + return data, human_readable_timestamp + + else: + if os.path.exists(json_backup_filename): + pika_connection.close() + print("No message available, RabbitMQ queue is empty") + print("Loading from JSON file...") + data_json = self.load_from_json(filename=json_backup_filename) + file_timestamp = os.path.getmtime(json_backup_filename) + file_timestamp = datetime.fromtimestamp(file_timestamp).strftime( + "%Y-%m-%d %H:%M:%S" + ) + + return data_json, file_timestamp + else: + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return {"workflows": []}, current_time + + # Function to get all workflows + @staticmethod + def get_workflows(): + url = "http://localhost:8058/api/workflows" + response = requests.get(url) + if response.status_code == 200: + return response.json() + else: + raise Exception( + "Failed to fetch data: Status code {}".format(response.status_code) + ) + + # Function to find a workflow ID by name + @staticmethod + def find_workflow_id_by_name(workflows, name): + for workflow in workflows.get("workflows", []): + if workflow["name"] == name: + return workflow + return None + def check_unprocessed_folder(self): + connection = sqlite3.connect( + "/g/korbel2/weber/workspace/strandscape/.panoptes.db" + ) + + # Get the list of processed plates from rabbitmq + message = self.consume_last_message_from_rabbitmq( + json_backup_filename="watchdog/processing_status.json", queue="data_queue" + ) + unwanted = ["._.DS_Store", ".DS_Store", "config"] - list_runs_processed = sorted([e for e in os.listdir(data_location) if e not in unwanted]) - total_list_runs = sorted([e for e in os.listdir(path_to_watch) if e not in unwanted]) - unprocessed_plates = set(total_list_runs).difference(list_runs_processed) - print(list_runs_processed) - print(total_list_runs) - print(unprocessed_plates) - # for plate in ["2023-07-10-HLGVJAFX5"]: - for plate in unprocessed_plates: - # if plate not in plates_processing_status["plate"].values.tolist(): - # plates_processing_status_plate_dict = collections.defaultdict(dict) - nb_txt_gz_files = len(glob.glob(f"{path_to_watch}/{plate}/*.txt.gz")) - # if nb_txt_gz_files == 576: - # if (nb_txt_gz_files % 192) == 0: - print(f"PROCESSING {path_to_watch}/{plate}") - self.process_new_directory(f"{path_to_watch}/{plate}") - # else: - # print(f"Not possible to process {path_to_watch}/{plate}, containing {nb_txt_gz_files} txt.gz files") - - def process_new_directory(self, directory_path): + list_runs_processed = sorted( + [e for e in os.listdir(data_location) if e not in unwanted] + ) + + total_list_runs = sorted( + [e for e in os.listdir(path_to_watch) if e not in unwanted] + ) + # unprocessed_plates = sorted(list(set(total_list_runs).difference(list_runs_processed))) + unprocessed_plates = list() + # workflows_data = self.get_workflows() + workflows_data = message[0] + last_message_timestamp = message[1] + print(last_message_timestamp) + last_message_timestamp = datetime.strptime( + last_message_timestamp, "%Y-%m-%d %H:%M:%S" + ).strftime("%Y-%m-%d %H:%M:%S.%f") + + # last_message_timestamp = last_message_timestamp + + main_df = list() + if workflows_data: + for plate in total_list_runs: + # print(plate) + if plate.split("-")[0][:2] == "20": + # if plate.split("-")[0] == "2023": + # if plate.startswith("2023-11-09"): + # if plate == "2021-02-17-HM7LYAFX2": + # if plate == "2020-06-22-H5YMMAFX2": + directory_path = f"{path_to_watch}/{plate}" + prefixes, samples, plate_types = self.extract_samples_names( + glob.glob(f"{path_to_watch}/{plate}/*.txt.gz"), directory_path + ) + # print(prefixes, samples, plate_types) + if len(set(prefixes)) == 1: + # print(plate) + # if self.check_date(plate): + + # print(plate) + for sample_name, plate_type in zip(samples, plate_types): + if sample_name not in [ + "PDAC60590", + "PDAC60590MNI", + "DXR30hMaja", + "DXR42hMaja", + "GM19705", + ]: + run_id = f"{pipeline}--{plate}--{sample_name}" + workflow_id = self.find_workflow_id_by_name( + workflows_data, run_id + ) + + report = False + labels = False + multiqc_scratch = False + multiqc_scratch_timestamp = None + remaining_days = None + + if os.path.isfile( + f"{publishdir_location}/{plate}/{sample_name}/cell_selection/labels.tsv" + ): + labels = True + + if os.path.isfile( + f"{publishdir_location}/{plate}/{sample_name}/reports/{sample_name}_{pipeline}_report.zip" + ): + report = True + + if os.path.isfile( + f"{data_location}/{plate}/{sample_name}/multiqc/multiqc_report/multiqc_report.html" + ): + multiqc_scratch = True + multiqc_scratch_timestamp = os.path.getmtime( + f"{data_location}/{plate}/{sample_name}/multiqc/multiqc_report/multiqc_report.html" + ) + # to datetime and then strfmtime + multiqc_scratch_timestamp = datetime.fromtimestamp( + multiqc_scratch_timestamp + ) + # computing remaning days to reach 5 months between multiqc_scratch_timestamp and now + remaining_days = ( + datetime.now() - multiqc_scratch_timestamp + ).days + remaining_days = 150 - remaining_days + + multiqc_scratch_timestamp = ( + multiqc_scratch_timestamp.strftime("%Y-%m-%d") + ) + + if not workflow_id: + workflow_id = { + "id": "None", + "status": "None", + "started_at": last_message_timestamp, + "completed_at": last_message_timestamp, + "jobs_done": "None", + "jobs_total": "None", + } + else: + workflow_id["started_at"] = datetime.strptime( + workflow_id["started_at"], + "%a, %d %b %Y %H:%M:%S GMT", + ).strftime("%Y-%m-%d %H:%M:%S.%f") + + if workflow_id["completed_at"] is not None: + workflow_id["completed_at"] = datetime.strptime( + workflow_id["completed_at"], + "%a, %d %b %Y %H:%M:%S GMT", + ).strftime("%Y-%m-%d %H:%M:%S.%f") + + # turn the print into a dict + tmp_d = { + "panoptes_id": workflow_id["id"], + "plate": plate, + "sample": sample_name, + "report": report, + "labels": labels, + "multiqc_scratch": multiqc_scratch, + "multiqc_scratch_timestamp": multiqc_scratch_timestamp, + "remaining_days": remaining_days, + "status": workflow_id["status"], + "prefix": list(prefixes)[0], + "plate_type": plate_type, + "started_at": workflow_id["started_at"], + "completed_at": workflow_id["completed_at"], + "jobs_done": workflow_id["jobs_done"], + "jobs_total": workflow_id["jobs_total"], + } + main_df.append(tmp_d) + pd.options.display.max_rows = 999 + pd.options.display.max_colwidth = 30 + # pd.options.display.max_columns = 50 + main_df = pd.DataFrame(main_df) + # main_df.loc[(main_df["labels"] == True) & (main_df["report"] == True), "real_status"] = "Completed" + main_df.loc[ + (main_df["labels"] == True) & (main_df["report"] == False), + "real_status", + ] = "Report missing" + main_df.loc[ + (main_df["labels"] == False) & (main_df["report"] == True), + "real_status", + ] = "Error" + main_df.loc[ + (main_df["labels"] == False) & (main_df["report"] == False), + "real_status", + ] = "To process" + main_df.loc[ + (main_df["labels"] == True) + & (main_df["report"] == True) + & (main_df["status"] == "None"), + "real_status", + ] = "Error" + main_df.loc[ + (main_df["labels"] == True) + & (main_df["report"] == True) + & (main_df["status"] == "Running"), + "real_status", + ] = "Running" + main_df.loc[ + (main_df["labels"] == True) + & (main_df["report"] == True) + & (main_df["status"] == "Done"), + "real_status", + ] = "Completed" + main_df["real_status"] = main_df["real_status"].fillna( + "Error (to investigate))" + ) + + print(main_df) + + dry_run_db = False + + if dry_run_db is False: + cursor = connection.cursor() + + assert ( + main_df.loc[ + (main_df["labels"] == False) & (main_df["report"] == True) + ].shape[0] + == 0 + ), "Error in table, samples have report done without the completion of the pipeline" + + logging.info( + "Correcting status of plates with report.zip and labels.tsv" + ) + + for row in main_df.loc[ + (main_df["labels"] == True) + & (main_df["report"] == True) + & (main_df["status"] != "Done") + ].to_dict("records"): + logging.info(row) + panoptes_entry = f"{pipeline}--{row['plate']}--{row['sample']}" + workflow_id = row["panoptes_id"] + + # if workflow_id != "None": + # command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"' + # subprocess.run(command, shell=True, check=True) + + panoptes_data = [ + e for e in workflows_data["workflows"] if e["id"] == workflow_id + ] + + if panoptes_data: + panoptes_data = panoptes_data[0] + if "completed_at" not in panoptes_data: + panoptes_data["completed_at"] = last_message_timestamp + + command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"' + subprocess.run(command, shell=True, check=True) + + else: + logging.info( + "Panoptes data not found for workflow entry: %s", row + ) + panoptes_data = { + "started_at": last_message_timestamp, + "completed_at": last_message_timestamp, + "jobs_done": "1", + "jobs_total": "1", + } + + print(row) + + cursor.execute( + """ + INSERT INTO workflows (name, status, done, total, started_at, completed_at) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + panoptes_entry, + "Done", + panoptes_data["jobs_done"], + panoptes_data["jobs_total"], + panoptes_data["started_at"], + panoptes_data["completed_at"], + ), + ) + connection.commit() + + logging.info( + "Processing plates without labels.tsv or outdated without report.zip" + ) + + for row in main_df.loc[ + (main_df["labels"] == False) & (main_df["report"] == False) + ].to_dict("records"): + logging.info(row) + + # panoptes = True if row["status"] == "None" else False + panoptes = True + + if dry_run == "False": + if row["panoptes_id"] != "None": + workflow_id = row["panoptes_id"] + panoptes_data = [ + e + for e in workflows_data["workflows"] + if e["id"] == workflow_id + ] + command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"' + subprocess.run(command, shell=True, check=True) + + self.process_new_directory( + "/".join([path_to_watch, row["plate"]]), + row["prefix"], + row["sample"], + row["plate_type"], + report_only=False, + panoptes=panoptes, + ) + + logging.info( + "Processing plates not present anymore on scratch and without report.zip" + ) + + for row in main_df.loc[ + # (main_df["multiqc_scratch"] == False) + (main_df["multiqc_scratch"] == False) + & (main_df["report"] == False) + ].to_dict("records"): + logging.info(row) + + # panoptes = True if row["status"] == "None" else False + panoptes = True + + if dry_run == "False": + if row["panoptes_id"] != "None": + workflow_id = row["panoptes_id"] + panoptes_data = [ + e + for e in workflows_data["workflows"] + if e["id"] == workflow_id + ] + command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"' + subprocess.run(command, shell=True, check=True) + + self.process_new_directory( + "/".join([path_to_watch, row["plate"]]), + row["prefix"], + row["sample"], + row["plate_type"], + report_only=False, + panoptes=panoptes, + ) + + logging.info( + "Processing plates without report.zip but with labels.tsv and still on scratch" + ) + + for row in main_df.loc[ + (main_df["labels"] == True) + & (main_df["multiqc_scratch"] == True) + & (main_df["remaining_days"] > 2) + & (main_df["report"] == False) + ].to_dict("records"): + logging.info(row) + + # panoptes = True if row["status"] == "None" else False + panoptes = False + panoptes_entry = f"{pipeline}--{row['plate']}--{row['sample']}" + + if dry_run == "False": + self.process_new_directory( + "/".join([path_to_watch, row["plate"]]), + row["prefix"], + row["sample"], + row["plate_type"], + report_only=True, + panoptes=panoptes, + ) + + if row["panoptes_id"] != "None": + workflow_id = row["panoptes_id"] + panoptes_data = [ + e + for e in workflows_data["workflows"] + if e["id"] == workflow_id + ][0] + + if panoptes_data: + command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"' + subprocess.run(command, shell=True, check=True) + + cursor.execute( + """ + INSERT INTO workflows (name, status, done, total, started_at, completed_at) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + panoptes_entry, + "Done", + panoptes_data["jobs_done"], + panoptes_data["jobs_total"], + panoptes_data["started_at"], + panoptes_data["completed_at"], + ), + ) + connection.commit() + + else: + logging.info( + "Panoptes data not found for workflow entry: %s", row + ) + + logging.info( + "Updating /scratch files timestamps that are close to 6 months" + ) + + for row in main_df.loc[ + (main_df["labels"] == True) + & (main_df["multiqc_scratch"] == True) + & (main_df["remaining_days"] < 10) + ].to_dict("records"): + logging.info(row) + self.update_timestamps( + f"{data_location}/{row['plate']}/{row['sample']}" + ) + + def process_new_directory( + self, + directory_path, + prefix, + sample_name, + plate_type, + report_only=False, + panoptes=False, + ): """Process the new directory, check for .txt.gz files and execute snakemake command if conditions are met.""" # Poll the directory until 576 files appear or a timeout is reached @@ -80,67 +651,33 @@ def process_new_directory(self, directory_path): num_files = len(txt_gz_files) # # If the desired number of files is found or timeout is reached, break the loop - # if (num_files % 192) == 0 or time.time() - start_time > timeout: + # if time.time() - start_time > timeout: # break - # # Sleep for a while before the next poll + # # # # # Sleep for a while before the next poll # time.sleep(5) # Sleep for 5 seconds # Process the found .txt.gz files - self.process_txt_gz_files(directory_path, txt_gz_files, num_files) - - def process_txt_gz_files(self, directory_path, txt_gz_files, num_files): - """Process the found .txt.gz files and execute snakemake command if conditions are met.""" - - if (num_files % 192) == 0: - logging.info(f"The new directory contains exactly {num_files} .txt.gz files.") - self.execute_snakemake(directory_path, txt_gz_files) - - else: - logging.info(f"The new directory contains {str(num_files)} .txt.gz files, not 576.") - - def execute_snakemake(self, directory_path, txt_gz_files): - """Execute the snakemake command based on the found prefixes.""" - pattern = re.compile(r"_lane1(.*?)(iTRU|PE20)(.*?)([A-H]?)(\d{2})(?:_1_|_2_)") - prefixes = list() - - for file_path in sorted(txt_gz_files): - match = pattern.search(file_path) - # print(file_path, match) - if match: - prefix = match.group(2) - # print(sample_name) - # prefix = match.group(2) + match.group(4) + match.group(5) # Concatenate the prefix, optional letter, and two digits - prefixes.append(prefix) - # indexes.add(index) - # pattern = re.compile(r"(iTRU|PE20)\d{3}") - # prefixes = set() - # - # for file_path in txt_gz_files: - # match = pattern.search(file_path) - # print(file_path) - # if match: - # prefix = match.group()[:4] # Get the first 4 characters, which is the prefix - # prefixes.add(prefix) - - if len(set(prefixes)) > 1: - logging.info("Multiple different prefixes found: %s", prefixes) - elif prefixes: - for j, file_path in enumerate(sorted(txt_gz_files)): - if (j + 1) % 192 == 0: - match = pattern.search(file_path) - sample_name = match.group(1) - cell = f"{sample_name}{prefixes[0]}{match.group(3)}{match.group(4)}96" - # print(file_path, j, match, sample_name, cell) - # print([match.group(i) for i in range(6)]) - # self.execute_command(directory_path, prefixes[0], sample_name) - - # Debug/dev purpose - target a specific file - self.execute_command(directory_path, prefixes[0], sample_name, cell) - else: - logging.info("No match found in any file.") + # self.process_txt_gz_files(directory_path, txt_gz_files, num_files) + self.execute_command( + directory_path, + prefix, + sample_name, + plate_type, + report_only=report_only, + panoptes=panoptes, + ) - def execute_command(self, directory_path, prefix, sample, cell=None): + def execute_command( + self, + directory_path, + prefix, + sample, + plate_type, + report_only=False, + cell=None, + panoptes=False, + ): """Execute the command.""" # Change directory and run the snakemake command @@ -150,12 +687,15 @@ def execute_command(self, directory_path, prefix, sample, cell=None): f"{snakemake_binary}", "-s", "workflow/Snakefile", + "--set-resources", + "ashleys_mark_duplicates:partition=bigmem", "--config", "genecore=True", f"genecore_prefix={genecore_prefix}", f"genecore_date_folder={date_folder}", f"genecore_regex_element={prefix}", f'samples_to_process="[{sample}]"', + f"plate_type={plate_type}", "multistep_normalisation=True", "MultiQC=True", "split_qc_plot=False", @@ -165,6 +705,7 @@ def execute_command(self, directory_path, prefix, sample, cell=None): "ashleys_pipeline_only=True", "ashleys_pipeline=True", "--nolock", + "--rerun-incomplete", "--rerun-triggers", "mtime", ] @@ -177,32 +718,62 @@ def execute_command(self, directory_path, prefix, sample, cell=None): "--force", ] - logging.info("Running command: %s", " ".join(cmd + profile_dry_run + dry_run_options)) + if report_only is False: + logging.info( + "Running command: %s", " ".join(cmd + profile_dry_run + dry_run_options) + ) + + process = subprocess.Popen( + cmd + profile_dry_run + dry_run_options, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + cwd=working_directory, + env=my_env, + ) + + # Variable to store the penultimate line + penultimate_line = "" + + # Read the output line by line in real-time + for line in iter(process.stdout.readline, ""): + logging.info(line.strip()) # log line in real-time + if line.strip(): # If line is not blank + penultimate_line = line.strip() + + # Wait for the subprocess to finish + process.wait() + logging.info("Return code: %s", process.returncode) + dryrun_check = True if (str(process.returncode) == str(0)) else False - process = subprocess.Popen( - cmd + profile_dry_run + dry_run_options, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True - ) - - # Variable to store the penultimate line - penultimate_line = "" - - # Read the output line by line in real-time - for line in iter(process.stdout.readline, ""): - logging.info(line.strip()) # log line in real-time - if line.strip(): # If line is not blank - penultimate_line = line.strip() - - # Wait for the subprocess to finish - process.wait() - logging.info("Return code: %s", process.returncode) - - # Check the penultimate line - if str(process.returncode) == str(0): - self.run_second_command(cmd, profile_slurm, data_location, date_folder, sample, cell) + else: + dryrun_check = True + + if dryrun_check is True: + self.run_second_command( + cmd, + profile_slurm, + data_location, + date_folder, + sample, + report_only, + cell, + panoptes, + ) else: logging.info("\nThe output is not as expected.") - def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sample, cell=None): + def run_second_command( + self, + cmd, + profile_slurm, + data_location, + date_folder, + sample, + report_only=False, + cell=None, + panoptes=False, + ): """Run the second command and write the output to a log file.""" report_location = f"{publishdir_location}/{date_folder}/{sample}/reports/{sample}_ashleys-qc-pipeline_report.zip" @@ -213,9 +784,6 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sam "/g/korbel2/weber/workspace/mosaicatcher-update/workflow/report/custom-stylesheet.css", ] - # Panoptes - pipeline = "ashleys-qc-pipeline" - wms_monitor_options = "http://127.0.0.1:8058" run_id = f"{pipeline}--{date_folder}--{sample}" wms_monitor_renaming_option = f"name={run_id}" @@ -229,9 +797,6 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sam # print(cmd + profile_slurm + report_options) - logging.info("\nThe output is as expected.") - logging.info("Running command: %s", " ".join(cmd + wms_monitor_args + profile_dry_run)) - os.makedirs("watchdog/logs/per-run", exist_ok=True) # Get the current date and time @@ -240,24 +805,58 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sam # Convert it to a string current_time = now.strftime("%Y%m%d%H%M%S") - with open(f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}.log", "w") as f: - process2 = subprocess.Popen(cmd + wms_monitor_args + profile_dry_run, stdout=f, stderr=f, universal_newlines=True) - # process2 = subprocess.Popen(cmd + profile_slurm, stdout=f, stderr=f, universal_newlines=True) - process2.wait() - - logging.info("Return code: %s", process2.returncode) + if panoptes is True: + final_cmd = cmd + wms_monitor_args + profile_slurm + else: + final_cmd = (cmd + profile_slurm,) + + if report_only is False: + logging.info("\nThe output is as expected.") + # logging.info("Running command: %s", " ".join(cmd + profile_slurm)) + + logging.info( + "Running command: %s", " ".join(cmd + wms_monitor_args + profile_slurm) + ) + + with open( + f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}.log", + "w", + ) as f: + # process2 = subprocess.Popen(cmd + wms_monitor_args + profile_dry_run, stdout=f, stderr=f, universal_newlines=True, cwd=working_directory, env=my_env) + process2 = subprocess.Popen( + final_cmd, + stdout=f, + stderr=f, + universal_newlines=True, + cwd=working_directory, + env=my_env, + ) + process2.wait() + + logging.info("Return code: %s", process2.returncode) logging.info("Generating ashleys report.") os.makedirs(os.path.dirname(report_location), exist_ok=True) # os.makedirs(f"{publishdir_location}/{date_folder}/{sample}/reports/", exist_ok=True) - logging.info("Running command: %s", " ".join(cmd + profile_slurm + report_options)) + logging.info( + "Running command: %s", " ".join(cmd + profile_dry_run + report_options) + ) # Change the permissions of the new directory # subprocess.run(["chmod", "-R", "777", f"{data_location}/{date_folder}"]) - with open(f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}_report.log", "w") as f: - print(cmd + profile_slurm + report_options) - process2 = subprocess.Popen(cmd + profile_dry_run + report_options, stdout=f, stderr=f, universal_newlines=True) - # process2 = subprocess.Popen(cmd + profile_slurm + report_options, stdout=f, stderr=f, universal_newlines=True) + with open( + f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}_report.log", + "w", + ) as f: + process2 = subprocess.Popen( + cmd + profile_dry_run + report_options, + stdout=f, + stderr=f, + universal_newlines=True, + cwd=working_directory, + env=my_env, + ) + # process2 = subprocess.Popen(cmd + profile_slurm + report_options, stdout=f, stderr=f, universal_newlines=True, cwd=working_directory, env=my_env) process2.wait() logging.info("Return code: %s", process2.returncode) From dd1102329e60eeddd092b37dfd9c1dfc93b834c7 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 10:44:35 +0000 Subject: [PATCH 2/8] merge blacklist fix, dump config fix, assertion to check labels.tsv x selected/ x scNOVA input lists, labels at later stage to prevent working on modified list of cells, other minor fixes --- .gitignore | 4 +- afac/update_timestamps.py | 25 + config/config.yaml | 3 + config/config_metadata.yaml | 6 + .../Dockerfile-2.2.2.dockerfile | 227 +++++++++ .../Dockerfile-2.2.3.dockerfile | 299 ++++++++++++ .../add_T2T_part_to_Dockerfile.sh | 35 ++ watchdog_pipeline/watchdog_pipeline.py | 13 +- workflow/Snakefile | 26 +- workflow/envs/scNOVA/scNOVA_DL.yaml | 1 + workflow/rules/aggregate_fct.smk | 2 +- workflow/rules/common.smk | 450 +++++------------- workflow/rules/count.smk | 3 +- workflow/rules/plots.smk | 7 +- workflow/rules/regenotyping.smk | 1 + workflow/rules/scNOVA.smk | 19 + workflow/rules/utils.smk | 15 + .../scripts/normalization/merge-blacklist.py | 3 +- .../scNOVA_scripts/assert_list_of_cells.py | 57 +++ workflow/scripts/utils/dump_config.py | 40 +- 20 files changed, 852 insertions(+), 384 deletions(-) create mode 100644 afac/update_timestamps.py create mode 100644 github-actions-runner/Dockerfile-2.2.2.dockerfile create mode 100644 github-actions-runner/Dockerfile-2.2.3.dockerfile create mode 100644 github-actions-runner/add_T2T_part_to_Dockerfile.sh create mode 100644 workflow/scripts/scNOVA_scripts/assert_list_of_cells.py diff --git a/.gitignore b/.gitignore index 406d336a..a140b637 100644 --- a/.gitignore +++ b/.gitignore @@ -218,4 +218,6 @@ LOGS_DEV/ # scTRIP multiplot workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot -workflow/config/scTRIP_multiplot.ok \ No newline at end of file +workflow/config/scTRIP_multiplot.ok +args.output +scNOVA_env_costea.yaml diff --git a/afac/update_timestamps.py b/afac/update_timestamps.py new file mode 100644 index 00000000..84cb551a --- /dev/null +++ b/afac/update_timestamps.py @@ -0,0 +1,25 @@ +import os, sys +import time +from pathlib import Path + + +def update_timestamps(directory): + """ + Update the access and modification times of all files in the given directory and its subdirectories. + + :param directory: Path to the directory + """ + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(".fastq.gz"): + continue + file_path = Path(root) / file + current_time = time.time() + print(file_path) + os.utime(file_path, (current_time, current_time)) + print(f"Updated timestamp for: {file_path}") + + +# Example usage +directory_path = sys.argv[1] +update_timestamps(directory_path) diff --git a/config/config.yaml b/config/config.yaml index 5a3b5098..3809643d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -14,6 +14,9 @@ email: "" # List of samples to process if multiple are specified samples_to_process: [] +# Plate size +plate_size: 96 + # -------------------------------------------------------- # Data location & I/O # -------------------------------------------------------- diff --git a/config/config_metadata.yaml b/config/config_metadata.yaml index 97c1af02..bff78ba9 100644 --- a/config/config_metadata.yaml +++ b/config/config_metadata.yaml @@ -135,3 +135,9 @@ use_strandscape_labels:: required: False default: False lint_check: False +plate_size:: + desc: "Plate size used for the sequencing (96/384)" + type: int + required: True + default: 96 + lint_check: False diff --git a/github-actions-runner/Dockerfile-2.2.2.dockerfile b/github-actions-runner/Dockerfile-2.2.2.dockerfile new file mode 100644 index 00000000..06f3ea66 --- /dev/null +++ b/github-actions-runner/Dockerfile-2.2.2.dockerfile @@ -0,0 +1,227 @@ +FROM condaforge/mambaforge:latest +LABEL io.github.snakemake.containerized="true" +LABEL io.github.snakemake.conda_env_hash="77eaa388d65d5205b87324fb0adb89561bc0e532a328995990a1d580aeb894ae" + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml +# prefix: /conda-envs/5681728a49bd83ceed09ba194330c858 +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - bwa ==0.7.17 +RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858 +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml +# prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - fastqc ==0.11.9 +RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml + +# Conda environment: +# source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_base.yaml +# prefix: /conda-envs/87c04f5d115eff742eca84455513deba +# name: ashleys_base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - samtools +# - tabix +# - bwa +# - sambamba +# - mosaicatcher +# # - alfred +# - ashleys-qc +# - pandas +# # PUBLISHDIR +# - rsync +# # MULTIQC +# - multiqc +# # Fix sklearn update +# - scikit-learn=1.2.2 +RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba +ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml + +# Conda environment: +# source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_rtools.yaml +# prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +# name: rtools +# channels: +# - conda-forge +# - bioconda +# - r +# - anaconda +# dependencies: +# # - bioconductor-biocparallel +# # - bioconductor-bsgenome +# # - bioconductor-bsgenome.hsapiens.ucsc.hg19 +# # - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# # - bioconductor-fastseg +# # - bioconductor-genomicalignments +# - bioconductor-genomicranges +# # - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - r-assertthat +# - r-base +# # - r-biocmanager +# - r-cowplot +# - r-data.table +# # - r-devtools +# # - r-doparallel +# # - r-foreach +# - r-ggplot2 +# # - r-gtools +# - r-reshape2 +# # - r-zoo +# # - r-dplyr +# # - r-mc2d +# # - r-pheatmap +# # - bioconductor-complexheatmap +# # - r-gplots +# - r-scales +# - r-rcolorbrewer +# # - r-stringr +# - r-cairo +# - fonts-anaconda +# # NEW +# - bioconductor-edger +# - r-r.utils +# # PLATE PLOT +# - r-dplyr +# - r-platetools +# - r-viridis +# # GC_correction +# - r-tidyr +# - r-ggpubr +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_base.yaml +# prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b +# name: mc-base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - pandas +# - intervaltree +# - scipy +# - pysam +# - tqdm +# - perl +# - pypdf2 +# - parmap +# # NEW +# - pyyaml +# - seaborn +# - matplotlib +# # SOLVE se-pe detection +# - samtools +# # ArbiGent Hufsah deps +# - pytables +# - xopen +RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b +COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_bioinfo_tools.yaml +# prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +# name: mc-bioinfo-tools +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bcftools +# - freebayes +# - mosaicatcher +# - samtools +# - tabix +# - whatshap +RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml + +# Conda environment: +# source: workflow/envs/rtools.yaml +# prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931 +# name: rtools +# channels: +# - bioconda +# - conda-forge +# - r +# - anaconda +# dependencies: +# # # NEW +# - strandphaser +# # ############### +# - bioconductor-biocparallel +# - bioconductor-bsgenome +# - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# - bioconductor-complexheatmap +# # - bioconductor-fastseg +# - bioconductor-genomicalignments +# - bioconductor-genomicranges +# - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - fonts-anaconda +# - r-assertthat +# - r-base +# - r-biocmanager +# - r-cairo +# - r-cowplot +# - r-data.table +# - r-devtools +# - r-doparallel +# - r-dplyr +# - r-foreach +# - r-ggplot2 +# - r-gplots +# - r-gtools +# - r-mc2d +# - r-rcolorbrewer +# - r-reshape2 +# - r-scales +# - r-stringr +# # SV_CALLS_DEV +# # - r-zoo +# - r-r.utils +# - r-ggnewscale +# # HEATMAP +# - r-tidyr +# # ARBIGENT +# - r-reshape +# - r-optparse +# - r-tidyr +# - r-ggbeeswarm +# - r-pheatmap +# # GC_correction +# - r-ggpubr +# - bioconductor-edger +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931 +COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml + +# Step 2: Generate conda environments + +RUN mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \ + mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \ + mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \ + mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \ + mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \ + mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \ + mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \ + mamba clean --all -y diff --git a/github-actions-runner/Dockerfile-2.2.3.dockerfile b/github-actions-runner/Dockerfile-2.2.3.dockerfile new file mode 100644 index 00000000..aa4d1c42 --- /dev/null +++ b/github-actions-runner/Dockerfile-2.2.3.dockerfile @@ -0,0 +1,299 @@ +FROM condaforge/mambaforge:latest +LABEL io.github.snakemake.containerized="true" +LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb9a77747c17f62707ea2f67a667" + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml +# prefix: /conda-envs/87c04f5d115eff742eca84455513deba +# name: ashleys_base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - samtools +# - tabix +# - bwa +# - sambamba +# - mosaicatcher +# # - alfred +# - ashleys-qc +# - pandas +# # PUBLISHDIR +# - rsync +# # MULTIQC +# - multiqc +# # Fix sklearn update +# - scikit-learn=1.2.2 +RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba +COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml + +# Conda environment: +# source: ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml +# prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +# name: rtools +# channels: +# - conda-forge +# - bioconda +# - r +# - anaconda +# dependencies: +# # - bioconductor-biocparallel +# # - bioconductor-bsgenome +# # - bioconductor-bsgenome.hsapiens.ucsc.hg19 +# # - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# # - bioconductor-fastseg +# # - bioconductor-genomicalignments +# - bioconductor-genomicranges +# # - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - r-assertthat +# - r-base +# # - r-biocmanager +# - r-cowplot +# - r-data.table +# # - r-devtools +# # - r-doparallel +# # - r-foreach +# - r-ggplot2 +# # - r-gtools +# - r-reshape2 +# # - r-zoo +# # - r-dplyr +# # - r-mc2d +# # - r-pheatmap +# # - bioconductor-complexheatmap +# # - r-gplots +# - r-scales +# - r-rcolorbrewer +# # - r-stringr +# - r-cairo +# - fonts-anaconda +# # NEW +# - bioconductor-edger +# - r-r.utils +# # PLATE PLOT +# - r-dplyr +# - r-platetools +# - r-viridis +# # GC_correction +# - r-tidyr +# - r-ggpubr +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml +# prefix: /conda-envs/5681728a49bd83ceed09ba194330c858 +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - bwa ==0.7.17 +RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858 +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml +# prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - fastqc ==0.11.9 +RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_base.yaml +# prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b +# name: mc-base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - pandas +# - intervaltree +# - scipy +# - pysam +# - tqdm +# - perl +# - pypdf2 +# - parmap +# # NEW +# - pyyaml +# - seaborn +# - matplotlib +# # SOLVE se-pe detection +# - samtools +# # ArbiGent Hufsah deps +# - pytables +# - xopen +RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b +COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_bioinfo_tools.yaml +# prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +# name: mc-bioinfo-tools +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bcftools +# - freebayes +# - mosaicatcher +# - samtools +# - tabix +# - whatshap +RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml + +# Conda environment: +# source: workflow/envs/rtools.yaml +# prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931 +# name: rtools +# channels: +# - bioconda +# - conda-forge +# - r +# - anaconda +# dependencies: +# # # NEW +# - strandphaser +# # ############### +# - bioconductor-biocparallel +# - bioconductor-bsgenome +# - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# - bioconductor-complexheatmap +# # - bioconductor-fastseg +# - bioconductor-genomicalignments +# - bioconductor-genomicranges +# - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - fonts-anaconda +# - r-assertthat +# - r-base +# - r-biocmanager +# - r-cairo +# - r-cowplot +# - r-data.table +# - r-devtools +# - r-doparallel +# - r-dplyr +# - r-foreach +# - r-ggplot2 +# - r-gplots +# - r-gtools +# - r-mc2d +# - r-rcolorbrewer +# - r-reshape2 +# - r-scales +# - r-stringr +# # SV_CALLS_DEV +# # - r-zoo +# - r-r.utils +# - r-ggnewscale +# # HEATMAP +# - r-tidyr +# # ARBIGENT +# - r-reshape +# - r-optparse +# - r-tidyr +# - r-ggbeeswarm +# - r-pheatmap +# # GC_correction +# - r-ggpubr +# - bioconductor-edger +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931 +COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_DL.yaml +# prefix: /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 +# name: scNOVA_DL +# channels: +# - conda-forge +# - anaconda +# dependencies: +# - tensorflow=1.15.0 +# - scikit-learn=0.21.3 +# - python=3.7.4 +# - matplotlib=3.1.1 +# - pandas=0.25.3 +# - h5py=2.10.0 +# - numpy +# # scNOVA archive +# - unzip +# # Fix +RUN mkdir -p /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 +COPY workflow/envs/scNOVA/scNOVA_DL.yaml /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_R.yaml +# prefix: /conda-envs/193f60d48796dd17eb847ea689b863a9 +# name: scNOVA +# channels: +# - bioconda +# - conda-forge +# - r +# dependencies: +# - bioconductor-deseq2=1.30.0 +# - r-matrixstats=0.58.0 +# - r-pheatmap=1.0.12 +# - r-gplots=3.1.1 +# - r-umap=0.2.7.0 +# - r-rtsne=0.15 +# - r-factoextra=1.0.7 +# - r-pracma=2.3.3 +# - bioconductor-chromvar=1.12.0 +# - r-nabor=0.5.0 +# - bioconductor-motifmatchr=1.12.0 +# - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.3 +# - bioconductor-jaspar2016=1.18.0 +# - r-codetools=0.2_18 +# - r-fitdistrplus +# - r-doparallel +# - r-foreach +RUN mkdir -p /conda-envs/193f60d48796dd17eb847ea689b863a9 +COPY workflow/envs/scNOVA/scNOVA_R.yaml /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml +# prefix: /conda-envs/ca9641251a8cb0057003875ad776c49f +# name: scNOVA_bioinfo_tools +# channels: +# - conda-forge +# - bioconda +# - anaconda +# dependencies: +# - samtools +# - biobambam +# - bedtools +RUN mkdir -p /conda-envs/ca9641251a8cb0057003875ad776c49f +COPY workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml + +# Step 2: Generate conda environments + +RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \ + mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \ + mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \ + mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \ + mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \ + mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \ + mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \ + mamba env create --prefix /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 --file /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml && \ + mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \ + mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \ + mamba clean --all -y +# CUSTOM PART +RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/ +COPY /workflow/scripts/utils/install_R_package.R /conda-envs/ +RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz diff --git a/github-actions-runner/add_T2T_part_to_Dockerfile.sh b/github-actions-runner/add_T2T_part_to_Dockerfile.sh new file mode 100644 index 00000000..7c631edd --- /dev/null +++ b/github-actions-runner/add_T2T_part_to_Dockerfile.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Check if a Dockerfile path is provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +DOCKERFILE=$1 + +# Check if the Dockerfile exists +if [ ! -f "$DOCKERFILE" ]; then + echo "Dockerfile not found: $DOCKERFILE" + exit 1 +fi + +# Extract the R environment variable +Renv=$(grep -P "\/rtools.*environment\.yaml" "$DOCKERFILE" | sed "s/\//\t/g" | cut -f 5) + +# Check if Renv is extracted +if [ -z "$Renv" ]; then + echo "R environment variable not found in the Dockerfile." + exit 1 +fi + +# Append custom steps to the Dockerfile +{ + echo '\n' + echo "# CUSTOM PART" + echo "RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/" + echo "COPY /workflow/scripts/utils/install_R_package.R /conda-envs/" + echo "RUN chmod -R 0777 /conda-envs/$Renv/lib/R/library && /conda-envs/$Renv/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz" +} >>"$DOCKERFILE" + +echo "Custom steps added to $DOCKERFILE" diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py index c0f259f9..4c1a6614 100644 --- a/watchdog_pipeline/watchdog_pipeline.py +++ b/watchdog_pipeline/watchdog_pipeline.py @@ -46,7 +46,8 @@ ] profile_dry_run = [ "--profile", - "workflow/snakemake_profiles/local/conda_singularity/", + "workflow/snakemake_profiles/local/conda/", + # "workflow/snakemake_profiles/local/conda_singularity/", "-c", "1", ] @@ -272,7 +273,7 @@ def check_unprocessed_folder(self): # last_message_timestamp = last_message_timestamp main_df = list() - if workflows_data: + if len(workflows_data) > 0: for plate in total_list_runs: # print(plate) if plate.split("-")[0][:2] == "20": @@ -383,6 +384,7 @@ def check_unprocessed_folder(self): pd.options.display.max_rows = 999 pd.options.display.max_colwidth = 30 # pd.options.display.max_columns = 50 + main_df = pd.DataFrame(main_df) # main_df.loc[(main_df["labels"] == True) & (main_df["report"] == True), "real_status"] = "Completed" main_df.loc[ @@ -418,7 +420,7 @@ def check_unprocessed_folder(self): main_df["real_status"] = main_df["real_status"].fillna( "Error (to investigate))" ) - + print(workflows_data["workflows"]) print(main_df) dry_run_db = False @@ -454,6 +456,9 @@ def check_unprocessed_folder(self): e for e in workflows_data["workflows"] if e["id"] == workflow_id ] + print(panoptes_entry) + print(panoptes_data) + if panoptes_data: panoptes_data = panoptes_data[0] if "completed_at" not in panoptes_data: @@ -530,7 +535,7 @@ def check_unprocessed_folder(self): for row in main_df.loc[ # (main_df["multiqc_scratch"] == False) (main_df["multiqc_scratch"] == False) - & (main_df["report"] == False) + # & (main_df["report"] == False) ].to_dict("records"): logging.info(row) diff --git a/workflow/Snakefile b/workflow/Snakefile index 5acf31fe..4262bd44 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,19 +19,29 @@ if config["ashleys_pipeline"] is True: module ashleys_qc: snakefile: - github( - "friendsofstrandseq/ashleys-qc-pipeline", - path="workflow/Snakefile", - tag=str(config["ashleys_pipeline_version"]), - ) + "../../ashleys-qc-pipeline/workflow/Snakefile" + # github( + # "friendsofstrandseq/ashleys-qc-pipeline", + # path="workflow/Snakefile", + # tag=str(config["ashleys_pipeline_version"]), + # ) config: config use rule * from ashleys_qc as ashleys_* - localrules: - ashleys_genecore_symlink, - symlink_selected_bam, + if config["ashleys_pipeline_only"] is True: + + localrules: + ashleys_genecore_symlink, + ashleys_symlink_selected_bam, + + else: + + localrules: + ashleys_genecore_symlink, + ashleys_symlink_selected_bam, + symlink_selected_bam, else: diff --git a/workflow/envs/scNOVA/scNOVA_DL.yaml b/workflow/envs/scNOVA/scNOVA_DL.yaml index 8530fdf8..775c36d8 100644 --- a/workflow/envs/scNOVA/scNOVA_DL.yaml +++ b/workflow/envs/scNOVA/scNOVA_DL.yaml @@ -12,3 +12,4 @@ dependencies: - numpy # scNOVA archive - unzip + # Fix diff --git a/workflow/rules/aggregate_fct.smk b/workflow/rules/aggregate_fct.smk index 278d45b9..5de9c6e1 100644 --- a/workflow/rules/aggregate_fct.smk +++ b/workflow/rules/aggregate_fct.smk @@ -169,7 +169,7 @@ def aggregate_cells_scTRIP_multiplot(wildcards): cell_list = df.cell.tolist() return expand( - "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png", + "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.pdf", folder=config["data_location"], sample=wildcards.sample, cell=cell_list, diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 4c42e1b8..7af4b6b4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -11,6 +11,12 @@ import os, sys os.environ["LC_CTYPE"] = "C" +# print(config["data_location"]) + +if config["ashleys_pipeline"] is True and config["genecore"] is True: + config["data_location"] = "/".join(config["data_location"].split("/")[:-1]) + + envvars: "LC_CTYPE", @@ -131,6 +137,9 @@ class HandleInput: genecore=False, genecore_path=str, ): + # print(input_path) + # print(genecore_path) + # print("\n") if genecore is False: df_config_files = self.handle_input_data(thisdir=input_path, bam=bam) elif genecore is True: @@ -154,56 +163,69 @@ class HandleInput: Returns: _type_: _description_ """ - complete_df_list = list() + from pprint import pprint + from collections import Counter - # List of folders/files to not consider (restrict to samples only) - l = sorted( - [ - e - for e in os.listdir( - "{genecore_prefix}/{date_folder}".format( - genecore_prefix=config["genecore_prefix"], - date_folder=config["genecore_date_folder"], - ) - ) - if e.endswith(".txt.gz") - ] + directory_path = f"{config['genecore_prefix']}/{config['genecore_date_folder']}" + + l = sorted([e for e in os.listdir(directory_path) if e.endswith(".txt.gz")]) + + complete_df_list = list() + # print(thisdir) + genecore_prefix = config["genecore_prefix"] + date_folder = config["genecore_date_folder"] + # print(f"{genecore_prefix}/{date_folder}") + + # Pattern to extract sample name and index + pattern = re.compile(r"(.*_lane1)(.*?)(iTRU|PE20)(.*?)(\d{2})(?:_1_|_2_)") + + samples = list() + prefixes = list() + indexes = list() + plate_types = list() + d_master = collections.defaultdict( + lambda: { + "indexes": set(), + "file_prefix": "", + "plate_type": "", + "index_pattern": "", + } ) - # print(l) - # Create a list of files to process for each sample - d_master = collections.defaultdict(dict) - sub_l = list() - for j, e in enumerate(l): - sub_l.append(e) - if (j + 1) % 192 == 0: - common_element = findstem(sub_l) - l_elems = common_element.split("lane1") - # print(sub_l) - # print(common_element) - # print(l_elems) - # print(l_elems[1].split("{regex_element}".format(regex_element=config["genecore_regex_element"])) - prefix = l_elems[0] - # technician_name = l_elems[0].split("_")[-2] - sample = l_elems[1].split( - "{regex_element}".format( - regex_element=config["genecore_regex_element"] - ) - )[0] - index = l_elems[1].split( - "{regex_element}".format( - regex_element=config["genecore_regex_element"] + + # First pass: Count occurrences of each sample_name + file_counts_per_sample = Counter() + for file_path in l: + match = pattern.search(file_path) + if match: + sample_name = match.group(2) + file_counts_per_sample[sample_name] += 1 + + # Second pass: Process files and determine plate type per sample + for j, file_path in enumerate(sorted(l)): + match = pattern.search(file_path) + if match: + sample_name = match.group(2) + index = match.group(4) + indexes.append(index) + d_master[sample_name]["indexes"].add(index) + file_count = file_counts_per_sample[sample_name] + + # Determine plate type using modulo 96 operation + if file_count % 96 != 0: + raise ValueError( + f"Invalid file count for sample {sample_name} with file count {file_count}. Must be a multiple of 96." ) - )[1] - # pe_index = common_element[-1] - sub_l = list() - - d_master[sample]["prefix"] = prefix - # d_master[sample]["technician_name"] = technician_name - d_master[sample]["index"] = index - d_master[sample]["common_element"] = common_element - # from pprint import pprint - # pprint(d_master) - # exit() + plate_type = int(file_count / 2) + + if (j + 1) % file_count == 0: + prefixes.append(match.group(3)) + d_master[sample_name]["file_prefix"] = match.group(1) + d_master[sample_name]["index_pattern"] = match.group(3) + plate = directory_path.split("/")[-1] + samples.append(sample_name) + plate_types.append(plate_type) + d_master[sample_name]["plate_type"] = plate_type + samples_to_process = ( config["samples_to_process"] if len(config["samples_to_process"]) > 0 @@ -220,8 +242,8 @@ class HandleInput: "{data_location}/{sample}/fastq/{sample}{regex_element}{index}{cell_nb}.{pair}.fastq.gz", data_location=config["data_location"], sample=sample, - regex_element=config["genecore_regex_element"], - index=d_master[sample]["index"], + regex_element=d_master[sample]["index_pattern"], + index=d_master[sample]["indexes"], cell_nb=[str(e).zfill(2) for e in list(range(1, 97))], pair=["1", "2"], ) @@ -229,7 +251,8 @@ class HandleInput: if sample in samples_to_process ] genecore_list = [sub_e for e in genecore_list for sub_e in e] - # pprint(genecore_list) + # pprint(d_master) + complete_df_list = list() for sample in d_master: @@ -248,11 +271,12 @@ class HandleInput: df["Full_path"] = df[["Folder", "File"]].apply( lambda r: f"{r['Folder']}/{r['File']}.fastq.gz", axis=1 ) + df["Genecore_path"] = df["File"].apply( - lambda r: f"{config['genecore_prefix']}/{config['genecore_date_folder']}/{d_master[sample]['prefix']}lane1{r.replace('.', '_')}_sequence.txt.gz" + lambda r: f"{config['genecore_prefix']}/{config['genecore_date_folder']}/{d_master[sample]['file_prefix']}{r.replace('.', '_')}_sequence.txt.gz" ) df["Genecore_file"] = df["File"].apply( - lambda r: f"{d_master[sample]['prefix']}lane1{r.replace('.', '_')}" + lambda r: f"{d_master[sample]['file_prefix']}{r.replace('.', '_')}" ) df["Genecore_file"] = df["Genecore_file"].apply( lambda r: "_".join(r.split("_")[:-1]) @@ -375,12 +399,18 @@ def findstem(arr): # Create configuration file with samples +# print("config['data_location']") +# print(config["data_location"]) + c = HandleInput( input_path=config["data_location"], - genecore_path="{genecore_prefix}/{genecore_date_folder}".format( + genecore_path="{genecore_prefix}".format( genecore_prefix=config["genecore_prefix"], - genecore_date_folder=config["genecore_date_folder"], ), + # genecore_path="{genecore_prefix}/{genecore_date_folder}".format( + # genecore_prefix=config["genecore_prefix"], + # genecore_date_folder=config["genecore_date_folder"], + # ), output_path="{data_location}/config/config_df.tsv".format( data_location=config["data_location"] ), @@ -532,8 +562,12 @@ def onsuccess_fct(log): log, "SUCCESS", config, config_metadata ) shell( - 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {}'.format( - config["version"], config["data_location"], config["email"], log_path_new + 'mail -s "[smk-wf-catalog/mosaicatcher-pipeline] v{} - [{}--{}] - SUCCESS" {} < {}'.format( + config["version"], + config["data_location"].split("/")[-1], + ";".join(samples), + config["email"], + log_path_new, ) ) @@ -546,8 +580,12 @@ def onerror_fct(log): log, "ERROR", config, config_metadata ) shell( - 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {}'.format( - config["version"], config["data_location"], config["email"], log_path_new + 'mail -s "[smk-wf-catalog/mosaicatcher-pipeline] v{} - [{}--{}] - ERROR" {} < {}'.format( + config["version"], + config["data_location"].split("/")[-1], + ";".join(samples), + config["email"], + log_path_new, ) ) @@ -564,308 +602,26 @@ def get_scnova_final_output(wildcards): # abbreviate_names = False l = [ - # expand( - # "{folder}/{sample}/scNOVA_input_user/{clone}_sv_calls_all_print.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # clone=clones[wildcards.sample], - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_orientation_CN_correct0.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_input_user/sv_calls_all_print_CREs.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_sort_geneid.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}_sc.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab_final.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{sample}_{clone}_orientation_norm_qc.pdf", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_orientation_norm.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab_final.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{sample}_{clone}_Resid_orientation_qc.pdf", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_Resid_orientation.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_all_orientation_norm_var_GC_CpG_RT_T_comb3_{clone}.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Expression_all_{clone}.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train40_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train20_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train5_output_ypred_{clone}.csv", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train40_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train20_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_CNN/DNN_train5_output_ypred_{clone}_annot.txt", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_plots/Result_scNOVA_plots_{sample}.pdf", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/result_PLSDA_{sample}.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result_plots/Result_scNOVA_plots_{sample}_alternative_PLSDA.pdf", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num_sort_for_chromVAR.txt", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W1.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W2.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C2.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam.bai", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam.bai", - # cell=cell_per_sample[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_input_user/strandphaser_output_copy.txt", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), - # expand( - # "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2_sort.txt", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2.tab", - # folder=config["data_location"], - # sample=wildcards.sample, - # ), expand( "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2_sort.txt", folder=config["data_location"], sample=wildcards.sample, ), - # expand( - # "{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam", - # clone=clones[wildcards.sample], - # folder=config["data_location"], - # sample=wildcards.sample, - # ), ] l = [sub_e for e in l for sub_e in e] return l @@ -1096,16 +852,26 @@ def get_all_plots(wildcards): ), ) - # Run summary section + # Config section l_outputs.extend( expand( - "{folder}/{sample}/config/run_summary.txt", + "{folder}/{sample}/config/config.yaml", folder=config["data_location"], sample=wildcards.sample, ), ) + # Run summary section + + # l_outputs.extend( + # expand( + # "{folder}/{sample}/config/run_summary.txt", + # folder=config["data_location"], + # sample=wildcards.sample, + # ), + # ) + # from pprint import pprint # pprint(l_outputs) return l_outputs diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk index 080d64b7..f1a0e74e 100755 --- a/workflow/rules/count.smk +++ b/workflow/rules/count.smk @@ -136,6 +136,7 @@ rule symlink_selected_bam: rule remove_unselected_bam: input: + labels="{folder}/{sample}/cell_selection/labels.tsv", bam=unselected_input_bam, bai=unselected_input_bai, output: @@ -196,7 +197,7 @@ if ( "../envs/mc_base.yaml" shell: """ - workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size {params.window} > {output.merged} 2>> {log} + workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size {params.window} --output {output.merged} """ else: diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk index 1acc1e55..221c8610 100644 --- a/workflow/rules/plots.smk +++ b/workflow/rules/plots.smk @@ -17,7 +17,7 @@ if config["ashleys_pipeline"] is False: # "{folder}/{sample}/plots/counts/CountComplete.raw.pdf", report( "{folder}/{sample}/plots/counts/CountComplete.raw.pdf", - category="Mosaic Counts", + category="Mosaic counts", subcategory="{sample}", labels={"Cell": "ALL", "Type": "raw"}, ), @@ -40,7 +40,7 @@ rule divide_pdf: report( "{folder}/{sample}/plots/counts_raw/{cell}.{i, \d+}.pdf", caption="../report/mosaic_counts.rst", - category="Mosaic counts", + category="Mosaic counts cellwise", subcategory="{sample}", labels={"Cell": "{cell}", "Nb": "{i}", "Type": "raw"}, ), @@ -306,7 +306,7 @@ rule scTRIP_multiplot: sv_counts="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", output: figure=report( - "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png", + "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.pdf", category="scTRIP multiplot", subcategory="{sample}", labels={"Cell": "{cell}", "Chrom": "{chrom}"}, @@ -315,6 +315,7 @@ rule scTRIP_multiplot: "{folder}/log/scTRIP_multiplot/{sample}/{cell}/{chrom}.log", conda: "../envs/rtools.yaml" + container: None resources: mem_mb=get_mem_mb, shell: diff --git a/workflow/rules/regenotyping.smk b/workflow/rules/regenotyping.smk index ebb451df..2bfae7b0 100644 --- a/workflow/rules/regenotyping.smk +++ b/workflow/rules/regenotyping.smk @@ -6,6 +6,7 @@ rule mergeBams: check=remove_unselected_fct, bam=selected_input_bam, bai=selected_input_bai, + labels="{folder}/{sample}/cell_selection/labels.tsv", output: temp("{folder}/{sample}/merged_bam/merged.raw.bam"), log: diff --git a/workflow/rules/scNOVA.smk b/workflow/rules/scNOVA.smk index 04c108d2..9f6c7c5b 100755 --- a/workflow/rules/scNOVA.smk +++ b/workflow/rules/scNOVA.smk @@ -1,8 +1,24 @@ +rule assert_list_of_cells: + input: + labels="{folder}/{sample}/cell_selection/labels.tsv", + subclone_list="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", + selected_cells="{folder}/{sample}/selected/", + output: + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", + log: + "{folder}/{sample}/log/assert_list_of_cells.log", + conda: + "../envs/mc_base.yaml" + script: + "../scripts/scNOVA_scripts/assert_list_of_cells.py" + + rule filter_sv_calls: log: "{folder}/{sample}/log/filter_sv_calls/{sample}.log", input: "{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: "{folder}/{sample}/scNOVA_input_user/sv_calls.tsv", conda: @@ -147,6 +163,7 @@ rule remove_dup: None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam", + assert_list_of_cells="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: bam_uniq="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", bam_metrix="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono.metrix_dup.txt", @@ -272,6 +289,7 @@ rule filter_input_subclonality: None input: "{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: "{folder}/{sample}/scNOVA_input_user/input_subclonality_{clone}.txt", conda: @@ -973,6 +991,7 @@ rule split_bam_WC: None input: "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", + "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: bam_header="{folder}/{sample}/scNOVA_bam_modified/{cell}.header_WC.sam", bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam", diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 39353b67..4eaf8464 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -139,3 +139,18 @@ rule samtools_faindex: mem_mb=get_mem_mb_heavy, shell: "samtools faidx {input}" + + +rule save_config: + input: + "config/config.yaml", + output: + "{folder}/{sample}/config/config.yaml", + log: + "{folder}/log/save_config/{sample}.log", + conda: + "../envs/mc_base.yaml" + resources: + mem_mb=get_mem_mb, + script: + "../scripts/utils/dump_config.py" diff --git a/workflow/scripts/normalization/merge-blacklist.py b/workflow/scripts/normalization/merge-blacklist.py index 9a484eec..750d3966 100755 --- a/workflow/scripts/normalization/merge-blacklist.py +++ b/workflow/scripts/normalization/merge-blacklist.py @@ -16,6 +16,7 @@ def main(): type=int, help="If the distance between two blacklisted intervals is below this threshold, they are merged.", ) + parser.add_argument("--output", default=None, help="Output file name") parser.add_argument( "--whitelist", default=None, help="TSV file with intervals to be removed from the blacklist (columns: chrom, start, end)." ) @@ -71,7 +72,7 @@ def main(): print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr) - norm_table.to_csv(sys.stdout, index=False, sep="\t") + norm_table.to_csv(args.output, index=False, sep="\t") ## Identify "complex" intervals # segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end']) diff --git a/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py b/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py new file mode 100644 index 00000000..651fb7c6 --- /dev/null +++ b/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py @@ -0,0 +1,57 @@ +import pandas as pd +import os + + +def main(labels_file, subclone_file, selected_folder, output_file): + # Read labels.tsv + labels_df = pd.read_csv(labels_file, sep="\t") + labels_cells = set( + labels_df["cell"].str.replace(".sort.mdup.bam", "").values.tolist() + ) + + # Read input_subclonality.txt + input_subclonality = pd.read_csv(subclone_file, sep="\t") + subclone_cells = set(input_subclonality["Filename"].values.tolist()) + + # List files in selected/ folder and process filenames + selected_cells = set( + file.replace(".sort.mdup.bam", "") + for file in os.listdir(selected_folder) + if file.endswith(".sort.mdup.bam") + ) + + # Compare sets + if labels_cells == subclone_cells == selected_cells: + result = "PASS: All cell lists match." + else: + result = "FAIL: Cell lists do not match." + + # Logging details of the mismatch + with open(output_file, "w") as output: + output.write("Labels cells: {}\n".format(labels_cells)) + output.write("Subclone cells: {}\n".format(subclone_cells)) + output.write("Selected cells: {}\n".format(selected_cells)) + output.write("Discrepancy details:\n") + output.write( + "In labels but not in subclone: {}\n".format(labels_cells - subclone_cells) + ) + output.write( + "In subclone but not in labels: {}\n".format(subclone_cells - labels_cells) + ) + output.write( + "In labels but not in selected: {}\n".format(labels_cells - selected_cells) + ) + output.write( + "In selected but not in labels: {}\n".format(selected_cells - labels_cells) + ) + output.write(result) + + +if __name__ == "__main__": + # Extracting Snakemake input variables + labels_file = snakemake.input.labels + subclone_file = snakemake.input.subclone_list + selected_folder = snakemake.input.selected_cells + output_file = snakemake.output[0] + + main(labels_file, subclone_file, selected_folder, output_file) diff --git a/workflow/scripts/utils/dump_config.py b/workflow/scripts/utils/dump_config.py index 4701706a..6b299ee6 100644 --- a/workflow/scripts/utils/dump_config.py +++ b/workflow/scripts/utils/dump_config.py @@ -1,28 +1,22 @@ -import json -import time +import yaml -timestamp = time.strftime("%Y%m%d-%H%M%S") -configured_samples = [] -for key in config.keys(): - if not key.startswith("sample_description"): - continue - sample = key.split("_", 2)[-1] - configured_samples.append(sample) +def update_config(input_file, output_file): + # Load the existing config file + with open(input_file, "r") as file: + flat_file_config = yaml.safe_load(file) -if configured_samples: - second_dump = "config_{}_{}.json".format(timestamp, "_".join(sorted(configured_samples))) -else: - second_dump = "config_{}.json".format(timestamp) + # Update the config with Snakemake parameters + for key, value in snakemake.config.items(): + flat_file_config[key] = value -with open(output[0], "w") as fake: - _ = fake.write(second_dump + "\n(Full configuration dump)") + # Save the updated config to the output file + with open(output_file, "w") as file: + yaml.dump(flat_file_config, file) -with open(second_dump, "w") as dump: - json.dump( - config, - dump, - ensure_ascii=True, - indent=2, - sort_keys=True, - ) + +if __name__ == "__main__": + input_config = snakemake.input[0] + output_config = snakemake.output[0] + + update_config(input_config, output_config) From 07c7548bf35ed199d068f4089fef40d0e03fa002 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 14:00:43 +0000 Subject: [PATCH 3/8] Linting, fmt, config update --- config/config.yaml | 4 ++-- workflow/Snakefile | 12 ++++++------ workflow/rules/common.smk | 4 +++- workflow/rules/plots.smk | 3 ++- workflow/rules/setup.smk | 6 ++++-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3809643d..017cd8bd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,10 +3,10 @@ # -------------------------------------------------------- # MosaiCatcher version -version: 2.2.2 +version: 2.2.3 # Ashleys-QC pipeline version -ashleys_pipeline_version: 2.2.2 +ashleys_pipeline_version: 2.2.3 # Email for notifications about the pipeline's status email: "" diff --git a/workflow/Snakefile b/workflow/Snakefile index 4262bd44..652acaaa 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,12 +19,12 @@ if config["ashleys_pipeline"] is True: module ashleys_qc: snakefile: - "../../ashleys-qc-pipeline/workflow/Snakefile" - # github( - # "friendsofstrandseq/ashleys-qc-pipeline", - # path="workflow/Snakefile", - # tag=str(config["ashleys_pipeline_version"]), - # ) + # "../../ashleys-qc-pipeline/workflow/Snakefile" + github( + "friendsofstrandseq/ashleys-qc-pipeline", + path="workflow/Snakefile", + tag=str(config["ashleys_pipeline_version"]), + ) config: config diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7af4b6b4..8acedf9f 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -14,7 +14,9 @@ os.environ["LC_CTYPE"] = "C" # print(config["data_location"]) if config["ashleys_pipeline"] is True and config["genecore"] is True: - config["data_location"] = "/".join(config["data_location"].split("/")[:-1]) + config["data_location"] = config["abs_path"].join( + config["data_location"].split("/")[:-1] + ) envvars: diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk index 221c8610..23d35f3a 100644 --- a/workflow/rules/plots.smk +++ b/workflow/rules/plots.smk @@ -315,7 +315,8 @@ rule scTRIP_multiplot: "{folder}/log/scTRIP_multiplot/{sample}/{cell}/{chrom}.log", conda: "../envs/rtools.yaml" - container: None + container: + None resources: mem_mb=get_mem_mb, shell: diff --git a/workflow/rules/setup.smk b/workflow/rules/setup.smk index e59889ec..93ed6847 100644 --- a/workflow/rules/setup.smk +++ b/workflow/rules/setup.smk @@ -20,8 +20,10 @@ rule install_BSgenome_package: params: selected_package=lambda wc, input: "BSgenome.{}.UCSC.{}".format( "Mmusculus" if config["reference"] == "mm10" else "Hsapiens", - config["reference"] - ) if config["reference"] in ["hg38", "hg19", "mm10"] else input.package, + config["reference"], + ) + if config["reference"] in ["hg38", "hg19", "mm10"] + else input.package, conda: "../envs/rtools.yaml" resources: From 264c7ee5da7d13c340c04675d7429e4908b0b76a Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 15:01:55 +0000 Subject: [PATCH 4/8] Small fix --- watchdog_pipeline/watchdog_pipeline.py | 17 +++++++++++++---- workflow/rules/count.smk | 2 +- .../scripts/normalization/merge-blacklist.py | 1 + 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py index 4c1a6614..ea647898 100644 --- a/watchdog_pipeline/watchdog_pipeline.py +++ b/watchdog_pipeline/watchdog_pipeline.py @@ -39,11 +39,14 @@ # publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV" publishdir_location = "/g/korbel/WORKFLOW_RESULTS" genecore_prefix = path_to_watch -# profile_slurm = ["--profile", "../snakemake_profiles/HPC/dev/slurm_legacy_conda/"] profile_slurm = [ "--profile", - "/g/korbel2/weber/workspace/snakemake_profiles/HPC/slurm_EMBL/", + "/g/korbel2/weber/workspace/snakemake_profiles/HPC/dev/slurm_legacy_conda/", ] +# profile_slurm = [ +# "--profile", +# "/g/korbel2/weber/workspace/snakemake_profiles/HPC/slurm_EMBL/", +# ] profile_dry_run = [ "--profile", "workflow/snakemake_profiles/local/conda/", @@ -297,7 +300,9 @@ def check_unprocessed_folder(self): "PDAC60590MNI", "DXR30hMaja", "DXR42hMaja", - "GM19705", + # "GM19705", + "OrgxDoxocx02", + "GM20355x01", ]: run_id = f"{pipeline}--{plate}--{sample_name}" workflow_id = self.find_workflow_id_by_name( @@ -458,6 +463,10 @@ def check_unprocessed_folder(self): print(panoptes_entry) print(panoptes_data) + if workflow_id: + assert ( + len(panoptes_data) > 0 + ), "Data issue between pika & panoptes" if panoptes_data: panoptes_data = panoptes_data[0] @@ -693,7 +702,7 @@ def execute_command( "-s", "workflow/Snakefile", "--set-resources", - "ashleys_mark_duplicates:partition=bigmem", + "ashleys_mark_duplicates:constraint='milan\|rome'", "--config", "genecore=True", f"genecore_prefix={genecore_prefix}", diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk index f1a0e74e..7eee69bb 100755 --- a/workflow/rules/count.smk +++ b/workflow/rules/count.smk @@ -215,7 +215,7 @@ else: "../envs/mc_base.yaml" shell: """ - workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} > {output.merged} 2> {log} + cp {input.norm} {ouput.merged} """ diff --git a/workflow/scripts/normalization/merge-blacklist.py b/workflow/scripts/normalization/merge-blacklist.py index 750d3966..998c9650 100755 --- a/workflow/scripts/normalization/merge-blacklist.py +++ b/workflow/scripts/normalization/merge-blacklist.py @@ -70,6 +70,7 @@ def main(): norm_table.loc[[i], "class"] = "good" additional_whitelist += row.end - row.start + print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr) norm_table.to_csv(args.output, index=False, sep="\t") From aecfa781e76da1249c928674d84c4817c257e14a Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 15:04:32 +0000 Subject: [PATCH 5/8] Small fix --- workflow/rules/count.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk index 7eee69bb..3cc8fdc2 100755 --- a/workflow/rules/count.smk +++ b/workflow/rules/count.smk @@ -215,7 +215,7 @@ else: "../envs/mc_base.yaml" shell: """ - cp {input.norm} {ouput.merged} + cp {input.norm} {output.merged} """ From b0a5a11b0d6ebd1445e9ddc1311b41d61dbb186c Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Mon, 4 Dec 2023 15:32:07 +0000 Subject: [PATCH 6/8] Update dockerfile --- github-actions-runner/Dockerfile-2.2.3.dockerfile | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/github-actions-runner/Dockerfile-2.2.3.dockerfile b/github-actions-runner/Dockerfile-2.2.3.dockerfile index aa4d1c42..89d51957 100644 --- a/github-actions-runner/Dockerfile-2.2.3.dockerfile +++ b/github-actions-runner/Dockerfile-2.2.3.dockerfile @@ -5,7 +5,7 @@ LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb # Step 1: Retrieve conda environments # Conda environment: -# source: ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml +# source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml # prefix: /conda-envs/87c04f5d115eff742eca84455513deba # name: ashleys_base # channels: @@ -27,10 +27,10 @@ LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb # # Fix sklearn update # - scikit-learn=1.2.2 RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba -COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml +ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml # Conda environment: -# source: ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml +# source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml # prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 # name: rtools # channels: @@ -83,7 +83,7 @@ COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d # # SOLVE R lib issue # - r-stringi=1.7.12 RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 -COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml +ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml # Conda environment: # source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml @@ -293,7 +293,3 @@ RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --fil mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \ mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \ mamba clean --all -y -# CUSTOM PART -RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/ -COPY /workflow/scripts/utils/install_R_package.R /conda-envs/ -RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz From ec910ddc0622caa2eb87a7c2b2c3c88f875ddd47 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 5 Dec 2023 10:39:39 +0000 Subject: [PATCH 7/8] Minor fixes for scNOVA (2.2.4) --- .gitignore | 2 + config/config.yaml | 2 +- workflow/rules/external_data.smk | 4 +- workflow/rules/scNOVA.smk | 232 +++++++++--------- .../filter_input_subclonality.py | 4 +- .../scripts/scNOVA_scripts/filter_sv_calls.py | 2 +- 6 files changed, 124 insertions(+), 122 deletions(-) diff --git a/.gitignore b/.gitignore index a140b637..ccdccfe3 100644 --- a/.gitignore +++ b/.gitignore @@ -221,3 +221,5 @@ workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot workflow/config/scTRIP_multiplot.ok args.output scNOVA_env_costea.yaml +.keras/keras.json +hs_err_pid2227945.log diff --git a/config/config.yaml b/config/config.yaml index 017cd8bd..2abbf67a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,7 +3,7 @@ # -------------------------------------------------------- # MosaiCatcher version -version: 2.2.3 +version: 2.2.4 # Ashleys-QC pipeline version ashleys_pipeline_version: 2.2.3 diff --git a/workflow/rules/external_data.smk b/workflow/rules/external_data.smk index 71ed4d63..634ac216 100644 --- a/workflow/rules/external_data.smk +++ b/workflow/rules/external_data.smk @@ -173,8 +173,8 @@ rule download_scnova_data: touch("log/config/dl_arbigent_mappability_track.ok"), conda: "../envs/scNOVA/scNOVA_DL.yaml" - container: - None + # container: + # None shell: """ directory="workflow/data/ref_genomes/" diff --git a/workflow/rules/scNOVA.smk b/workflow/rules/scNOVA.smk index 9f6c7c5b..bba0f075 100755 --- a/workflow/rules/scNOVA.smk +++ b/workflow/rules/scNOVA.smk @@ -17,8 +17,8 @@ rule filter_sv_calls: log: "{folder}/{sample}/log/filter_sv_calls/{sample}.log", input: - "{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", - "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", + sv="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", + assertion="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: "{folder}/{sample}/scNOVA_input_user/sv_calls.tsv", conda: @@ -28,8 +28,8 @@ rule filter_sv_calls: rule scNOVA_final_results: - container: - None + # container: + # None input: get_scnova_final_output, output: @@ -43,8 +43,8 @@ rule scNOVA_final_results: rule generate_CN_for_CNN: - container: - None + # container: + # None input: mosaiclassifier_final_results="{folder}/{sample}/plots/final_results/{sample}.txt", subclone="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", @@ -70,8 +70,8 @@ rule generate_CN_for_CNN: rule generate_CN_for_chromVAR: - container: - None + # container: + # None input: TSS_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_TSS_for_SVM.txt", TES_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_TES_for_SVM.txt", @@ -97,8 +97,8 @@ rule generate_CN_for_chromVAR: rule remove_low_quality_reads: - container: - None + # container: + # None input: bam="{folder}/{sample}/selected/{cell}.sort.mdup.bam", output: @@ -112,16 +112,16 @@ rule remove_low_quality_reads: mem_mb=get_mem_mb, shell: """ - samtools view -H {input} > {output.bam_header} - samtools view -F 2304 {input.bam} | awk -f workflow/scripts/scNOVA_scripts/awk_1st.awk | cat {output.bam_header} - | samtools view -Sb - > {output.bam_pre} + samtools view -H {input} > {output.bam_header} + samtools view -F 2304 {input.bam} | awk -f workflow/scripts/scNOVA_scripts/awk_1st.awk | cat {output.bam_header} - | samtools view -Sb - > {output.bam_pre} """ rule sort_bam: log: "{folder}/{sample}/log/sort_bam/{cell}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono.bam", output: @@ -140,8 +140,8 @@ rule sort_bam: rule index_num1: log: "{folder}/{sample}/log/index_num1/{cell}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam", output: @@ -159,8 +159,8 @@ rule index_num1: rule remove_dup: log: "{folder}/{sample}/log/remove_dup/{cell}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam", assert_list_of_cells="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", @@ -180,8 +180,8 @@ rule remove_dup: rule index_num2: log: "{folder}/{sample}/log/index_num2/{cell}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", output: @@ -199,8 +199,8 @@ rule index_num2: rule count_reads_split: log: "{folder}/{sample}/log/count_reads_split/{cell}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai", @@ -221,8 +221,8 @@ rule count_reads_split: rule count_reads_split_aggr: log: "{folder}/{sample}/log/count_reads_split_aggr.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result/count_reads_split/{cell}.tab", @@ -244,8 +244,8 @@ rule count_reads_split_aggr: rule count_sort_by_coordinate: log: "{folder}/{sample}/log/count_sort_by_coordinate/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/{sample}.tab", output: @@ -261,8 +261,8 @@ rule count_sort_by_coordinate: rule count_sort_annotate_geneid: log: "{folder}/{sample}/log/count_sort_annotate_geneid/{sample}.log", - container: - None + # container: + # None input: count_table="{folder}/{sample}/scNOVA_result/{sample}_sort.txt", GB_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_Genebody_for_SCDE.txt", @@ -278,18 +278,18 @@ rule count_sort_annotate_geneid: mem_mb=get_mem_mb, shell: """ - Rscript {params.count_sort_annotate_geneid} {input.count_table} {input.GB_matrix} {output} + Rscript {params.count_sort_annotate_geneid} {input.count_table} {input.GB_matrix} {output} """ rule filter_input_subclonality: log: "{folder}/{sample}/log/filter_input_subclonality/{clone}.log", - container: - None + # container: + # None input: - "{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", - "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", + subclonality="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", + assertion="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: "{folder}/{sample}/scNOVA_input_user/input_subclonality_{clone}.txt", conda: @@ -301,8 +301,6 @@ rule filter_input_subclonality: rule merge_bam_clones: log: "{folder}/{sample}/log/merge_bam_clones/{clone}.log", - container: - None input: bam=lambda wc: expand( "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", @@ -324,6 +322,8 @@ rule merge_bam_clones: line="{folder}/{sample}/scNOVA_input_user/{clone}_line.txt", conda: "../envs/scNOVA/scNOVA_bioinfo_tools.yaml" + # container: + # None resources: mem_mb=get_mem_mb, shell: @@ -335,8 +335,8 @@ rule merge_bam_clones: rule count_reads_for_DNN: log: "{folder}/{sample}/log/count_reads_for_DNN/{clone}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam", bai="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam.bai", @@ -357,8 +357,8 @@ rule count_reads_for_DNN: rule count_reads_for_DNN_aggr: log: "{folder}/{sample}/log/count_reads_for_DNN_aggr/{sample}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result/count_reads_for_DNN/Deeptool_Genes_for_CNN_{clone}.tab", @@ -380,8 +380,8 @@ rule count_reads_for_DNN_aggr: rule count_reads_for_DNN_sc: log: "{folder}/{sample}/log/count_reads_for_DNN_sc/{cell}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai", @@ -402,8 +402,8 @@ rule count_reads_for_DNN_sc: rule count_reads_for_DNN_sc_aggr: log: "{folder}/{sample}/log/count_reads_for_DNN_sc_aggr/{sample}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result/count_reads_for_DNN_sc/Deeptool_Genes_for_CNN_{cell}.tab", @@ -425,8 +425,8 @@ rule count_reads_for_DNN_sc_aggr: rule count_reads_chr_length: log: "{folder}/{sample}/log/count_reads_chr_length/{clone}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam", bai="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam.bai", @@ -449,8 +449,8 @@ rule count_reads_chr_length: rule count_reads_chr_length_aggr: log: "{folder}/{sample}/log/count_reads_chr_length_aggr/{sample}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result/count_reads_chr_length/Deeptool_chr_length_{clone}.tab", @@ -472,8 +472,8 @@ rule count_reads_chr_length_aggr: rule count_reads_chr_length_sc: log: "{folder}/{sample}/log/count_reads_chr_length_sc/{cell}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai", @@ -494,8 +494,8 @@ rule count_reads_chr_length_sc: rule count_reads_chr_length_sc_aggr: log: "{folder}/{sample}/log/count_reads_chr_length_sc_aggr/{sample}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result/count_reads_chr_length_sc/Deeptool_chr_length_{cell}.tab", @@ -517,8 +517,8 @@ rule count_reads_chr_length_sc_aggr: rule count_reads_for_DNN_sort: log: "{folder}/{sample}/log/count_reads_for_DNN_sort/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}.tab", output: @@ -534,8 +534,8 @@ rule count_reads_for_DNN_sort: rule count_reads_for_DNN_sort_lab: log: "{folder}/{sample}/log/count_reads_for_DNN_sort_lab/{sample}.log", - container: - None + # container: + # None input: count_reads_sort="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort.txt", Ref_bed="workflow/data/scNOVA/utils/bin_Genes_for_CNN_num_sort.txt", @@ -557,8 +557,8 @@ rule count_reads_for_DNN_sort_lab: rule count_reads_for_DNN_sort_label_sort: log: "{folder}/{sample}/log/count_reads_for_DNN_sort_label_sort/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt", output: @@ -574,8 +574,8 @@ rule count_reads_for_DNN_sort_label_sort: rule count_reads_for_DNN_normalization: log: "{folder}/{sample}/log/count_reads_for_DNN_normalization/{clone}.log", - container: - None + # container: + # None input: count_reads_chr_length="{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}.tab", count_reads_sort_label="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt", @@ -605,8 +605,8 @@ rule count_reads_for_DNN_normalization: rule count_reads_for_DNN_sc_sort: log: "{folder}/{sample}/log/count_reads_for_DNN_sc_sort/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc.tab", output: @@ -622,8 +622,8 @@ rule count_reads_for_DNN_sc_sort: rule count_reads_for_DNN_sc_sort_lab: log: "{folder}/{sample}/log/count_reads_for_DNN_sc_sort_lab/{sample}.log", - container: - None + # container: + # None input: count_reads_sort="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort.txt", Ref_bed="workflow/data/scNOVA/utils/bin_Genes_for_CNN_num_sort.txt", @@ -645,8 +645,8 @@ rule count_reads_for_DNN_sc_sort_lab: rule count_reads_for_DNN_sc_sort_label_sort: log: "{folder}/{sample}/log/count_reads_for_DNN_sc_sort_label_sort/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab.txt", output: @@ -662,8 +662,8 @@ rule count_reads_for_DNN_sc_sort_label_sort: rule generate_feature_sc_var: log: "{folder}/{sample}/log/generate_feature_sc_var/{clone}.log", - container: - None + # container: + # None input: subclone_list="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", count_reads_sc_sort="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab_final.txt", @@ -693,8 +693,8 @@ rule generate_feature_sc_var: rule combine_features: log: "{folder}/{sample}/log/combine_features/{clone}.log", - container: - None + # container: + # None input: TSS_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_TSS_for_SVM.txt", table_GC_imput="workflow/data/scNOVA/utils/Features_reshape_GC_orientation_impute.txt", @@ -726,8 +726,8 @@ rule combine_features: rule infer_expressed_genes_split: log: "{folder}/{sample}/log/infer_expressed_genes_split/{clone}_{chrom}_{i}.log", - container: - None + # container: + # None input: features="{folder}/{sample}/scNOVA_result/Features_reshape_all_orientation_norm_var_GC_CpG_RT_T_comb3_{clone}.txt", TSS_annot="{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt", @@ -744,8 +744,8 @@ rule infer_expressed_genes_split: rule gather_infer_expressed_genes_split: log: "{folder}/{sample}/log/gather_infer_expressed_genes_split/{clone}_{i}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result_CNN/{chrom}/DNN_train{i}_output_ypred_{clone}.csv", @@ -766,8 +766,8 @@ rule gather_infer_expressed_genes_split: rule aggr_models_touch: log: "{folder}/{sample}/log/aggr_models_touch/{clone}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result_CNN/DNN_train{i}_output_ypred_{clone}.csv", @@ -783,8 +783,8 @@ rule aggr_models_touch: rule annot_expressed_genes: log: "{folder}/{sample}/log/annot_expressed_genes/{clone}.log", - container: - None + # container: + # None input: TSS_annot="{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt", train80="{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}.csv", @@ -814,8 +814,8 @@ rule annot_expressed_genes: rule infer_differential_gene_expression: log: "{folder}/{sample}/log/infer_differential_gene_expression/{sample}.log", - container: - None + # container: + # None input: Genebody_NO="{folder}/{sample}/scNOVA_result/{sample}_sort.txt", clonality="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", @@ -847,8 +847,8 @@ rule infer_differential_gene_expression: rule infer_differential_gene_expression_alt: log: "{folder}/{sample}/log/infer_differential_gene_expression_alt/{sample}.log", - container: - None + # container: + # None input: Genebody_NO="{folder}/{sample}/scNOVA_result/{sample}_sort.txt", clonality="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt", @@ -874,15 +874,15 @@ rule infer_differential_gene_expression_alt: time="10:00:00", shell: """ - Rscript {params.infer_diff_gene_expression_alt} {input.Genebody_NO} {input.clonality} {input.TSS_matrix} {input.GB_matrix} {input.CNN_result1} {input.CNN_result2} {input.input_matrix} {output.result_table} {output.result_plot} {input.final_result} + Rscript {params.infer_diff_gene_expression_alt} {input.Genebody_NO} {input.clonality} {input.TSS_matrix} {input.GB_matrix} {input.CNN_result1} {input.CNN_result2} {input.input_matrix} {output.result_table} {output.result_plot} {input.final_result} """ rule count_reads_CREs: log: "{folder}/{sample}/log/count_reads_CREs/{cell}.log", - container: - None + # container: + # None input: bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai", @@ -905,8 +905,8 @@ rule count_reads_CREs: rule count_reads_CREs_aggr: log: "{folder}/{sample}/log/count_reads_CREs_aggr/{sample}.log", - container: - None + # container: + # None input: lambda wc: expand( "{folder}/{sample}/scNOVA_result/count_reads_CREs/{cell}_CREs_2kb.tab", @@ -928,8 +928,8 @@ rule count_reads_CREs_aggr: rule count_sort_by_coordinate_CREs: log: "{folder}/{sample}/log/count_sort_by_coordinate_CREs/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb.tab", output: @@ -945,8 +945,8 @@ rule count_sort_by_coordinate_CREs: rule count_sort_annotate_chrid_CREs: log: "{folder}/{sample}/log/count_sort_annotate_chrid_CREs/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort.txt", output: @@ -963,15 +963,15 @@ rule count_sort_annotate_chrid_CREs: mem_mb=get_mem_mb, shell: """ - Rscript {params.count_sort_annotate_chrid_CREs} {input} {output} + Rscript {params.count_sort_annotate_chrid_CREs} {input} {output} """ rule count_sort_annotate_chrid_CREs_sort: log: "{folder}/{sample}/log/count_sort_annotate_chrid_CREs_sort/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num.txt", output: @@ -987,11 +987,11 @@ rule count_sort_annotate_chrid_CREs_sort: rule split_bam_WC: log: "{folder}/{sample}/log/split_bam_WC/{cell}.log", - container: - None + # container: + # None input: - "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", - "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", + bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam", + assertion="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt", output: bam_header="{folder}/{sample}/scNOVA_bam_modified/{cell}.header_WC.sam", bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam", @@ -1004,19 +1004,19 @@ rule split_bam_WC: mem_mb=get_mem_mb, shell: """ - samtools view -H {input} > {output.bam_header} - samtools view -f 99 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C1} - samtools view -f 147 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C2} - samtools view -f 83 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W1} - samtools view -f 163 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W2} + samtools view -H {input.bam} > {output.bam_header} + samtools view -f 99 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C1} + samtools view -f 147 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C2} + samtools view -f 83 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W1} + samtools view -f 163 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W2} """ rule split_bam_WC_merge: log: "{folder}/{sample}/log/split_bam_WC_merge/{cell}.log", - container: - None + # container: + # None input: bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam", bam_C2="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C2.bam", @@ -1039,8 +1039,8 @@ rule split_bam_WC_merge: rule split_bam_WC_index: log: "{folder}/{sample}/log/split_bam_WC_index/{cell}.log", - container: - None + # container: + # None input: bam_C="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam", bam_W="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam", @@ -1061,8 +1061,6 @@ rule split_bam_WC_index: rule perl_split_sc: log: "{folder}/{sample}/log/perl_split_sc/{sample}.log", - container: - None input: strandphaser_output="{folder}/{sample}/strandphaser/strandphaser_phased_haps_merged.txt", bam_C_ind=lambda wc: expand( @@ -1085,6 +1083,8 @@ rule perl_split_sc: "{folder}/{sample}/log/perl_split_sc.log", conda: "../envs/scNOVA/scNOVA_bioinfo_tools.yaml" + # container: + # None resources: mem_mb=get_mem_mb, shell: @@ -1096,8 +1096,8 @@ rule perl_split_sc: rule count_reads_CREs_haplo: log: "{folder}/{sample}/log/count_reads_CREs_haplo/{sample}.log", - container: - None + # container: + # None input: bam1="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam", bam2="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam", @@ -1118,8 +1118,8 @@ rule count_reads_CREs_haplo: rule count_reads_CREs_haplo_sort_by_coordinate: log: "{folder}/{sample}/log/count_reads_CREs_haplo_sort_by_coordinate/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2.tab", output: @@ -1135,8 +1135,8 @@ rule count_reads_CREs_haplo_sort_by_coordinate: rule count_reads_genebody_haplo: log: "{folder}/{sample}/log/count_reads_genebody_haplo/{sample}.log", - container: - None + # container: + # None input: bam1="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam", bam2="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam", @@ -1159,8 +1159,8 @@ rule count_reads_genebody_haplo: rule count_reads_genebody_haplo_sort_by_coordinate_genebody: log: "{folder}/{sample}/log/count_reads_genebody_haplo_sort_by_coordinate_genebody/{sample}.log", - container: - None + # container: + # None input: "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2.tab", output: diff --git a/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py b/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py index 6693de13..2bd08ae3 100644 --- a/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py +++ b/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py @@ -1,6 +1,6 @@ import pandas as pd -df = pd.read_csv(snakemake.input[0], sep="\t") +df = pd.read_csv(snakemake.input.subclonality, sep="\t") df.loc[df["Subclonality"] == snakemake.wildcards.clone].to_csv( snakemake.output[0], sep="\t", index=False -) \ No newline at end of file +) diff --git a/workflow/scripts/scNOVA_scripts/filter_sv_calls.py b/workflow/scripts/scNOVA_scripts/filter_sv_calls.py index 730058ee..19ba300f 100644 --- a/workflow/scripts/scNOVA_scripts/filter_sv_calls.py +++ b/workflow/scripts/scNOVA_scripts/filter_sv_calls.py @@ -1,4 +1,4 @@ import pandas as pd -df = pd.read_csv(snakemake.input[0], sep="\t") +df = pd.read_csv(snakemake.input.sv, sep="\t") df.loc[df["chrom"] != "chrY"].to_csv(snakemake.output[0], sep="\t", index=False) From 1d8c6889665cc1fe5bd796d896c0f62e41acf8c3 Mon Sep 17 00:00:00 2001 From: Thomas Weber Date: Tue, 5 Dec 2023 10:44:02 +0000 Subject: [PATCH 8/8] Dockerfile --- .../Dockerfile-2.2.4.dockerfile | 300 ++++++++++++++++++ .../add_T2T_part_to_Dockerfile.sh | 1 - 2 files changed, 300 insertions(+), 1 deletion(-) create mode 100644 github-actions-runner/Dockerfile-2.2.4.dockerfile diff --git a/github-actions-runner/Dockerfile-2.2.4.dockerfile b/github-actions-runner/Dockerfile-2.2.4.dockerfile new file mode 100644 index 00000000..f03d13ea --- /dev/null +++ b/github-actions-runner/Dockerfile-2.2.4.dockerfile @@ -0,0 +1,300 @@ +FROM condaforge/mambaforge:latest +LABEL io.github.snakemake.containerized="true" +LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb9a77747c17f62707ea2f67a667" + +# Step 1: Retrieve conda environments + +# Conda environment: +# source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml +# prefix: /conda-envs/87c04f5d115eff742eca84455513deba +# name: ashleys_base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - samtools +# - tabix +# - bwa +# - sambamba +# - mosaicatcher +# # - alfred +# - ashleys-qc +# - pandas +# # PUBLISHDIR +# - rsync +# # MULTIQC +# - multiqc +# # Fix sklearn update +# - scikit-learn=1.2.2 +RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba +ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml + +# Conda environment: +# source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml +# prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +# name: rtools +# channels: +# - conda-forge +# - bioconda +# - r +# - anaconda +# dependencies: +# # - bioconductor-biocparallel +# # - bioconductor-bsgenome +# # - bioconductor-bsgenome.hsapiens.ucsc.hg19 +# # - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# # - bioconductor-fastseg +# # - bioconductor-genomicalignments +# - bioconductor-genomicranges +# # - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - r-assertthat +# - r-base +# # - r-biocmanager +# - r-cowplot +# - r-data.table +# # - r-devtools +# # - r-doparallel +# # - r-foreach +# - r-ggplot2 +# # - r-gtools +# - r-reshape2 +# # - r-zoo +# # - r-dplyr +# # - r-mc2d +# # - r-pheatmap +# # - bioconductor-complexheatmap +# # - r-gplots +# - r-scales +# - r-rcolorbrewer +# # - r-stringr +# - r-cairo +# - fonts-anaconda +# # NEW +# - bioconductor-edger +# - r-r.utils +# # PLATE PLOT +# - r-dplyr +# - r-platetools +# - r-viridis +# # GC_correction +# - r-tidyr +# - r-ggpubr +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 +ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml +# prefix: /conda-envs/5681728a49bd83ceed09ba194330c858 +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - bwa ==0.7.17 +RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858 +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml + +# Conda environment: +# source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml +# prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d +# channels: +# - bioconda +# - conda-forge +# - defaults +# dependencies: +# - fastqc ==0.11.9 +RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d +ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_base.yaml +# prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b +# name: mc-base +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - pandas +# - intervaltree +# - scipy +# - pysam +# - tqdm +# - perl +# - pypdf2 +# - parmap +# # NEW +# - pyyaml +# - seaborn +# - matplotlib +# # SOLVE se-pe detection +# - samtools +# # ArbiGent Hufsah deps +# - pytables +# - xopen +RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b +COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml + +# Conda environment: +# source: workflow/envs/mc_bioinfo_tools.yaml +# prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +# name: mc-bioinfo-tools +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bcftools +# - freebayes +# - mosaicatcher +# - samtools +# - tabix +# - whatshap +RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66 +COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml + +# Conda environment: +# source: workflow/envs/rtools.yaml +# prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931 +# name: rtools +# channels: +# - bioconda +# - conda-forge +# - r +# - anaconda +# dependencies: +# # # NEW +# - strandphaser +# # ############### +# - bioconductor-biocparallel +# - bioconductor-bsgenome +# - bioconductor-bsgenome.hsapiens.ucsc.hg38 +# - bioconductor-complexheatmap +# # - bioconductor-fastseg +# - bioconductor-genomicalignments +# - bioconductor-genomicranges +# - bioconductor-rsamtools +# # - bioconductor-s4vectors +# - fonts-anaconda +# - r-assertthat +# - r-base +# - r-biocmanager +# - r-cairo +# - r-cowplot +# - r-data.table +# - r-devtools +# - r-doparallel +# - r-dplyr +# - r-foreach +# - r-ggplot2 +# - r-gplots +# - r-gtools +# - r-mc2d +# - r-rcolorbrewer +# - r-reshape2 +# - r-scales +# - r-stringr +# # SV_CALLS_DEV +# # - r-zoo +# - r-r.utils +# - r-ggnewscale +# # HEATMAP +# - r-tidyr +# # ARBIGENT +# - r-reshape +# - r-optparse +# - r-tidyr +# - r-ggbeeswarm +# - r-pheatmap +# # GC_correction +# - r-ggpubr +# - bioconductor-edger +# # SOLVE R lib issue +# - r-stringi=1.7.12 +RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931 +COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_DL.yaml +# prefix: /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 +# name: scNOVA_DL +# channels: +# - conda-forge +# - anaconda +# dependencies: +# - tensorflow=1.15.0 +# - scikit-learn=0.21.3 +# - python=3.7.4 +# - matplotlib=3.1.1 +# - pandas=0.25.3 +# - h5py=2.10.0 +# - numpy +# # scNOVA archive +# - unzip +# # Fix +RUN mkdir -p /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 +COPY workflow/envs/scNOVA/scNOVA_DL.yaml /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_R.yaml +# prefix: /conda-envs/193f60d48796dd17eb847ea689b863a9 +# name: scNOVA +# channels: +# - bioconda +# - conda-forge +# - r +# dependencies: +# - bioconductor-deseq2=1.30.0 +# - r-matrixstats=0.58.0 +# - r-pheatmap=1.0.12 +# - r-gplots=3.1.1 +# - r-umap=0.2.7.0 +# - r-rtsne=0.15 +# - r-factoextra=1.0.7 +# - r-pracma=2.3.3 +# - bioconductor-chromvar=1.12.0 +# - r-nabor=0.5.0 +# - bioconductor-motifmatchr=1.12.0 +# - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.3 +# - bioconductor-jaspar2016=1.18.0 +# - r-codetools=0.2_18 +# - r-fitdistrplus +# - r-doparallel +# - r-foreach +RUN mkdir -p /conda-envs/193f60d48796dd17eb847ea689b863a9 +COPY workflow/envs/scNOVA/scNOVA_R.yaml /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml + +# Conda environment: +# source: workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml +# prefix: /conda-envs/ca9641251a8cb0057003875ad776c49f +# name: scNOVA_bioinfo_tools +# channels: +# - conda-forge +# - bioconda +# - anaconda +# dependencies: +# - samtools +# - biobambam +# - bedtools +RUN mkdir -p /conda-envs/ca9641251a8cb0057003875ad776c49f +COPY workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml + +# Step 2: Generate conda environments + +RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \ + mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \ + mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \ + mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \ + mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \ + mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \ + mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \ + mamba env create --prefix /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 --file /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml && \ + mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \ + mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \ + mamba clean --all -y + +# CUSTOM PART +RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/ +COPY /workflow/scripts/utils/install_R_package.R /conda-envs/ +RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz diff --git a/github-actions-runner/add_T2T_part_to_Dockerfile.sh b/github-actions-runner/add_T2T_part_to_Dockerfile.sh index 7c631edd..3d7fcc85 100644 --- a/github-actions-runner/add_T2T_part_to_Dockerfile.sh +++ b/github-actions-runner/add_T2T_part_to_Dockerfile.sh @@ -25,7 +25,6 @@ fi # Append custom steps to the Dockerfile { - echo '\n' echo "# CUSTOM PART" echo "RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/" echo "COPY /workflow/scripts/utils/install_R_package.R /conda-envs/"