From 7283cb0a1c4f45dd9547e5a67de43b4e8861264e Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Mon, 4 Dec 2023 10:44:16 +0000
Subject: [PATCH 1/8] merge blacklist fix, dump config fix, assertion to check
 labels.tsv x selected/ x scNOVA input lists, labels at later stage to prevent
 working on modified list of cells, other minor fixes

---
 watchdog_pipeline/watchdog_pipeline.py | 841 +++++++++++++++++++++----
 1 file changed, 720 insertions(+), 121 deletions(-)

diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py
index d3df33c6..c0f259f9 100644
--- a/watchdog_pipeline/watchdog_pipeline.py
+++ b/watchdog_pipeline/watchdog_pipeline.py
@@ -1,5 +1,7 @@
+import sqlite3
 import time
 import os, sys, glob, subprocess, re
+import requests
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 from datetime import datetime
@@ -7,7 +9,10 @@
 import json
 import pandas as pd
 import threading
-
+import re
+from collections import Counter
+from pathlib import Path
+import pika  # RabbitMQ
 
 os.makedirs("watchdog/logs", exist_ok=True)
 
@@ -16,7 +21,9 @@
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     handlers=[
-        logging.FileHandler("watchdog/logs/watchdog_ashleys.log"),  # File handler to log to a file
+        logging.FileHandler(
+            "watchdog/logs/watchdog_ashleys.log"
+        ),  # File handler to log to a file
         logging.StreamHandler(),  # Stream handler to log to the console
     ],
 )
@@ -24,17 +31,37 @@
 
 # Set the path you want to watch
 path_to_watch = sys.argv[1]
+dry_run = sys.argv[2]
+report_only = sys.argv[3]
+
 
-data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS_DEV"
-publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV"
-# publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
+data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS"
+# publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV"
+publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
 genecore_prefix = path_to_watch
-profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"]
-profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/", "-c", "1"]
+# profile_slurm = ["--profile", "../snakemake_profiles/HPC/dev/slurm_legacy_conda/"]
+profile_slurm = [
+    "--profile",
+    "/g/korbel2/weber/workspace/snakemake_profiles/HPC/slurm_EMBL/",
+]
+profile_dry_run = [
+    "--profile",
+    "workflow/snakemake_profiles/local/conda_singularity/",
+    "-c",
+    "1",
+]
 dry_run_options = ["-n", "-q"]
 # snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake"
-snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_panoptesfix/bin/snakemake"
+snakemake_binary = (
+    "/g/korbel2/weber/miniconda3/envs/snakemake_panoptesfix/bin/snakemake"
+)
+# Panoptes
+pipeline = "ashleys-qc-pipeline"
 
+my_env = os.environ.copy()
+snakemake_binary_folder = "/".join(snakemake_binary.split("/")[:-1])
+my_env["PATH"] = f"{snakemake_binary_folder}:{my_env['PATH']}"
+working_directory = "/g/korbel2/weber/workspace/mosaicatcher-update"
 
 # plates_processing_status = pd.read_csv("watchdog/processing_status.json", sep="\t")
 # print(plates_processing_status)
@@ -47,27 +74,571 @@ def on_created(self, event):
             logging.info(f"Directory {event.src_path} has been created!")
             self.process_new_directory(event.src_path)
 
+    def extract_samples_names(self, l, directory_path):
+        samples = list()
+        prefixes = list()
+        plate_types = list()
+
+        pattern = re.compile(r"_lane1(.*?)(iTRU|PE20)(.*?)([A-H]?)(\d{2})(?:_1_|_2_)")
+
+        # First pass: Count occurrences of each sample_name
+        file_counts_per_sample = Counter()
+        for file_path in l:
+            match = pattern.search(file_path)
+            if match:
+                sample_name = match.group(1)
+                file_counts_per_sample[sample_name] += 1
+
+        # Second pass: Process files and determine plate type per sample
+        for j, file_path in enumerate(sorted(l)):
+            match = pattern.search(file_path)
+            if match:
+                sample_name = match.group(1)
+                file_count = file_counts_per_sample[sample_name]
+
+                # Determine plate type using modulo 96 operation
+                if file_count % 96 != 0:
+                    raise ValueError(
+                        f"Invalid file count for sample {sample_name} with file count {file_count}. Must be a multiple of 96."
+                    )
+                plate_type = int(file_count / 2)
+
+                if (j + 1) % file_count == 0:
+                    prefixes.append(match.group(2))
+                    plate = directory_path.split("/")[-1]
+                    samples.append(sample_name)
+                    plate_types.append(plate_type)
+
+        return prefixes, samples, plate_types
+
+    def check_date(self, plate):
+        from datetime import datetime, timedelta
+
+        date_str = "-".join(plate.split("-")[:-1])
+        date_format = "%Y-%m-%d"
+        folder_date = datetime.strptime(date_str, date_format)
+
+        # Calculate the date that is 6 months before today
+        six_months_ago = datetime.now() - timedelta(
+            days=3 * 30
+        )  # This assumes an average of 30 days in a month
+        # print(plate, six_months_ago, folder_date > six_months_ago)
+        # Compare dates
+        return folder_date > six_months_ago
+
+    @staticmethod
+    def load_from_json(filename: str):
+        """Load the data from the JSON file."""
+        try:
+            with open(filename, "r") as file:
+                data = json.load(file)
+            return data
+        except (FileNotFoundError, json.JSONDecodeError):
+            # If the file does not exist or there's an error in reading it,
+            # return an empty dictionary or other default value
+            return {}
+
+    @staticmethod
+    def update_timestamps(directory):
+        """
+        Update the access and modification times of all files in the given directory and its subdirectories.
+
+        :param directory: Path to the directory
+        """
+        for root, dirs, files in os.walk(directory):
+            for file in files:
+                if file.endswith(".fastq.gz"):
+                    continue
+                try:
+                    file_path = Path(root) / file
+                    current_time = time.time()
+                    os.utime(file_path, (current_time, current_time))
+                    logging.info(f"Updated timestamp for: {file_path}")
+                except FileNotFoundError:
+                    logging.info(f"File not found: {file_path}")
+
+    # Example usage
+    # directory_path = "/path/to/your/directory"
+    # update_timestamps(directory_path)
+
+    def consume_last_message_from_rabbitmq(self, json_backup_filename=str, queue=str):
+        pika_connection = pika.BlockingConnection(
+            pika.ConnectionParameters(host="localhost")
+        )
+        channel = pika_connection.channel()
+
+        # Fetch the message without auto acknowledgment
+        method_frame, header_frame, body = channel.basic_get(
+            queue=queue, auto_ack=False
+        )
+
+        if method_frame:
+            # Extract the timestamp from the header frame
+            if header_frame.timestamp:
+                timestamp = header_frame.timestamp
+                human_readable_timestamp = datetime.fromtimestamp(
+                    timestamp / 1000.0
+                ).strftime("%Y-%m-%d %H:%M:%S")
+
+            else:
+                timestamp = None
+            # Convert timestamp to human-readable format if necessary
+
+            # # Acknowledge the message after processing
+            # channel.basic_ack(delivery_tag=method_frame.delivery_tag)
+            pika_connection.close()
+            data = json.loads(body.decode("utf-8"))
+            print(data)
+            # if data dict is empty
+            if not data:
+                print("EXITING")
+                sys.exit("RabbitMQ queue NOT empty but message is")
+                # print("Loading from JSON file...")
+                # data_json = self.load_from_json(filename=json_backup_filename)
+                # file_timestamp = os.path.getmtime(json_backup_filename)
+                # file_timestamp = datetime.fromtimestamp(file_timestamp).strftime(
+                #     "%Y-%m-%d %H:%M:%S"
+                # )
+                # return data_json, file_timestamp
+            else:
+                print("RabbitMQ queue NOT empty and message is NOT empty")
+                print(data)
+                return data, human_readable_timestamp
+
+        else:
+            if os.path.exists(json_backup_filename):
+                pika_connection.close()
+                print("No message available, RabbitMQ queue is empty")
+                print("Loading from JSON file...")
+                data_json = self.load_from_json(filename=json_backup_filename)
+                file_timestamp = os.path.getmtime(json_backup_filename)
+                file_timestamp = datetime.fromtimestamp(file_timestamp).strftime(
+                    "%Y-%m-%d %H:%M:%S"
+                )
+
+                return data_json, file_timestamp
+            else:
+                current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                return {"workflows": []}, current_time
+
+    # Function to get all workflows
+    @staticmethod
+    def get_workflows():
+        url = "http://localhost:8058/api/workflows"
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            raise Exception(
+                "Failed to fetch data: Status code {}".format(response.status_code)
+            )
+
+    # Function to find a workflow ID by name
+    @staticmethod
+    def find_workflow_id_by_name(workflows, name):
+        for workflow in workflows.get("workflows", []):
+            if workflow["name"] == name:
+                return workflow
+        return None
+
     def check_unprocessed_folder(self):
+        connection = sqlite3.connect(
+            "/g/korbel2/weber/workspace/strandscape/.panoptes.db"
+        )
+
+        # Get the list of processed plates from rabbitmq
+        message = self.consume_last_message_from_rabbitmq(
+            json_backup_filename="watchdog/processing_status.json", queue="data_queue"
+        )
+
         unwanted = ["._.DS_Store", ".DS_Store", "config"]
-        list_runs_processed = sorted([e for e in os.listdir(data_location) if e not in unwanted])
-        total_list_runs = sorted([e for e in os.listdir(path_to_watch) if e not in unwanted])
-        unprocessed_plates = set(total_list_runs).difference(list_runs_processed)
-        print(list_runs_processed)
-        print(total_list_runs)
-        print(unprocessed_plates)
-        # for plate in ["2023-07-10-HLGVJAFX5"]:
-        for plate in unprocessed_plates:
-            # if plate not in plates_processing_status["plate"].values.tolist():
-            # plates_processing_status_plate_dict = collections.defaultdict(dict)
-            nb_txt_gz_files = len(glob.glob(f"{path_to_watch}/{plate}/*.txt.gz"))
-            # if nb_txt_gz_files == 576:
-            # if (nb_txt_gz_files % 192) == 0:
-            print(f"PROCESSING {path_to_watch}/{plate}")
-            self.process_new_directory(f"{path_to_watch}/{plate}")
-            # else:
-            #     print(f"Not possible to process {path_to_watch}/{plate}, containing {nb_txt_gz_files} txt.gz files")
-
-    def process_new_directory(self, directory_path):
+        list_runs_processed = sorted(
+            [e for e in os.listdir(data_location) if e not in unwanted]
+        )
+
+        total_list_runs = sorted(
+            [e for e in os.listdir(path_to_watch) if e not in unwanted]
+        )
+        # unprocessed_plates = sorted(list(set(total_list_runs).difference(list_runs_processed)))
+        unprocessed_plates = list()
+        # workflows_data = self.get_workflows()
+        workflows_data = message[0]
+        last_message_timestamp = message[1]
+        print(last_message_timestamp)
+        last_message_timestamp = datetime.strptime(
+            last_message_timestamp, "%Y-%m-%d %H:%M:%S"
+        ).strftime("%Y-%m-%d %H:%M:%S.%f")
+
+        # last_message_timestamp = last_message_timestamp
+
+        main_df = list()
+        if workflows_data:
+            for plate in total_list_runs:
+                # print(plate)
+                if plate.split("-")[0][:2] == "20":
+                    # if plate.split("-")[0] == "2023":
+                    # if plate.startswith("2023-11-09"):
+                    # if plate == "2021-02-17-HM7LYAFX2":
+                    # if plate == "2020-06-22-H5YMMAFX2":
+                    directory_path = f"{path_to_watch}/{plate}"
+                    prefixes, samples, plate_types = self.extract_samples_names(
+                        glob.glob(f"{path_to_watch}/{plate}/*.txt.gz"), directory_path
+                    )
+                    # print(prefixes, samples, plate_types)
+                    if len(set(prefixes)) == 1:
+                        # print(plate)
+                        # if self.check_date(plate):
+
+                        # print(plate)
+                        for sample_name, plate_type in zip(samples, plate_types):
+                            if sample_name not in [
+                                "PDAC60590",
+                                "PDAC60590MNI",
+                                "DXR30hMaja",
+                                "DXR42hMaja",
+                                "GM19705",
+                            ]:
+                                run_id = f"{pipeline}--{plate}--{sample_name}"
+                                workflow_id = self.find_workflow_id_by_name(
+                                    workflows_data, run_id
+                                )
+
+                                report = False
+                                labels = False
+                                multiqc_scratch = False
+                                multiqc_scratch_timestamp = None
+                                remaining_days = None
+
+                                if os.path.isfile(
+                                    f"{publishdir_location}/{plate}/{sample_name}/cell_selection/labels.tsv"
+                                ):
+                                    labels = True
+
+                                if os.path.isfile(
+                                    f"{publishdir_location}/{plate}/{sample_name}/reports/{sample_name}_{pipeline}_report.zip"
+                                ):
+                                    report = True
+
+                                if os.path.isfile(
+                                    f"{data_location}/{plate}/{sample_name}/multiqc/multiqc_report/multiqc_report.html"
+                                ):
+                                    multiqc_scratch = True
+                                    multiqc_scratch_timestamp = os.path.getmtime(
+                                        f"{data_location}/{plate}/{sample_name}/multiqc/multiqc_report/multiqc_report.html"
+                                    )
+                                    # to datetime and then strfmtime
+                                    multiqc_scratch_timestamp = datetime.fromtimestamp(
+                                        multiqc_scratch_timestamp
+                                    )
+                                    # computing remaning days to reach 5 months between multiqc_scratch_timestamp and now
+                                    remaining_days = (
+                                        datetime.now() - multiqc_scratch_timestamp
+                                    ).days
+                                    remaining_days = 150 - remaining_days
+
+                                    multiqc_scratch_timestamp = (
+                                        multiqc_scratch_timestamp.strftime("%Y-%m-%d")
+                                    )
+
+                                if not workflow_id:
+                                    workflow_id = {
+                                        "id": "None",
+                                        "status": "None",
+                                        "started_at": last_message_timestamp,
+                                        "completed_at": last_message_timestamp,
+                                        "jobs_done": "None",
+                                        "jobs_total": "None",
+                                    }
+                                else:
+                                    workflow_id["started_at"] = datetime.strptime(
+                                        workflow_id["started_at"],
+                                        "%a, %d %b %Y %H:%M:%S GMT",
+                                    ).strftime("%Y-%m-%d %H:%M:%S.%f")
+
+                                    if workflow_id["completed_at"] is not None:
+                                        workflow_id["completed_at"] = datetime.strptime(
+                                            workflow_id["completed_at"],
+                                            "%a, %d %b %Y %H:%M:%S GMT",
+                                        ).strftime("%Y-%m-%d %H:%M:%S.%f")
+
+                                # turn the print into a dict
+                                tmp_d = {
+                                    "panoptes_id": workflow_id["id"],
+                                    "plate": plate,
+                                    "sample": sample_name,
+                                    "report": report,
+                                    "labels": labels,
+                                    "multiqc_scratch": multiqc_scratch,
+                                    "multiqc_scratch_timestamp": multiqc_scratch_timestamp,
+                                    "remaining_days": remaining_days,
+                                    "status": workflow_id["status"],
+                                    "prefix": list(prefixes)[0],
+                                    "plate_type": plate_type,
+                                    "started_at": workflow_id["started_at"],
+                                    "completed_at": workflow_id["completed_at"],
+                                    "jobs_done": workflow_id["jobs_done"],
+                                    "jobs_total": workflow_id["jobs_total"],
+                                }
+                                main_df.append(tmp_d)
+            pd.options.display.max_rows = 999
+            pd.options.display.max_colwidth = 30
+            # pd.options.display.max_columns = 50
+            main_df = pd.DataFrame(main_df)
+            # main_df.loc[(main_df["labels"] == True) &  (main_df["report"] == True), "real_status"] = "Completed"
+            main_df.loc[
+                (main_df["labels"] == True) & (main_df["report"] == False),
+                "real_status",
+            ] = "Report missing"
+            main_df.loc[
+                (main_df["labels"] == False) & (main_df["report"] == True),
+                "real_status",
+            ] = "Error"
+            main_df.loc[
+                (main_df["labels"] == False) & (main_df["report"] == False),
+                "real_status",
+            ] = "To process"
+            main_df.loc[
+                (main_df["labels"] == True)
+                & (main_df["report"] == True)
+                & (main_df["status"] == "None"),
+                "real_status",
+            ] = "Error"
+            main_df.loc[
+                (main_df["labels"] == True)
+                & (main_df["report"] == True)
+                & (main_df["status"] == "Running"),
+                "real_status",
+            ] = "Running"
+            main_df.loc[
+                (main_df["labels"] == True)
+                & (main_df["report"] == True)
+                & (main_df["status"] == "Done"),
+                "real_status",
+            ] = "Completed"
+            main_df["real_status"] = main_df["real_status"].fillna(
+                "Error (to  investigate))"
+            )
+
+            print(main_df)
+
+            dry_run_db = False
+
+            if dry_run_db is False:
+                cursor = connection.cursor()
+
+                assert (
+                    main_df.loc[
+                        (main_df["labels"] == False) & (main_df["report"] == True)
+                    ].shape[0]
+                    == 0
+                ), "Error in table, samples have report done without the completion of the pipeline"
+
+                logging.info(
+                    "Correcting status of plates with report.zip and labels.tsv"
+                )
+
+                for row in main_df.loc[
+                    (main_df["labels"] == True)
+                    & (main_df["report"] == True)
+                    & (main_df["status"] != "Done")
+                ].to_dict("records"):
+                    logging.info(row)
+                    panoptes_entry = f"{pipeline}--{row['plate']}--{row['sample']}"
+                    workflow_id = row["panoptes_id"]
+
+                    # if workflow_id != "None":
+                    #     command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"'
+                    #     subprocess.run(command, shell=True, check=True)
+
+                    panoptes_data = [
+                        e for e in workflows_data["workflows"] if e["id"] == workflow_id
+                    ]
+
+                    if panoptes_data:
+                        panoptes_data = panoptes_data[0]
+                        if "completed_at" not in panoptes_data:
+                            panoptes_data["completed_at"] = last_message_timestamp
+
+                        command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"'
+                        subprocess.run(command, shell=True, check=True)
+
+                    else:
+                        logging.info(
+                            "Panoptes data not found for workflow entry: %s", row
+                        )
+                        panoptes_data = {
+                            "started_at": last_message_timestamp,
+                            "completed_at": last_message_timestamp,
+                            "jobs_done": "1",
+                            "jobs_total": "1",
+                        }
+
+                    print(row)
+
+                    cursor.execute(
+                        """
+                        INSERT INTO workflows (name, status, done, total, started_at, completed_at)
+                        VALUES (?, ?, ?, ?, ?, ?)
+                        """,
+                        (
+                            panoptes_entry,
+                            "Done",
+                            panoptes_data["jobs_done"],
+                            panoptes_data["jobs_total"],
+                            panoptes_data["started_at"],
+                            panoptes_data["completed_at"],
+                        ),
+                    )
+                    connection.commit()
+
+                logging.info(
+                    "Processing plates without labels.tsv or outdated without report.zip"
+                )
+
+                for row in main_df.loc[
+                    (main_df["labels"] == False) & (main_df["report"] == False)
+                ].to_dict("records"):
+                    logging.info(row)
+
+                    # panoptes = True if row["status"] == "None" else False
+                    panoptes = True
+
+                    if dry_run == "False":
+                        if row["panoptes_id"] != "None":
+                            workflow_id = row["panoptes_id"]
+                            panoptes_data = [
+                                e
+                                for e in workflows_data["workflows"]
+                                if e["id"] == workflow_id
+                            ]
+                            command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"'
+                            subprocess.run(command, shell=True, check=True)
+
+                        self.process_new_directory(
+                            "/".join([path_to_watch, row["plate"]]),
+                            row["prefix"],
+                            row["sample"],
+                            row["plate_type"],
+                            report_only=False,
+                            panoptes=panoptes,
+                        )
+
+                logging.info(
+                    "Processing plates not present anymore on scratch and without report.zip"
+                )
+
+                for row in main_df.loc[
+                    # (main_df["multiqc_scratch"] == False)
+                    (main_df["multiqc_scratch"] == False)
+                    & (main_df["report"] == False)
+                ].to_dict("records"):
+                    logging.info(row)
+
+                    # panoptes = True if row["status"] == "None" else False
+                    panoptes = True
+
+                    if dry_run == "False":
+                        if row["panoptes_id"] != "None":
+                            workflow_id = row["panoptes_id"]
+                            panoptes_data = [
+                                e
+                                for e in workflows_data["workflows"]
+                                if e["id"] == workflow_id
+                            ]
+                            command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"'
+                            subprocess.run(command, shell=True, check=True)
+
+                        self.process_new_directory(
+                            "/".join([path_to_watch, row["plate"]]),
+                            row["prefix"],
+                            row["sample"],
+                            row["plate_type"],
+                            report_only=False,
+                            panoptes=panoptes,
+                        )
+
+                logging.info(
+                    "Processing plates without report.zip but with labels.tsv and still on scratch"
+                )
+
+                for row in main_df.loc[
+                    (main_df["labels"] == True)
+                    & (main_df["multiqc_scratch"] == True)
+                    & (main_df["remaining_days"] > 2)
+                    & (main_df["report"] == False)
+                ].to_dict("records"):
+                    logging.info(row)
+
+                    # panoptes = True if row["status"] == "None" else False
+                    panoptes = False
+                    panoptes_entry = f"{pipeline}--{row['plate']}--{row['sample']}"
+
+                    if dry_run == "False":
+                        self.process_new_directory(
+                            "/".join([path_to_watch, row["plate"]]),
+                            row["prefix"],
+                            row["sample"],
+                            row["plate_type"],
+                            report_only=True,
+                            panoptes=panoptes,
+                        )
+
+                        if row["panoptes_id"] != "None":
+                            workflow_id = row["panoptes_id"]
+                            panoptes_data = [
+                                e
+                                for e in workflows_data["workflows"]
+                                if e["id"] == workflow_id
+                            ][0]
+
+                            if panoptes_data:
+                                command = f'sqlite3 /g/korbel2/weber/workspace/strandscape/.panoptes.db "DELETE FROM workflows WHERE id={workflow_id};"'
+                                subprocess.run(command, shell=True, check=True)
+
+                                cursor.execute(
+                                    """
+                                    INSERT INTO workflows (name, status, done, total, started_at, completed_at)
+                                    VALUES (?, ?, ?, ?, ?, ?)
+                                    """,
+                                    (
+                                        panoptes_entry,
+                                        "Done",
+                                        panoptes_data["jobs_done"],
+                                        panoptes_data["jobs_total"],
+                                        panoptes_data["started_at"],
+                                        panoptes_data["completed_at"],
+                                    ),
+                                )
+                                connection.commit()
+
+                    else:
+                        logging.info(
+                            "Panoptes data not found for workflow entry: %s", row
+                        )
+
+                logging.info(
+                    "Updating /scratch files timestamps that are close to 6 months"
+                )
+
+                for row in main_df.loc[
+                    (main_df["labels"] == True)
+                    & (main_df["multiqc_scratch"] == True)
+                    & (main_df["remaining_days"] < 10)
+                ].to_dict("records"):
+                    logging.info(row)
+                    self.update_timestamps(
+                        f"{data_location}/{row['plate']}/{row['sample']}"
+                    )
+
+    def process_new_directory(
+        self,
+        directory_path,
+        prefix,
+        sample_name,
+        plate_type,
+        report_only=False,
+        panoptes=False,
+    ):
         """Process the new directory, check for .txt.gz files and execute snakemake command if conditions are met."""
 
         # Poll the directory until 576 files appear or a timeout is reached
@@ -80,67 +651,33 @@ def process_new_directory(self, directory_path):
         num_files = len(txt_gz_files)
 
         #     # If the desired number of files is found or timeout is reached, break the loop
-        # if (num_files % 192) == 0 or time.time() - start_time > timeout:
+        # if time.time() - start_time > timeout:
         #     break
 
-        # # Sleep for a while before the next poll
+        # # # # # Sleep for a while before the next poll
         # time.sleep(5)  # Sleep for 5 seconds
 
         # Process the found .txt.gz files
-        self.process_txt_gz_files(directory_path, txt_gz_files, num_files)
-
-    def process_txt_gz_files(self, directory_path, txt_gz_files, num_files):
-        """Process the found .txt.gz files and execute snakemake command if conditions are met."""
-
-        if (num_files % 192) == 0:
-            logging.info(f"The new directory contains exactly {num_files} .txt.gz files.")
-            self.execute_snakemake(directory_path, txt_gz_files)
-
-        else:
-            logging.info(f"The new directory contains {str(num_files)} .txt.gz files, not 576.")
-
-    def execute_snakemake(self, directory_path, txt_gz_files):
-        """Execute the snakemake command based on the found prefixes."""
-        pattern = re.compile(r"_lane1(.*?)(iTRU|PE20)(.*?)([A-H]?)(\d{2})(?:_1_|_2_)")
-        prefixes = list()
-
-        for file_path in sorted(txt_gz_files):
-            match = pattern.search(file_path)
-            # print(file_path, match)
-            if match:
-                prefix = match.group(2)
-                #    print(sample_name)
-                # prefix = match.group(2) + match.group(4) + match.group(5)  # Concatenate the prefix, optional letter, and two digits
-                prefixes.append(prefix)
-                # indexes.add(index)
-        #        pattern = re.compile(r"(iTRU|PE20)\d{3}")
-        #        prefixes = set()
-        #
-        #        for file_path in txt_gz_files:
-        #            match = pattern.search(file_path)
-        #            print(file_path)
-        #            if match:
-        #                prefix = match.group()[:4]  # Get the first 4 characters, which is the prefix
-        #                prefixes.add(prefix)
-
-        if len(set(prefixes)) > 1:
-            logging.info("Multiple different prefixes found: %s", prefixes)
-        elif prefixes:
-            for j, file_path in enumerate(sorted(txt_gz_files)):
-                if (j + 1) % 192 == 0:
-                    match = pattern.search(file_path)
-                    sample_name = match.group(1)
-                    cell = f"{sample_name}{prefixes[0]}{match.group(3)}{match.group(4)}96"
-                    # print(file_path, j, match, sample_name, cell)
-                    # print([match.group(i) for i in range(6)])
-                    # self.execute_command(directory_path, prefixes[0], sample_name)
-
-                    # Debug/dev purpose - target a specific file
-                    self.execute_command(directory_path, prefixes[0], sample_name, cell)
-        else:
-            logging.info("No match found in any file.")
+        # self.process_txt_gz_files(directory_path, txt_gz_files, num_files)
+        self.execute_command(
+            directory_path,
+            prefix,
+            sample_name,
+            plate_type,
+            report_only=report_only,
+            panoptes=panoptes,
+        )
 
-    def execute_command(self, directory_path, prefix, sample, cell=None):
+    def execute_command(
+        self,
+        directory_path,
+        prefix,
+        sample,
+        plate_type,
+        report_only=False,
+        cell=None,
+        panoptes=False,
+    ):
         """Execute the command."""
 
         # Change directory and run the snakemake command
@@ -150,12 +687,15 @@ def execute_command(self, directory_path, prefix, sample, cell=None):
             f"{snakemake_binary}",
             "-s",
             "workflow/Snakefile",
+            "--set-resources",
+            "ashleys_mark_duplicates:partition=bigmem",
             "--config",
             "genecore=True",
             f"genecore_prefix={genecore_prefix}",
             f"genecore_date_folder={date_folder}",
             f"genecore_regex_element={prefix}",
             f'samples_to_process="[{sample}]"',
+            f"plate_type={plate_type}",
             "multistep_normalisation=True",
             "MultiQC=True",
             "split_qc_plot=False",
@@ -165,6 +705,7 @@ def execute_command(self, directory_path, prefix, sample, cell=None):
             "ashleys_pipeline_only=True",
             "ashleys_pipeline=True",
             "--nolock",
+            "--rerun-incomplete",
             "--rerun-triggers",
             "mtime",
         ]
@@ -177,32 +718,62 @@ def execute_command(self, directory_path, prefix, sample, cell=None):
                 "--force",
             ]
 
-        logging.info("Running command: %s", " ".join(cmd + profile_dry_run + dry_run_options))
+        if report_only is False:
+            logging.info(
+                "Running command: %s", " ".join(cmd + profile_dry_run + dry_run_options)
+            )
+
+            process = subprocess.Popen(
+                cmd + profile_dry_run + dry_run_options,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                cwd=working_directory,
+                env=my_env,
+            )
+
+            # Variable to store the penultimate line
+            penultimate_line = ""
+
+            # Read the output line by line in real-time
+            for line in iter(process.stdout.readline, ""):
+                logging.info(line.strip())  # log line in real-time
+                if line.strip():  # If line is not blank
+                    penultimate_line = line.strip()
+
+            # Wait for the subprocess to finish
+            process.wait()
+            logging.info("Return code: %s", process.returncode)
+            dryrun_check = True if (str(process.returncode) == str(0)) else False
 
-        process = subprocess.Popen(
-            cmd + profile_dry_run + dry_run_options, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
-        )
-
-        # Variable to store the penultimate line
-        penultimate_line = ""
-
-        # Read the output line by line in real-time
-        for line in iter(process.stdout.readline, ""):
-            logging.info(line.strip())  # log line in real-time
-            if line.strip():  # If line is not blank
-                penultimate_line = line.strip()
-
-        # Wait for the subprocess to finish
-        process.wait()
-        logging.info("Return code: %s", process.returncode)
-
-        # Check the penultimate line
-        if str(process.returncode) == str(0):
-            self.run_second_command(cmd, profile_slurm, data_location, date_folder, sample, cell)
+        else:
+            dryrun_check = True
+
+        if dryrun_check is True:
+            self.run_second_command(
+                cmd,
+                profile_slurm,
+                data_location,
+                date_folder,
+                sample,
+                report_only,
+                cell,
+                panoptes,
+            )
         else:
             logging.info("\nThe output is not as expected.")
 
-    def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sample, cell=None):
+    def run_second_command(
+        self,
+        cmd,
+        profile_slurm,
+        data_location,
+        date_folder,
+        sample,
+        report_only=False,
+        cell=None,
+        panoptes=False,
+    ):
         """Run the second command and write the output to a log file."""
 
         report_location = f"{publishdir_location}/{date_folder}/{sample}/reports/{sample}_ashleys-qc-pipeline_report.zip"
@@ -213,9 +784,6 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sam
             "/g/korbel2/weber/workspace/mosaicatcher-update/workflow/report/custom-stylesheet.css",
         ]
 
-        # Panoptes
-        pipeline = "ashleys-qc-pipeline"
-
         wms_monitor_options = "http://127.0.0.1:8058"
         run_id = f"{pipeline}--{date_folder}--{sample}"
         wms_monitor_renaming_option = f"name={run_id}"
@@ -229,9 +797,6 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sam
 
         # print(cmd + profile_slurm + report_options)
 
-        logging.info("\nThe output is as expected.")
-        logging.info("Running command: %s", " ".join(cmd + wms_monitor_args + profile_dry_run))
-
         os.makedirs("watchdog/logs/per-run", exist_ok=True)
 
         # Get the current date and time
@@ -240,24 +805,58 @@ def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sam
         # Convert it to a string
         current_time = now.strftime("%Y%m%d%H%M%S")
 
-        with open(f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}.log", "w") as f:
-            process2 = subprocess.Popen(cmd + wms_monitor_args + profile_dry_run, stdout=f, stderr=f, universal_newlines=True)
-            # process2 = subprocess.Popen(cmd + profile_slurm, stdout=f, stderr=f, universal_newlines=True)
-            process2.wait()
-
-            logging.info("Return code: %s", process2.returncode)
+        if panoptes is True:
+            final_cmd = cmd + wms_monitor_args + profile_slurm
+        else:
+            final_cmd = (cmd + profile_slurm,)
+
+        if report_only is False:
+            logging.info("\nThe output is as expected.")
+            # logging.info("Running command: %s", " ".join(cmd + profile_slurm))
+
+            logging.info(
+                "Running command: %s", " ".join(cmd + wms_monitor_args + profile_slurm)
+            )
+
+            with open(
+                f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}.log",
+                "w",
+            ) as f:
+                # process2 = subprocess.Popen(cmd + wms_monitor_args + profile_dry_run, stdout=f, stderr=f, universal_newlines=True, cwd=working_directory, env=my_env)
+                process2 = subprocess.Popen(
+                    final_cmd,
+                    stdout=f,
+                    stderr=f,
+                    universal_newlines=True,
+                    cwd=working_directory,
+                    env=my_env,
+                )
+                process2.wait()
+
+                logging.info("Return code: %s", process2.returncode)
 
         logging.info("Generating ashleys report.")
         os.makedirs(os.path.dirname(report_location), exist_ok=True)
         # os.makedirs(f"{publishdir_location}/{date_folder}/{sample}/reports/", exist_ok=True)
-        logging.info("Running command: %s", " ".join(cmd + profile_slurm + report_options))
+        logging.info(
+            "Running command: %s", " ".join(cmd + profile_dry_run + report_options)
+        )
         # Change the permissions of the new directory
         # subprocess.run(["chmod", "-R", "777", f"{data_location}/{date_folder}"])
 
-        with open(f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}_report.log", "w") as f:
-            print(cmd + profile_slurm + report_options)
-            process2 = subprocess.Popen(cmd + profile_dry_run + report_options, stdout=f, stderr=f, universal_newlines=True)
-            # process2 = subprocess.Popen(cmd + profile_slurm + report_options, stdout=f, stderr=f, universal_newlines=True)
+        with open(
+            f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}_report.log",
+            "w",
+        ) as f:
+            process2 = subprocess.Popen(
+                cmd + profile_dry_run + report_options,
+                stdout=f,
+                stderr=f,
+                universal_newlines=True,
+                cwd=working_directory,
+                env=my_env,
+            )
+            # process2 = subprocess.Popen(cmd + profile_slurm + report_options, stdout=f, stderr=f, universal_newlines=True, cwd=working_directory, env=my_env)
             process2.wait()
 
             logging.info("Return code: %s", process2.returncode)

From dd1102329e60eeddd092b37dfd9c1dfc93b834c7 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Mon, 4 Dec 2023 10:44:35 +0000
Subject: [PATCH 2/8] merge blacklist fix, dump config fix, assertion to check
 labels.tsv x selected/ x scNOVA input lists, labels at later stage to prevent
 working on modified list of cells, other minor fixes

---
 .gitignore                                    |   4 +-
 afac/update_timestamps.py                     |  25 +
 config/config.yaml                            |   3 +
 config/config_metadata.yaml                   |   6 +
 .../Dockerfile-2.2.2.dockerfile               | 227 +++++++++
 .../Dockerfile-2.2.3.dockerfile               | 299 ++++++++++++
 .../add_T2T_part_to_Dockerfile.sh             |  35 ++
 watchdog_pipeline/watchdog_pipeline.py        |  13 +-
 workflow/Snakefile                            |  26 +-
 workflow/envs/scNOVA/scNOVA_DL.yaml           |   1 +
 workflow/rules/aggregate_fct.smk              |   2 +-
 workflow/rules/common.smk                     | 450 +++++-------------
 workflow/rules/count.smk                      |   3 +-
 workflow/rules/plots.smk                      |   7 +-
 workflow/rules/regenotyping.smk               |   1 +
 workflow/rules/scNOVA.smk                     |  19 +
 workflow/rules/utils.smk                      |  15 +
 .../scripts/normalization/merge-blacklist.py  |   3 +-
 .../scNOVA_scripts/assert_list_of_cells.py    |  57 +++
 workflow/scripts/utils/dump_config.py         |  40 +-
 20 files changed, 852 insertions(+), 384 deletions(-)
 create mode 100644 afac/update_timestamps.py
 create mode 100644 github-actions-runner/Dockerfile-2.2.2.dockerfile
 create mode 100644 github-actions-runner/Dockerfile-2.2.3.dockerfile
 create mode 100644 github-actions-runner/add_T2T_part_to_Dockerfile.sh
 create mode 100644 workflow/scripts/scNOVA_scripts/assert_list_of_cells.py

diff --git a/.gitignore b/.gitignore
index 406d336a..a140b637 100644
--- a/.gitignore
+++ b/.gitignore
@@ -218,4 +218,6 @@ LOGS_DEV/
 
 # scTRIP multiplot
 workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot
-workflow/config/scTRIP_multiplot.ok
\ No newline at end of file
+workflow/config/scTRIP_multiplot.ok
+args.output
+scNOVA_env_costea.yaml
diff --git a/afac/update_timestamps.py b/afac/update_timestamps.py
new file mode 100644
index 00000000..84cb551a
--- /dev/null
+++ b/afac/update_timestamps.py
@@ -0,0 +1,25 @@
+import os, sys
+import time
+from pathlib import Path
+
+
+def update_timestamps(directory):
+    """
+    Update the access and modification times of all files in the given directory and its subdirectories.
+
+    :param directory: Path to the directory
+    """
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".fastq.gz"):
+                continue
+            file_path = Path(root) / file
+            current_time = time.time()
+            print(file_path)
+            os.utime(file_path, (current_time, current_time))
+            print(f"Updated timestamp for: {file_path}")
+
+
+# Example usage
+directory_path = sys.argv[1]
+update_timestamps(directory_path)
diff --git a/config/config.yaml b/config/config.yaml
index 5a3b5098..3809643d 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -14,6 +14,9 @@ email: ""
 # List of samples to process if multiple are specified
 samples_to_process: []
 
+# Plate size
+plate_size: 96
+
 # --------------------------------------------------------
 # Data location & I/O
 # --------------------------------------------------------
diff --git a/config/config_metadata.yaml b/config/config_metadata.yaml
index 97c1af02..bff78ba9 100644
--- a/config/config_metadata.yaml
+++ b/config/config_metadata.yaml
@@ -135,3 +135,9 @@ use_strandscape_labels::
   required: False
   default: False
   lint_check: False
+plate_size::
+  desc: "Plate size used for the sequencing (96/384)"
+  type: int
+  required: True
+  default: 96
+  lint_check: False
diff --git a/github-actions-runner/Dockerfile-2.2.2.dockerfile b/github-actions-runner/Dockerfile-2.2.2.dockerfile
new file mode 100644
index 00000000..06f3ea66
--- /dev/null
+++ b/github-actions-runner/Dockerfile-2.2.2.dockerfile
@@ -0,0 +1,227 @@
+FROM condaforge/mambaforge:latest
+LABEL io.github.snakemake.containerized="true"
+LABEL io.github.snakemake.conda_env_hash="77eaa388d65d5205b87324fb0adb89561bc0e532a328995990a1d580aeb894ae"
+
+# Step 1: Retrieve conda environments
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml
+#   prefix: /conda-envs/5681728a49bd83ceed09ba194330c858
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - bwa ==0.7.17
+RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml
+#   prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - fastqc ==0.11.9
+RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml
+
+# Conda environment:
+#   source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_base.yaml
+#   prefix: /conda-envs/87c04f5d115eff742eca84455513deba
+#   name: ashleys_base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - samtools
+#     - tabix
+#     - bwa
+#     - sambamba
+#     - mosaicatcher
+#     # - alfred
+#     - ashleys-qc
+#     - pandas
+#     # PUBLISHDIR
+#     - rsync
+#     # MULTIQC
+#     - multiqc
+#     # Fix sklearn update
+#     - scikit-learn=1.2.2
+RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba
+ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml
+
+# Conda environment:
+#   source: https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_rtools.yaml
+#   prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+#   name: rtools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#     - r
+#     - anaconda
+#   dependencies:
+#     # - bioconductor-biocparallel
+#     # - bioconductor-bsgenome
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg19
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     # - bioconductor-fastseg
+#     # - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     # - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - r-assertthat
+#     - r-base
+#     # - r-biocmanager
+#     - r-cowplot
+#     - r-data.table
+#     # - r-devtools
+#     # - r-doparallel
+#     # - r-foreach
+#     - r-ggplot2
+#     # - r-gtools
+#     - r-reshape2
+#     # - r-zoo
+#     # - r-dplyr
+#     # - r-mc2d
+#     # - r-pheatmap
+#     # - bioconductor-complexheatmap
+#     # - r-gplots
+#     - r-scales
+#     - r-rcolorbrewer
+#     # - r-stringr
+#     - r-cairo
+#     - fonts-anaconda
+#     # NEW
+#     - bioconductor-edger
+#     - r-r.utils
+#     # PLATE PLOT
+#     - r-dplyr
+#     - r-platetools
+#     - r-viridis
+#     # GC_correction
+#     - r-tidyr
+#     - r-ggpubr
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+ADD https://raw.githubusercontent.com/friendsofstrandseq/ashleys-qc-pipeline/2.2.2/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_base.yaml
+#   prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b
+#   name: mc-base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - pandas
+#     - intervaltree
+#     - scipy
+#     - pysam
+#     - tqdm
+#     - perl
+#     - pypdf2
+#     - parmap
+#     # NEW
+#     - pyyaml
+#     - seaborn
+#     - matplotlib
+#     # SOLVE se-pe detection
+#     - samtools
+#     # ArbiGent Hufsah deps
+#     - pytables
+#     - xopen
+RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b
+COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_bioinfo_tools.yaml
+#   prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+#   name: mc-bioinfo-tools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - bcftools
+#     - freebayes
+#     - mosaicatcher
+#     - samtools
+#     - tabix
+#     - whatshap
+RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/rtools.yaml
+#   prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931
+#   name: rtools
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - r
+#     - anaconda
+#   dependencies:
+#     # # NEW
+#     - strandphaser
+#     # ###############
+#     - bioconductor-biocparallel
+#     - bioconductor-bsgenome
+#     - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     - bioconductor-complexheatmap
+#     # - bioconductor-fastseg
+#     - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - fonts-anaconda
+#     - r-assertthat
+#     - r-base
+#     - r-biocmanager
+#     - r-cairo
+#     - r-cowplot
+#     - r-data.table
+#     - r-devtools
+#     - r-doparallel
+#     - r-dplyr
+#     - r-foreach
+#     - r-ggplot2
+#     - r-gplots
+#     - r-gtools
+#     - r-mc2d
+#     - r-rcolorbrewer
+#     - r-reshape2
+#     - r-scales
+#     - r-stringr
+#     # SV_CALLS_DEV
+#     # - r-zoo
+#     - r-r.utils
+#     - r-ggnewscale
+#     # HEATMAP
+#     - r-tidyr
+#     # ARBIGENT
+#     - r-reshape
+#     - r-optparse
+#     - r-tidyr
+#     - r-ggbeeswarm
+#     - r-pheatmap
+#     # GC_correction
+#     - r-ggpubr
+#     - bioconductor-edger
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931
+COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml
+
+# Step 2: Generate conda environments
+
+RUN mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \
+    mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \
+    mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \
+    mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \
+    mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \
+    mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \
+    mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \
+    mamba clean --all -y
diff --git a/github-actions-runner/Dockerfile-2.2.3.dockerfile b/github-actions-runner/Dockerfile-2.2.3.dockerfile
new file mode 100644
index 00000000..aa4d1c42
--- /dev/null
+++ b/github-actions-runner/Dockerfile-2.2.3.dockerfile
@@ -0,0 +1,299 @@
+FROM condaforge/mambaforge:latest
+LABEL io.github.snakemake.containerized="true"
+LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb9a77747c17f62707ea2f67a667"
+
+# Step 1: Retrieve conda environments
+
+# Conda environment:
+#   source: ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml
+#   prefix: /conda-envs/87c04f5d115eff742eca84455513deba
+#   name: ashleys_base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - samtools
+#     - tabix
+#     - bwa
+#     - sambamba
+#     - mosaicatcher
+#     # - alfred
+#     - ashleys-qc
+#     - pandas
+#     # PUBLISHDIR
+#     - rsync
+#     # MULTIQC
+#     - multiqc
+#     # Fix sklearn update
+#     - scikit-learn=1.2.2
+RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba
+COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml
+
+# Conda environment:
+#   source: ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml
+#   prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+#   name: rtools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#     - r
+#     - anaconda
+#   dependencies:
+#     # - bioconductor-biocparallel
+#     # - bioconductor-bsgenome
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg19
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     # - bioconductor-fastseg
+#     # - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     # - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - r-assertthat
+#     - r-base
+#     # - r-biocmanager
+#     - r-cowplot
+#     - r-data.table
+#     # - r-devtools
+#     # - r-doparallel
+#     # - r-foreach
+#     - r-ggplot2
+#     # - r-gtools
+#     - r-reshape2
+#     # - r-zoo
+#     # - r-dplyr
+#     # - r-mc2d
+#     # - r-pheatmap
+#     # - bioconductor-complexheatmap
+#     # - r-gplots
+#     - r-scales
+#     - r-rcolorbrewer
+#     # - r-stringr
+#     - r-cairo
+#     - fonts-anaconda
+#     # NEW
+#     - bioconductor-edger
+#     - r-r.utils
+#     # PLATE PLOT
+#     - r-dplyr
+#     - r-platetools
+#     - r-viridis
+#     # GC_correction
+#     - r-tidyr
+#     - r-ggpubr
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml
+#   prefix: /conda-envs/5681728a49bd83ceed09ba194330c858
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - bwa ==0.7.17
+RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml
+#   prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - fastqc ==0.11.9
+RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_base.yaml
+#   prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b
+#   name: mc-base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - pandas
+#     - intervaltree
+#     - scipy
+#     - pysam
+#     - tqdm
+#     - perl
+#     - pypdf2
+#     - parmap
+#     # NEW
+#     - pyyaml
+#     - seaborn
+#     - matplotlib
+#     # SOLVE se-pe detection
+#     - samtools
+#     # ArbiGent Hufsah deps
+#     - pytables
+#     - xopen
+RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b
+COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_bioinfo_tools.yaml
+#   prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+#   name: mc-bioinfo-tools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - bcftools
+#     - freebayes
+#     - mosaicatcher
+#     - samtools
+#     - tabix
+#     - whatshap
+RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/rtools.yaml
+#   prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931
+#   name: rtools
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - r
+#     - anaconda
+#   dependencies:
+#     # # NEW
+#     - strandphaser
+#     # ###############
+#     - bioconductor-biocparallel
+#     - bioconductor-bsgenome
+#     - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     - bioconductor-complexheatmap
+#     # - bioconductor-fastseg
+#     - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - fonts-anaconda
+#     - r-assertthat
+#     - r-base
+#     - r-biocmanager
+#     - r-cairo
+#     - r-cowplot
+#     - r-data.table
+#     - r-devtools
+#     - r-doparallel
+#     - r-dplyr
+#     - r-foreach
+#     - r-ggplot2
+#     - r-gplots
+#     - r-gtools
+#     - r-mc2d
+#     - r-rcolorbrewer
+#     - r-reshape2
+#     - r-scales
+#     - r-stringr
+#     # SV_CALLS_DEV
+#     # - r-zoo
+#     - r-r.utils
+#     - r-ggnewscale
+#     # HEATMAP
+#     - r-tidyr
+#     # ARBIGENT
+#     - r-reshape
+#     - r-optparse
+#     - r-tidyr
+#     - r-ggbeeswarm
+#     - r-pheatmap
+#     # GC_correction
+#     - r-ggpubr
+#     - bioconductor-edger
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931
+COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/scNOVA/scNOVA_DL.yaml
+#   prefix: /conda-envs/1ede379ce8d378df7dca25b2bf4111f3
+#   name: scNOVA_DL
+#   channels:
+#     - conda-forge
+#     - anaconda
+#   dependencies:
+#     - tensorflow=1.15.0
+#     - scikit-learn=0.21.3
+#     - python=3.7.4
+#     - matplotlib=3.1.1
+#     - pandas=0.25.3
+#     - h5py=2.10.0
+#     - numpy
+#     # scNOVA archive
+#     - unzip
+#     # Fix
+RUN mkdir -p /conda-envs/1ede379ce8d378df7dca25b2bf4111f3
+COPY workflow/envs/scNOVA/scNOVA_DL.yaml /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/scNOVA/scNOVA_R.yaml
+#   prefix: /conda-envs/193f60d48796dd17eb847ea689b863a9
+#   name: scNOVA
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - r
+#   dependencies:
+#     - bioconductor-deseq2=1.30.0
+#     - r-matrixstats=0.58.0
+#     - r-pheatmap=1.0.12
+#     - r-gplots=3.1.1
+#     - r-umap=0.2.7.0
+#     - r-rtsne=0.15
+#     - r-factoextra=1.0.7
+#     - r-pracma=2.3.3
+#     - bioconductor-chromvar=1.12.0
+#     - r-nabor=0.5.0
+#     - bioconductor-motifmatchr=1.12.0
+#     - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.3
+#     - bioconductor-jaspar2016=1.18.0
+#     - r-codetools=0.2_18
+#     - r-fitdistrplus
+#     - r-doparallel
+#     - r-foreach
+RUN mkdir -p /conda-envs/193f60d48796dd17eb847ea689b863a9
+COPY workflow/envs/scNOVA/scNOVA_R.yaml /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml
+#   prefix: /conda-envs/ca9641251a8cb0057003875ad776c49f
+#   name: scNOVA_bioinfo_tools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#     - anaconda
+#   dependencies:
+#     - samtools
+#     - biobambam
+#     - bedtools
+RUN mkdir -p /conda-envs/ca9641251a8cb0057003875ad776c49f
+COPY workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml
+
+# Step 2: Generate conda environments
+
+RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \
+    mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \
+    mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \
+    mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \
+    mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \
+    mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \
+    mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \
+    mamba env create --prefix /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 --file /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml && \
+    mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \
+    mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \
+    mamba clean --all -y
+# CUSTOM PART
+RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/
+COPY /workflow/scripts/utils/install_R_package.R /conda-envs/
+RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz
diff --git a/github-actions-runner/add_T2T_part_to_Dockerfile.sh b/github-actions-runner/add_T2T_part_to_Dockerfile.sh
new file mode 100644
index 00000000..7c631edd
--- /dev/null
+++ b/github-actions-runner/add_T2T_part_to_Dockerfile.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Check if a Dockerfile path is provided
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <path-to-Dockerfile>"
+    exit 1
+fi
+
+DOCKERFILE=$1
+
+# Check if the Dockerfile exists
+if [ ! -f "$DOCKERFILE" ]; then
+    echo "Dockerfile not found: $DOCKERFILE"
+    exit 1
+fi
+
+# Extract the R environment variable
+Renv=$(grep -P "\/rtools.*environment\.yaml" "$DOCKERFILE" | sed "s/\//\t/g" | cut -f 5)
+
+# Check if Renv is extracted
+if [ -z "$Renv" ]; then
+    echo "R environment variable not found in the Dockerfile."
+    exit 1
+fi
+
+# Append custom steps to the Dockerfile
+{
+    echo '\n'
+    echo "# CUSTOM PART"
+    echo "RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/"
+    echo "COPY /workflow/scripts/utils/install_R_package.R /conda-envs/"
+    echo "RUN chmod -R 0777 /conda-envs/$Renv/lib/R/library && /conda-envs/$Renv/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz"
+} >>"$DOCKERFILE"
+
+echo "Custom steps added to $DOCKERFILE"
diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py
index c0f259f9..4c1a6614 100644
--- a/watchdog_pipeline/watchdog_pipeline.py
+++ b/watchdog_pipeline/watchdog_pipeline.py
@@ -46,7 +46,8 @@
 ]
 profile_dry_run = [
     "--profile",
-    "workflow/snakemake_profiles/local/conda_singularity/",
+    "workflow/snakemake_profiles/local/conda/",
+    # "workflow/snakemake_profiles/local/conda_singularity/",
     "-c",
     "1",
 ]
@@ -272,7 +273,7 @@ def check_unprocessed_folder(self):
         # last_message_timestamp = last_message_timestamp
 
         main_df = list()
-        if workflows_data:
+        if len(workflows_data) > 0:
             for plate in total_list_runs:
                 # print(plate)
                 if plate.split("-")[0][:2] == "20":
@@ -383,6 +384,7 @@ def check_unprocessed_folder(self):
             pd.options.display.max_rows = 999
             pd.options.display.max_colwidth = 30
             # pd.options.display.max_columns = 50
+
             main_df = pd.DataFrame(main_df)
             # main_df.loc[(main_df["labels"] == True) &  (main_df["report"] == True), "real_status"] = "Completed"
             main_df.loc[
@@ -418,7 +420,7 @@ def check_unprocessed_folder(self):
             main_df["real_status"] = main_df["real_status"].fillna(
                 "Error (to  investigate))"
             )
-
+            print(workflows_data["workflows"])
             print(main_df)
 
             dry_run_db = False
@@ -454,6 +456,9 @@ def check_unprocessed_folder(self):
                         e for e in workflows_data["workflows"] if e["id"] == workflow_id
                     ]
 
+                    print(panoptes_entry)
+                    print(panoptes_data)
+
                     if panoptes_data:
                         panoptes_data = panoptes_data[0]
                         if "completed_at" not in panoptes_data:
@@ -530,7 +535,7 @@ def check_unprocessed_folder(self):
                 for row in main_df.loc[
                     # (main_df["multiqc_scratch"] == False)
                     (main_df["multiqc_scratch"] == False)
-                    & (main_df["report"] == False)
+                    # & (main_df["report"] == False)
                 ].to_dict("records"):
                     logging.info(row)
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 5acf31fe..4262bd44 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -19,19 +19,29 @@ if config["ashleys_pipeline"] is True:
 
     module ashleys_qc:
         snakefile:
-            github(
-                "friendsofstrandseq/ashleys-qc-pipeline",
-                path="workflow/Snakefile",
-                tag=str(config["ashleys_pipeline_version"]),
-            )
+            "../../ashleys-qc-pipeline/workflow/Snakefile"
+            # github(
+            #     "friendsofstrandseq/ashleys-qc-pipeline",
+            #     path="workflow/Snakefile",
+            #     tag=str(config["ashleys_pipeline_version"]),
+            # )
         config:
             config
 
     use rule * from ashleys_qc as ashleys_*
 
-    localrules:
-        ashleys_genecore_symlink,
-        symlink_selected_bam,
+    if config["ashleys_pipeline_only"] is True:
+
+        localrules:
+            ashleys_genecore_symlink,
+            ashleys_symlink_selected_bam,
+
+    else:
+
+        localrules:
+            ashleys_genecore_symlink,
+            ashleys_symlink_selected_bam,
+            symlink_selected_bam,
 
 else:
 
diff --git a/workflow/envs/scNOVA/scNOVA_DL.yaml b/workflow/envs/scNOVA/scNOVA_DL.yaml
index 8530fdf8..775c36d8 100644
--- a/workflow/envs/scNOVA/scNOVA_DL.yaml
+++ b/workflow/envs/scNOVA/scNOVA_DL.yaml
@@ -12,3 +12,4 @@ dependencies:
   - numpy
   # scNOVA archive
   - unzip
+  # Fix
diff --git a/workflow/rules/aggregate_fct.smk b/workflow/rules/aggregate_fct.smk
index 278d45b9..5de9c6e1 100644
--- a/workflow/rules/aggregate_fct.smk
+++ b/workflow/rules/aggregate_fct.smk
@@ -169,7 +169,7 @@ def aggregate_cells_scTRIP_multiplot(wildcards):
     cell_list = df.cell.tolist()
 
     return expand(
-        "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png",
+        "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.pdf",
         folder=config["data_location"],
         sample=wildcards.sample,
         cell=cell_list,
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 4c42e1b8..7af4b6b4 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -11,6 +11,12 @@ import os, sys
 os.environ["LC_CTYPE"] = "C"
 
 
+# print(config["data_location"])
+
+if config["ashleys_pipeline"] is True and config["genecore"] is True:
+    config["data_location"] = "/".join(config["data_location"].split("/")[:-1])
+
+
 envvars:
     "LC_CTYPE",
 
@@ -131,6 +137,9 @@ class HandleInput:
         genecore=False,
         genecore_path=str,
     ):
+        # print(input_path)
+        # print(genecore_path)
+        # print("\n")
         if genecore is False:
             df_config_files = self.handle_input_data(thisdir=input_path, bam=bam)
         elif genecore is True:
@@ -154,56 +163,69 @@ class HandleInput:
         Returns:
             _type_: _description_
         """
-        complete_df_list = list()
+        from pprint import pprint
+        from collections import Counter
 
-        # List of folders/files to not consider (restrict to samples only)
-        l = sorted(
-            [
-                e
-                for e in os.listdir(
-                    "{genecore_prefix}/{date_folder}".format(
-                        genecore_prefix=config["genecore_prefix"],
-                        date_folder=config["genecore_date_folder"],
-                    )
-                )
-                if e.endswith(".txt.gz")
-            ]
+        directory_path = f"{config['genecore_prefix']}/{config['genecore_date_folder']}"
+
+        l = sorted([e for e in os.listdir(directory_path) if e.endswith(".txt.gz")])
+
+        complete_df_list = list()
+        # print(thisdir)
+        genecore_prefix = config["genecore_prefix"]
+        date_folder = config["genecore_date_folder"]
+        # print(f"{genecore_prefix}/{date_folder}")
+
+        # Pattern to extract sample name and index
+        pattern = re.compile(r"(.*_lane1)(.*?)(iTRU|PE20)(.*?)(\d{2})(?:_1_|_2_)")
+
+        samples = list()
+        prefixes = list()
+        indexes = list()
+        plate_types = list()
+        d_master = collections.defaultdict(
+            lambda: {
+                "indexes": set(),
+                "file_prefix": "",
+                "plate_type": "",
+                "index_pattern": "",
+            }
         )
-        # print(l)
-        # Create a list of  files to process for each sample
-        d_master = collections.defaultdict(dict)
-        sub_l = list()
-        for j, e in enumerate(l):
-            sub_l.append(e)
-            if (j + 1) % 192 == 0:
-                common_element = findstem(sub_l)
-                l_elems = common_element.split("lane1")
-                # print(sub_l)
-                # print(common_element)
-                # print(l_elems)
-                # print(l_elems[1].split("{regex_element}".format(regex_element=config["genecore_regex_element"]))
-                prefix = l_elems[0]
-                # technician_name = l_elems[0].split("_")[-2]
-                sample = l_elems[1].split(
-                    "{regex_element}".format(
-                        regex_element=config["genecore_regex_element"]
-                    )
-                )[0]
-                index = l_elems[1].split(
-                    "{regex_element}".format(
-                        regex_element=config["genecore_regex_element"]
+
+        # First pass: Count occurrences of each sample_name
+        file_counts_per_sample = Counter()
+        for file_path in l:
+            match = pattern.search(file_path)
+            if match:
+                sample_name = match.group(2)
+                file_counts_per_sample[sample_name] += 1
+
+        # Second pass: Process files and determine plate type per sample
+        for j, file_path in enumerate(sorted(l)):
+            match = pattern.search(file_path)
+            if match:
+                sample_name = match.group(2)
+                index = match.group(4)
+                indexes.append(index)
+                d_master[sample_name]["indexes"].add(index)
+                file_count = file_counts_per_sample[sample_name]
+
+                # Determine plate type using modulo 96 operation
+                if file_count % 96 != 0:
+                    raise ValueError(
+                        f"Invalid file count for sample {sample_name} with file count {file_count}. Must be a multiple of 96."
                     )
-                )[1]
-                # pe_index = common_element[-1]
-                sub_l = list()
-
-                d_master[sample]["prefix"] = prefix
-                # d_master[sample]["technician_name"] = technician_name
-                d_master[sample]["index"] = index
-                d_master[sample]["common_element"] = common_element
-        # from pprint import pprint
-        # pprint(d_master)
-        # exit()
+                plate_type = int(file_count / 2)
+
+                if (j + 1) % file_count == 0:
+                    prefixes.append(match.group(3))
+                    d_master[sample_name]["file_prefix"] = match.group(1)
+                    d_master[sample_name]["index_pattern"] = match.group(3)
+                    plate = directory_path.split("/")[-1]
+                    samples.append(sample_name)
+                    plate_types.append(plate_type)
+                    d_master[sample_name]["plate_type"] = plate_type
+
         samples_to_process = (
             config["samples_to_process"]
             if len(config["samples_to_process"]) > 0
@@ -220,8 +242,8 @@ class HandleInput:
                 "{data_location}/{sample}/fastq/{sample}{regex_element}{index}{cell_nb}.{pair}.fastq.gz",
                 data_location=config["data_location"],
                 sample=sample,
-                regex_element=config["genecore_regex_element"],
-                index=d_master[sample]["index"],
+                regex_element=d_master[sample]["index_pattern"],
+                index=d_master[sample]["indexes"],
                 cell_nb=[str(e).zfill(2) for e in list(range(1, 97))],
                 pair=["1", "2"],
             )
@@ -229,7 +251,8 @@ class HandleInput:
             if sample in samples_to_process
         ]
         genecore_list = [sub_e for e in genecore_list for sub_e in e]
-        # pprint(genecore_list)
+        # pprint(d_master)
+
         complete_df_list = list()
 
         for sample in d_master:
@@ -248,11 +271,12 @@ class HandleInput:
                 df["Full_path"] = df[["Folder", "File"]].apply(
                     lambda r: f"{r['Folder']}/{r['File']}.fastq.gz", axis=1
                 )
+
                 df["Genecore_path"] = df["File"].apply(
-                    lambda r: f"{config['genecore_prefix']}/{config['genecore_date_folder']}/{d_master[sample]['prefix']}lane1{r.replace('.', '_')}_sequence.txt.gz"
+                    lambda r: f"{config['genecore_prefix']}/{config['genecore_date_folder']}/{d_master[sample]['file_prefix']}{r.replace('.', '_')}_sequence.txt.gz"
                 )
                 df["Genecore_file"] = df["File"].apply(
-                    lambda r: f"{d_master[sample]['prefix']}lane1{r.replace('.', '_')}"
+                    lambda r: f"{d_master[sample]['file_prefix']}{r.replace('.', '_')}"
                 )
                 df["Genecore_file"] = df["Genecore_file"].apply(
                     lambda r: "_".join(r.split("_")[:-1])
@@ -375,12 +399,18 @@ def findstem(arr):
 
 # Create configuration file with samples
 
+# print("config['data_location']")
+# print(config["data_location"])
+
 c = HandleInput(
     input_path=config["data_location"],
-    genecore_path="{genecore_prefix}/{genecore_date_folder}".format(
+    genecore_path="{genecore_prefix}".format(
         genecore_prefix=config["genecore_prefix"],
-        genecore_date_folder=config["genecore_date_folder"],
     ),
+    # genecore_path="{genecore_prefix}/{genecore_date_folder}".format(
+    #     genecore_prefix=config["genecore_prefix"],
+    #     genecore_date_folder=config["genecore_date_folder"],
+    # ),
     output_path="{data_location}/config/config_df.tsv".format(
         data_location=config["data_location"]
     ),
@@ -532,8 +562,12 @@ def onsuccess_fct(log):
         log, "SUCCESS", config, config_metadata
     )
     shell(
-        'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {}'.format(
-            config["version"], config["data_location"], config["email"], log_path_new
+        'mail -s "[smk-wf-catalog/mosaicatcher-pipeline] v{} - [{}--{}] - SUCCESS" {} < {}'.format(
+            config["version"],
+            config["data_location"].split("/")[-1],
+            ";".join(samples),
+            config["email"],
+            log_path_new,
         )
     )
 
@@ -546,8 +580,12 @@ def onerror_fct(log):
         log, "ERROR", config, config_metadata
     )
     shell(
-        'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {}'.format(
-            config["version"], config["data_location"], config["email"], log_path_new
+        'mail -s "[smk-wf-catalog/mosaicatcher-pipeline] v{} - [{}--{}] - ERROR" {} < {}'.format(
+            config["version"],
+            config["data_location"].split("/")[-1],
+            ";".join(samples),
+            config["email"],
+            log_path_new,
         )
     )
 
@@ -564,308 +602,26 @@ def get_scnova_final_output(wildcards):
     # abbreviate_names = False
 
     l = [
-        # expand(
-        #     "{folder}/{sample}/scNOVA_input_user/{clone}_sv_calls_all_print.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        #     clone=clones[wildcards.sample],
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_orientation_CN_correct0.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_input_user/sv_calls_all_print_CREs.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/{sample}.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/{sample}_sort.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/{sample}_sort_geneid.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}_sc.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab_final.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_{sample}_{clone}_orientation_norm_qc.pdf",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_orientation_norm.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab_final.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_{sample}_{clone}_Resid_orientation_qc.pdf",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_{clone}_Resid_orientation.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_all_orientation_norm_var_GC_CpG_RT_T_comb3_{clone}.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Expression_all_{clone}.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}.csv",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train40_output_ypred_{clone}.csv",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train20_output_ypred_{clone}.csv",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train5_output_ypred_{clone}.csv",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}_annot.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train40_output_ypred_{clone}_annot.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train20_output_ypred_{clone}_annot.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_CNN/DNN_train5_output_ypred_{clone}_annot.txt",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_plots/Result_scNOVA_plots_{sample}.pdf",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/result_PLSDA_{sample}.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
         expand(
             "{folder}/{sample}/scNOVA_result_plots/Result_scNOVA_plots_{sample}_alternative_PLSDA.pdf",
             folder=config["data_location"],
             sample=wildcards.sample,
         ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
         expand(
             "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num_sort_for_chromVAR.txt",
             folder=config["data_location"],
             sample=wildcards.sample,
         ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W1.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W2.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C2.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam.bai",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam.bai",
-        #     cell=cell_per_sample[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_input_user/strandphaser_output_copy.txt",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
         expand(
             "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2_sort.txt",
             folder=config["data_location"],
             sample=wildcards.sample,
         ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2.tab",
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
         expand(
             "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2_sort.txt",
             folder=config["data_location"],
             sample=wildcards.sample,
         ),
-        # expand(
-        #     "{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam",
-        #     clone=clones[wildcards.sample],
-        #     folder=config["data_location"],
-        #     sample=wildcards.sample,
-        # ),
     ]
     l = [sub_e for e in l for sub_e in e]
     return l
@@ -1096,16 +852,26 @@ def get_all_plots(wildcards):
             ),
         )
 
-        # Run summary section
+        # Config section
 
         l_outputs.extend(
             expand(
-                "{folder}/{sample}/config/run_summary.txt",
+                "{folder}/{sample}/config/config.yaml",
                 folder=config["data_location"],
                 sample=wildcards.sample,
             ),
         )
 
+        # Run summary section
+
+        # l_outputs.extend(
+        #     expand(
+        #         "{folder}/{sample}/config/run_summary.txt",
+        #         folder=config["data_location"],
+        #         sample=wildcards.sample,
+        #     ),
+        # )
+
     # from pprint import pprint
     # pprint(l_outputs)
     return l_outputs
diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk
index 080d64b7..f1a0e74e 100755
--- a/workflow/rules/count.smk
+++ b/workflow/rules/count.smk
@@ -136,6 +136,7 @@ rule symlink_selected_bam:
 
 rule remove_unselected_bam:
     input:
+        labels="{folder}/{sample}/cell_selection/labels.tsv",
         bam=unselected_input_bam,
         bai=unselected_input_bai,
     output:
@@ -196,7 +197,7 @@ if (
             "../envs/mc_base.yaml"
         shell:
             """
-            workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size {params.window} > {output.merged} 2>> {log}
+            workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size {params.window} --output {output.merged}
             """
 
 else:
diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk
index 1acc1e55..221c8610 100644
--- a/workflow/rules/plots.smk
+++ b/workflow/rules/plots.smk
@@ -17,7 +17,7 @@ if config["ashleys_pipeline"] is False:
             # "{folder}/{sample}/plots/counts/CountComplete.raw.pdf",
             report(
                 "{folder}/{sample}/plots/counts/CountComplete.raw.pdf",
-                category="Mosaic Counts",
+                category="Mosaic counts",
                 subcategory="{sample}",
                 labels={"Cell": "ALL", "Type": "raw"},
             ),
@@ -40,7 +40,7 @@ rule divide_pdf:
         report(
             "{folder}/{sample}/plots/counts_raw/{cell}.{i, \d+}.pdf",
             caption="../report/mosaic_counts.rst",
-            category="Mosaic counts",
+            category="Mosaic counts cellwise",
             subcategory="{sample}",
             labels={"Cell": "{cell}", "Nb": "{i}", "Type": "raw"},
         ),
@@ -306,7 +306,7 @@ rule scTRIP_multiplot:
         sv_counts="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv",
     output:
         figure=report(
-            "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png",
+            "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.pdf",
             category="scTRIP multiplot",
             subcategory="{sample}",
             labels={"Cell": "{cell}", "Chrom": "{chrom}"},
@@ -315,6 +315,7 @@ rule scTRIP_multiplot:
         "{folder}/log/scTRIP_multiplot/{sample}/{cell}/{chrom}.log",
     conda:
         "../envs/rtools.yaml"
+    container: None
     resources:
         mem_mb=get_mem_mb,
     shell:
diff --git a/workflow/rules/regenotyping.smk b/workflow/rules/regenotyping.smk
index ebb451df..2bfae7b0 100644
--- a/workflow/rules/regenotyping.smk
+++ b/workflow/rules/regenotyping.smk
@@ -6,6 +6,7 @@ rule mergeBams:
         check=remove_unselected_fct,
         bam=selected_input_bam,
         bai=selected_input_bai,
+        labels="{folder}/{sample}/cell_selection/labels.tsv",
     output:
         temp("{folder}/{sample}/merged_bam/merged.raw.bam"),
     log:
diff --git a/workflow/rules/scNOVA.smk b/workflow/rules/scNOVA.smk
index 04c108d2..9f6c7c5b 100755
--- a/workflow/rules/scNOVA.smk
+++ b/workflow/rules/scNOVA.smk
@@ -1,8 +1,24 @@
+rule assert_list_of_cells:
+    input:
+        labels="{folder}/{sample}/cell_selection/labels.tsv",
+        subclone_list="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
+        selected_cells="{folder}/{sample}/selected/",
+    output:
+        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
+    log:
+        "{folder}/{sample}/log/assert_list_of_cells.log",
+    conda:
+        "../envs/mc_base.yaml"
+    script:
+        "../scripts/scNOVA_scripts/assert_list_of_cells.py"
+
+
 rule filter_sv_calls:
     log:
         "{folder}/{sample}/log/filter_sv_calls/{sample}.log",
     input:
         "{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv",
+        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         "{folder}/{sample}/scNOVA_input_user/sv_calls.tsv",
     conda:
@@ -147,6 +163,7 @@ rule remove_dup:
         None
     input:
         bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam",
+        assert_list_of_cells="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         bam_uniq="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
         bam_metrix="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono.metrix_dup.txt",
@@ -272,6 +289,7 @@ rule filter_input_subclonality:
         None
     input:
         "{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
+        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         "{folder}/{sample}/scNOVA_input_user/input_subclonality_{clone}.txt",
     conda:
@@ -973,6 +991,7 @@ rule split_bam_WC:
         None
     input:
         "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
+        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         bam_header="{folder}/{sample}/scNOVA_bam_modified/{cell}.header_WC.sam",
         bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam",
diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
index 39353b67..4eaf8464 100644
--- a/workflow/rules/utils.smk
+++ b/workflow/rules/utils.smk
@@ -139,3 +139,18 @@ rule samtools_faindex:
         mem_mb=get_mem_mb_heavy,
     shell:
         "samtools faidx {input}"
+
+
+rule save_config:
+    input:
+        "config/config.yaml",
+    output:
+        "{folder}/{sample}/config/config.yaml",
+    log:
+        "{folder}/log/save_config/{sample}.log",
+    conda:
+        "../envs/mc_base.yaml"
+    resources:
+        mem_mb=get_mem_mb,
+    script:
+        "../scripts/utils/dump_config.py"
diff --git a/workflow/scripts/normalization/merge-blacklist.py b/workflow/scripts/normalization/merge-blacklist.py
index 9a484eec..750d3966 100755
--- a/workflow/scripts/normalization/merge-blacklist.py
+++ b/workflow/scripts/normalization/merge-blacklist.py
@@ -16,6 +16,7 @@ def main():
         type=int,
         help="If the distance between two blacklisted intervals is below this threshold, they are merged.",
     )
+    parser.add_argument("--output", default=None, help="Output file name")
     parser.add_argument(
         "--whitelist", default=None, help="TSV file with intervals to be removed from the blacklist (columns: chrom, start, end)."
     )
@@ -71,7 +72,7 @@ def main():
 
     print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr)
 
-    norm_table.to_csv(sys.stdout, index=False, sep="\t")
+    norm_table.to_csv(args.output, index=False, sep="\t")
 
     ## Identify "complex" intervals
     # segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end'])
diff --git a/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py b/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py
new file mode 100644
index 00000000..651fb7c6
--- /dev/null
+++ b/workflow/scripts/scNOVA_scripts/assert_list_of_cells.py
@@ -0,0 +1,57 @@
+import pandas as pd
+import os
+
+
+def main(labels_file, subclone_file, selected_folder, output_file):
+    # Read labels.tsv
+    labels_df = pd.read_csv(labels_file, sep="\t")
+    labels_cells = set(
+        labels_df["cell"].str.replace(".sort.mdup.bam", "").values.tolist()
+    )
+
+    # Read input_subclonality.txt
+    input_subclonality = pd.read_csv(subclone_file, sep="\t")
+    subclone_cells = set(input_subclonality["Filename"].values.tolist())
+
+    # List files in selected/ folder and process filenames
+    selected_cells = set(
+        file.replace(".sort.mdup.bam", "")
+        for file in os.listdir(selected_folder)
+        if file.endswith(".sort.mdup.bam")
+    )
+
+    # Compare sets
+    if labels_cells == subclone_cells == selected_cells:
+        result = "PASS: All cell lists match."
+    else:
+        result = "FAIL: Cell lists do not match."
+
+    # Logging details of the mismatch
+    with open(output_file, "w") as output:
+        output.write("Labels cells: {}\n".format(labels_cells))
+        output.write("Subclone cells: {}\n".format(subclone_cells))
+        output.write("Selected cells: {}\n".format(selected_cells))
+        output.write("Discrepancy details:\n")
+        output.write(
+            "In labels but not in subclone: {}\n".format(labels_cells - subclone_cells)
+        )
+        output.write(
+            "In subclone but not in labels: {}\n".format(subclone_cells - labels_cells)
+        )
+        output.write(
+            "In labels but not in selected: {}\n".format(labels_cells - selected_cells)
+        )
+        output.write(
+            "In selected but not in labels: {}\n".format(selected_cells - labels_cells)
+        )
+        output.write(result)
+
+
+if __name__ == "__main__":
+    # Extracting Snakemake input variables
+    labels_file = snakemake.input.labels
+    subclone_file = snakemake.input.subclone_list
+    selected_folder = snakemake.input.selected_cells
+    output_file = snakemake.output[0]
+
+    main(labels_file, subclone_file, selected_folder, output_file)
diff --git a/workflow/scripts/utils/dump_config.py b/workflow/scripts/utils/dump_config.py
index 4701706a..6b299ee6 100644
--- a/workflow/scripts/utils/dump_config.py
+++ b/workflow/scripts/utils/dump_config.py
@@ -1,28 +1,22 @@
-import json
-import time
+import yaml
 
-timestamp = time.strftime("%Y%m%d-%H%M%S")
 
-configured_samples = []
-for key in config.keys():
-    if not key.startswith("sample_description"):
-        continue
-    sample = key.split("_", 2)[-1]
-    configured_samples.append(sample)
+def update_config(input_file, output_file):
+    # Load the existing config file
+    with open(input_file, "r") as file:
+        flat_file_config = yaml.safe_load(file)
 
-if configured_samples:
-    second_dump = "config_{}_{}.json".format(timestamp, "_".join(sorted(configured_samples)))
-else:
-    second_dump = "config_{}.json".format(timestamp)
+    # Update the config with Snakemake parameters
+    for key, value in snakemake.config.items():
+        flat_file_config[key] = value
 
-with open(output[0], "w") as fake:
-    _ = fake.write(second_dump + "\n(Full configuration dump)")
+    # Save the updated config to the output file
+    with open(output_file, "w") as file:
+        yaml.dump(flat_file_config, file)
 
-with open(second_dump, "w") as dump:
-    json.dump(
-        config,
-        dump,
-        ensure_ascii=True,
-        indent=2,
-        sort_keys=True,
-    )
+
+if __name__ == "__main__":
+    input_config = snakemake.input[0]
+    output_config = snakemake.output[0]
+
+    update_config(input_config, output_config)

From 07c7548bf35ed199d068f4089fef40d0e03fa002 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Mon, 4 Dec 2023 14:00:43 +0000
Subject: [PATCH 3/8] Linting, fmt, config update

---
 config/config.yaml        |  4 ++--
 workflow/Snakefile        | 12 ++++++------
 workflow/rules/common.smk |  4 +++-
 workflow/rules/plots.smk  |  3 ++-
 workflow/rules/setup.smk  |  6 ++++--
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 3809643d..017cd8bd 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -3,10 +3,10 @@
 # --------------------------------------------------------
 
 # MosaiCatcher version
-version: 2.2.2
+version: 2.2.3
 
 # Ashleys-QC pipeline version
-ashleys_pipeline_version: 2.2.2
+ashleys_pipeline_version: 2.2.3
 
 # Email for notifications about the pipeline's status
 email: ""
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 4262bd44..652acaaa 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -19,12 +19,12 @@ if config["ashleys_pipeline"] is True:
 
     module ashleys_qc:
         snakefile:
-            "../../ashleys-qc-pipeline/workflow/Snakefile"
-            # github(
-            #     "friendsofstrandseq/ashleys-qc-pipeline",
-            #     path="workflow/Snakefile",
-            #     tag=str(config["ashleys_pipeline_version"]),
-            # )
+            # "../../ashleys-qc-pipeline/workflow/Snakefile"
+            github(
+                "friendsofstrandseq/ashleys-qc-pipeline",
+                path="workflow/Snakefile",
+                tag=str(config["ashleys_pipeline_version"]),
+            )
         config:
             config
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 7af4b6b4..8acedf9f 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -14,7 +14,9 @@ os.environ["LC_CTYPE"] = "C"
 # print(config["data_location"])
 
 if config["ashleys_pipeline"] is True and config["genecore"] is True:
-    config["data_location"] = "/".join(config["data_location"].split("/")[:-1])
+    config["data_location"] = config["abs_path"].join(
+        config["data_location"].split("/")[:-1]
+    )
 
 
 envvars:
diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk
index 221c8610..23d35f3a 100644
--- a/workflow/rules/plots.smk
+++ b/workflow/rules/plots.smk
@@ -315,7 +315,8 @@ rule scTRIP_multiplot:
         "{folder}/log/scTRIP_multiplot/{sample}/{cell}/{chrom}.log",
     conda:
         "../envs/rtools.yaml"
-    container: None
+    container:
+        None
     resources:
         mem_mb=get_mem_mb,
     shell:
diff --git a/workflow/rules/setup.smk b/workflow/rules/setup.smk
index e59889ec..93ed6847 100644
--- a/workflow/rules/setup.smk
+++ b/workflow/rules/setup.smk
@@ -20,8 +20,10 @@ rule install_BSgenome_package:
     params:
         selected_package=lambda wc, input: "BSgenome.{}.UCSC.{}".format(
             "Mmusculus" if config["reference"] == "mm10" else "Hsapiens",
-            config["reference"]
-        ) if config["reference"] in ["hg38", "hg19", "mm10"] else input.package,
+            config["reference"],
+        )
+        if config["reference"] in ["hg38", "hg19", "mm10"]
+        else input.package,
     conda:
         "../envs/rtools.yaml"
     resources:

From 264c7ee5da7d13c340c04675d7429e4908b0b76a Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Mon, 4 Dec 2023 15:01:55 +0000
Subject: [PATCH 4/8] Small fix

---
 watchdog_pipeline/watchdog_pipeline.py          | 17 +++++++++++++----
 workflow/rules/count.smk                        |  2 +-
 .../scripts/normalization/merge-blacklist.py    |  1 +
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py
index 4c1a6614..ea647898 100644
--- a/watchdog_pipeline/watchdog_pipeline.py
+++ b/watchdog_pipeline/watchdog_pipeline.py
@@ -39,11 +39,14 @@
 # publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV"
 publishdir_location = "/g/korbel/WORKFLOW_RESULTS"
 genecore_prefix = path_to_watch
-# profile_slurm = ["--profile", "../snakemake_profiles/HPC/dev/slurm_legacy_conda/"]
 profile_slurm = [
     "--profile",
-    "/g/korbel2/weber/workspace/snakemake_profiles/HPC/slurm_EMBL/",
+    "/g/korbel2/weber/workspace/snakemake_profiles/HPC/dev/slurm_legacy_conda/",
 ]
+# profile_slurm = [
+#     "--profile",
+#     "/g/korbel2/weber/workspace/snakemake_profiles/HPC/slurm_EMBL/",
+# ]
 profile_dry_run = [
     "--profile",
     "workflow/snakemake_profiles/local/conda/",
@@ -297,7 +300,9 @@ def check_unprocessed_folder(self):
                                 "PDAC60590MNI",
                                 "DXR30hMaja",
                                 "DXR42hMaja",
-                                "GM19705",
+                                # "GM19705",
+                                "OrgxDoxocx02",
+                                "GM20355x01",
                             ]:
                                 run_id = f"{pipeline}--{plate}--{sample_name}"
                                 workflow_id = self.find_workflow_id_by_name(
@@ -458,6 +463,10 @@ def check_unprocessed_folder(self):
 
                     print(panoptes_entry)
                     print(panoptes_data)
+                    if workflow_id:
+                        assert (
+                            len(panoptes_data) > 0
+                        ), "Data issue between pika & panoptes"
 
                     if panoptes_data:
                         panoptes_data = panoptes_data[0]
@@ -693,7 +702,7 @@ def execute_command(
             "-s",
             "workflow/Snakefile",
             "--set-resources",
-            "ashleys_mark_duplicates:partition=bigmem",
+            "ashleys_mark_duplicates:constraint='milan\|rome'",
             "--config",
             "genecore=True",
             f"genecore_prefix={genecore_prefix}",
diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk
index f1a0e74e..7eee69bb 100755
--- a/workflow/rules/count.smk
+++ b/workflow/rules/count.smk
@@ -215,7 +215,7 @@ else:
             "../envs/mc_base.yaml"
         shell:
             """
-            workflow/scripts/normalization/merge-blacklist.py --merge_distance 500000 {input.norm} > {output.merged} 2> {log}
+            cp {input.norm} {ouput.merged}
             """
 
 
diff --git a/workflow/scripts/normalization/merge-blacklist.py b/workflow/scripts/normalization/merge-blacklist.py
index 750d3966..998c9650 100755
--- a/workflow/scripts/normalization/merge-blacklist.py
+++ b/workflow/scripts/normalization/merge-blacklist.py
@@ -70,6 +70,7 @@ def main():
                     norm_table.loc[[i], "class"] = "good"
                     additional_whitelist += row.end - row.start
 
+
     print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr)
 
     norm_table.to_csv(args.output, index=False, sep="\t")

From aecfa781e76da1249c928674d84c4817c257e14a Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Mon, 4 Dec 2023 15:04:32 +0000
Subject: [PATCH 5/8] Small fix

---
 workflow/rules/count.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk
index 7eee69bb..3cc8fdc2 100755
--- a/workflow/rules/count.smk
+++ b/workflow/rules/count.smk
@@ -215,7 +215,7 @@ else:
             "../envs/mc_base.yaml"
         shell:
             """
-            cp {input.norm} {ouput.merged}
+            cp {input.norm} {output.merged}
             """
 
 

From b0a5a11b0d6ebd1445e9ddc1311b41d61dbb186c Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Mon, 4 Dec 2023 15:32:07 +0000
Subject: [PATCH 6/8] Update dockerfile

---
 github-actions-runner/Dockerfile-2.2.3.dockerfile | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/github-actions-runner/Dockerfile-2.2.3.dockerfile b/github-actions-runner/Dockerfile-2.2.3.dockerfile
index aa4d1c42..89d51957 100644
--- a/github-actions-runner/Dockerfile-2.2.3.dockerfile
+++ b/github-actions-runner/Dockerfile-2.2.3.dockerfile
@@ -5,7 +5,7 @@ LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb
 # Step 1: Retrieve conda environments
 
 # Conda environment:
-#   source: ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml
+#   source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml
 #   prefix: /conda-envs/87c04f5d115eff742eca84455513deba
 #   name: ashleys_base
 #   channels:
@@ -27,10 +27,10 @@ LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb
 #     # Fix sklearn update
 #     - scikit-learn=1.2.2
 RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba
-COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml
+ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml
 
 # Conda environment:
-#   source: ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml
+#   source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml
 #   prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
 #   name: rtools
 #   channels:
@@ -83,7 +83,7 @@ COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d
 #     # SOLVE R lib issue
 #     - r-stringi=1.7.12
 RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
-COPY ../ashleys-qc-pipeline/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml
+ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml
 
 # Conda environment:
 #   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml
@@ -293,7 +293,3 @@ RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --fil
     mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \
     mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \
     mamba clean --all -y
-# CUSTOM PART
-RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/
-COPY /workflow/scripts/utils/install_R_package.R /conda-envs/
-RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz

From ec910ddc0622caa2eb87a7c2b2c3c88f875ddd47 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 5 Dec 2023 10:39:39 +0000
Subject: [PATCH 7/8] Minor fixes for scNOVA (2.2.4)

---
 .gitignore                                    |   2 +
 config/config.yaml                            |   2 +-
 workflow/rules/external_data.smk              |   4 +-
 workflow/rules/scNOVA.smk                     | 232 +++++++++---------
 .../filter_input_subclonality.py              |   4 +-
 .../scripts/scNOVA_scripts/filter_sv_calls.py |   2 +-
 6 files changed, 124 insertions(+), 122 deletions(-)

diff --git a/.gitignore b/.gitignore
index a140b637..ccdccfe3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -221,3 +221,5 @@ workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot
 workflow/config/scTRIP_multiplot.ok
 args.output
 scNOVA_env_costea.yaml
+.keras/keras.json
+hs_err_pid2227945.log
diff --git a/config/config.yaml b/config/config.yaml
index 017cd8bd..2abbf67a 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -3,7 +3,7 @@
 # --------------------------------------------------------
 
 # MosaiCatcher version
-version: 2.2.3
+version: 2.2.4
 
 # Ashleys-QC pipeline version
 ashleys_pipeline_version: 2.2.3
diff --git a/workflow/rules/external_data.smk b/workflow/rules/external_data.smk
index 71ed4d63..634ac216 100644
--- a/workflow/rules/external_data.smk
+++ b/workflow/rules/external_data.smk
@@ -173,8 +173,8 @@ rule download_scnova_data:
         touch("log/config/dl_arbigent_mappability_track.ok"),
     conda:
         "../envs/scNOVA/scNOVA_DL.yaml"
-    container:
-        None
+    # container:
+    #     None
     shell:
         """
         directory="workflow/data/ref_genomes/"
diff --git a/workflow/rules/scNOVA.smk b/workflow/rules/scNOVA.smk
index 9f6c7c5b..bba0f075 100755
--- a/workflow/rules/scNOVA.smk
+++ b/workflow/rules/scNOVA.smk
@@ -17,8 +17,8 @@ rule filter_sv_calls:
     log:
         "{folder}/{sample}/log/filter_sv_calls/{sample}.log",
     input:
-        "{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv",
-        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
+        sv="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv",
+        assertion="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         "{folder}/{sample}/scNOVA_input_user/sv_calls.tsv",
     conda:
@@ -28,8 +28,8 @@ rule filter_sv_calls:
 
 
 rule scNOVA_final_results:
-    container:
-        None
+    # container:
+    #     None
     input:
         get_scnova_final_output,
     output:
@@ -43,8 +43,8 @@ rule scNOVA_final_results:
 
 
 rule generate_CN_for_CNN:
-    container:
-        None
+    # container:
+    #     None
     input:
         mosaiclassifier_final_results="{folder}/{sample}/plots/final_results/{sample}.txt",
         subclone="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
@@ -70,8 +70,8 @@ rule generate_CN_for_CNN:
 
 
 rule generate_CN_for_chromVAR:
-    container:
-        None
+    # container:
+    #     None
     input:
         TSS_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_TSS_for_SVM.txt",
         TES_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_TES_for_SVM.txt",
@@ -97,8 +97,8 @@ rule generate_CN_for_chromVAR:
 
 
 rule remove_low_quality_reads:
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/selected/{cell}.sort.mdup.bam",
     output:
@@ -112,16 +112,16 @@ rule remove_low_quality_reads:
         mem_mb=get_mem_mb,
     shell:
         """
-        samtools view -H {input} > {output.bam_header} 
-        samtools view -F 2304 {input.bam} | awk -f workflow/scripts/scNOVA_scripts/awk_1st.awk | cat {output.bam_header} - | samtools view -Sb - > {output.bam_pre}    
+        samtools view -H {input} > {output.bam_header}
+        samtools view -F 2304 {input.bam} | awk -f workflow/scripts/scNOVA_scripts/awk_1st.awk | cat {output.bam_header} - | samtools view -Sb - > {output.bam_pre}
         """
 
 
 rule sort_bam:
     log:
         "{folder}/{sample}/log/sort_bam/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono.bam",
     output:
@@ -140,8 +140,8 @@ rule sort_bam:
 rule index_num1:
     log:
         "{folder}/{sample}/log/index_num1/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam",
     output:
@@ -159,8 +159,8 @@ rule index_num1:
 rule remove_dup:
     log:
         "{folder}/{sample}/log/remove_dup/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark.bam",
         assert_list_of_cells="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
@@ -180,8 +180,8 @@ rule remove_dup:
 rule index_num2:
     log:
         "{folder}/{sample}/log/index_num2/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
     output:
@@ -199,8 +199,8 @@ rule index_num2:
 rule count_reads_split:
     log:
         "{folder}/{sample}/log/count_reads_split/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
         bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai",
@@ -221,8 +221,8 @@ rule count_reads_split:
 rule count_reads_split_aggr:
     log:
         "{folder}/{sample}/log/count_reads_split_aggr.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result/count_reads_split/{cell}.tab",
@@ -244,8 +244,8 @@ rule count_reads_split_aggr:
 rule count_sort_by_coordinate:
     log:
         "{folder}/{sample}/log/count_sort_by_coordinate/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/{sample}.tab",
     output:
@@ -261,8 +261,8 @@ rule count_sort_by_coordinate:
 rule count_sort_annotate_geneid:
     log:
         "{folder}/{sample}/log/count_sort_annotate_geneid/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         count_table="{folder}/{sample}/scNOVA_result/{sample}_sort.txt",
         GB_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_Genebody_for_SCDE.txt",
@@ -278,18 +278,18 @@ rule count_sort_annotate_geneid:
         mem_mb=get_mem_mb,
     shell:
         """
-        Rscript {params.count_sort_annotate_geneid} {input.count_table} {input.GB_matrix} {output}  
+        Rscript {params.count_sort_annotate_geneid} {input.count_table} {input.GB_matrix} {output}
         """
 
 
 rule filter_input_subclonality:
     log:
         "{folder}/{sample}/log/filter_input_subclonality/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
-        "{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
-        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
+        subclonality="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
+        assertion="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         "{folder}/{sample}/scNOVA_input_user/input_subclonality_{clone}.txt",
     conda:
@@ -301,8 +301,6 @@ rule filter_input_subclonality:
 rule merge_bam_clones:
     log:
         "{folder}/{sample}/log/merge_bam_clones/{clone}.log",
-    container:
-        None
     input:
         bam=lambda wc: expand(
             "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
@@ -324,6 +322,8 @@ rule merge_bam_clones:
         line="{folder}/{sample}/scNOVA_input_user/{clone}_line.txt",
     conda:
         "../envs/scNOVA/scNOVA_bioinfo_tools.yaml"
+    # container:
+    #     None
     resources:
         mem_mb=get_mem_mb,
     shell:
@@ -335,8 +335,8 @@ rule merge_bam_clones:
 rule count_reads_for_DNN:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam",
         bai="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam.bai",
@@ -357,8 +357,8 @@ rule count_reads_for_DNN:
 rule count_reads_for_DNN_aggr:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_aggr/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result/count_reads_for_DNN/Deeptool_Genes_for_CNN_{clone}.tab",
@@ -380,8 +380,8 @@ rule count_reads_for_DNN_aggr:
 rule count_reads_for_DNN_sc:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sc/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
         bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai",
@@ -402,8 +402,8 @@ rule count_reads_for_DNN_sc:
 rule count_reads_for_DNN_sc_aggr:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sc_aggr/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result/count_reads_for_DNN_sc/Deeptool_Genes_for_CNN_{cell}.tab",
@@ -425,8 +425,8 @@ rule count_reads_for_DNN_sc_aggr:
 rule count_reads_chr_length:
     log:
         "{folder}/{sample}/log/count_reads_chr_length/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam",
         bai="{folder}/{sample}/scNOVA_bam_merge/{clone}.merge.bam.bai",
@@ -449,8 +449,8 @@ rule count_reads_chr_length:
 rule count_reads_chr_length_aggr:
     log:
         "{folder}/{sample}/log/count_reads_chr_length_aggr/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result/count_reads_chr_length/Deeptool_chr_length_{clone}.tab",
@@ -472,8 +472,8 @@ rule count_reads_chr_length_aggr:
 rule count_reads_chr_length_sc:
     log:
         "{folder}/{sample}/log/count_reads_chr_length_sc/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
         bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai",
@@ -494,8 +494,8 @@ rule count_reads_chr_length_sc:
 rule count_reads_chr_length_sc_aggr:
     log:
         "{folder}/{sample}/log/count_reads_chr_length_sc_aggr/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result/count_reads_chr_length_sc/Deeptool_chr_length_{cell}.tab",
@@ -517,8 +517,8 @@ rule count_reads_chr_length_sc_aggr:
 rule count_reads_for_DNN_sort:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sort/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}.tab",
     output:
@@ -534,8 +534,8 @@ rule count_reads_for_DNN_sort:
 rule count_reads_for_DNN_sort_lab:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sort_lab/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         count_reads_sort="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort.txt",
         Ref_bed="workflow/data/scNOVA/utils/bin_Genes_for_CNN_num_sort.txt",
@@ -557,8 +557,8 @@ rule count_reads_for_DNN_sort_lab:
 rule count_reads_for_DNN_sort_label_sort:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sort_label_sort/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt",
     output:
@@ -574,8 +574,8 @@ rule count_reads_for_DNN_sort_label_sort:
 rule count_reads_for_DNN_normalization:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_normalization/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         count_reads_chr_length="{folder}/{sample}/scNOVA_result/Deeptool_chr_length_{sample}.tab",
         count_reads_sort_label="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sort_lab.txt",
@@ -605,8 +605,8 @@ rule count_reads_for_DNN_normalization:
 rule count_reads_for_DNN_sc_sort:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sc_sort/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc.tab",
     output:
@@ -622,8 +622,8 @@ rule count_reads_for_DNN_sc_sort:
 rule count_reads_for_DNN_sc_sort_lab:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sc_sort_lab/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         count_reads_sort="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort.txt",
         Ref_bed="workflow/data/scNOVA/utils/bin_Genes_for_CNN_num_sort.txt",
@@ -645,8 +645,8 @@ rule count_reads_for_DNN_sc_sort_lab:
 rule count_reads_for_DNN_sc_sort_label_sort:
     log:
         "{folder}/{sample}/log/count_reads_for_DNN_sc_sort_label_sort/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab.txt",
     output:
@@ -662,8 +662,8 @@ rule count_reads_for_DNN_sc_sort_label_sort:
 rule generate_feature_sc_var:
     log:
         "{folder}/{sample}/log/generate_feature_sc_var/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         subclone_list="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
         count_reads_sc_sort="{folder}/{sample}/scNOVA_result/Deeptool_Genes_for_CNN_{sample}_sc_sort_lab_final.txt",
@@ -693,8 +693,8 @@ rule generate_feature_sc_var:
 rule combine_features:
     log:
         "{folder}/{sample}/log/combine_features/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         TSS_matrix="workflow/data/scNOVA/utils/Strand_seq_matrix_TSS_for_SVM.txt",
         table_GC_imput="workflow/data/scNOVA/utils/Features_reshape_GC_orientation_impute.txt",
@@ -726,8 +726,8 @@ rule combine_features:
 rule infer_expressed_genes_split:
     log:
         "{folder}/{sample}/log/infer_expressed_genes_split/{clone}_{chrom}_{i}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         features="{folder}/{sample}/scNOVA_result/Features_reshape_all_orientation_norm_var_GC_CpG_RT_T_comb3_{clone}.txt",
         TSS_annot="{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt",
@@ -744,8 +744,8 @@ rule infer_expressed_genes_split:
 rule gather_infer_expressed_genes_split:
     log:
         "{folder}/{sample}/log/gather_infer_expressed_genes_split/{clone}_{i}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result_CNN/{chrom}/DNN_train{i}_output_ypred_{clone}.csv",
@@ -766,8 +766,8 @@ rule gather_infer_expressed_genes_split:
 rule aggr_models_touch:
     log:
         "{folder}/{sample}/log/aggr_models_touch/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result_CNN/DNN_train{i}_output_ypred_{clone}.csv",
@@ -783,8 +783,8 @@ rule aggr_models_touch:
 rule annot_expressed_genes:
     log:
         "{folder}/{sample}/log/annot_expressed_genes/{clone}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         TSS_annot="{folder}/{sample}/scNOVA_result/Features_reshape_all_TSS_matrix_woM_all_RT_{clone}.txt",
         train80="{folder}/{sample}/scNOVA_result_CNN/DNN_train80_output_ypred_{clone}.csv",
@@ -814,8 +814,8 @@ rule annot_expressed_genes:
 rule infer_differential_gene_expression:
     log:
         "{folder}/{sample}/log/infer_differential_gene_expression/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         Genebody_NO="{folder}/{sample}/scNOVA_result/{sample}_sort.txt",
         clonality="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
@@ -847,8 +847,8 @@ rule infer_differential_gene_expression:
 rule infer_differential_gene_expression_alt:
     log:
         "{folder}/{sample}/log/infer_differential_gene_expression_alt/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         Genebody_NO="{folder}/{sample}/scNOVA_result/{sample}_sort.txt",
         clonality="{folder}/{sample}/scNOVA_input_user/input_subclonality.txt",
@@ -874,15 +874,15 @@ rule infer_differential_gene_expression_alt:
         time="10:00:00",
     shell:
         """
-        Rscript {params.infer_diff_gene_expression_alt} {input.Genebody_NO} {input.clonality} {input.TSS_matrix} {input.GB_matrix} {input.CNN_result1} {input.CNN_result2} {input.input_matrix} {output.result_table} {output.result_plot} {input.final_result} 
+        Rscript {params.infer_diff_gene_expression_alt} {input.Genebody_NO} {input.clonality} {input.TSS_matrix} {input.GB_matrix} {input.CNN_result1} {input.CNN_result2} {input.input_matrix} {output.result_table} {output.result_plot} {input.final_result}
         """
 
 
 rule count_reads_CREs:
     log:
         "{folder}/{sample}/log/count_reads_CREs/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
         bai="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.bai",
@@ -905,8 +905,8 @@ rule count_reads_CREs:
 rule count_reads_CREs_aggr:
     log:
         "{folder}/{sample}/log/count_reads_CREs_aggr/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         lambda wc: expand(
             "{folder}/{sample}/scNOVA_result/count_reads_CREs/{cell}_CREs_2kb.tab",
@@ -928,8 +928,8 @@ rule count_reads_CREs_aggr:
 rule count_sort_by_coordinate_CREs:
     log:
         "{folder}/{sample}/log/count_sort_by_coordinate_CREs/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb.tab",
     output:
@@ -945,8 +945,8 @@ rule count_sort_by_coordinate_CREs:
 rule count_sort_annotate_chrid_CREs:
     log:
         "{folder}/{sample}/log/count_sort_annotate_chrid_CREs/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort.txt",
     output:
@@ -963,15 +963,15 @@ rule count_sort_annotate_chrid_CREs:
         mem_mb=get_mem_mb,
     shell:
         """
-        Rscript {params.count_sort_annotate_chrid_CREs} {input} {output} 
+        Rscript {params.count_sort_annotate_chrid_CREs} {input} {output}
         """
 
 
 rule count_sort_annotate_chrid_CREs_sort:
     log:
         "{folder}/{sample}/log/count_sort_annotate_chrid_CREs_sort/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result/{sample}_CREs_2kb_sort_num.txt",
     output:
@@ -987,11 +987,11 @@ rule count_sort_annotate_chrid_CREs_sort:
 rule split_bam_WC:
     log:
         "{folder}/{sample}/log/split_bam_WC/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
-        "{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
-        "{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
+        bam="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam",
+        assertion="{folder}/{sample}/scNOVA_input_user/assert_list_of_cells.txt",
     output:
         bam_header="{folder}/{sample}/scNOVA_bam_modified/{cell}.header_WC.sam",
         bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam",
@@ -1004,19 +1004,19 @@ rule split_bam_WC:
         mem_mb=get_mem_mb,
     shell:
         """
-        samtools view -H {input} > {output.bam_header}
-        samtools view -f 99 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C1}
-        samtools view -f 147 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C2}
-        samtools view -f 83 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W1}
-        samtools view -f 163 {input} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W2}
+        samtools view -H {input.bam} > {output.bam_header}
+        samtools view -f 99 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C1}
+        samtools view -f 147 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_C2}
+        samtools view -f 83 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W1}
+        samtools view -f 163 {input.bam} | cat {output.bam_header} - | samtools view -Sb - > {output.bam_W2}
         """
 
 
 rule split_bam_WC_merge:
     log:
         "{folder}/{sample}/log/split_bam_WC_merge/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam_C1="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C1.bam",
         bam_C2="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C2.bam",
@@ -1039,8 +1039,8 @@ rule split_bam_WC_merge:
 rule split_bam_WC_index:
     log:
         "{folder}/{sample}/log/split_bam_WC_index/{cell}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam_C="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.C.bam",
         bam_W="{folder}/{sample}/scNOVA_bam_modified/{cell}.sc_pre_mono_sort_for_mark_uniq.bam.W.bam",
@@ -1061,8 +1061,6 @@ rule split_bam_WC_index:
 rule perl_split_sc:
     log:
         "{folder}/{sample}/log/perl_split_sc/{sample}.log",
-    container:
-        None
     input:
         strandphaser_output="{folder}/{sample}/strandphaser/strandphaser_phased_haps_merged.txt",
         bam_C_ind=lambda wc: expand(
@@ -1085,6 +1083,8 @@ rule perl_split_sc:
         "{folder}/{sample}/log/perl_split_sc.log",
     conda:
         "../envs/scNOVA/scNOVA_bioinfo_tools.yaml"
+    # container:
+    #     None
     resources:
         mem_mb=get_mem_mb,
     shell:
@@ -1096,8 +1096,8 @@ rule perl_split_sc:
 rule count_reads_CREs_haplo:
     log:
         "{folder}/{sample}/log/count_reads_CREs_haplo/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam1="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam",
         bam2="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam",
@@ -1118,8 +1118,8 @@ rule count_reads_CREs_haplo:
 rule count_reads_CREs_haplo_sort_by_coordinate:
     log:
         "{folder}/{sample}/log/count_reads_CREs_haplo_sort_by_coordinate/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result_haplo/Deeptool_DHS_2kb_H1H2.tab",
     output:
@@ -1135,8 +1135,8 @@ rule count_reads_CREs_haplo_sort_by_coordinate:
 rule count_reads_genebody_haplo:
     log:
         "{folder}/{sample}/log/count_reads_genebody_haplo/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         bam1="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleA/result.H1.bam",
         bam2="{folder}/{sample}/scNOVA_nucleosomes_bam/nucleosome_sampleB/result.H2.bam",
@@ -1159,8 +1159,8 @@ rule count_reads_genebody_haplo:
 rule count_reads_genebody_haplo_sort_by_coordinate_genebody:
     log:
         "{folder}/{sample}/log/count_reads_genebody_haplo_sort_by_coordinate_genebody/{sample}.log",
-    container:
-        None
+    # container:
+    #     None
     input:
         "{folder}/{sample}/scNOVA_result_haplo/Deeptool_Genebody_H1H2.tab",
     output:
diff --git a/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py b/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py
index 6693de13..2bd08ae3 100644
--- a/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py
+++ b/workflow/scripts/scNOVA_scripts/filter_input_subclonality.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-df = pd.read_csv(snakemake.input[0], sep="\t")
+df = pd.read_csv(snakemake.input.subclonality, sep="\t")
 df.loc[df["Subclonality"] == snakemake.wildcards.clone].to_csv(
     snakemake.output[0], sep="\t", index=False
-)
\ No newline at end of file
+)
diff --git a/workflow/scripts/scNOVA_scripts/filter_sv_calls.py b/workflow/scripts/scNOVA_scripts/filter_sv_calls.py
index 730058ee..19ba300f 100644
--- a/workflow/scripts/scNOVA_scripts/filter_sv_calls.py
+++ b/workflow/scripts/scNOVA_scripts/filter_sv_calls.py
@@ -1,4 +1,4 @@
 import pandas as pd
 
-df = pd.read_csv(snakemake.input[0], sep="\t")
+df = pd.read_csv(snakemake.input.sv, sep="\t")
 df.loc[df["chrom"] != "chrY"].to_csv(snakemake.output[0], sep="\t", index=False)

From 1d8c6889665cc1fe5bd796d896c0f62e41acf8c3 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 5 Dec 2023 10:44:02 +0000
Subject: [PATCH 8/8] Dockerfile

---
 .../Dockerfile-2.2.4.dockerfile               | 300 ++++++++++++++++++
 .../add_T2T_part_to_Dockerfile.sh             |   1 -
 2 files changed, 300 insertions(+), 1 deletion(-)
 create mode 100644 github-actions-runner/Dockerfile-2.2.4.dockerfile

diff --git a/github-actions-runner/Dockerfile-2.2.4.dockerfile b/github-actions-runner/Dockerfile-2.2.4.dockerfile
new file mode 100644
index 00000000..f03d13ea
--- /dev/null
+++ b/github-actions-runner/Dockerfile-2.2.4.dockerfile
@@ -0,0 +1,300 @@
+FROM condaforge/mambaforge:latest
+LABEL io.github.snakemake.containerized="true"
+LABEL io.github.snakemake.conda_env_hash="8c338e2bbe95ae23ac438e1ac650a859ed4dbb9a77747c17f62707ea2f67a667"
+
+# Step 1: Retrieve conda environments
+
+# Conda environment:
+#   source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml
+#   prefix: /conda-envs/87c04f5d115eff742eca84455513deba
+#   name: ashleys_base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - samtools
+#     - tabix
+#     - bwa
+#     - sambamba
+#     - mosaicatcher
+#     # - alfred
+#     - ashleys-qc
+#     - pandas
+#     # PUBLISHDIR
+#     - rsync
+#     # MULTIQC
+#     - multiqc
+#     # Fix sklearn update
+#     - scikit-learn=1.2.2
+RUN mkdir -p /conda-envs/87c04f5d115eff742eca84455513deba
+ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_base.yaml /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml
+#   prefix: /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+#   name: rtools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#     - r
+#     - anaconda
+#   dependencies:
+#     # - bioconductor-biocparallel
+#     # - bioconductor-bsgenome
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg19
+#     # - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     # - bioconductor-fastseg
+#     # - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     # - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - r-assertthat
+#     - r-base
+#     # - r-biocmanager
+#     - r-cowplot
+#     - r-data.table
+#     # - r-devtools
+#     # - r-doparallel
+#     # - r-foreach
+#     - r-ggplot2
+#     # - r-gtools
+#     - r-reshape2
+#     # - r-zoo
+#     # - r-dplyr
+#     # - r-mc2d
+#     # - r-pheatmap
+#     # - bioconductor-complexheatmap
+#     # - r-gplots
+#     - r-scales
+#     - r-rcolorbrewer
+#     # - r-stringr
+#     - r-cairo
+#     - fonts-anaconda
+#     # NEW
+#     - bioconductor-edger
+#     - r-r.utils
+#     # PLATE PLOT
+#     - r-dplyr
+#     - r-platetools
+#     - r-viridis
+#     # GC_correction
+#     - r-tidyr
+#     - r-ggpubr
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/9b847fc31baae8e01dfb7ce438a56b71
+ADD https://github.com/friendsofstrandseq/ashleys-qc-pipeline/raw/2.2.3/workflow/envs/ashleys_rtools.yaml /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml
+#   prefix: /conda-envs/5681728a49bd83ceed09ba194330c858
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - bwa ==0.7.17
+RUN mkdir -p /conda-envs/5681728a49bd83ceed09ba194330c858
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/bwa/index/environment.yaml /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml
+
+# Conda environment:
+#   source: https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml
+#   prefix: /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - defaults
+#   dependencies:
+#     - fastqc ==0.11.9
+RUN mkdir -p /conda-envs/08d4368302a4bdf7eda6b536495efe7d
+ADD https://github.com/snakemake/snakemake-wrappers/raw/v1.7.0/bio/fastqc/environment.yaml /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_base.yaml
+#   prefix: /conda-envs/c80307395eddf442c2fb6870f40d822b
+#   name: mc-base
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - pandas
+#     - intervaltree
+#     - scipy
+#     - pysam
+#     - tqdm
+#     - perl
+#     - pypdf2
+#     - parmap
+#     # NEW
+#     - pyyaml
+#     - seaborn
+#     - matplotlib
+#     # SOLVE se-pe detection
+#     - samtools
+#     # ArbiGent Hufsah deps
+#     - pytables
+#     - xopen
+RUN mkdir -p /conda-envs/c80307395eddf442c2fb6870f40d822b
+COPY workflow/envs/mc_base.yaml /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/mc_bioinfo_tools.yaml
+#   prefix: /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+#   name: mc-bioinfo-tools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#   dependencies:
+#     - bcftools
+#     - freebayes
+#     - mosaicatcher
+#     - samtools
+#     - tabix
+#     - whatshap
+RUN mkdir -p /conda-envs/f251d84cdc9f25d0e14b48e780261d66
+COPY workflow/envs/mc_bioinfo_tools.yaml /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/rtools.yaml
+#   prefix: /conda-envs/598c87b6c764d05e0c66953cc67f2931
+#   name: rtools
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - r
+#     - anaconda
+#   dependencies:
+#     # # NEW
+#     - strandphaser
+#     # ###############
+#     - bioconductor-biocparallel
+#     - bioconductor-bsgenome
+#     - bioconductor-bsgenome.hsapiens.ucsc.hg38
+#     - bioconductor-complexheatmap
+#     # - bioconductor-fastseg
+#     - bioconductor-genomicalignments
+#     - bioconductor-genomicranges
+#     - bioconductor-rsamtools
+#     # - bioconductor-s4vectors
+#     - fonts-anaconda
+#     - r-assertthat
+#     - r-base
+#     - r-biocmanager
+#     - r-cairo
+#     - r-cowplot
+#     - r-data.table
+#     - r-devtools
+#     - r-doparallel
+#     - r-dplyr
+#     - r-foreach
+#     - r-ggplot2
+#     - r-gplots
+#     - r-gtools
+#     - r-mc2d
+#     - r-rcolorbrewer
+#     - r-reshape2
+#     - r-scales
+#     - r-stringr
+#     # SV_CALLS_DEV
+#     # - r-zoo
+#     - r-r.utils
+#     - r-ggnewscale
+#     # HEATMAP
+#     - r-tidyr
+#     # ARBIGENT
+#     - r-reshape
+#     - r-optparse
+#     - r-tidyr
+#     - r-ggbeeswarm
+#     - r-pheatmap
+#     # GC_correction
+#     - r-ggpubr
+#     - bioconductor-edger
+#     # SOLVE R lib issue
+#     - r-stringi=1.7.12
+RUN mkdir -p /conda-envs/598c87b6c764d05e0c66953cc67f2931
+COPY workflow/envs/rtools.yaml /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/scNOVA/scNOVA_DL.yaml
+#   prefix: /conda-envs/1ede379ce8d378df7dca25b2bf4111f3
+#   name: scNOVA_DL
+#   channels:
+#     - conda-forge
+#     - anaconda
+#   dependencies:
+#     - tensorflow=1.15.0
+#     - scikit-learn=0.21.3
+#     - python=3.7.4
+#     - matplotlib=3.1.1
+#     - pandas=0.25.3
+#     - h5py=2.10.0
+#     - numpy
+#     # scNOVA archive
+#     - unzip
+#     # Fix
+RUN mkdir -p /conda-envs/1ede379ce8d378df7dca25b2bf4111f3
+COPY workflow/envs/scNOVA/scNOVA_DL.yaml /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/scNOVA/scNOVA_R.yaml
+#   prefix: /conda-envs/193f60d48796dd17eb847ea689b863a9
+#   name: scNOVA
+#   channels:
+#     - bioconda
+#     - conda-forge
+#     - r
+#   dependencies:
+#     - bioconductor-deseq2=1.30.0
+#     - r-matrixstats=0.58.0
+#     - r-pheatmap=1.0.12
+#     - r-gplots=3.1.1
+#     - r-umap=0.2.7.0
+#     - r-rtsne=0.15
+#     - r-factoextra=1.0.7
+#     - r-pracma=2.3.3
+#     - bioconductor-chromvar=1.12.0
+#     - r-nabor=0.5.0
+#     - bioconductor-motifmatchr=1.12.0
+#     - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.3
+#     - bioconductor-jaspar2016=1.18.0
+#     - r-codetools=0.2_18
+#     - r-fitdistrplus
+#     - r-doparallel
+#     - r-foreach
+RUN mkdir -p /conda-envs/193f60d48796dd17eb847ea689b863a9
+COPY workflow/envs/scNOVA/scNOVA_R.yaml /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml
+
+# Conda environment:
+#   source: workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml
+#   prefix: /conda-envs/ca9641251a8cb0057003875ad776c49f
+#   name: scNOVA_bioinfo_tools
+#   channels:
+#     - conda-forge
+#     - bioconda
+#     - anaconda
+#   dependencies:
+#     - samtools
+#     - biobambam
+#     - bedtools
+RUN mkdir -p /conda-envs/ca9641251a8cb0057003875ad776c49f
+COPY workflow/envs/scNOVA/scNOVA_bioinfo_tools.yaml /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml
+
+# Step 2: Generate conda environments
+
+RUN mamba env create --prefix /conda-envs/87c04f5d115eff742eca84455513deba --file /conda-envs/87c04f5d115eff742eca84455513deba/environment.yaml && \
+    mamba env create --prefix /conda-envs/9b847fc31baae8e01dfb7ce438a56b71 --file /conda-envs/9b847fc31baae8e01dfb7ce438a56b71/environment.yaml && \
+    mamba env create --prefix /conda-envs/5681728a49bd83ceed09ba194330c858 --file /conda-envs/5681728a49bd83ceed09ba194330c858/environment.yaml && \
+    mamba env create --prefix /conda-envs/08d4368302a4bdf7eda6b536495efe7d --file /conda-envs/08d4368302a4bdf7eda6b536495efe7d/environment.yaml && \
+    mamba env create --prefix /conda-envs/c80307395eddf442c2fb6870f40d822b --file /conda-envs/c80307395eddf442c2fb6870f40d822b/environment.yaml && \
+    mamba env create --prefix /conda-envs/f251d84cdc9f25d0e14b48e780261d66 --file /conda-envs/f251d84cdc9f25d0e14b48e780261d66/environment.yaml && \
+    mamba env create --prefix /conda-envs/598c87b6c764d05e0c66953cc67f2931 --file /conda-envs/598c87b6c764d05e0c66953cc67f2931/environment.yaml && \
+    mamba env create --prefix /conda-envs/1ede379ce8d378df7dca25b2bf4111f3 --file /conda-envs/1ede379ce8d378df7dca25b2bf4111f3/environment.yaml && \
+    mamba env create --prefix /conda-envs/193f60d48796dd17eb847ea689b863a9 --file /conda-envs/193f60d48796dd17eb847ea689b863a9/environment.yaml && \
+    mamba env create --prefix /conda-envs/ca9641251a8cb0057003875ad776c49f --file /conda-envs/ca9641251a8cb0057003875ad776c49f/environment.yaml && \
+    mamba clean --all -y
+
+# CUSTOM PART
+RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/
+COPY /workflow/scripts/utils/install_R_package.R /conda-envs/
+RUN chmod -R 0777 /conda-envs/598c87b6c764d05e0c66953cc67f2931/lib/R/library && /conda-envs/598c87b6c764d05e0c66953cc67f2931/bin/Rscript /conda-envs/install_R_package.R /workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz
diff --git a/github-actions-runner/add_T2T_part_to_Dockerfile.sh b/github-actions-runner/add_T2T_part_to_Dockerfile.sh
index 7c631edd..3d7fcc85 100644
--- a/github-actions-runner/add_T2T_part_to_Dockerfile.sh
+++ b/github-actions-runner/add_T2T_part_to_Dockerfile.sh
@@ -25,7 +25,6 @@ fi
 
 # Append custom steps to the Dockerfile
 {
-    echo '\n'
     echo "# CUSTOM PART"
     echo "RUN wget https://zenodo.org/record/7697400/files/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz -P /workflow/data/ref_genomes/"
     echo "COPY /workflow/scripts/utils/install_R_package.R /conda-envs/"