diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fc370376..6522ff26 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -48,6 +48,8 @@ jobs:
             exec_profile: singularity
           - test_profile: test_lfq
             exec_profile: conda
+          - test_profile: test_dda_id
+            exec_profile: conda
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v4
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ac206374..135dc698 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [#386](https://github.com/bigbio/quantms/pull/386) Make validation of ontology terms optional
+- [#398](https://github.com/bigbio/quantms/pull/398) Python scripts moved to quantms-utils package
+- [#389](https://github.com/bigbio/quantms/pull/389) Introduction to DIANN 1.9.1 to the pipeline, only available in Singularity.
+- [#397](https://github.com/bigbio/quantms/pull/397) More options included in SDRF validation.
 
 ### Fixed
 
@@ -15,12 +18,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Changed`
 
+- [#391](https://github.com/bigbio/quantms/pull/391) Move mzML statistics to parquet files from csv
+- [#386](https://github.com/bigbio/quantms/pull/386) Make optional the validation of ontology terms in the input SDRF file.
+- [#374](https://github.com/bigbio/quantms/pull/374) Create the common msgf+ database in one step before the msgf+ runs on each ms run file.
+-
+
 ### `Fixed`
 
 - [#396](https://github.com/bigbio/quantms/pull/396) Added verification of tar archive unpacking to prevent silent failures.
 
 ### `Dependencies`
 
+- quantms-utils==0.0.7
+- diann==1.9.1
+
 ### `Parameters`
 
 - `validate_ontologies`: enable or disable validating ontologies in the input SDRF file.
diff --git a/bin/add_sage_feature.py b/bin/add_sage_feature.py
deleted file mode 100755
index 8d53fa06..00000000
--- a/bin/add_sage_feature.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python
-# Add extra features in sage idXML. Adding extra feature in Sage isn't known input for PSMFeatureExtractor
-
-import sys
-
-import pandas as pd
-import pyopenms as oms
-
-
-def add_feature(idx_file, output_file, feat_file):
-    extra_feat = []
-    feat = pd.read_csv(feat_file, sep="\t")
-    for _, row in feat.iterrows():
-        if row["feature_generator"] == "psm_file":
-            continue
-        else:
-            extra_feat.append(row["feature_name"])
-    print("Adding extra feature: {}".format(extra_feat))
-    protein_ids = []
-    peptide_ids = []
-    oms.IdXMLFile().load(idx_file, protein_ids, peptide_ids)
-    SearchParameters = protein_ids[0].getSearchParameters()
-    features = SearchParameters.getMetaValue("extra_features")
-    extra_features = features + "," + ",".join(extra_feat)
-    SearchParameters.setMetaValue("extra_features", extra_features)
-    protein_ids[0].setSearchParameters(SearchParameters)
-    oms.IdXMLFile().store(output_file, protein_ids, peptide_ids)
-    print("Done")
-
-
-def main():
-    idx_file = sys.argv[1]
-    output_file = sys.argv[2]
-    feat_file = sys.argv[3]
-    add_feature(idx_file, output_file, feat_file)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
deleted file mode 100755
index e7157b25..00000000
--- a/bin/check_samplesheet.py
+++ /dev/null
@@ -1,223 +0,0 @@
-#!/usr/bin/env python
-
-# nf-core: Update the script to check the sdrf
-# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
-
-import errno
-import os
-import sys
-
-import click
-import pandas as pd
-from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame
-from sdrf_pipelines.sdrf.sdrf_schema import DEFAULT_TEMPLATE, MASS_SPECTROMETRY
-
-CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
-def cli():
-    """
-    This is the main tool that gives access to all commands to convert SDRF files into pipelines-specific configuration
-    files.
-    """
-    pass
-
-
-def make_dir(path):
-    if len(path) > 0:
-        try:
-            os.makedirs(path)
-        except OSError as exception:
-            if exception.errno != errno.EEXIST:
-                raise exception
-
-
-def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
-    if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
-    print(error_str)
-    sys.exit(1)
-
-
-def check_sdrf(
-    input_sdrf: str,
-    skip_ms_validation: bool = False,
-    skip_factor_validation: bool = False,
-    skip_experimental_design_validation: bool = False,
-    use_ols_cache_only: bool = False,
-    skip_sdrf_validation: bool = False,
-):
-    """
-    Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code.
-    @param input_sdrf: Path to the SDRF file to check
-    @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)
-    @param skip_factor_validation: Disable the validation of factor values in SDRF
-    @param skip_experimental_design_validation: Disable the validation of experimental design
-    @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service
-    @param skip_sdrf_validation: Disable the validation of SDRF
-    """
-    if skip_sdrf_validation:
-        print("No SDRF validation was performed.")
-        sys.exit(0)
-
-    df = SdrfDataFrame.parse(input_sdrf)
-    errors = df.validate(DEFAULT_TEMPLATE, use_ols_cache_only)
-
-    if not skip_ms_validation:
-        errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only)
-
-    if not skip_factor_validation:
-        errors = errors + df.validate_factor_values()
-
-    if not skip_experimental_design_validation:
-        errors = errors + df.validate_experimental_design()
-
-    for error in errors:
-        print(error)
-
-    sys.exit(bool(errors))
-
-
-def check_expdesign(expdesign):
-    """
-    Check the expdesign file for errors. If any errors are found, print them and exit with a non-zero status code.
-    @param expdesign: Path to the expdesign file to check
-    """
-    data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
-    data = data.dropna()
-    schema_file = ["Fraction_Group", "Fraction", "Spectra_Filepath", "Label", "Sample"]
-    schema_sample = ["Sample", "MSstats_Condition", "MSstats_BioReplicate"]
-
-    # check table format: two table
-    with open(expdesign, "r") as f:
-        lines = f.readlines()
-        try:
-            empty_row = lines.index("\n")
-        except ValueError:
-            print(
-                "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf"
-            )
-            sys.exit(1)
-        if lines.index("\n") >= len(lines):
-            print(
-                "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf"
-            )
-            sys.exit(1)
-
-        s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:]
-        s_header = lines[empty_row + 1].replace("\n", "").split("\t")
-        s_DataFrame = pd.DataFrame(s_table, columns=s_header)
-
-    # check missed mandatory column
-    missed_columns = set(schema_file) - set(data.columns)
-    if len(missed_columns) != 0:
-        print("{0} column missed".format(" ".join(missed_columns)))
-        sys.exit(1)
-
-    missed_columns = set(schema_sample) - set(s_DataFrame.columns)
-    if len(missed_columns) != 0:
-        print("{0} column missed".format(" ".join(missed_columns)))
-        sys.exit(1)
-
-    if len(set(data.Label)) != 1 and "MSstats_Mixture" not in s_DataFrame.columns:
-        print("MSstats_Mixture column missed in ISO experiments")
-        sys.exit(1)
-
-    # check a logical problem: may be improved
-    check_expdesign_logic(data, s_DataFrame)
-
-
-def check_expdesign_logic(fTable, sTable):
-    if int(max(fTable.Fraction_Group)) > len(set(fTable.Fraction_Group)):
-        print("Fraction_Group discontinuous!")
-        sys.exit(1)
-    fTable_D = fTable.drop_duplicates(["Fraction_Group", "Fraction", "Label", "Sample"])
-    if fTable_D.shape[0] < fTable.shape[0]:
-        print(
-            "Existing duplicate entries in Fraction_Group, Fraction, Label and Sample"
-        )
-        sys.exit(1)
-    if len(set(sTable.Sample)) < sTable.shape[0]:
-        print("Existing duplicate Sample in sample table!")
-        sys.exit(1)
-
-
-@click.command(
-    "validate", short_help="Reformat nf-core/quantms sdrf file and check its contents."
-)
-@click.option("--exp_design", help="SDRF/Expdesign file to be validated")
-@click.option("--is_sdrf", help="SDRF file or Expdesign file", is_flag=True)
-@click.option(
-    "--skip_sdrf_validation", help="Disable the validation of SDRF", is_flag=True
-)
-@click.option(
-    "--skip_ms_validation",
-    help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)",
-    is_flag=True,
-)
-@click.option(
-    "--skip_factor_validation",
-    help="Disable the validation of factor values in SDRF",
-    is_flag=True,
-)
-@click.option(
-    "--skip_experimental_design_validation",
-    help="Disable the validation of experimental design",
-    is_flag=True,
-)
-@click.option(
-    "--use_ols_cache_only",
-    help="Use ols cache for validation of the terms and not OLS internet service",
-    is_flag=True,
-)
-def validate(
-    exp_design: str,
-    is_sdrf: bool = False,
-    skip_sdrf_validation: bool = False,
-    skip_ms_validation: bool = False,
-    skip_factor_validation: bool = False,
-    skip_experimental_design_validation: bool = False,
-    use_ols_cache_only: bool = False,
-):
-    """
-    Reformat nf-core/quantms sdrf file and check its contents.
-    @param exp_design: SDRF/Expdesign file to be validated
-    @param is_sdrf: SDRF file or Expdesign file
-    @param skip_sdrf_validation: Disable the validation of SDRF
-    @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)
-    @param skip_factor_validation: Disable the validation of factor values in SDRF
-    @param skip_experimental_design_validation: Disable the validation of experimental design
-    @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service
-
-    """
-    # TODO validate expdesign file
-    if is_sdrf:
-        check_sdrf(
-            input_sdrf=exp_design,
-            skip_sdrf_validation=skip_sdrf_validation,
-            skip_ms_validation=skip_ms_validation,
-            skip_factor_validation=skip_factor_validation,
-            skip_experimental_design_validation=skip_experimental_design_validation,
-            use_ols_cache_only=use_ols_cache_only,
-        )
-    else:
-        check_expdesign(exp_design)
-
-
-cli.add_command(validate)
-
-
-def main():
-    try:
-        cli()
-    except SystemExit as e:
-        if e.code != 0:
-            raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bin/diann_convert.py b/bin/diann_convert.py
deleted file mode 100755
index becf0b4a..00000000
--- a/bin/diann_convert.py
+++ /dev/null
@@ -1,1633 +0,0 @@
-#!/usr/bin/env python
-"""
-This script converts the output from DIA-NN into three standard formats: MSstats, Triqler and mzTab.
-License: Apache 2.0
-Authors: Hong Wong, Yasset Perez-Riverol
-Revisions:
-    2023-Aug-05: J. Sebastian Paez
-"""
-import logging
-import os
-import re
-import warnings
-from pathlib import Path
-from typing import Any, Dict, List, Set, Tuple, Union
-
-import click
-import numpy as np
-import pandas as pd
-from pyopenms import AASequence, FASTAFile, ModificationsDB
-from pyopenms.Constants import PROTON_MASS_U
-
-pd.set_option("display.max_rows", 500)
-pd.set_option("display.max_columns", 500)
-pd.set_option("display.width", 1000)
-
-CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
-REVISION = "0.1.1"
-
-logging.basicConfig(
-    format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG
-)
-logger = logging.getLogger(__name__)
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
-def cli():
-    pass
-
-
-@click.command("convert")
-@click.option("--folder", "-f")
-@click.option("--exp_design", "-d")
-@click.option("--diann_version", "-v")
-@click.option("--dia_params", "-p")
-@click.option("--charge", "-c")
-@click.option("--missed_cleavages", "-m")
-@click.option("--qvalue_threshold", "-q", type=float)
-@click.pass_context
-def convert(
-    ctx,
-    folder,
-    exp_design,
-    dia_params,
-    diann_version,
-    charge,
-    missed_cleavages,
-    qvalue_threshold,
-):
-    """
-    Convert DIA-NN output to MSstats, Triqler or mzTab.
-    The output formats are used for quality control and downstream analysis.
-
-    :param folder: DiannConvert specifies the folder where the required file resides. The folder contains
-        the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence
-        FASTA file, version file of DiaNN and ms_info TSVs
-    :type folder: str
-    :param dia_params: A list contains DIA parameters
-    :type dia_params: list
-    :param diann_version: Path to a version file of DIA-NN
-    :type diann_version: str
-    :param charge: The charge assigned by DIA-NN(max_precursor_charge)
-    :type charge: int
-    :param missed_cleavages: Allowed missed cleavages assigned by DIA-NN
-    :type missed_cleavages: int
-    :param qvalue_threshold: Threshold for filtering q value
-    :type qvalue_threshold: float
-    """
-    logger.debug(f"Revision {REVISION}")
-    logger.debug("Reading input files...")
-    diann_directory = DiannDirectory(folder, diann_version_file=diann_version)
-    report = diann_directory.main_report_df(qvalue_threshold=qvalue_threshold)
-    s_DataFrame, f_table = get_exp_design_dfs(exp_design)
-
-    # Convert to MSstats
-    msstats_columns_keep = [
-        "Protein.Names",
-        "Modified.Sequence",
-        "Precursor.Charge",
-        "Precursor.Quantity",
-        "File.Name",
-        "Run",
-    ]
-
-    logger.debug("Converting to MSstats format...")
-    out_msstats = report[msstats_columns_keep]
-    out_msstats.columns = [
-        "ProteinName",
-        "PeptideSequence",
-        "PrecursorCharge",
-        "Intensity",
-        "Reference",
-        "Run",
-    ]
-    out_msstats = out_msstats[out_msstats["Intensity"] != 0]
-
-    # Q: What is this line doing?
-    out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply(
-        lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1
-    )
-    out_msstats["FragmentIon"] = "NA"
-    out_msstats["ProductCharge"] = "0"
-    out_msstats["IsotopeLabelType"] = "L"
-    unique_reference_map = {
-        k: os.path.basename(k) for k in out_msstats["Reference"].unique()
-    }
-    out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map)
-    del unique_reference_map
-
-    logger.debug("\n\nReference Column >>>")
-    logger.debug(out_msstats["Reference"])
-
-    logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>")
-    logger.debug(out_msstats.head(5))
-
-    logger.debug(f"\n\nf_table ({f_table.shape})>>>")
-    logger.debug(f_table.head(5))
-
-    logger.debug(f"\n\ns_DataFrame ({s_DataFrame.shape})>>>")
-    logger.debug(s_DataFrame.head(5))
-
-    logger.debug("Adding Fraction, BioReplicate, Condition columns")
-    # Changing implementation from apply to merge went from several minutes to
-    # ~50ms
-    out_msstats = out_msstats.merge(
-        (
-            s_DataFrame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]]
-            .merge(f_table[["Fraction", "Sample", "run"]], on="Sample")
-            .rename(
-                columns={
-                    "run": "Run",
-                    "MSstats_BioReplicate": "BioReplicate",
-                    "MSstats_Condition": "Condition",
-                }
-            )
-            .drop(columns=["Sample"])
-        ),
-        on="Run",
-        validate="many_to_one",
-    )
-    exp_out_prefix = Path(exp_design).stem
-    out_msstats.to_csv(exp_out_prefix + "_msstats_in.csv", sep=",", index=False)
-    logger.info(f"MSstats input file is saved as {exp_out_prefix}_msstats_in.csv")
-
-    # Convert to Triqler
-    triqler_cols = [
-        "ProteinName",
-        "PeptideSequence",
-        "PrecursorCharge",
-        "Intensity",
-        "Run",
-        "Condition",
-    ]
-    out_triqler = out_msstats[triqler_cols]
-    del out_msstats
-    out_triqler.columns = [
-        "proteins",
-        "peptide",
-        "charge",
-        "intensity",
-        "run",
-        "condition",
-    ]
-    out_triqler = out_triqler[out_triqler["intensity"] != 0]
-
-    out_triqler.loc[:, "searchScore"] = report["Q.Value"]
-    out_triqler.loc[:, "searchScore"] = 1 - out_triqler["searchScore"]
-    out_triqler.to_csv(exp_out_prefix + "_triqler_in.tsv", sep="\t", index=False)
-    logger.info(f"Triqler input file is saved as {exp_out_prefix}_triqler_in.tsv")
-    del out_triqler
-
-    mztab_out = f"{Path(exp_design).stem}_out.mzTab"
-    # Convert to mzTab
-    diann_directory.convert_to_mztab(
-        report=report,
-        f_table=f_table,
-        charge=charge,
-        missed_cleavages=missed_cleavages,
-        dia_params=dia_params,
-        out=mztab_out,
-    )
-
-
-def _true_stem(x):
-    """
-    Return the true stem of a file name, i.e. the
-    file name without the extension.
-
-    :param x: The file name
-    :type x: str
-    :return: The true stem of the file name
-    :rtype: str
-
-    Examples:
-    >>> _true_stem("foo.mzML")
-    'foo'
-    >>> _true_stem("foo.d.tar")
-    'foo'
-
-    These examples can be tested with pytest:
-    $ pytest -v --doctest-modules
-    """
-    split = os.path.basename(x).split(".")
-    stem = split[0]
-
-    # Should I check here that the extensions are
-    # allowed? I can see how this would break if the
-    # file name contains a period.
-    return stem
-
-
-def get_exp_design_dfs(exp_design_file):
-    logger.info(f"Reading experimental design file: {exp_design_file}")
-    with open(exp_design_file, "r") as f:
-        data = f.readlines()
-        empty_row = data.index("\n")
-        f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]]
-        f_header = data[0].replace("\n", "").split("\t")
-        f_table = pd.DataFrame(f_table, columns=f_header)
-        f_table.loc[:, "run"] = f_table.apply(
-            lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
-        )
-
-        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
-        s_header = data[empty_row + 1].replace("\n", "").split("\t")
-        s_DataFrame = pd.DataFrame(s_table, columns=s_header)
-
-    return s_DataFrame, f_table
-
-
-def compute_mass_modified_peptide(peptide_seq: str) -> float:
-    """
-    Function that takes a peptide sequence including modifications and compute the mass using the AASequence class from
-    pyopenms. The notation of a peptidoform for pyopenms is the following:
-
-    if not modifications is present:
-      AVQVHQDTLRTMYFAXR -> AVQVHQDTLRTMYFAX[178.995499]R
-    if modification is present in Methionine:
-      AVQVHQDTLRTM(Oxidation)YFAXR -> AVQVHQDTLRTM(Oxidation)YFAX[178.995499]R
-
-    @param peptide_seq: str, peptide sequence
-    @return: float, mass of the peptide
-    """
-    peptide_parts: List[str] = []
-    not_mod = True
-    aa_mass = {
-        "X": "X[178.98493453312]",  # 196.995499 - 17.003288 - 1.00727646688
-        "U": "X[132.94306553312]",  # 150.95363  - 17.003288 - 1.00727646688
-        "O": "X[237.14773053312]",  # 255.158295 - 17.003288 - 1.00727646688
-    }
-    for aa in peptide_seq:
-        # Check if the letter is in aminoacid
-        if aa == "(":
-            not_mod = False
-        elif aa == ")":
-            not_mod = True
-        # Check aminoacid letter
-        if aa in aa_mass and not_mod:
-            aa = aa_mass[aa]
-        elif (
-            aa
-            not in [
-                "G",
-                "A",
-                "V",
-                "L",
-                "I",
-                "F",
-                "M",
-                "P",
-                "W",
-                "S",
-                "C",
-                "T",
-                "Y",
-                "N",
-                "Q",
-                "D",
-                "E",
-                "K",
-                "R",
-                "H",
-            ]
-            and not_mod
-            and aa != ")"
-        ):
-            logger.info(f"Unknown amino acid with mass not known:{aa}")
-        peptide_parts.append(aa)
-    new_peptide_seq = "".join(peptide_parts)
-    mass = AASequence.fromString(new_peptide_seq).getMonoWeight()
-    logger.debug(new_peptide_seq + ":" + str(mass))
-    return mass
-
-
-class DiannDirectory:
-    def __init__(self, base_path, diann_version_file):
-        self.base_path = Path(base_path)
-        if not self.base_path.exists() and not self.base_path.is_dir():
-            raise NotADirectoryError(f"Path {self.base_path} does not exist")
-        self.diann_version_file = Path(diann_version_file)
-        if not self.diann_version_file.is_file():
-            raise FileNotFoundError(f"Path {self.diann_version_file} does not exist")
-
-    def find_first_file_with_suffix(self, suffix: str) -> os.PathLike:
-        """Finds a file with a given suffix in the directory.
-
-        :param suffix: The suffix to search for
-        :type suffix: str
-
-        :raises FileNotFoundError: If no file with the given suffix is found
-        """
-        try:
-            return next(self.base_path.glob(f"**/*{suffix}"))
-        except StopIteration:
-            raise FileNotFoundError(f"Could not find file with suffix {suffix}")
-
-    @property
-    def report(self) -> os.PathLike:
-        return self.find_first_file_with_suffix("report.tsv")
-
-    @property
-    def pg_matrix(self) -> os.PathLike:
-        return self.find_first_file_with_suffix("pg_matrix.tsv")
-
-    @property
-    def pr_matrix(self) -> os.PathLike:
-        return self.find_first_file_with_suffix("pr_matrix.tsv")
-
-    @property
-    def fasta(self) -> os.PathLike:
-        try:
-            return self.find_first_file_with_suffix(".fasta")
-        except FileNotFoundError:
-            return self.find_first_file_with_suffix(".fa")
-
-    @property
-    def ms_info(self) -> os.PathLike:
-        return self.find_first_file_with_suffix("ms_info.tsv")
-
-    @property
-    def diann_version(self) -> str:
-        logger.debug("Validating DIANN version")
-        diann_version_id = None
-        with open(self.diann_version_file) as f:
-            for line in f:
-                if "DIA-NN" in line:
-                    logger.debug(f"Found DIA-NN version: {line}")
-                    diann_version_id = line.rstrip("\n").split(": ")[1]
-
-        if diann_version_id is None:
-            raise ValueError(
-                f"Could not find DIA-NN version in file {self.diann_version_file}"
-            )
-
-        return diann_version_id
-
-    def validate_diann_version(self) -> None:
-        supported_diann_versions = ["1.8.1", "1.9.beta.1"]
-        if self.diann_version not in supported_diann_versions:
-            raise ValueError(f"Unsupported DIANN version {self.diann_version}")
-
-    def convert_to_mztab(
-        self,
-        report,
-        f_table,
-        charge: int,
-        missed_cleavages: int,
-        dia_params: List[Any],
-        out: os.PathLike,
-    ) -> None:
-        logger.info("Converting to mzTab")
-        self.validate_diann_version()
-
-        # This could be a branching point if we want to support other versions
-        # of DIA-NN, maybe something like this:
-        # if diann_version_id == "1.8.1":
-        #     self.convert_to_mztab_1_8_1(report, f_table, charge, missed_cleavages, dia_params)
-        # else:
-        #     raise ValueError(f"Unsupported DIANN version {diann_version_id}, supported versions are 1.8.1 ...")
-
-        logger.info(f"Reading fasta file: {self.fasta}")
-        entries: list = []
-        f = FASTAFile()
-        f.load(str(self.fasta), entries)
-        fasta_entries = [(e.identifier, e.sequence, len(e.sequence)) for e in entries]
-        fasta_df = pd.DataFrame(fasta_entries, columns=["id", "seq", "len"])
-
-        logger.info("Mapping run information to report")
-        index_ref = f_table.copy()
-        index_ref.rename(
-            columns={
-                "Fraction_Group": "ms_run",
-                "Sample": "study_variable",
-                "run": "Run",
-            },
-            inplace=True,
-        )
-        index_ref["ms_run"] = index_ref["ms_run"].astype("int")
-        index_ref["study_variable"] = index_ref["study_variable"].astype("int")
-        report = report.merge(
-            index_ref[["ms_run", "Run", "study_variable"]],
-            on="Run",
-            validate="many_to_one",
-        )
-
-        MTD, database = mztab_MTD(
-            index_ref, dia_params, str(self.fasta), charge, missed_cleavages
-        )
-        pg = pd.read_csv(
-            self.pg_matrix,
-            sep="\t",
-            header=0,
-        )
-        PRH = mztab_PRH(report, pg, index_ref, database, fasta_df)
-        del pg
-        pr = pd.read_csv(
-            self.pr_matrix,
-            sep="\t",
-            header=0,
-        )
-        precursor_list = list(report["Precursor.Id"].unique())
-        PEH = mztab_PEH(report, pr, precursor_list, index_ref, database)
-        del pr
-        PSH = mztab_PSH(report, str(self.base_path), database)
-        del report
-        MTD.loc["", :] = ""
-        PRH.loc[len(PRH) + 1, :] = ""
-        PEH.loc[len(PEH) + 1, :] = ""
-        with open(out, "w", newline="") as f:
-            MTD.to_csv(f, mode="w", sep="\t", index=False, header=False)
-            PRH.to_csv(f, mode="w", sep="\t", index=False, header=True)
-            PEH.to_csv(f, mode="w", sep="\t", index=False, header=True)
-            PSH.to_csv(f, mode="w", sep="\t", index=False, header=True)
-
-        logger.info(f"mzTab file generated successfully! at {out}_out.mzTab")
-
-    def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
-        remain_cols = [
-            "File.Name",
-            "Run",
-            "Protein.Group",
-            "Protein.Names",
-            "Protein.Ids",
-            "First.Protein.Description",
-            "PG.MaxLFQ",
-            "RT.Start",
-            "Global.Q.Value",
-            "Lib.Q.Value",
-            "PEP",
-            "Precursor.Normalised",
-            "Precursor.Id",
-            "Q.Value",
-            "Modified.Sequence",
-            "Stripped.Sequence",
-            "Precursor.Charge",
-            "Precursor.Quantity",
-            "Global.PG.Q.Value",
-        ]
-        report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)
-
-        # filter based on qvalue parameter for downstream analysiss
-        logger.debug(
-            f"Filtering report based on qvalue threshold: {qvalue_threshold}, {len(report)} rows"
-        )
-        report = report[report["Q.Value"] < qvalue_threshold]
-        logger.debug(f"Report filtered, {len(report)} rows remaining")
-
-        logger.debug("Calculating Precursor.Mz")
-        # Making the map is 10x faster, and includes the mass of
-        # the modification. with respect to the previous implementation.
-        uniq_masses = {
-            k: compute_mass_modified_peptide(k)
-            for k in report["Modified.Sequence"].unique()
-        }
-        mass_vector = report["Modified.Sequence"].map(uniq_masses)
-        report["Calculate.Precursor.Mz"] = (
-            mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
-        ) / report["Precursor.Charge"]
-
-        logger.debug("Indexing Precursors")
-        # Making the map is 1500x faster
-        precursor_index_map = {
-            k: i for i, k in enumerate(report["Precursor.Id"].unique())
-        }
-        report["precursor.Index"] = report["Precursor.Id"].map(precursor_index_map)
-
-        logger.debug(f"Shape of main report {report.shape}")
-        logger.debug(str(report.head()))
-
-        return report
-
-
-def MTD_mod_info(fix_mod, var_mod):
-    """
-    Convert fixed and variable modifications to the format required by the MTD sub-table.
-
-    :param fix_mod: Fixed modifications from DIA parameter list
-    :type fix_mod: str
-    :param var_mod: Variable modifications from DIA parameter list
-    :type var_mod: str
-    :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null
-    :rtype: tuple
-    """
-    var_ptm = []
-    fix_ptm = []
-    mods_db = ModificationsDB()
-
-    if fix_mod != "null":
-        fix_flag = 1
-        for mod in fix_mod.split(","):
-            mod_obj = mods_db.getModification(mod)
-            mod_name = mod_obj.getId()
-            mod_accession = mod_obj.getUniModAccession()
-            site = mod_obj.getOrigin()
-            fix_ptm.append(
-                ("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)
-            )
-    else:
-        fix_flag = 0
-        fix_ptm.append("[MS, MS:1002453, No fixed modifications searched, ]")
-
-    if var_mod != "null":
-        var_flag = 1
-        for mod in var_mod.split(","):
-            mod_obj = mods_db.getModification(mod)
-            mod_name = mod_obj.getId()
-            mod_accession = mod_obj.getUniModAccession()
-            site = mod_obj.getOrigin()
-            var_ptm.append(
-                ("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)
-            )
-    else:
-        var_flag = 0
-        var_ptm.append("[MS, MS:1002454, No variable modifications searched, ]")
-
-    return fix_ptm, var_ptm, fix_flag, var_flag
-
-
-def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
-    """
-    Construct MTD sub-table.
-
-    :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching
-    :type index_ref: pandas.core.frame.DataFrame
-    :param dia_params: A list contains DIA parameters
-    :type dia_params: list
-    :param fasta: Fasta file path
-    :type fasta: str
-    :param charge: Charges set by Dia-NN
-    :type charge: int
-    :param missed_cleavages: Missed cleavages set by Dia-NN
-    :type missed_cleavages: int
-    :return: MTD sub-table
-    :rtype: pandas.core.frame.DataFrame
-    """
-    logger.info("Constructing MTD sub-table...")
-    dia_params_list = dia_params.split(";")
-    dia_params_list = ["null" if i == "" else i for i in dia_params_list]
-    FragmentMassTolerance = dia_params_list[0]
-    FragmentMassToleranceUnit = dia_params_list[1]
-    PrecursorMassTolerance = dia_params_list[2]
-    PrecursorMassToleranceUnit = dia_params_list[3]
-    Enzyme = dia_params_list[4]
-    FixedModifications = dia_params_list[5]
-    VariableModifications = dia_params_list[6]
-    out_mztab_MTD = pd.DataFrame()
-    out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0"
-    out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary"
-    out_mztab_MTD.loc[1, "mzTab-type"] = "Quantification"
-    out_mztab_MTD.loc[1, "title"] = "ConsensusMap export from OpenMS"
-    out_mztab_MTD.loc[1, "description"] = "OpenMS export from consensusXML"
-    out_mztab_MTD.loc[1, "protein_search_engine_score[1]"] = (
-        "[, , DIA-NN Global.PG.Q.Value, ]"
-    )
-    out_mztab_MTD.loc[1, "peptide_search_engine_score[1]"] = (
-        "[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]"
-    )
-    out_mztab_MTD.loc[1, "psm_search_engine_score[1]"] = (
-        "[MS, MS:MS:1001869, protein-level q-value, ]"
-    )
-    out_mztab_MTD.loc[1, "software[1]"] = "[MS, MS:1003253, DIA-NN, Release (v1.8.1)]"
-    out_mztab_MTD.loc[1, "software[1]-setting[1]"] = fasta
-    out_mztab_MTD.loc[1, "software[1]-setting[2]"] = "db_version:null"
-    out_mztab_MTD.loc[1, "software[1]-setting[3]"] = (
-        "fragment_mass_tolerance:" + FragmentMassTolerance
-    )
-    out_mztab_MTD.loc[1, "software[1]-setting[4]"] = (
-        "fragment_mass_tolerance_unit:" + FragmentMassToleranceUnit
-    )
-    out_mztab_MTD.loc[1, "software[1]-setting[5]"] = (
-        "precursor_mass_tolerance:" + PrecursorMassTolerance
-    )
-    out_mztab_MTD.loc[1, "software[1]-setting[6]"] = (
-        "precursor_mass_tolerance_unit:" + PrecursorMassToleranceUnit
-    )
-    out_mztab_MTD.loc[1, "software[1]-setting[7]"] = "enzyme:" + Enzyme
-    out_mztab_MTD.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
-    out_mztab_MTD.loc[1, "software[1]-setting[9]"] = "charges:" + str(charge)
-    out_mztab_MTD.loc[1, "software[1]-setting[10]"] = "missed_cleavages:" + str(
-        missed_cleavages
-    )
-    out_mztab_MTD.loc[1, "software[1]-setting[11]"] = (
-        "fixed_modifications:" + FixedModifications
-    )
-    out_mztab_MTD.loc[1, "software[1]-setting[12]"] = (
-        "variable_modifications:" + VariableModifications
-    )
-
-    (fixed_mods, variable_mods, fix_flag, var_flag) = MTD_mod_info(
-        FixedModifications, VariableModifications
-    )
-    if fix_flag == 1:
-        for i in range(1, len(fixed_mods) + 1):
-            out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]"] = fixed_mods[i - 1][0]
-            out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]-site"] = fixed_mods[i - 1][
-                1
-            ]
-            out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]-position"] = "Anywhere"
-    else:
-        out_mztab_MTD.loc[1, "fixed_mod[1]"] = fixed_mods[0]
-
-    if var_flag == 1:
-        for i in range(1, len(variable_mods) + 1):
-            out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]"] = variable_mods[i - 1][
-                0
-            ]
-            out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
-                i - 1
-            ][1]
-            out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
-    else:
-        out_mztab_MTD.loc[1, "variable_mod[1]"] = variable_mods[0]
-
-    out_mztab_MTD.loc[1, "quantification_method"] = (
-        "[MS, MS:1001834, LC-MS label-free quantitation analysis, ]"
-    )
-    out_mztab_MTD.loc[1, "protein-quantification_unit"] = "[, , Abundance, ]"
-    out_mztab_MTD.loc[1, "peptide-quantification_unit"] = "[, , Abundance, ]"
-
-    for i in range(1, max(index_ref["ms_run"]) + 1):
-        out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-format"] = (
-            "[MS, MS:1000584, mzML file, ]"
-        )
-        out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-location"] = (
-            "file://"
-            + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
-        )
-        out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
-            "[MS, MS:1000777, spectrum identifier nativeID format, ]"
-        )
-        out_mztab_MTD.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = (
-            "[MS, MS:1002038, unlabeled sample, ]"
-        )
-        out_mztab_MTD.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
-            "ms_run[" + str(i) + "]"
-        )
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        # This is used here in order to ignore performance warnings from pandas.
-        for i in range(1, max(index_ref["study_variable"]) + 1):
-            study_variable = []
-            for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values):
-                study_variable.append("assay[" + str(j) + "]")
-            out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = (
-                ",".join(study_variable)
-            )
-            out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = (
-                "no description given"
-            )
-
-    # The former loop makes a very sharded frame, this
-    # makes the frame more compact in memory.
-    out_mztab_MTD = out_mztab_MTD.copy()
-    out_mztab_MTD.loc[2, :] = "MTD"
-
-    # Transpose out_mztab_MTD
-    col = list(out_mztab_MTD.columns)
-    row = list(out_mztab_MTD.index)
-    out_mztab_MTD_T = pd.DataFrame(out_mztab_MTD.values.T, index=col, columns=row)
-    out_mztab_MTD_T.columns = ["inf", "index"]
-    out_mztab_MTD_T.insert(0, "title", out_mztab_MTD_T.index)
-    index = out_mztab_MTD_T.loc[:, "index"]
-    out_mztab_MTD_T.drop(labels=["index"], axis=1, inplace=True)
-    out_mztab_MTD_T.insert(0, "index", index)
-    database = os.path.basename(fasta.split(".")[-2])
-
-    return out_mztab_MTD_T, database
-
-
-def mztab_PRH(report, pg, index_ref, database, fasta_df):
-    """
-    Construct PRH sub-table.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param pg: Dataframe for Dia-NN protein groups matrix
-    :type pg: pandas.core.frame.DataFrame
-    :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching
-    :type index_ref: pandas.core.frame.DataFrame
-    :param database: Path to fasta file
-    :type database: str
-    :param fasta_df: A dataframe contains protein IDs, sequences and lengths
-    :type fasta_df: pandas.core.frame.DataFrame
-    :return: PRH sub-table
-    :rtype: pandas.core.frame.DataFrame
-    """
-    logger.info("Constructing PRH sub-table...")
-    logger.debug(
-        f"Input report shape: {report.shape},"
-        f" input pg shape: {pg.shape},"
-        f" input index_ref shape: {index_ref.shape},"
-        f" input fasta_df shape: {fasta_df.shape}"
-    )
-    file = list(pg.columns[5:])
-    col = {}
-    for i in file:
-        col[i] = (
-            "protein_abundance_assay["
-            + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
-            + "]"
-        )
-
-    pg.rename(columns=col, inplace=True)
-
-    logger.debug("Classifying results type ...")
-    pg["opt_global_result_type"] = "single_protein"
-    pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = (
-        "indistinguishable_protein_group"
-    )
-
-    out_mztab_PRH = pg
-    del pg
-    out_mztab_PRH = out_mztab_PRH.drop(["Protein.Names"], axis=1)
-    out_mztab_PRH.rename(
-        columns={"First.Protein.Description": "description"}, inplace=True
-    )
-    out_mztab_PRH.loc[:, "database"] = database
-
-    null_col = [
-        "taxid",
-        "species",
-        "database_version",
-        "search_engine",
-        "opt_global_Posterior_Probability_score",
-        "opt_global_nr_found_peptides",
-        "opt_global_cv_PRIDE:0000303_decoy_hit",
-    ]
-    for i in null_col:
-        out_mztab_PRH.loc[:, i] = "null"
-
-    logger.debug("Extracting accession values (keeping first)...")
-    out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(
-        lambda x: x["Protein.Group"].split(";")[0], axis=1
-    )
-
-    protein_details_df = out_mztab_PRH[
-        out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group"
-    ]
-    prh_series = (
-        protein_details_df["Protein.Group"]
-        .str.split(";", expand=True)
-        .stack()
-        .reset_index(level=1, drop=True)
-    )
-    prh_series.name = "accession"
-    protein_details_df = (
-        protein_details_df.drop("accession", axis=1)
-        .join(prh_series)
-        .reset_index()
-        .drop(columns="index")
-    )
-    if len(protein_details_df) > 0:
-        logger.info(f"Found {len(protein_details_df)} indistinguishable protein groups")
-        # The Following line fails if there are no indistinguishable protein groups
-        protein_details_df.loc[:, "col"] = "protein_details"
-        # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
-        out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(
-            drop=True
-        )
-    else:
-        logger.info("No indistinguishable protein groups found")
-
-    logger.debug("Calculating protein coverage (bottleneck)...")
-    # This is a bottleneck
-    # reimplementation runs in 67s vs 137s (old) in my data
-    out_mztab_PRH.loc[:, "protein_coverage"] = calculate_protein_coverages(
-        report=report, out_mztab_PRH=out_mztab_PRH, fasta_df=fasta_df
-    )
-
-    logger.debug("Getting ambiguity members...")
-    # IN THEORY this should be the same as
-    # out_mztab_PRH["ambiguity_members"] = out_mztab_PRH["Protein.Ids"]
-    # out_mztab_PRH.loc[out_mztab_PRH["opt_global_result_type"] == "single_protein", "ambiguity_members"] = "null"
-    # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
-    out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply(
-        lambda x: (
-            x["Protein.Group"]
-            if x["opt_global_result_type"] == "indistinguishable_protein_group"
-            else "null"
-        ),
-        axis=1,
-    )
-
-    logger.debug("Matching PRH to best search engine score...")
-    score_looker = ModScoreLooker(report)
-    out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = (
-        out_mztab_PRH.apply(
-            lambda x: score_looker.get_score(x["Protein.Group"]),
-            axis=1,
-            result_type="expand",
-        )
-    )
-
-    logger.debug("Matching PRH to modifications...")
-    out_mztab_PRH.loc[:, "modifications"] = out_mztab_PRH.apply(
-        lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand"
-    )
-
-    logger.debug("Matching PRH to protein quantification...")
-    ## quantity at protein level: PG.MaxLFQ
-    # This used to be a bottleneck in performance
-    # This implementation drops the run time from 57s to 25ms
-    protein_agg_report = (
-        report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
-        .groupby(["study_variable", "Protein.Group"])
-        .agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
-        .reset_index()
-        .pivot(columns=["study_variable"], index="Protein.Group")
-        .reset_index()
-    )
-    protein_agg_report.columns = [
-        "::".join([str(s) for s in col]).strip()
-        for col in protein_agg_report.columns.values
-    ]
-    subname_mapper = {
-        "Protein.Group::::": "Protein.Group",
-        "PG.MaxLFQ::mean": "protein_abundance_study_variable",
-        "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
-        "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
-    }
-    name_mapper = name_mapper_builder(subname_mapper)
-    protein_agg_report.rename(columns=name_mapper, inplace=True)
-    # out_mztab_PRH has columns accession and Protein.Ids; 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2']
-    # the report table has 'Protein.Group' and 'Protein.Ids': 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2'
-    # Oddly enough the last implementation mapped the the accession (Q9NZJ9) in the mztab
-    # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
-    out_mztab_PRH = out_mztab_PRH.merge(
-        protein_agg_report,
-        on="Protein.Group",
-        how="left",
-        validate="many_to_one",
-        copy=True,
-    )
-    del name_mapper
-    del subname_mapper
-    del protein_agg_report
-    # end of (former) bottleneck
-
-    out_mztab_PRH.loc[:, "PRH"] = "PRT"
-    index = out_mztab_PRH.loc[:, "PRH"]
-    out_mztab_PRH.drop(
-        ["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True
-    )
-    out_mztab_PRH.insert(0, "PRH", index)
-    out_mztab_PRH.fillna("null", inplace=True)
-    out_mztab_PRH.loc[:, "database"] = database
-    new_cols = [col for col in out_mztab_PRH.columns if not col.startswith("opt_")] + [
-        col for col in out_mztab_PRH.columns if col.startswith("opt_")
-    ]
-    out_mztab_PRH = out_mztab_PRH[new_cols]
-    return out_mztab_PRH
-
-
-def mztab_PEH(
-    report: pd.DataFrame,
-    pr: pd.DataFrame,
-    precursor_list: List[str],
-    index_ref: pd.DataFrame,
-    database: os.PathLike,
-) -> pd.DataFrame:
-    """
-    Construct PEH sub-table.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param pr: Dataframe for Dia-NN precursors matrix
-    :type pr: pandas.core.frame.DataFrame
-    :param precursor_list: A list contains all precursor IDs
-    :type precursor_list: list
-    :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching
-    :type index_ref: pandas.core.frame.DataFrame
-    :param database: Path to fasta file
-    :type database: str
-    :return: PEH sub-table
-    :rtype: pandas.core.frame.DataFrame
-    """
-    logger.info("Constructing PEH sub-table...")
-    logger.debug(
-        f"report.shape: {report.shape}, "
-        f" pr.shape: {pr.shape},"
-        f" len(precursor_list): {len(precursor_list)},"
-        f" index_ref.shape: {index_ref.shape}"
-    )
-    out_mztab_PEH = pd.DataFrame()
-    out_mztab_PEH = pr.iloc[:, 0:10]
-    out_mztab_PEH.drop(
-        ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],
-        axis=1,
-        inplace=True,
-    )
-    out_mztab_PEH.rename(
-        columns={
-            "Stripped.Sequence": "sequence",
-            "Protein.Group": "accession",
-            "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
-            "Precursor.Charge": "charge",
-        },
-        inplace=True,
-    )
-
-    logger.debug("Finding modifications...")
-    out_mztab_PEH.loc[:, "modifications"] = out_mztab_PEH.apply(
-        lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]),
-        axis=1,
-        result_type="expand",
-    )
-
-    logger.debug("Extracting sequence...")
-    out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = (
-        out_mztab_PEH.apply(
-            lambda x: AASequence.fromString(
-                x["opt_global_cv_MS:1000889_peptidoform_sequence"]
-            ).toString(),
-            axis=1,
-        )
-    )
-
-    logger.debug("Checking accession uniqueness...")
-    out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(
-        lambda x: "0" if ";" in str(x["accession"]) else "1",
-        axis=1,
-        result_type="expand",
-    )
-
-    null_col = [
-        "database_version",
-        "search_engine",
-        "retention_time_window",
-        "mass_to_charge",
-        "opt_global_feature_id",
-    ]
-    for i in null_col:
-        out_mztab_PEH.loc[:, i] = "null"
-    out_mztab_PEH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
-
-    logger.debug("Matching precursor IDs...")
-    # Pre-calculating the indices and using a lookup table drops run time from
-    # ~6.5s to 11ms
-    precursor_indices = {k: i for i, k in enumerate(precursor_list)}
-    pr_ids = out_mztab_PEH["Precursor.Id"].map(precursor_indices)
-    out_mztab_PEH["pr_id"] = pr_ids
-    del precursor_indices
-
-    logger.debug("Getting scores per run")
-    # This implementation is 422-700x faster than the apply-based one
-    tmp = (
-        report.groupby(["precursor.Index", "ms_run"])
-        .agg({"Q.Value": ["min"]})
-        .reset_index()
-        .pivot(columns=["ms_run"], index="precursor.Index")
-        .reset_index()
-    )
-    tmp.columns = pd.Index(
-        ["::".join([str(s) for s in col]).strip() for col in tmp.columns.values]
-    )
-    subname_mapper = {
-        "precursor.Index::::": "precursor.Index",
-        "Q.Value::min": "search_engine_score[1]_ms_run",
-    }
-    name_mapper = name_mapper_builder(subname_mapper)
-    tmp.rename(columns=name_mapper, inplace=True)
-    out_mztab_PEH = out_mztab_PEH.merge(
-        tmp.rename(columns={"precursor.Index": "pr_id"}),
-        on="pr_id",
-        validate="one_to_one",
-    )
-    del tmp
-    del subname_mapper
-    del name_mapper
-
-    logger.debug("Getting peptide abundances per study variable")
-    pep_study_report = per_peptide_study_report(report)
-    out_mztab_PEH = out_mztab_PEH.merge(
-        pep_study_report, on="pr_id", how="left", validate="one_to_one", copy=True
-    )
-    del pep_study_report
-
-    logger.debug("Getting peptide properties...")
-    # Re-implementing this section from apply -> assign to groupby->agg
-    # speeds up the process from 11s to 25ms in my data (~440x faster)
-    # Notably, this changes slightly...
-    # "opt_global_q-value" was the FIRST "Global.Q.Value", now its the min
-    # "opt_global_SpecEValue_score" was the FIRST "Lib.Q.Value" now its the min
-    # I believe picking the first is inconsistent because no sorting is checked
-    # and the first is arbitrary.
-
-    aggtable = (
-        report.groupby(["precursor.Index"])
-        .agg(
-            {
-                "Q.Value": "min",
-                "RT.Start": "mean",
-                "Global.Q.Value": "min",
-                "Lib.Q.Value": "min",
-                "Calculate.Precursor.Mz": "mean",
-            }
-        )
-        .reset_index()
-        .rename(
-            columns={
-                "precursor.Index": "pr_id",
-                "Q.Value": "best_search_engine_score[1]",
-                "RT.Start": "retention_time",
-                "Global.Q.Value": "opt_global_q-value",
-                "Lib.Q.Value": "opt_global_SpecEValue_score",
-                "Calculate.Precursor.Mz": "mass_to_charge",
-            }
-        )
-    )
-    del out_mztab_PEH["mass_to_charge"]
-    out_mztab_PEH = out_mztab_PEH.merge(aggtable, on="pr_id", validate="one_to_one")
-
-    logger.debug("Re-ordering columns...")
-    out_mztab_PEH.loc[:, "PEH"] = "PEP"
-    out_mztab_PEH.loc[:, "database"] = str(database)
-    index = out_mztab_PEH.loc[:, "PEH"]
-    out_mztab_PEH.drop(["PEH", "Precursor.Id", "Genes", "pr_id"], axis=1, inplace=True)
-    out_mztab_PEH.insert(0, "PEH", index)
-    out_mztab_PEH.fillna("null", inplace=True)
-    new_cols = [col for col in out_mztab_PEH.columns if not col.startswith("opt_")] + [
-        col for col in out_mztab_PEH.columns if col.startswith("opt_")
-    ]
-    out_mztab_PEH = out_mztab_PEH[new_cols]
-
-    return out_mztab_PEH
-
-
-def mztab_PSH(report, folder, database):
-    """
-    Construct PSH sub-table.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param folder: DiannConvert specifies the folder where the required file resides. The folder contains
-        the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence
-        FASTA file, version file of DiaNN and ms_info TSVs
-    :type folder: str
-    :param database: Path to fasta file
-    :type database: str
-    :return: PSH sub-table
-    :rtype: pandas.core.frame.DataFrame
-    """
-    logger.info("Constructing PSH sub-table")
-
-    def __find_info(directory, n):
-        # This line matches n="220101_myfile", folder="." to
-        # "myfolder/220101_myfile_ms_info.tsv"
-        files = list(Path(directory).rglob(f"{n}_ms_info.parquet"))
-        # Check that it matches one and only one file
-        if not files:
-            raise ValueError(f"Could not find {n} info file in {directory}")
-        if len(files) > 1:
-            raise ValueError(f"Found multiple {n} info files in {directory}: {files}")
-
-        return files[0]
-
-    out_mztab_PSH = pd.DataFrame()
-    for n, group in report.groupby(["Run"]):
-        if isinstance(n, tuple) and len(n) == 1:
-            # This is here only to support versions of pandas where the groupby
-            # key is a tuple.
-            # related: https://github.com/pandas-dev/pandas/pull/51817
-            n = n[0]
-
-        file = __find_info(folder, n)
-        target = pd.read_parquet(file)
-        group.sort_values(by="RT.Start", inplace=True)
-        target = target[["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"]]
-        target.columns = [
-            "RT.Start",
-            "opt_global_spectrum_reference",
-            "exp_mass_to_charge",
-        ]
-        # Standardize spectrum identifier format for bruker data
-        if type(target.loc[0, "opt_global_spectrum_reference"]) != str:
-            target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
-                :, "opt_global_spectrum_reference"
-            ].astype(str)
-
-        # TODO seconds returned from precursor.getRT()
-        target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
-        out_mztab_PSH = pd.concat(
-            [
-                out_mztab_PSH,
-                pd.merge_asof(group, target, on="RT.Start", direction="nearest"),
-            ]
-        )
-    del report
-
-    ## Score at PSM level: Q.Value
-    out_mztab_PSH = out_mztab_PSH[
-        [
-            "Stripped.Sequence",
-            "Protein.Group",
-            "Q.Value",
-            "RT.Start",
-            "Precursor.Charge",
-            "Calculate.Precursor.Mz",
-            "exp_mass_to_charge",
-            "Modified.Sequence",
-            "PEP",
-            "Global.Q.Value",
-            "Global.Q.Value",
-            "opt_global_spectrum_reference",
-            "ms_run",
-        ]
-    ]
-    out_mztab_PSH.columns = [
-        "sequence",
-        "accession",
-        "search_engine_score[1]",
-        "retention_time",
-        "charge",
-        "calc_mass_to_charge",
-        "exp_mass_to_charge",
-        "opt_global_cv_MS:1000889_peptidoform_sequence",
-        "opt_global_SpecEValue_score",
-        "opt_global_q-value",
-        "opt_global_q-value_score",
-        "opt_global_spectrum_reference",
-        "ms_run",
-    ]
-
-    out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
-    out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index
-    out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(
-        lambda x: "0" if ";" in str(x["accession"]) else "1",
-        axis=1,
-        result_type="expand",
-    )
-    out_mztab_PSH.loc[:, "database"] = database
-
-    null_col = [
-        "database_version",
-        "search_engine",
-        "pre",
-        "post",
-        "start",
-        "end",
-        "opt_global_feature_id",
-        "opt_global_map_index",
-    ]
-    for i in null_col:
-        out_mztab_PSH.loc[:, i] = "null"
-
-    logger.info("Finding Modifications ...")
-    out_mztab_PSH.loc[:, "modifications"] = out_mztab_PSH.apply(
-        lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]),
-        axis=1,
-        result_type="expand",
-    )
-
-    out_mztab_PSH.loc[:, "spectra_ref"] = out_mztab_PSH.apply(
-        lambda x: "ms_run[{}]:".format(x["ms_run"])
-        + x["opt_global_spectrum_reference"],
-        axis=1,
-        result_type="expand",
-    )
-
-    out_mztab_PSH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = (
-        out_mztab_PSH.apply(
-            lambda x: AASequence.fromString(
-                x["opt_global_cv_MS:1000889_peptidoform_sequence"]
-            ).toString(),
-            axis=1,
-            result_type="expand",
-        )
-    )
-
-    out_mztab_PSH.loc[:, "PSH"] = "PSM"
-    index = out_mztab_PSH.loc[:, "PSH"]
-    out_mztab_PSH.drop(["PSH", "ms_run"], axis=1, inplace=True)
-    out_mztab_PSH.insert(0, "PSH", index)
-    out_mztab_PSH.fillna("null", inplace=True)
-    new_cols = [col for col in out_mztab_PSH.columns if not col.startswith("opt_")] + [
-        col for col in out_mztab_PSH.columns if col.startswith("opt_")
-    ]
-    out_mztab_PSH = out_mztab_PSH[new_cols]
-    # out_mztab_PSH.to_csv("./out_psms.mztab", sep=",", index=False)
-
-    return out_mztab_PSH
-
-
-def add_info(target, index_ref):
-    """
-    On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching.
-
-    :param target: The value of "Run" column in f_table
-    :type target: str
-    :param index_ref: A dataframe on the basis of f_table
-    :type index_ref: pandas.core.frame.DataFrame
-    :return: A tuple contains ms_run and study_variable
-    :rtype: tuple
-    """
-    match = index_ref[index_ref["Run"] == target]
-    ms_run = match["ms_run"].values[0]
-    study_variable = match["study_variable"].values[0]
-
-    return ms_run, study_variable
-
-
-def classify_result_type(target):
-    """Classify proteins
-
-    :param target: The target dataframe contains "Protein.Group" and "Protein.Ids"
-    :type target: pandas.core.frame.DataFrame
-    :return: A string implys protein type
-    :rtype: str
-    """
-    if ";" in target["Protein.Group"]:
-        return "indistinguishable_protein_group"
-    return "single_protein"
-
-
-def match_in_report(report, target, max_, flag, level):
-    """
-    This function is used to match the columns "ms_run" and "study_variable" from the report and
-    get the corresponding information for the mztab ms_run and study_values metadata values.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein")
-    :type target: str
-    :param max_: max_assay or max_study_variable
-    :type max_: int
-    :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result
-    :type flag: int
-    :param level: "pep" or "protein"
-    :type level: str
-    :return: A tuple contains multiple messages
-    :rtype: tuple
-    """  # noqa
-    if flag == 1 and level == "pep":
-        result = report[report["precursor.Index"] == target]
-        PEH_params = []
-        for i in range(1, max_ + 1):
-            match = result[result["study_variable"] == i]
-            PEH_params.extend(
-                [
-                    match["Precursor.Normalised"].mean(),
-                    "null",
-                    "null",
-                    "null",
-                    match["RT.Start"].mean(),
-                ]
-            )
-
-        return tuple(PEH_params)
-
-    if flag == 0 and level == "pep":
-        result = report[report["precursor.Index"] == target]
-        q_value = []
-        for i in range(1, max_ + 1):
-            match = result[result["ms_run"] == i]
-            q_value.append(
-                match["Q.Value"].values[0]
-                if match["Q.Value"].values.size > 0
-                else np.nan
-            )
-
-        return tuple(q_value)
-
-    if flag == 1 and level == "protein":
-        result = report[report["Protein.Group"] == target]
-        PRH_params = []
-        for i in range(1, max_ + 1):
-            match = result[result["study_variable"] == i]
-            PRH_params.extend([match["PG.MaxLFQ"].mean(), "null", "null"])
-
-        return tuple(PRH_params)
-
-
-class ModScoreLooker:
-    """
-    Class used to cache the lookup table of accessions to best scores and their
-    respective mod sequences.
-
-    Pre-computing the lookup table leverages a lot of speedum and vectortized
-    operations from pandas, and is much faster than doing the lookup on the fly
-    in a loop.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    """
-
-    def __init__(self, report: pd.DataFrame) -> None:
-        self.lookup_dict = self.make_lookup_dict(report)
-
-    def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
-        grouped_df = (
-            report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
-            .sort_values("Global.PG.Q.Value", ascending=True)
-            .groupby(["Protein.Group"])
-            .head(1)
-        )
-        #        Modified.Sequence               Protein.Group  Global.PG.Q.Value
-        # 78265          LFNEQNFFQR  Q8IV63;Q8IV63-2;Q8IV63-3           0.000252
-        # 103585    NPTIVNFPITNVDLR           Q53GS9;Q53GS9-2           0.000252
-        # 103586          NPTWKPLIR           Q7Z4Q2;Q7Z4Q2-2           0.000252
-        # 103588      NPVGYPLAWQFLR           Q9NZ08;Q9NZ08-2           0.000252
-
-        out = {
-            row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
-            for _, row in grouped_df.iterrows()
-        }
-        return out
-
-    def get_score(self, protein_id: str) -> Tuple[Union[str, float], float]:
-        """Returns a tuple contains modified sequences and the score at protein level.
-
-        Gets the best score and corresponding peptide for a given protein_id
-
-        Note that protein id can be something like 'Q8IV63;Q8IV63-2;Q8IV63-3'
-
-        Note2: This implementation also fixes a bug where the function would
-        return the first peptide in the report, not the best one. (but with the
-        score of the best one for that accession)
-
-        :param protein_id: The value of "accession" column in report
-        :type target: str
-        :return: A tuple that contains (best modified sequence, best score)
-            if the accession is not found, (np.nan, np.nan) is returned.
-        :rtype: tuple
-        """
-        # Q: in what cases can the accession not exist in the table?
-        #    or an accession not have peptides?
-        val = self.lookup_dict.get(protein_id, (np.nan, np.nan))
-        return val
-
-
-# Pre-compiling the regex makes the next function 2x faster
-# in my benchmarking - JSPP
-MODIFICATION_PATTERN = re.compile(r"\((.*?)\)")
-
-
-def find_modification(peptide):
-    """
-    Identify the modification site based on the peptide containing modifications.
-
-    :param peptide: Sequences of peptides
-    :type peptide: str
-    :return: Modification sites
-    :rtype: str
-
-    Examples:
-    >>> find_modification("PEPM(UNIMOD:35)IDE")
-    '4-UNIMOD:35'
-    >>> find_modification("SM(UNIMOD:35)EWEIRDS(UNIMOD:21)EPTIDEK")
-    '2-UNIMOD:35,9-UNIMOD:21'
-    """
-    peptide = str(peptide)
-    original_mods = MODIFICATION_PATTERN.findall(peptide)
-    peptide = MODIFICATION_PATTERN.sub(".", peptide)
-    position = [i for i, x in enumerate(peptide) if x == "."]
-    for j in range(1, len(position)):
-        position[j] -= j
-
-    for k in range(0, len(original_mods)):
-        original_mods[k] = str(position[k]) + "-" + original_mods[k].upper()
-
-    original_mods = (
-        ",".join(str(i) for i in original_mods) if len(original_mods) > 0 else "null"
-    )
-
-    return original_mods
-
-
-def name_mapper_builder(subname_mapper):
-    """Returns a function that renames the columns of the grouped table to match the ones
-    in the final table.
-
-    Examples:
-        >>> mapping_dict = {
-        ...     "precursor.Index::::": "pr_id",
-        ...     "Precursor.Normalised::mean": "peptide_abundance_study_variable"
-        ... }
-        >>> name_mapper = name_mapper_builder(mapping_dict)
-        >>> name_mapper("precursor.Index::::")
-        "pr_id"
-        >>> name_mapper("Precursor.Normalised::mean::1")
-        "peptide_abundance_study_variable[1]"
-    """
-    num_regex = re.compile(r"(.*)::(\d+)$")
-
-    def name_mapper(x):
-        """Renames the columns of the grouped table to match the ones
-        in the final table.
-
-        Examples:
-            >>> name_mapper("precursor.Index::::")
-            "pr_id"
-            >>> name_mapper("Precursor.Normalised::mean::1")
-            "peptide_abundance_study_variable[1]"
-        """
-        orig_x = x
-        for k, v in subname_mapper.items():
-            if k in x:
-                x = x.replace(k, v)
-        out = num_regex.sub(r"\1[\2]", x)
-        if out == orig_x:
-            # This should never happen but I am adding it here
-            # to prevent myself from shoting myself in the foot in the future.
-            raise ValueError(f"Column name {x} not found in subname_mapper")
-        return out
-
-    return name_mapper
-
-
-def per_peptide_study_report(report: pd.DataFrame) -> pd.DataFrame:
-    """Summarizes the report at peptide/study level and flattens the columns.
-
-    This function was implemented to replace an 'apply -> filter' approach.
-    In my benchmarking it went from 35.23 seconds for 4 samples, 4 conditions to
-    0.007 seconds.
-
-    This implementation differs in several aspects in the output values:
-    1. in the fact that it actually gets values for the m/z
-    2. always returns a float, whilst the apply version returns an 'object' dtype.
-    3. The original implementation, missing values had the string 'null', here they have the value np.nan.
-    4. The order of the final output is different; the original orders columns by
-    study variables > calculated value, this one is calculated value > study variables.
-
-    Calculates the mean, standard deviation and std error of the precursor
-    abundances, as well as the mean retention time and m/z.
-
-    The names in the end are called "peptide" but thechnically the are at the
-    precursor level. (peptide+charge combinations).
-
-    The columns will look like this in the end:
-    [
-        'pr_id',
-        'peptide_abundance_study_variable[1]',
-        ...
-        'peptide_abundance_stdev_study_variable[1]',
-        ...
-        'peptide_abundance_std_error_study_variable[1]',
-        ...
-        'opt_global_retention_time_study_variable[1]',
-        ...
-        'opt_global_mass_to_charge_study_variable[1]',
-        ...
-    ]
-    """
-    pep_study_grouped = (
-        report.groupby(["study_variable", "precursor.Index"])
-        .agg(
-            {
-                "Precursor.Normalised": ["mean", "std", "sem"],
-                "RT.Start": ["mean"],
-                "Calculate.Precursor.Mz": ["mean"],
-            }
-        )
-        .reset_index()
-        .pivot(columns=["study_variable"], index="precursor.Index")
-        .reset_index()
-    )
-    pep_study_grouped.columns = pd.Index(
-        [
-            "::".join([str(s) for s in col]).strip()
-            for col in pep_study_grouped.columns.values
-        ]
-    )
-    # Columns here would be like:
-    # [
-    #     "precursor.Index::::",
-    #     "Precursor.Normalised::mean::1",
-    #     "Precursor.Normalised::mean::2",
-    #     "Precursor.Normalised::std::1",
-    #     "Precursor.Normalised::std::2",
-    #     "Precursor.Normalised::sem::1",
-    #     "Precursor.Normalised::sem::2",
-    #     "RT.Start::mean::1",
-    #     "RT.Start::mean::2",
-    # ]
-    # So the right names need to be given and the table can be joined with the other one
-    subname_mapper = {
-        "precursor.Index::::": "pr_id",
-        "Precursor.Normalised::mean": "peptide_abundance_study_variable",
-        "Precursor.Normalised::std": "peptide_abundance_stdev_study_variable",
-        "Precursor.Normalised::sem": "peptide_abundance_std_error_study_variable",
-        "Calculate.Precursor.Mz::mean": "opt_global_mass_to_charge_study_variable",
-        "RT.Start::mean": "opt_global_retention_time_study_variable",
-    }
-    name_mapper = name_mapper_builder(subname_mapper)
-
-    pep_study_grouped.rename(
-        columns=name_mapper,
-        inplace=True,
-    )
-
-    return pep_study_grouped
-
-
-def calculate_coverage(ref_sequence: str, sequences: Set[str]):
-    """
-    Calculates the coverage of the reference sequence by the given sequences.
-
-    Examples:
-    >>> calculate_coverage("WATEROVERTHEDUCKSBACK", {"WATER", "DUCK"})
-    0.42857142857142855
-    >>> calculate_coverage("DUCKDUCKDUCK", {"DUCK"})
-    1.0
-    >>> calculate_coverage("WATEROVERTHEDUCK", {"DUCK"})
-    0.25
-    >>> calculate_coverage("WATER", {"WAT", "TER"})
-    1.0
-    >>> calculate_coverage("WATERGLASS", {"WAT", "TER"})
-    0.5
-    """
-    starts = []
-    lengths = []
-    for sequence in sequences:
-        local_start = 0
-        while True:
-            local_start = ref_sequence.find(sequence, local_start)
-            if local_start == -1:
-                break
-            starts.append(local_start)
-            lengths.append(len(sequence))
-            local_start += 1
-
-    # merge overlapping intervals
-    merged_starts: list = []
-    merged_lengths: list = []
-    for start, length in sorted(zip(starts, lengths)):
-        if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
-            merged_lengths[-1] = (
-                max(merged_starts[-1] + merged_lengths[-1], start + length)
-                - merged_starts[-1]
-            )
-        else:
-            merged_starts.append(start)
-            merged_lengths.append(length)
-
-    # calculate coverage
-    coverage = sum(merged_lengths) / len(ref_sequence)
-    return coverage
-
-
-def calculate_protein_coverages(
-    report: pd.DataFrame, out_mztab_PRH: pd.DataFrame, fasta_df: pd.DataFrame
-) -> List[str]:
-    """Calculates protein coverages for the PRH table.
-
-    The protein coverage is calculated as the fraction of the protein sequence
-    in the fasta df, covered by the peptides in the report table, for every
-    protein in the PRH table (defined by accession, not protein.ids).
-    """
-    nested_df = (
-        report[["Protein.Group", "Stripped.Sequence"]]
-        .groupby("Protein.Group")
-        .agg({"Stripped.Sequence": set})
-        .reset_index()
-    )
-    #                      Protein.Group                                  Stripped.Sequence
-    # 0     A0A024RBG1;Q9NZJ9;Q9NZJ9-2                                   {SEQEDEVLLVSSSR}
-    # 1        A0A096LP49;A0A096LP49-2                                  {SPWAMTERKHSSLER}
-    # 2                A0AVT1;A0AVT1-2  {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
-    ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
-    acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Group"]))
-    fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
-    acc_to_fasta_ids: dict = {}
-
-    # Since fasta ids are something like sp|P51451|BLK_HUMAN but
-    # accessions are something like Q9Y6V7-2, we need to find a
-    # partial string match between the two (the best one)
-    for acc in acc_to_ids:
-        # I am pretty sure this is the slowest part of the code
-        matches = fasta_df[fasta_df["id"].str.contains(acc)]["id"]
-        if len(matches) == 0:
-            logger.warning(
-                f"Could not find fasta id for accession {acc} in the fasta file."
-            )
-            acc_to_fasta_ids[acc] = None
-        elif len(matches) == 1:
-            acc_to_fasta_ids[acc] = matches.iloc[0]
-        else:
-            # If multiple, find best match. ej. Pick Q9Y6V7 over Q9Y6V7-2
-            # This can be acquired by finding the shortest string, since
-            # it entails more un-matched characters.
-            acc_to_fasta_ids[acc] = min(matches, key=len)
-
-    out: List[str] = [""] * len(out_mztab_PRH["accession"])
-
-    for i, acc in enumerate(out_mztab_PRH["accession"]):
-        f_id = acc_to_fasta_ids[acc]
-        if f_id is None:
-            out_cov = "null"
-        else:
-            cov = calculate_coverage(
-                fasta_id_to_seqs[f_id], ids_to_seqs[acc_to_ids[acc]]
-            )
-            out_cov = format(cov, ".03f")
-
-        out[i] = out_cov
-
-    return out
-
-
-cli.add_command(convert)
-
-if __name__ == "__main__":
-    cli()
diff --git a/bin/extract_sample.py b/bin/extract_sample.py
deleted file mode 100755
index 84270d5c..00000000
--- a/bin/extract_sample.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import errno
-import os
-import sys
-from pathlib import Path
-
-import pandas as pd
-
-
-def parse_args(args=None):
-    Description = "Extract sample information from an experiment design file"
-    Epilog = "Example usage: python extract_sample.py <EXP>"
-
-    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("EXP", help="Expdesign file to be extracted")
-    return parser.parse_args(args)
-
-
-def extract_sample(expdesign):
-    data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
-    fTable = data.dropna()
-
-    # two table format
-    with open(expdesign, "r") as f:
-        lines = f.readlines()
-        empty_row = lines.index("\n")
-        s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:]
-        s_header = lines[empty_row + 1].replace("\n", "").split("\t")
-        s_DataFrame = pd.DataFrame(s_table, columns=s_header)
-
-    sample_dt = pd.DataFrame()
-    if "MSstats_Mixture" not in s_DataFrame.columns:
-        fTable = fTable[["Spectra_Filepath", "Sample"]]
-        fTable.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False)
-    else:
-        fTable.drop_duplicates(subset=["Spectra_Filepath"], inplace=True)
-        for _, row in fTable.iterrows():
-            mixture_id = s_DataFrame[s_DataFrame["Sample"] == row["Sample"]][
-                "MSstats_Mixture"
-            ]
-            sample_dt = sample_dt.append(
-                {"Spectra_Filepath": row["Spectra_Filepath"], "Sample": mixture_id},
-                ignore_index=True,
-            )
-        sample_dt.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False)
-
-
-def main(args=None):
-    args = parse_args(args)
-    extract_sample(args.EXP)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py
deleted file mode 100755
index aa9196c0..00000000
--- a/bin/ms2rescore_cli.py
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/usr/bin/env python
-# Written by Jonas Scheid under the MIT license
-
-
-import importlib.resources
-import json
-import logging
-import sys
-from typing import List
-
-import click
-import pandas as pd
-import pyopenms as oms
-from ms2rescore import package_data, rescore
-from psm_utils import PSMList
-from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-
-
-def parse_cli_arguments_to_config(**kwargs):
-    """Update default MS²Rescore config with CLI arguments"""
-    config = json.load(
-        importlib.resources.open_text(package_data, "config_default.json")
-    )
-
-    for key, value in kwargs.items():
-        # Skip these arguments since they need to set in a nested dict of feature_generators
-        if key in ["ms2pip_model", "ms2_tolerance", "rng", "calibration_set_size"]:
-            continue
-
-        elif key == "feature_generators":
-            feature_generators = value.split(",")
-            # Reset feature generator dict since there might be default generators we don't want
-            config["ms2rescore"]["feature_generators"] = {}
-            if "basic" in feature_generators:
-                config["ms2rescore"]["feature_generators"]["basic"] = {}
-            if "ms2pip" in feature_generators:
-                config["ms2rescore"]["feature_generators"]["ms2pip"] = {
-                    "model_dir": kwargs["ms2pip_model_dir"],
-                    "model": kwargs["ms2pip_model"],
-                    "ms2_tolerance": kwargs["ms2_tolerance"],
-                }
-            if "deeplc" in feature_generators:
-                config["ms2rescore"]["feature_generators"]["deeplc"] = {
-                    "deeplc_retrain": False,
-                    "calibration_set_size": kwargs["calibration_set_size"],
-                }
-            if "maxquant" in feature_generators:
-                config["ms2rescore"]["feature_generators"]["maxquant"] = {}
-            if "ionmob" in feature_generators:
-                config["ms2rescore"]["feature_generators"]["ionmob"] = {}
-
-        elif key == "rescoring_engine":
-            # Reset rescoring engine dict we want to allow only computing features
-            config["ms2rescore"]["rescoring_engine"] = {}
-            if value == "mokapot":
-                config["ms2rescore"]["rescoring_engine"]["mokapot"] = {
-                    "write_weights": True,
-                    "write_txt": False,
-                    "write_flashlfq": False,
-                    "rng": kwargs["rng"],
-                    "test_fdr": kwargs["test_fdr"],
-                    "max_workers": kwargs["processes"],
-                }
-            if value == "percolator":
-                logging.info(
-                    "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step."
-                )
-                continue
-
-        else:
-            config["ms2rescore"][key] = value
-
-    return config
-
-
-def rescore_idxml(input_file, output_file, config) -> None:
-    """Rescore PSMs in an idXML file and keep other information unchanged."""
-    # Read PSMs
-    reader = IdXMLReader(input_file)
-    psm_list = reader.read_file()
-
-    # Rescore
-    rescore(config, psm_list)
-
-    # Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators
-    peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids)
-
-    # Write
-    writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered)
-    writer.write_file(psm_list)
-
-
-def filter_out_artifact_psms(
-    psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification]
-) -> List[oms.PeptideIdentification]:
-    """Filter out PeptideHits that could not be processed by all feature generators"""
-    num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list])
-    new_psm_list = PSMList(
-        psm_list=[
-            psm
-            for psm in psm_list
-            if len(psm.rescoring_features) == num_mandatory_features
-        ]
-    )
-
-    # get differing peptidoforms of both psm lists
-    psm_list_peptides = set(
-        [next(iter(psm.provenance_data.items()))[1] for psm in psm_list]
-    )
-    new_psm_list_peptides = set(
-        [next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list]
-    )
-    not_supported_peptides = psm_list_peptides - new_psm_list_peptides
-
-    # no need to filter if all peptides are supported
-    if len(not_supported_peptides) == 0:
-        return peptide_ids
-    # Create new peptide ids and filter out not supported peptides
-    new_peptide_ids = []
-    for peptide_id in peptide_ids:
-        new_hits = []
-        for hit in peptide_id.getHits():
-            if hit.getSequence().toString() in not_supported_peptides:
-                continue
-            new_hits.append(hit)
-        if len(new_hits) == 0:
-            continue
-        peptide_id.setHits(new_hits)
-        new_peptide_ids.append(peptide_id)
-    logging.info(
-        f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}"
-    )
-    return new_peptide_ids
-
-
-@click.command()
-@click.option(
-    "-p",
-    "--psm_file",
-    help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)",
-    required=True,
-)
-@click.option(
-    "-s",
-    "--spectrum_path",
-    help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)",
-    required=True,
-)
-@click.option(
-    "-o",
-    "--output_path",
-    help="Path and stem for output file names (default: derive from identification file)",
-)
-@click.option(
-    "-l", "--log_level", help="Logging level (default: `info`)", default="info"
-)
-@click.option(
-    "-n",
-    "--processes",
-    help="Number of parallel processes available to MS²Rescore",
-    type=int,
-    default=16,
-)
-@click.option("-f", "--fasta_file", help="Path to FASTA file")
-@click.option(
-    "-t",
-    "--test_fdr",
-    help="The false-discovery rate threshold at which to evaluate the learned models. (default: 0.05)",
-    default=0.05,
-)
-@click.option(
-    "-fg",
-    "--feature_generators",
-    help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information",
-    default="",
-)
-@click.option(
-    "-pipm",
-    "--ms2pip_model",
-    help="MS²PIP model (default: `Immuno-HCD`)",
-    type=str,
-    default="Immuno-HCD",
-)
-@click.option(
-    "-md",
-    "--ms2pip_model_dir",
-    help="The path of MS²PIP model (default: `./`)",
-    type=str,
-    default="./",
-)
-@click.option(
-    "-ms2tol",
-    "--ms2_tolerance",
-    help="Fragment mass tolerance [Da](default: `0.02`)",
-    type=float,
-    default=0.02,
-)
-@click.option(
-    "-cs",
-    "--calibration_set_size",
-    help="Percentage of number of calibration set for DeepLC (default: `0.15`)",
-    default=0.15,
-)
-@click.option(
-    "-re",
-    "--rescoring_engine",
-    help="Either mokapot or percolator (default: `mokapot`)",
-    default="mokapot",
-)
-@click.option(
-    "-rng",
-    "--rng",
-    help="Seed for mokapot's random number generator (default: `4711`)",
-    type=int,
-    default=4711,
-)
-@click.option(
-    "-d",
-    "--id_decoy_pattern",
-    help="Regex decoy pattern (default: `DECOY_`)",
-    default="^DECOY_",
-)
-@click.option(
-    "-lsb",
-    "--lower_score_is_better",
-    help="Interpretation of primary search engine score (default: True)",
-    default=True,
-)
-def main(**kwargs):
-    config = parse_cli_arguments_to_config(**kwargs)
-    logging.info("MS²Rescore config:")
-    logging.info(config)
-    rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/bin/mzml_statistics.py b/bin/mzml_statistics.py
deleted file mode 100755
index f738fb98..00000000
--- a/bin/mzml_statistics.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#!/usr/bin/env python
-"""
-This script parses the mass spectrum file and generates a set of statistics about the file.
-License: Apache 2.0
-Authors: Hong Wong, Yasset Perez-Riverol
-"""
-import re
-import sqlite3
-import sys
-from pathlib import Path
-
-import pandas as pd
-import pyarrow
-from pyopenms import MSExperiment, MzMLFile
-
-
-def ms_dataframe(ms_path: str, id_only: bool = False) -> None:
-    """
-    Parse the mass spectrum file and generate a set of statistics about the file.
-    @param ms_path: The file name of the mass spectrum file
-    @param id_only: If True, it will save a csv with the spectrum id, mz and intensity
-
-    """
-    file_columns = [
-        "SpectrumID",
-        "MSLevel",
-        "Charge",
-        "MS_peaks",
-        "Base_Peak_Intensity",
-        "Summed_Peak_Intensities",
-        "Retention_Time",
-        "Exp_Mass_To_Charge",
-        "AcquisitionDateTime",
-    ]
-
-    def parse_mzml(file_name: str, file_columns: list, id_only: bool = False):
-        """
-        Parse mzML file and return a pandas DataFrame with the information. If id_only is True, it will also save a csv.
-        @param file_name: The file name of the mzML file
-        @param file_columns: The columns of the DataFrame
-        @param id_only: If True, it will save a csv with the spectrum id, mz and intensity
-        @return: A pandas DataFrame with the information of the mzML file
-        """
-
-        info = []
-        psm_part_info = []
-        exp = MSExperiment()
-        acquisition_datetime = exp.getDateTime().get()
-        MzMLFile().load(file_name, exp)
-        for spectrum in exp:
-            id_ = spectrum.getNativeID()
-            MSLevel = spectrum.getMSLevel()
-            rt = spectrum.getRT() if spectrum.getRT() else None
-
-            peaks_tuple = spectrum.get_peaks()
-            peak_per_ms = len(peaks_tuple[0])
-
-            if not spectrum.metaValueExists("base peak intensity"):
-                bpc = max(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None
-            else:
-                bpc = spectrum.getMetaValue("base peak intensity")
-
-            if not spectrum.metaValueExists("total ion current"):
-                tic = sum(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None
-            else:
-                tic = spectrum.getMetaValue("total ion current")
-
-            if MSLevel == 1:
-                info_list = [
-                    id_,
-                    MSLevel,
-                    None,
-                    peak_per_ms,
-                    bpc,
-                    tic,
-                    rt,
-                    None,
-                    acquisition_datetime,
-                ]
-            elif MSLevel == 2:
-                charge_state = spectrum.getPrecursors()[0].getCharge()
-                emz = (
-                    spectrum.getPrecursors()[0].getMZ()
-                    if spectrum.getPrecursors()[0].getMZ()
-                    else None
-                )
-                info_list = [
-                    id_,
-                    MSLevel,
-                    charge_state,
-                    peak_per_ms,
-                    bpc,
-                    tic,
-                    rt,
-                    emz,
-                    acquisition_datetime,
-                ]
-                mz_array = peaks_tuple[0]
-                intensity_array = peaks_tuple[1]
-            else:
-                info_list = [
-                    id_,
-                    MSLevel,
-                    None,
-                    None,
-                    None,
-                    None,
-                    rt,
-                    None,
-                    acquisition_datetime,
-                ]
-
-            if id_only and MSLevel == 2:
-                psm_part_info.append(
-                    [
-                        re.findall(r"[scan|spectrum]=(\d+)", id_)[0],
-                        MSLevel,
-                        mz_array,
-                        intensity_array,
-                    ]
-                )
-            info.append(info_list)
-
-        if id_only and len(psm_part_info) > 0:
-            pd.DataFrame(
-                psm_part_info, columns=["scan", "ms_level", "mz", "intensity"]
-            ).to_parquet(f"{Path(ms_path).stem}_spectrum_df.parquet", index=False)
-
-        return pd.DataFrame(info, columns=file_columns)
-
-    def parse_bruker_d(file_name: str, file_columns: list):
-        sql_filepath = f"{file_name}/analysis.tdf"
-        if not Path(sql_filepath).exists():
-            msg = f"File '{sql_filepath}' not found"
-            raise FileNotFoundError(msg)
-        conn = sqlite3.connect(sql_filepath)
-        c = conn.cursor()
-
-        datetime_cmd = (
-            "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'"
-        )
-        AcquisitionDateTime = c.execute(datetime_cmd).fetchall()[0][0]
-
-        df = pd.read_sql_query(
-            "SELECT Id, MsMsType, NumPeaks, MaxIntensity, SummedIntensities, Time FROM frames",
-            conn,
-        )
-        df["AcquisitionDateTime"] = AcquisitionDateTime
-
-        # {8:'DDA-PASEF', 9:'DIA-PASEF'}
-        if 8 in df["MsMsType"].values:
-            mslevel_map = {0: 1, 8: 2}
-        elif 9 in df["MsMsType"].values:
-            mslevel_map = {0: 1, 9: 2}
-        else:
-            msg = f"Unrecognized ms type '{df['MsMsType'].values}'"
-            raise ValueError(msg)
-        df["MsMsType"] = df["MsMsType"].map(mslevel_map)
-
-        try:
-            # This line raises an sqlite error if the table does not exist
-            _ = conn.execute("SELECT * from Precursors LIMIT 1").fetchall()
-            precursor_df = pd.read_sql_query("SELECT * from Precursors", conn)
-        except sqlite3.OperationalError as e:
-            if "no such table: Precursors" in str(e):
-                print(
-                    f"No precursers recorded in {file_name}, This is normal for DIA data."
-                )
-                precursor_df = pd.DataFrame()
-            else:
-                raise
-
-        if len(df) == len(precursor_df):
-            df = pd.concat([df, precursor_df["Charge", "MonoisotopicMz"]], axis=1)
-            df["Charge"] = df["Charge"].fillna(0)
-        else:
-            df[["Charge", "Exp_Mass_To_Charge"]] = None, None
-
-        df = df[
-            [
-                "Id",
-                "MsMsType",
-                "Charge",
-                "NumPeaks",
-                "MaxIntensity",
-                "SummedIntensities",
-                "Time",
-                "Exp_Mass_To_Charge",
-                "AcquisitionDateTime",
-            ]
-        ]
-        df.columns = pd.Index(file_columns)
-
-        return df
-
-    if not (Path(ms_path).exists()):
-        print(f"Not found '{ms_path}', trying to find alias")
-        ms_path_path = Path(ms_path)
-        path_stem = str(ms_path_path.stem)
-        candidates = (
-            list(ms_path_path.parent.glob("*.d"))
-            + list(ms_path_path.parent.glob("*.mzml"))
-            + list(ms_path_path.parent.glob("*.mzML"))
-        )
-
-        candidates = [c for c in candidates if path_stem in str(c)]
-
-        if len(candidates) == 1:
-            ms_path = str(candidates[0].resolve())
-        else:
-            raise FileNotFoundError()
-
-    if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir():
-        ms_df = parse_bruker_d(ms_path, file_columns)
-    elif Path(ms_path).suffix in [".mzML", ".mzml"]:
-        ms_df = parse_mzml(ms_path, file_columns, id_only)
-    else:
-        msg = f"Unrecognized or the mass spec file '{ms_path}' do not exist"
-        raise RuntimeError(msg)
-
-    ms_df.to_parquet(
-        f"{Path(ms_path).stem}_ms_info.parquet", engine="pyarrow", index=False
-    )
-
-
-def main():
-    ms_path = sys.argv[1]
-    id_only = sys.argv[2]
-    ms_dataframe(ms_path, id_only)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/bin/prepare_diann_parameters.py b/bin/prepare_diann_parameters.py
deleted file mode 100755
index 1c7eddf4..00000000
--- a/bin/prepare_diann_parameters.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-
-"""
-This script converts SDRF parameters to DIA-NN parameters
-License: Apache 2.0
-Authors: Dai Chengxin, Yasset Perez-Riverol
-"""
-
-import re
-from typing import List, Tuple
-
-import click
-from sdrf_pipelines.openms.unimod import UnimodDatabase
-
-CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
-def cli():
-    pass
-
-
-@click.command("generate")
-@click.option("--enzyme", "-e", help="")
-@click.option("--fix_mod", "-f", help="")
-@click.option("--var_mod", "-v", help="")
-@click.pass_context
-def generate_cfg(ctx, enzyme, fix_mod, var_mod):
-    cut = enzyme_cut(enzyme)
-    unimod_database = UnimodDatabase()
-    fix_ptm, var_ptm = convert_mod(unimod_database, fix_mod, var_mod)
-
-    var_ptm_str = " --var-mod "
-    fix_ptm_str = " --fixed-mod "
-    diann_fix_ptm = ""
-    diann_var_ptm = ""
-    for mod in fix_ptm:
-        diann_fix_ptm += fix_ptm_str + mod
-    for mod in var_ptm:
-        diann_var_ptm += var_ptm_str + mod
-
-    with open("diann_config.cfg", "w") as file:
-        file.write("--cut " + cut + diann_fix_ptm + diann_var_ptm)
-
-
-def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List]:
-    pattern = re.compile(r"\((.*?)\)")
-    var_ptm = []
-    fix_ptm = []
-
-    if fix_mod != "":
-        for mod in fix_mod.split(","):
-            tag = 0
-            for modification in unimod_database.modifications:
-                if modification.get_name() == mod.split(" ")[0]:
-                    diann_mod = (
-                        modification.get_name()
-                        + ","
-                        + str(modification._delta_mono_mass)
-                    )
-                    tag = 1
-                    break
-            if tag == 0:
-                print(
-                    "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
-                    + mod
-                )
-                continue
-            site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0]
-            if site == "Protein N-term":
-                site = "*n"
-            elif site == "N-term":
-                site = "n"
-
-            if (
-                "TMT" in diann_mod
-                or "Label" in diann_mod
-                or "iTRAQ" in diann_mod
-                or "mTRAQ" in diann_mod
-            ):
-                fix_ptm.append(diann_mod + "," + site + "," + "label")
-            else:
-                fix_ptm.append(diann_mod + "," + site)
-
-    if var_mod != "":
-        for mod in var_mod.split(","):
-            tag = 0
-            for modification in unimod_database.modifications:
-                if modification.get_name() == mod.split(" ")[0]:
-                    diann_mod = (
-                        modification.get_name()
-                        + ","
-                        + str(modification._delta_mono_mass)
-                    )
-                    tag = 1
-                    break
-            if tag == 0:
-                print(
-                    "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
-                    + mod
-                )
-                continue
-            site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0]
-            if site == "Protein N-term":
-                site = "*n"
-            elif site == "N-term":
-                site = "n"
-
-            if (
-                "TMT" in diann_mod
-                or "Label" in diann_mod
-                or "iTRAQ" in diann_mod
-                or "mTRAQ" in diann_mod
-            ):
-                var_ptm.append(diann_mod + "," + site + "," + "label")
-            else:
-                var_ptm.append(diann_mod + "," + site)
-
-    return fix_ptm, var_ptm
-
-
-_ENZYME_SPECIFICITY = {
-    "Trypsin": "K*,R*,!*P",
-    "Trypsin/P": "K*,R*",
-    "Arg-C": "R*,!*P",
-    "Asp-N": "*B,*D",
-    "Chymotrypsin": "F*,W*,Y*,L*,!*P",
-    "Lys-C": "K*,!*P",
-}
-
-
-def enzyme_cut(enzyme: str) -> str:
-    return _ENZYME_SPECIFICITY.get(enzyme) or "--cut"
-
-
-cli.add_command(generate_cfg)
-
-if __name__ == "__main__":
-    cli()
diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py
deleted file mode 100755
index 79ac1517..00000000
--- a/bin/psm_conversion.py
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import re
-import sys
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import pyopenms as oms
-
-_parquet_field = [
-    "sequence",
-    "protein_accessions",
-    "protein_start_positions",
-    "protein_end_positions",
-    "modifications",
-    "retention_time",
-    "charge",
-    "exp_mass_to_charge",
-    "reference_file_name",
-    "scan_number",
-    "peptidoform",
-    "posterior_error_probability",
-    "global_qvalue",
-    "is_decoy",
-    "consensus_support",
-    "mz_array",
-    "intensity_array",
-    "num_peaks",
-    "search_engines",
-    "id_scores",
-    "hit_rank",
-]
-
-
-def mods_position(peptide):
-    if peptide.startswith("."):
-        peptide = peptide[1:]
-    pattern = re.compile(r"\((.*?)\)")
-    original_mods = pattern.findall(peptide)
-    peptide = re.sub(r"\(.*?\)", ".", peptide)
-    position = [i.start() for i in re.finditer(r"\.", peptide)]
-    for j in range(1, len(position)):
-        position[j] -= j
-
-    for k in range(0, len(original_mods)):
-        original_mods[k] = str(position[k]) + "-" + original_mods[k]
-
-    original_mods = (
-        [str(i) for i in original_mods] if len(original_mods) > 0 else np.nan
-    )
-
-    return original_mods
-
-
-def convert_psm(idxml, spectra_file, export_decoy_psm):
-    prot_ids = []
-    pep_ids = []
-    parquet_data = []
-    consensus_support = np.nan
-    mz_array = []
-    intensity_array = []
-    num_peaks = np.nan
-    id_scores = []
-    search_engines = []
-
-    oms.IdXMLFile().load(idxml, prot_ids, pep_ids)
-    if "ConsensusID" in prot_ids[0].getSearchEngine():
-        if prot_ids[0].getSearchParameters().metaValueExists("SE:MS-GF+"):
-            search_engines = ["MS-GF+"]
-        if prot_ids[0].getSearchParameters().metaValueExists("SE:Comet"):
-            search_engines.append("Comet")
-        if prot_ids[0].getSearchParameters().metaValueExists("SE:Sage"):
-            search_engines.append("Sage")
-    else:
-        search_engines = [prot_ids[0].getSearchEngine()]
-
-    reference_file_name = os.path.splitext(
-        prot_ids[0].getMetaValue("spectra_data")[0].decode("UTF-8")
-    )[0]
-    spectra_df = pd.read_parquet(spectra_file) if spectra_file else None
-
-    for peptide_id in pep_ids:
-        retention_time = peptide_id.getRT()
-        exp_mass_to_charge = peptide_id.getMZ()
-        scan_number = int(
-            re.findall(
-                r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference")
-            )[0][1]
-        )
-
-        if isinstance(spectra_df, pd.DataFrame):
-            spectra = spectra_df[spectra_df["scan"] == scan_number]
-            mz_array = spectra["mz"].values
-            intensity_array = spectra["intensity"].values
-            num_peaks = len(mz_array)
-
-        for hit in peptide_id.getHits():
-            # if remove decoy when mapped to target+decoy?
-            is_decoy = 0 if hit.getMetaValue("target_decoy") == "target" else 1
-            if export_decoy_psm == "false" and is_decoy:
-                continue
-            global_qvalue = np.nan
-            if len(search_engines) > 1:
-                if "q-value" in peptide_id.getScoreType():
-                    global_qvalue = hit.getScore()
-                consensus_support = hit.getMetaValue("consensus_support")
-            elif search_engines == "Comet":
-                id_scores = ["Comet:Expectation value: " + str(hit.getScore())]
-            elif search_engines == "MS-GF+":
-                id_scores = ["MS-GF:SpecEValue: " + str(hit.getScore())]
-            elif search_engines == "Sage":
-                id_scores = ["Sage:hyperscore: " + str(hit.getScore())]
-
-            if hit.metaValueExists("MS:1001491"):
-                global_qvalue = hit.getMetaValue("MS:1001491")
-
-            charge = hit.getCharge()
-            peptidoform = hit.getSequence().toString()
-            modifications = mods_position(peptidoform)
-            sequence = hit.getSequence().toUnmodifiedString()
-            protein_accessions = [
-                ev.getProteinAccession() for ev in hit.getPeptideEvidences()
-            ]
-            posterior_error_probability = hit.getMetaValue(
-                "Posterior Error Probability_score"
-            )
-            protein_start_positions = [
-                ev.getStart() for ev in hit.getPeptideEvidences()
-            ]
-            protein_end_positions = [ev.getEnd() for ev in hit.getPeptideEvidences()]
-            hit_rank = hit.getRank()
-
-            parquet_data.append(
-                [
-                    sequence,
-                    protein_accessions,
-                    protein_start_positions,
-                    protein_end_positions,
-                    modifications,
-                    retention_time,
-                    charge,
-                    exp_mass_to_charge,
-                    reference_file_name,
-                    scan_number,
-                    peptidoform,
-                    posterior_error_probability,
-                    global_qvalue,
-                    is_decoy,
-                    consensus_support,
-                    mz_array,
-                    intensity_array,
-                    num_peaks,
-                    search_engines,
-                    id_scores,
-                    hit_rank,
-                ]
-            )
-
-    pd.DataFrame(parquet_data, columns=_parquet_field).to_csv(
-        f"{Path(idxml).stem}_psm.csv", mode="w", index=False, header=True
-    )
-
-
-def main():
-    idxml_path = sys.argv[1]
-    spectra_file = sys.argv[2]
-    export_decoy_psm = sys.argv[3]
-    convert_psm(idxml_path, spectra_file, export_decoy_psm)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf
index 8ef5b3d1..e2adcaee 100644
--- a/modules/local/diannconvert/main.nf
+++ b/modules/local/diannconvert/main.nf
@@ -2,11 +2,10 @@ process DIANNCONVERT {
     tag "$meta.experiment_id"
     label 'process_medium'
 
-    conda "conda-forge::pandas_schema conda-forge::lzstring bioconda::pmultiqc=0.0.25"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pmultiqc:0.0.25--pyhdfd78af_0' :
-        'biocontainers/pmultiqc:0.0.25--pyhdfd78af_0' }"
-
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     input:
     path(report)
@@ -34,7 +33,7 @@ process DIANNCONVERT {
                         meta.precursormasstoleranceunit,meta.enzyme,meta.fixedmodifications,meta.variablemodifications].join(';')
 
     """
-    diann_convert.py convert \\
+    quantmsutilsc diann2mztab \\
         --folder ./ \\
         --exp_design ${exp_design} \\
         --diann_version ./version/versions.yml \\
diff --git a/modules/local/extract_psm/main.nf b/modules/local/extract_psm/main.nf
index 32f4ceb8..83768eca 100644
--- a/modules/local/extract_psm/main.nf
+++ b/modules/local/extract_psm/main.nf
@@ -2,10 +2,10 @@ process PSMCONVERSION {
     tag "$meta.mzml_id"
     label 'process_medium'
 
-    conda "bioconda::pmultiqc=0.0.25"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pmultiqc:0.0.25--pyhdfd78af_0' :
-        'biocontainers/pmultiqc:0.0.25--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
 
     input:
@@ -19,12 +19,13 @@ process PSMCONVERSION {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.mzml_id}"
+    def string_export_decoy_psm = params.export_decoy_psm == true ? "--export_decoy_psm" : ""
 
 
     """
-    psm_conversion.py "${idxml_file}" \\
-        ${spectrum_df} \\
-        $params.export_decoy_psm \\
+    quantmsutilsc psmconvert --idxml "${idxml_file}" \\
+        --spectra_file ${spectrum_df} \\
+        ${string_export_decoy_psm} \\
         2>&1 | tee extract_idxml.log
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/generate_diann_cfg/main.nf b/modules/local/generate_diann_cfg/main.nf
index 9cddd307..bbf44d65 100644
--- a/modules/local/generate_diann_cfg/main.nf
+++ b/modules/local/generate_diann_cfg/main.nf
@@ -2,11 +2,10 @@ process GENERATE_DIANN_CFG {
     tag "$meta.experiment_id"
     label 'process_low'
 
-    conda 'conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.29'
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' :
-        'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }"
-
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     input:
     val(meta)
@@ -20,7 +19,7 @@ process GENERATE_DIANN_CFG {
     def args = task.ext.args ?: ''
 
     """
-    prepare_diann_parameters.py generate \\
+    quantmsutilsc dianncfg \\
         --enzyme "${meta.enzyme}" \\
         --fix_mod "${meta.fixedmodifications}" \\
         --var_mod "${meta.variablemodifications}" \\
diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf
index 473a2e63..4e7c1508 100644
--- a/modules/local/ms2rescore/main.nf
+++ b/modules/local/ms2rescore/main.nf
@@ -2,10 +2,10 @@ process MS2RESCORE {
     tag "$meta.mzml_id"
     label 'process_high'
 
-    conda "bioconda::ms2rescore=3.0.3 bioconda::psm-utils=0.8.2 conda-forge::pydantic=1.10.14 pygam=0.9.1 bioconda::deeplc=2.2.27 bioconda::ms2pip=4.0.0.dev8 bioconda::deeplcretrainer=0.2.11 conda-forge::scikit-learn=1.4.2 conda-forge::scipy=1.13.0"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.3--pyhdfd78af_0':
-        'biocontainers/ms2rescore:3.0.3--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     // userEmulation settings when docker is specified
     containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : ''
@@ -43,7 +43,7 @@ process MS2RESCORE {
     }
 
     """
-    ms2rescore_cli.py \\
+    quantmsutilsc ms2rescore \\
         --psm_file $idxml \\
         --spectrum_path . \\
         --ms2_tolerance $ms2_tolerence \\
diff --git a/modules/local/mzmlstatistics/main.nf b/modules/local/mzmlstatistics/main.nf
index 31bb55dd..8cd15d16 100644
--- a/modules/local/mzmlstatistics/main.nf
+++ b/modules/local/mzmlstatistics/main.nf
@@ -3,10 +3,10 @@ process MZMLSTATISTICS {
     label 'process_medium'
     label 'process_single'
 
-    conda "bioconda::quantms-utils=0.0.2"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.2--pyhdfd78af_0' :
-        'biocontainers/quantms-utils:0.0.2--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     input:
     tuple val(meta), path(ms_file)
@@ -20,10 +20,11 @@ process MZMLSTATISTICS {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.mzml_id}"
+    def string_id_only = params.id_only == true ? "--id_only" : ""
 
     """
-    mzml_statistics.py "${ms_file}" \\
-        $params.id_only \\
+    quantmsutilsc mzmlstats --ms_path "${ms_file}" \\
+        ${string_id_only} \\
         2>&1 | tee mzml_statistics.log
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/preprocess_expdesign.nf b/modules/local/preprocess_expdesign.nf
index 61842c08..5fb26703 100644
--- a/modules/local/preprocess_expdesign.nf
+++ b/modules/local/preprocess_expdesign.nf
@@ -6,10 +6,10 @@ process PREPROCESS_EXPDESIGN {
     tag "$design.Name"
     label 'process_low'
 
-    conda "bioconda::sdrf-pipelines=0.0.29"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' :
-        'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     input:
     path design
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
index 11614b61..c53c4049 100644
--- a/modules/local/samplesheet_check.nf
+++ b/modules/local/samplesheet_check.nf
@@ -3,10 +3,10 @@ process SAMPLESHEET_CHECK {
     tag "$input_file"
     label 'process_single'
 
-    conda "bioconda::sdrf-pipelines=0.0.29"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' :
-        'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     input:
     path input_file
@@ -32,7 +32,7 @@ process SAMPLESHEET_CHECK {
     def string_is_sdrf = is_sdrf == true ? "--is_sdrf" : ""
 
     """
-    check_samplesheet.py validate --exp_design "${input_file}" ${string_is_sdrf} \\
+    quantmsutilsc checksamplesheet --exp_design "${input_file}" ${string_is_sdrf} \\
     ${string_skip_sdrf_validation} \\
     ${string_skip_ms_validation} \\
     ${string_skip_factor_validation} \\
diff --git a/modules/local/sdrfparsing/main.nf b/modules/local/sdrfparsing/main.nf
index c90a6f9a..fa2acc54 100644
--- a/modules/local/sdrfparsing/main.nf
+++ b/modules/local/sdrfparsing/main.nf
@@ -2,10 +2,10 @@ process SDRFPARSING {
     tag "$sdrf.Name"
     label 'process_low'
 
-    conda "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.29"
+    conda "bioconda::quantms-utils=0.0.7"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' :
-        'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' :
+        'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }"
 
     input:
     path sdrf