Skip to content

Commit

Permalink
little change
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Oct 28, 2024
1 parent 80c83e7 commit db0c9ea
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 121 deletions.
Binary file removed docs/include/mq_psm_example.paruqet
Binary file not shown.
28 changes: 22 additions & 6 deletions quantmsio/commands/feature_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
required=True,
)
@click.option(
"--chunksize",
"--file_num",
help="Read batch size",
default=1000000,
default=50,
)
@click.option(
"--protein_file",
Expand All @@ -45,25 +45,33 @@
help="Prefix of the Json file needed to generate the file name",
required=False,
)
@click.option(
"--duckdb_max_memory", help="The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)", required=False
)
@click.option("--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)", required=False)
def convert_feature_file(
sdrf_file: str,
msstats_file: str,
mztab_file: str,
chunksize: int,
file_num: int,
protein_file: str,
output_folder: str,
partitions: str,
output_prefix_file: str,
duckdb_max_memory: str,
duckdb_threads: int
):
"""
Convert a msstats/mztab file to a parquet file. The parquet file will contain the features and the metadata.
:param sdrf_file: the SDRF file needed to extract some of the metadata
:param msstats_file: the MSstats input file, this will be considered the main format to convert
:param mztab_file: the mzTab file, this will be used to extract the protein
:param chunksize: Read batch size
:param file_num: Read batch size
:param output_folder: Folder where the Json file will be generated
:param partitions: The field used for splitting files, multiple fields are separated by ,
:param output_prefix_file: Prefix of the Json file needed to generate the file name
:param duckdb_max_memory: The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)
:param duckdb_threads: The number of threads for the DuckDB engine (e.g 4)
"""

if sdrf_file is None or msstats_file is None or mztab_file is None or output_folder is None:
Expand All @@ -74,13 +82,21 @@ def convert_feature_file(
filename = create_uuid_filename(output_prefix_file, ".feature.parquet")
output_path = output_folder + "/" + filename
if not partitions:
feature_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)
feature_manager.write_feature_to_file(
output_path=output_path,
file_num=file_num,
protein_file=protein_file,
duckdb_max_memory=duckdb_max_memory,
duckdb_threads=duckdb_threads
)
else:
partitions = partitions.split(",")
feature_manager.write_features_to_file(
output_folder=output_folder,
filename=filename,
partitions=partitions,
chunksize=chunksize,
file_num=file_num,
protein_file=protein_file,
duckdb_max_memory=duckdb_max_memory,
duckdb_threads=duckdb_threads
)
7 changes: 2 additions & 5 deletions quantmsio/core/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,15 @@
import pyarrow.parquet as pq
from quantmsio.operate.tools import get_ahocorasick
from quantmsio.utils.file_utils import extract_protein_list
from quantmsio.core.mztab import MzTab, generate_modification_list
from quantmsio.core.mztab import MzTab
from quantmsio.core.psm import Psm
from quantmsio.core.sdrf import SDRFHandler
from quantmsio.core.msstats_in import MsstatsIN
from quantmsio.utils.pride_utils import (
clean_peptidoform_sequence,
get_petidoform_msstats_notation,
generate_scan_number,
get_peptidoform_proforma_version_in_mztab,
)
from quantmsio.utils.constants import ITRAQ_CHANNEL, TMT_CHANNELS
from quantmsio.core.common import MSSTATS_MAP, MSSTATS_USECOLS, SDRF_USECOLS, SDRF_MAP, FEATURE_SCHEMA
from quantmsio.core.common import FEATURE_SCHEMA


class Feature(MzTab):
Expand Down
104 changes: 0 additions & 104 deletions quantmsio/core/pg_matrix.py

This file was deleted.

7 changes: 1 addition & 6 deletions quantmsio/core/psm.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,4 @@ def convert_to_parquet_format(res):
if "rt" in res.columns:
res["rt"] = res["rt"].astype(float)
else:
res.loc[:, "rt"] = None

#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
#res["unique"] = res["unique"].astype("Int32")
#res["global_qvalue"] = res["global_qvalue"].astype(float)
res.loc[:, "rt"] = None

0 comments on commit db0c9ea

Please sign in to comment.