little change

bigbio · Oct 28, 2024 · db0c9ea · db0c9ea
1 parent 80c83e7
commit db0c9ea
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 121 deletions.
diff --git a/docs/include/mq_psm_example.paruqet b/docs/include/mq_psm_example.paruqet
diff --git a/quantmsio/commands/feature_command.py b/quantmsio/commands/feature_command.py
@@ -21,9 +21,9 @@
     required=True,
 )
 @click.option(
-    "--chunksize",
+    "--file_num",
     help="Read batch size",
-    default=1000000,
+    default=50,
 )
 @click.option(
     "--protein_file",
@@ -45,25 +45,33 @@
     help="Prefix of the Json file needed to generate the file name",
     required=False,
 )
+@click.option(
+    "--duckdb_max_memory", help="The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)", required=False
+)
+@click.option("--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)", required=False)
 def convert_feature_file(
     sdrf_file: str,
     msstats_file: str,
     mztab_file: str,
-    chunksize: int,
+    file_num: int,
     protein_file: str,
     output_folder: str,
     partitions: str,
     output_prefix_file: str,
+    duckdb_max_memory: str,
+    duckdb_threads: int
 ):
     """
     Convert a msstats/mztab file to a parquet file. The parquet file will contain the features and the metadata.
     :param sdrf_file: the SDRF file needed to extract some of the metadata
     :param msstats_file: the MSstats input file, this will be considered the main format to convert
     :param mztab_file: the mzTab file, this will be used to extract the protein
-    :param chunksize: Read batch size
+    :param file_num: Read batch size
     :param output_folder: Folder where the Json file will be generated
     :param partitions: The field used for splitting files, multiple fields are separated by ,
     :param output_prefix_file: Prefix of the Json file needed to generate the file name
+    :param duckdb_max_memory: The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)
+    :param duckdb_threads: The number of threads for the DuckDB engine (e.g 4)
     """
 
     if sdrf_file is None or msstats_file is None or mztab_file is None or output_folder is None:
@@ -74,13 +82,21 @@ def convert_feature_file(
     filename = create_uuid_filename(output_prefix_file, ".feature.parquet")
     output_path = output_folder + "/" + filename
     if not partitions:
-        feature_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)
+        feature_manager.write_feature_to_file(
+            output_path=output_path,
+            file_num=file_num,
+            protein_file=protein_file,
+            duckdb_max_memory=duckdb_max_memory,
+            duckdb_threads=duckdb_threads
+            )
     else:
         partitions = partitions.split(",")
         feature_manager.write_features_to_file(
             output_folder=output_folder,
             filename=filename,
             partitions=partitions,
-            chunksize=chunksize,
+            file_num=file_num,
             protein_file=protein_file,
+            duckdb_max_memory=duckdb_max_memory,
+            duckdb_threads=duckdb_threads
         )
diff --git a/quantmsio/core/feature.py b/quantmsio/core/feature.py
@@ -4,18 +4,15 @@
 import pyarrow.parquet as pq
 from quantmsio.operate.tools import get_ahocorasick
 from quantmsio.utils.file_utils import extract_protein_list
-from quantmsio.core.mztab import MzTab, generate_modification_list
+from quantmsio.core.mztab import MzTab
 from quantmsio.core.psm import Psm
 from quantmsio.core.sdrf import SDRFHandler
 from quantmsio.core.msstats_in import MsstatsIN
 from quantmsio.utils.pride_utils import (
-    clean_peptidoform_sequence,
     get_petidoform_msstats_notation,
     generate_scan_number,
-    get_peptidoform_proforma_version_in_mztab,
 )
-from quantmsio.utils.constants import ITRAQ_CHANNEL, TMT_CHANNELS
-from quantmsio.core.common import MSSTATS_MAP, MSSTATS_USECOLS, SDRF_USECOLS, SDRF_MAP, FEATURE_SCHEMA
+from quantmsio.core.common import FEATURE_SCHEMA
 
 
 class Feature(MzTab):

diff --git a/quantmsio/core/pg_matrix.py b/quantmsio/core/pg_matrix.py
diff --git a/quantmsio/core/psm.py b/quantmsio/core/psm.py
@@ -109,9 +109,4 @@ def convert_to_parquet_format(res):
         if "rt" in res.columns:
             res["rt"] = res["rt"].astype(float)
         else:
-            res.loc[:, "rt"] = None
-
-#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
-#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
-#res["unique"] = res["unique"].astype("Int32")
-#res["global_qvalue"] = res["global_qvalue"].astype(float)
+            res.loc[:, "rt"] = None