diff --git a/docs/README.adoc b/docs/README.adoc index 6a73cc64..fa2f4b09 100644 --- a/docs/README.adoc +++ b/docs/README.adoc @@ -984,7 +984,7 @@ The following table presents the fields needed to describe each feature in quant | m/z | exp_mass_to_charge -| `rt_apex` +| `rt` | Precursor retention time (in seconds) | float32, null | RT diff --git a/docs/include/example.psm.parquet b/docs/include/example.psm.parquet new file mode 100644 index 00000000..cb19d8ce Binary files /dev/null and b/docs/include/example.psm.parquet differ diff --git a/docs/include/feature.parquet b/docs/include/feature.parquet deleted file mode 100644 index 7bac45a5..00000000 Binary files a/docs/include/feature.parquet and /dev/null differ diff --git a/docs/include/maxquant_feature.parquet b/docs/include/maxquant_feature.parquet deleted file mode 100644 index e64548b8..00000000 Binary files a/docs/include/maxquant_feature.parquet and /dev/null differ diff --git a/docs/include/maxquant_psm.parquet b/docs/include/maxquant_psm.parquet deleted file mode 100644 index 01385969..00000000 Binary files a/docs/include/maxquant_psm.parquet and /dev/null differ diff --git a/docs/include/psm.parquet b/docs/include/psm.parquet deleted file mode 100644 index e4ed9638..00000000 Binary files a/docs/include/psm.parquet and /dev/null differ diff --git a/docs/tools.adoc b/docs/tools.rst similarity index 53% rename from docs/tools.adoc rename to docs/tools.rst index b0fca6fd..b2ffccc4 100644 --- a/docs/tools.adoc +++ b/docs/tools.rst @@ -8,100 +8,6 @@ image::map.png[width=80%] You can generate separate files or complete project files depending on your needs.A completed project contains the following files: -[[tools-and-libraries]] -== Querying parquet - -The query module provides the ability to quickly search through Parquet files. - -Basic query operations allow you to query all ``samples``, ``peptides``, ``proteins``, ``genes``, and ``mzML`` files. The query results will be deduplicated and then returned in a list format. - -[source,python] ----- - from quantmsio.core.query import Parquet - P = Parquet('PXD007683.feature.parquet') - P.get_unique_samples() - """ - ['PXD007683-Sample-3', - 'PXD007683-Sample-9', - 'PXD007683-Sample-4', - 'PXD007683-Sample-6', - 'PXD007683-Sample-11', - 'PXD007683-Sample-7', - 'PXD007683-Sample-1', - 'PXD007683-Sample-10', - 'PXD007683-Sample-8', - 'PXD007683-Sample-2', - 'PXD007683-Sample-5'] - """ - P.get_unique_peptides() - P.get_unique_proteins() - P.get_unique_genes() - P.get_unique_references() ----- - -Specific queries allow you to individually search for certain values based on specific conditions. -The results are returned in the form of a ``DataFrame``. - -[source,python] ----- - P.query_peptide('QPAYVSK') - """ - sequence protein_accessions protein_start_positions ... - QPAYVSK [sp|P36016|LHS1_YEAST] [739] - QPAYVSK [sp|P36016|LHS1_YEAST] [739] - QPAYVSK [sp|P36016|LHS1_YEAST] [739] - ... - """ - P.query_peptide('QPAYVSK',columns=['protein_start_positions','protein_end_positions']) ----- - -[source,python] ----- - P.query_protein('P36016',columns=None) - P.query_proteins(['P36016','O95861'],columns=None) - """ - sequence protein_accessions protein_start_positions ... - QPAYVSK [sp|P36016|LHS1_YEAST] [739] - QPAYVSK [sp|P36016|LHS1_YEAST] [739] - QPAYVSK [sp|P36016|LHS1_YEAST] [739] - ... - QPCPSQYSAIK [sp|O95861|BPNT1_HUMAN] [98] - ... - """ - - P.get_samples_from_database(['PXD007683-Sample-3','PXD007683-Sample-9'],columns=None) - """ - sequence protein_accessions sample_accession - AAAAAAAAAAAAAAAGAGAGAK [sp|P55011|S12A2_HUMAN] PXD007683-Sample-3 - AAAAAAAAAAAAAAAGAGAGAK [sp|P55011|S12A2_HUMAN] PXD007683-Sample-3 - AAAAAAAAAK [sp|Q99453|PHX2B_HUMAN] PXD007683-Sample-3 - """ - P.get_report_from_database(['a05063','a05059'],columns=None) # mzml - """ - sequence protein_accessions reference_file_name - AAAAAAALQAK [sp|P36578|RL4_HUMAN] a05063 - AAAAAAALQAK [sp|P36578|RL4_HUMAN] a05063 - AAAAAAALQAK [sp|P36578|RL4_HUMAN] a05063 - """ ----- - -You can use the following method to produce values in batches. - -[source,python] ----- - for samples,df in P.iter_samples(file_num=10,columns=None): - # A batch contains ten samples. - print(samples,df) - - for df in P.iter_chunk(batch_size=500000,columns=None): - # A batch contains 500,000 rows. - print(df) - - for refs,df in P.iter_file(file_num=20,columns=None): # mzml - # A batch contains 20 mzML files. - print(refs,df) ----- - Project converter tool ------------------------- If your project comes from the PRIDE database, @@ -131,8 +37,8 @@ It's like this: * Optional parameter .. code:: shell - - --quantms_version Quantms version + --software_name software name used to generate the data + --software_version software version used to generate the data --delete_existing Delete existing files in the output folder(default False) DE converter tool @@ -201,8 +107,8 @@ It store peptide intensity to perform down-stream analysis and integration. * If you want to know more, please read :doc:`feature`. -In some projects, mzTab files can be very large, so we provide both `diskcache` and `no-diskcache` versions of the tool. -You can choose the desired version according to your server configuration. +Mztab +>>>>>>> Example: @@ -218,38 +124,34 @@ Example: .. code:: shell + --file_num Read batch size(default 50) --protein_file Protein file that meets specific requirements(protein.txt) + --partitions The field used for splitting files, multiple fields are separated by `,` --output_prefix_file The prefix of the result file(like {prefix}-{uu.id}-{extension}) + --duckdb_max_memory The maximum amount of memory allocated by the DuckDB engine (e.g 16GB) + --duckdb_threads The number of threads for the DuckDB engine (e.g 4) - -Psm converter tool ---------------------- - -The PSM table aims to cover detail on PSM level for AI/ML training and other use-cases. -It store details on PSM level including spectrum mz/intensity for specific use-cases such as AI/ML training. - -* If you want to know more, please read :doc:`psm`. +Maxquant +>>>>>>>>> Example: - + .. code:: shell - quantmsioc convert-psm - --mztab_file PXD014414.sdrf_openms_design_openms.mzTab + quantmsioc convert-maxquant-feature + --evidence_file evidence.txt + --sdrf_file PXD014414.sdrf.tsv --output_folder result * Optional parameter .. code:: shell + --chunksize Read batch size + --output_prefix_file Prefix of the parquet file needed to generate the file name - --protein_file Protein file that meets specific requirements(protein.txt) - --output_prefix_file The prefix of the result file(like {prefix}-{uu.id}-{extension}) - -DiaNN convert --------------------------- -For DiaNN, the command supports generating `feature.parquet` and `psm.parquet` directly from diann_report files. -* If you want to see `design_file`, please click `sdrf-pipelines `__ +DiaNN +>>>>>>> Example: @@ -257,80 +159,65 @@ Example: quantmsioc convert-diann --report_path diann_report.tsv - --design_file PXD037682.sdrf_openms_design.tsv --qvalue_threshold 0.05 --mzml_info_folder mzml --sdrf_path PXD037682.sdrf.tsv --output_folder result - --output_prefix_file PXD037682 - + * Optional parameter .. code:: shell + --protein_file Protein file that meets specific requirements + --output_prefix_file Prefix of the Json file needed to generate the file name + --duckdb_max_memory The maximum amount of memory allocated by the DuckDB engine (e.g 16GB) + --duckdb_threads The number of threads for the DuckDB engine (e.g 4) + --file_num Read batch size(default 50) - --duckdb_max_memory The maximum amount of memory allocated by the DuckDB engine (e.g 16GB) - --duckdb_threads The number of threads for the DuckDB engine (e.g 4) - --file_num The number of files being processed at the same time (default 100) - -Maxquant convert --------------------------- -Convert Maxquant evidence.txt to parquet file -* file +Psm converter tool +--------------------- -Example: +The PSM table aims to cover detail on PSM level for AI/ML training and other use-cases. +It store details on PSM level including spectrum mz/intensity for specific use-cases such as AI/ML training. -.. code:: shell +* If you want to know more, please read :doc:`psm`. - quantmsioc convert-maxquant - --sdrf_file example.sdrf.tsv - --evidence_file evidence.tsv - --output_folder result - --output_prefix_file example +Mztab +>>>>>>> -* zip file +Example: + .. code:: shell - quantmsioc convert-zip-maxquant - --sdrf_file example.sdrf.tsv - --evidence_file requirement.txt + quantmsioc convert-psm + --mztab_file PXD014414.sdrf_openms_design_openms.mzTab --output_folder result - --output_prefix_file example -requirement.txt like this: +* Optional parameter .. code:: shell - #requirement.txt - 2013_04_03_16_54_Q-Exactive-Orbitrap_1.zip - 2013_04_03_17_47_Q-Exactive-Orbitrap_1.zip + --protein_file Protein file that meets specific requirements(protein.txt) + --chunksize Read batch size + --output_prefix_file The prefix of the result file(like {prefix}-{uu.id}-{extension}) -Inject some messages for DiaNN -------------------------------- -For DiaNN, some field information is not available and needs to be filled with other commands. +Maxquant +>>>>>>>>> -* bset-psm-scan-number Example: - + .. code:: shell - quantmsioc inject-bset-psm-scan-number - --diann_psm_path PXD010154-f75fbb29-4419-455f-a011-e4f776bcf73b.psm.parquet - --diann_feature_path PXD010154_map_protein_accession-88d63fca-3ae6-4eab-9262-6e7a68184432.feature.parquet - --output_path PXD010154.feature.parquet + quantmsioc convert-maxquant-psm + --msms_file the msms.txt file, this will be used to extract the peptide information + --output_folder result -* start-and-end-pisition -Example: +* Optional parameter .. code:: shell - quantmsioc inject-start-and-end-from-fasta - --parquet_path PXD010154_map_protein_accession-88d63fca-3ae6-4eab-9262-6e7a68184432.feature.parquet - --fasta_path Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta - --label feature - --output_path PXD010154.feature.parquet - - + --chunksize Read batch size + --output_prefix_file The prefix of the result file(like {prefix}-{uu.id}-{extension}) Compare psm.parquet ------------------- @@ -352,11 +239,10 @@ Example: Generate spectra message ------------------------- -generate_spectra_message support psm and feature. It can be used directly for spectral clustering. +generate_spectra_message support psm. It can be used directly for spectral clustering. + -* `--label` contains two options: `psm` and `feature`. -* `--partion` contains two options: `charge` and `reference_file_name`. -Since the result file is too large, you can specify `–-partition` to split the result file. +Since the result file is too large, you can specify `–-partitions` to split the result file. Example: @@ -365,18 +251,18 @@ Example: quantmsioc map-spectrum-message-to-parquet --parquet_path PXD014414-f4fb88f6-0a45-451d-a8a6-b6d58fb83670.psm.parquet --mzml_directory mzmls - --output_path psm/PXD014414.parquet - --label psm - --file_num(default 10) - --partition charge + --output_folder result +* Optional parameter + +.. code:: shell + + --file_num The number of rows of parquet read using pandas streaming + --partitions The field used for splitting files, multiple fields are separated by `,` Generate gene message ------------------------- -generate_gene_message support psm and feature. - -* `--label` contains two options: `psm` and `feature`. -* `--map_parameter` contains two options: `map_protein_name` or `map_protein_accession`. +generate_gene_message support feature. Example: @@ -385,14 +271,14 @@ Example: quantmsioc map-gene-msg-to-parquet --parquet_path PXD000672-0beee055-ae78-4d97-b6ac-1f191e91bdd4.featrue.parquet --fasta_path Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta - --output_path PXD000672-gene.parquet - --label feature - --map_parameter map_protein_name + --output_folder result * Optional parameter .. code:: shell + --file_num The number of rows of parquet read using pandas streaming + --partitions The field used for splitting files, multiple fields are separated by `,` --species species type(default human) * `species` @@ -420,69 +306,13 @@ Example: +-------------+-------------------------+ -Map proteins accessions ------------------------- - -get_unanimous_name support parquet and tsv. For parquet, map_parameter -have two option (`map_protein_name` or `map_protein_accession`), and the -label controls whether it is PSM or Feature. - -* parquet -* `--label` contains two options: `psm` and `feature` - -Example: - -.. code:: shell - - quantmsioc labels convert-accession - --parquet_path PXD014414-f4fb88f6-0a45-451d-a8a6-b6d58fb83670.psm.parquet - --fasta Reference fasta database - --output_path psm/PXD014414.psm.parquet - --map_parameter map_protein_name - --label psm - -* tsv - -Example: - -.. code:: shell - - quantmsioc labels get-unanimous-for-tsv - --path PXD014414-c2a52d63-ea64-4a64-b241-f819a3157b77.differential.tsv - --fasta Reference fasta database - --output_path psm/PXD014414.de.tsv - --map_parameter map_protein_name - -Compare two parquet files --------------------------- -This tool is used to compare the feature.parquet file generated by two versions (`diskcache` or `no-diskcache`). - -Example: - -.. code:: shell - - quantmsioc compare-parquet - --parquet_path_one res_lfq2_discache.parquet - --parquet_path_two res_lfq2_no_cache.parquet - --report_path report.txt - -Generate report about files ------------------------------ -This tool is used to generate report about all project. - -Example: - -.. code:: shell - - quantmsioc generate-project-report - --project_folder PXD014414 Register file -------------------------- This tool is used to register the file to `project.json`. If your project comes from the PRIDE database, You can use this command to add file information for `project.json`. -* The parameter `--category` has three options: `feature_file`, `psm_file`, `differential_file`, `absolute_file`.You can add the above file types. +* The parameter `--category` has three options: `sdrf_file`, `feature_file`, `psm_file`, `differential_file`, `absolute_file`.You can add the above file types. * The parameter `--replace_existing` is enable then we remove the old file and add this one. If not then we can have a list of files for a category. Example: @@ -493,41 +323,14 @@ Example: --project_file PXD014414/project.json --attach_file PXD014414-943a8f02-0527-4528-b1a3-b96de99ebe75.featrue.parquet --category feature_file - --replace_existing - -Convert file to json --------------------------- -This tool is used to convert file to json. - -* parquet -* `--data_type` contains two options: `psm` and `feature` -Example: - -.. code:: shell - - quantmsioc convert-parquet-json - --data_type feature - --parquet_path PXD014414-943a8f02-0527-4528-b1a3-b96de99ebe75.featrue.parquet - --json_path PXD014414.featrue.json -* tsv -Example: - -.. code:: shell - - quantmsioc json convert-tsv-to-json - --file PXD010154-51b34353-227f-4d38-a181-6d42824de9f7.absolute.tsv - --json_path PXD010154.ae.json - -* sdrf -Example: -.. code:: shell - - quantmsioc json convert-sdrf-to-json - --file MSV000079033-Blood-Plasma-iTRAQ.sdrf.tsv - --json_path MSV000079033.sdrf.json +* Optional parameter +.. code:: shell + --is_folder A boolean value that indicates if the file is a folder or not + --replace_existing Whether to delete old files + --partitions The fields that are used to partition the data in the file. This is used to optimize the data retrieval and filtering of the data. This field is optional. Statistics ----------- @@ -593,4 +396,6 @@ This tool is used for visualization. quantmsioc plot plot-box-intensity-distribution --feature_path PXD010154-51b34353-227f-4d38-a181-6d42824de9f7.featrue.parquet --num_samples 10 - --save_path PXD014414_psm_peptides.svg \ No newline at end of file + --save_path PXD014414_psm_peptides.svg + + diff --git a/quantmsio/commands/attach_file_command.py b/quantmsio/commands/attach_file_command.py index 298ccab1..e23114de 100644 --- a/quantmsio/commands/attach_file_command.py +++ b/quantmsio/commands/attach_file_command.py @@ -20,28 +20,28 @@ ) @click.option("--is_folder", help="A boolean value that indicates if the file is a folder or not", is_flag=True) @click.option( - "--partition_fields", - multiple=True, - type=str, - help="The fields that are used to partition the data in the file. This is used to optimize the data retrieval and filtering of the data. This field is optional", + "--partitions", + help="The field used for splitting files, multiple fields are separated by ,", required=False, ) @click.option("--replace_existing", help="Whether to delete old files", is_flag=True) -def attach_file_to_json(project_file, attach_file, category, is_folder, partition_fields, replace_existing): +def attach_file_to_json(project_file, attach_file, category, is_folder, partitions, replace_existing): """ Register the file with project.json :param path_name: The name of the file or folder :param file_category: quantms file category(e.g."protein_file","peptide_file","psm_file","differential_file",etc.) :param is_folder: A boolean value that indicates if the file is a folder or not. - :partition_fields: The fields that are used to partition the data in the file. This is used to optimize the data retrieval and filtering of the data. This field is optional. + :partitions: The fields that are used to partition the data in the file. This is used to optimize the data retrieval and filtering of the data. This field is optional. :param replace_existing: Whether to delete old files """ + if partitions: + partitions = partitions.split(",") register = ProjectHandler(project_json_file=project_file) register.register_file( attach_file, category, is_folder=is_folder, - partition_fields=list(partition_fields), + partition_fields=partitions, replace_existing=replace_existing, ) register.save_updated_project_info(output_file_name=project_file) diff --git a/quantmsio/commands/generate_gene_message_command.py b/quantmsio/commands/generate_gene_message_command.py new file mode 100644 index 00000000..de77e945 --- /dev/null +++ b/quantmsio/commands/generate_gene_message_command.py @@ -0,0 +1,47 @@ +import click +from quantmsio.operate.tools import generate_feature_of_gene + + +@click.command( + "map-gene-message-to-parquet", + short_help="According mzMl to map the gene message to parquet", +) +@click.option("--parquet_path", help="Psm or feature parquet path") +@click.option("--fasta", help="fasta file") +@click.option( + "--output_folder", + help="Folder where the Json file will be generated", + required=True, +) +@click.option( + "--file_num", + help="The number of rows of parquet read using pandas streaming", + default=10, +) +@click.option( + "--partitions", + help="The field used for splitting files, multiple fields are separated by ,", + required=False, +) +@click.option("--species", help="species", default="human", required=False) +def map_gene_message_to_parquet( + parquet_path: str, + fasta: str, + output_folder: str, + file_num: int, + partitions: str = None, + species: str = "human", +): + """ + according fasta file to map the gene message to parquet. + :param parquet_path: psm_parquet_path or feature_parquet_path + :param fasta: fasta path + :param output_folder: Folder where the Json file will be generated + :param file_num: reference num + :param partitions: The field used for splitting files, multiple fields are separated by , + :param species: species + retrun: None + """ + if partitions: + partitions = partitions.split(",") + generate_feature_of_gene(parquet_path, fasta, output_folder, file_num, partitions, species) \ No newline at end of file diff --git a/quantmsio/commands/generate_spectra_message_command.py b/quantmsio/commands/generate_spectra_message_command.py index fbe771b9..ea50a7be 100644 --- a/quantmsio/commands/generate_spectra_message_command.py +++ b/quantmsio/commands/generate_spectra_message_command.py @@ -1,5 +1,5 @@ import click -from quantmsio.operate.tools import generate_features_of_spectrum +from quantmsio.operate.tools import generate_psms_of_spectrum @click.command( @@ -8,11 +8,10 @@ ) @click.option("--parquet_path", help="Psm or feature parquet path") @click.option("--mzml_directory", help="mzml file folder") -@click.option("--output_path", help="output file path(.parquet)") @click.option( - "--label", - type=click.Choice(["feature", "psm"], case_sensitive=False), - help="parquet type", + "--output_folder", + help="Folder where the Json file will be generated", + required=True, ) @click.option( "--file_num", @@ -20,29 +19,26 @@ default=10, ) @click.option( - "--partition", - type=click.Choice(["charge", "reference_file_name"], case_sensitive=False), - help="partition", + "--partitions", + help="The field used for splitting files, multiple fields are separated by ,", required=False, ) def map_spectrum_message_to_parquet( parquet_path: str, mzml_directory: str, - output_path: str, - label: str, + output_folder: str, file_num: int, - partition: str = None, + partitions: str = None, ): """ according mzML file to map the spectrum message to parquet. :param parquet_path: psm_parquet_path or feature_parquet_path :param mzml_directory: mzml file folder - :param output_path: output file path(extension[.parquet]) - :param label: feature or psm + :param output_folder: Folder where the Json file will be generated :param file_num: reference num - :param partition: charge or reference_file_name + :param partitions: The field used for splitting files, multiple fields are separated by , retrun: None """ - if not output_path.endswith("parquet"): - raise click.UsageError("Please provide file extension(.parquet)") - generate_features_of_spectrum(parquet_path, mzml_directory, output_path, label, file_num, partition) + if partitions: + partitions = partitions.split(",") + generate_psms_of_spectrum(parquet_path, mzml_directory, output_folder, file_num, partitions) diff --git a/quantmsio/commands/get_unanimous_command.py b/quantmsio/commands/get_unanimous_command.py deleted file mode 100644 index c2e21d80..00000000 --- a/quantmsio/commands/get_unanimous_command.py +++ /dev/null @@ -1,68 +0,0 @@ -import click - -from quantmsio.operate.tools import map_protein_for_parquet -from quantmsio.operate.tools import map_protein_for_tsv - -CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) - - -# Command Group -@click.group(name="labels", context_settings=CONTEXT_SETTINGS) -def labels(): - """Tool related commands""" - pass - - -@labels.command( - "convert-accession", - short_help="Map reported proteins to uniprot names/accessions in fasta file", -) -@click.option("--parquet_path", help="psm or feature parquet path") -@click.option("--fasta", help="Reference fasta database") -@click.option("--output_path", help="output file path") -@click.option( - "--map_parameter", - type=click.Choice(["map_protein_name", "map_protein_accession"], case_sensitive=False), - help="map type", -) -@click.option( - "--label", - type=click.Choice(["feature", "psm"], case_sensitive=False), - help="parquet type", -) -def get_unanimous_for_parquet(parquet_path, fasta, output_path, map_parameter, label): - """ - according fasta database to map the proteins accessions to uniprot names. - :param parquet_path: psm_parquet_path or feature_parquet_path - :param fasta: Reference fasta database - :param output_path: output file path - :param map_parameter: map_protein_name or map_protein_accession - :param label: feature or psm - return: None - """ - map_protein_for_parquet(parquet_path, fasta, output_path, map_parameter, label) - - -# tsv -@labels.command( - "get-unanimous-for-tsv", - short_help="According fasta database to map the proteins accessions to uniprot names.", -) -@click.option("--path", help="ae or de path") -@click.option("--fasta", help="Reference fasta database") -@click.option("--output_path", help="output file path") -@click.option( - "--map_parameter", - type=click.Choice(["map_protein_name", "map_protein_accession"], case_sensitive=False), - help="map type", -) -def get_unanimous_for_tsv(path, fasta, output_path, map_parameter): - """ - according fasta database, to map the proteins accessions to uniprot names. - :param path: de_path or ae_path - :param fasta: Reference fasta database - :param output_path: output file path - :param map_parameter: map_protein_name or map_protein_accession - retrun: None - """ - map_protein_for_tsv(path, fasta, output_path, map_parameter) diff --git a/quantmsio/commands/maxquant_command.py b/quantmsio/commands/maxquant_command.py index 4fbfbe15..60d35ace 100644 --- a/quantmsio/commands/maxquant_command.py +++ b/quantmsio/commands/maxquant_command.py @@ -104,7 +104,5 @@ def convert_maxquant_feature( output_prefix_file = "" MQ = MaxQuant() - output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") - MQ.convert_feature_to_parquet( - evidence_path=evidence_file, sdrf_path=sdrf_file, output_path=output_path, chunksize=chunksize - ) + output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".feature.parquet") + MQ.convert_feature_to_parquet(evidence_path=evidence_file, sdrf_path=sdrf_file, output_path=output_path, chunksize=chunksize) diff --git a/quantmsio/commands/pg_matrix_command.py b/quantmsio/commands/pg_matrix_command.py deleted file mode 100644 index abe1551e..00000000 --- a/quantmsio/commands/pg_matrix_command.py +++ /dev/null @@ -1,44 +0,0 @@ -import click -from quantmsio.core.project import create_uuid_filename -from quantmsio.core.pg_matrix import PgMatrix - - -@click.command( - "convert-pg-matrix", - short_help="Convert psm from mzTab to parquet file in quantms io", -) -@click.option( - "--feature_file", - help="feature file", - required=True, -) -@click.option( - "--output_folder", - help="Folder where the parquet file will be generated", - required=True, -) -@click.option( - "--output_prefix_file", - help="Prefix of the parquet file needed to generate the file name", - required=False, -) -def convert_pg_matrix( - feature_file: str, - output_folder: str, - output_prefix_file: str, -): - """ - :param feature_file: feature file - :param output_folder: Folder where the Json file will be generated - :param output_prefix_file: Prefix of the Json file needed to generate the file name - """ - - if feature_file is None or output_folder is None: - raise click.UsageError("Please provide all the required parameters") - - if not output_prefix_file: - output_prefix_file = "" - - pg_manager = PgMatrix(feature_path=feature_file) - output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".pg.parquet") - pg_manager.write_to_file(output_path=output_path) diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py index 797d8df2..88547760 100644 --- a/quantmsio/core/common.py +++ b/quantmsio/core/common.py @@ -76,6 +76,7 @@ "Modified sequence": "peptidoform", "Raw file": "reference_file_name", "Score": "additional_scores", + "PIF": "parent_ion_score" } MAXQUANT_FEATURE_MAP = { diff --git a/quantmsio/core/maxquant.py b/quantmsio/core/maxquant.py index c88afd5a..8ae0f9de 100644 --- a/quantmsio/core/maxquant.py +++ b/quantmsio/core/maxquant.py @@ -247,7 +247,7 @@ def main_operate(self, df: pd.DataFrame): df["additional_scores"] = df["additional_scores"].apply( lambda x: [{"score_name": "maxquant_score", "score_value": np.float32(x)}] ) - df.loc[:, "cv_params"] = None + df.loc[:, "cv_params"] = df["parent_ion_score"].apply(lambda socre: [{"cv_name": "parent_ion_score", "cv_value": str(socre)}]) df.loc[:, "predicted_rt"] = None df.loc[:, "ion_mobility"] = None return df diff --git a/quantmsio/operate/plots.py b/quantmsio/operate/plots.py index 4d2564ea..b7c64f2e 100644 --- a/quantmsio/operate/plots.py +++ b/quantmsio/operate/plots.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd import seaborn as sns - +from quantmsio.operate.tools import transform_ibaq def plot_distribution_of_ibaq(ibaq_path: str, save_path: str = None, selected_column: str = None) -> None: """ @@ -107,7 +107,8 @@ def plot_intensity_distribution_of_samples(feature_path: str, save_path: str = N :param save_path: save path[xxx.svg] :param num_samples: number of samples to plot """ - df = pd.read_parquet(feature_path, columns=["sample_accession", "intensity"]) + df = pd.read_parquet(feature_path, columns=["intensities"]) + df = transform_ibaq(df) sample_accessions = df["sample_accession"].unique().tolist() random.shuffle(sample_accessions) if len(sample_accessions) > num_samples: @@ -133,7 +134,8 @@ def plot_peptide_distribution_of_protein(feature_path: str, save_path: str = Non :param save_path: save path[xxx.svg] :param num_samples: number of samples to plot """ - df = pd.read_parquet(feature_path, columns=["sample_accession", "sequence"]) + df = pd.read_parquet(feature_path, columns=["intensities", "sequence"]) + df = transform_ibaq(df) df = df.groupby(["sample_accession"]).agg({"sequence": "count"}).reset_index() df.columns = ["sample", "peptides"] df = df.sample(frac=1).reset_index(drop=True) @@ -160,7 +162,8 @@ def plot_intensity_box_of_samples(feature_path: str, save_path: str = None, num_ :param save_path: save path[xxx.svg] :param num_samples: number of samples to plot """ - df = pd.read_parquet(feature_path, columns=["sample_accession", "intensity"]) + df = pd.read_parquet(feature_path, columns=["intensities"]) + df = transform_ibaq(df) sample_accessions = df["sample_accession"].unique().tolist() random.shuffle(sample_accessions) diff --git a/quantmsio/operate/query.py b/quantmsio/operate/query.py index 22c6660f..4f4fc339 100644 --- a/quantmsio/operate/query.py +++ b/quantmsio/operate/query.py @@ -1,21 +1,21 @@ import os import re -# from collections import defaultdict +from collections import defaultdict import ahocorasick import duckdb -# import mygene +import mygene import pandas as pd import pyarrow.parquet as pq from Bio import SeqIO from quantmsio.core.openms import OpenMSHandler -# from quantmsio.utils.pride_utils import generate_gene_name_map -# from quantmsio.utils.pride_utils import get_gene_accessions -# from quantmsio.utils.pride_utils import get_unanimous_name +from quantmsio.utils.pride_utils import generate_gene_name_map +from quantmsio.utils.pride_utils import get_gene_accessions +from quantmsio.utils.pride_utils import get_unanimous_name def check_string(re_exp, strings): @@ -160,28 +160,33 @@ def inject_position_msg(self, df: pd.DataFrame, protein_dict: dict): ) return df - # def inject_gene_msg( - # self, - # df: pd.DataFrame, - # fasta: str, - # map_parameter: str = "map_protein_accession", - # species: str = "human", - # ): - # """ - # :params df: parquet file - # :params fasta: refence fasta file - # :params map_parameter: map_protein_name or map_protein_accession - # :params species: default human - # :return df - # """ - # map_gene_names = generate_gene_name_map(fasta, map_parameter) - # df["gene_names"] = df["protein_accessions"].apply(lambda x: get_unanimous_name(x, map_gene_names)) - # gene_list = self.get_gene_list(map_gene_names) - # gene_accessions = self.get_gene_accessions(gene_list, species) - # df["gene_accessions"] = df["gene_names"].apply(lambda x: get_gene_accessions(x, gene_accessions)) - - # return df + def inject_gene_msg( + self, + df: pd.DataFrame, + map_gene_names: dict, + species: str = "human", + ): + """ + :params df: parquet file + :params fasta: refence fasta file + :params map_parameter: map_protein_name or map_protein_accession + :params species: default human + :return df + """ + if "pg_accessions" in df.columns: + df["gg_names"] = df["pg_accessions"].apply(lambda x: get_unanimous_name(x, map_gene_names)) + else: + df["gg_names"] = df["mp_accessions"].apply(lambda x: get_unanimous_name(x, map_gene_names)) + gene_list = list(set([gene for gene_names in df["gg_names"] if gene_names is not None for gene in gene_names])) + gene_accessions = self.get_gene_accessions(gene_list, species) + df["gg_accessions"] = df["gg_names"].apply(lambda x: get_gene_accessions(x, gene_accessions)) + + return df + def get_protein_to_gene_map(self, fasta: str, map_parameter: str = "map_protein_accession"): + map_gene_names = generate_gene_name_map(fasta, map_parameter) + return map_gene_names + def get_protein_dict(self, fasta_path): """ return: protein_map {protein_accession:seq} @@ -232,9 +237,12 @@ def get_unique_proteins(self): return: A list of deduplicated proteins. """ - unique_prts = self.parquet_db.sql("SELECT DISTINCT pg_accessions FROM parquet_db").df() - - return unique_prts["pg_accessions"].tolist() + unique_prts = self.parquet_db.sql("SELECT mp_accessions FROM parquet_db").df() + proteins = set() + for protein_accessions in unique_prts["mp_accessions"]: + if protein_accessions is not None: + proteins.update(set(protein_accessions)) + return list(proteins) def get_unique_genes(self): """ @@ -301,25 +309,15 @@ def query_protein(self, protein: str, columns: list = None): else: raise KeyError("Illegal protein!") - # def get_gene_list(self, map_gene_names: dict): - # """ - # :params map_gene_names: protenin => gene - # return: unique gene list - # """ - # unique_prts = self.get_unique_proteins() - # gene_names = [get_unanimous_name(proteins, map_gene_names) for proteins in unique_prts] - # gene_list = list(set([item for sublist in gene_names for item in sublist])) - - # return gene_list - - # def get_gene_accessions(self, gene_list: list, species: str = "human"): - # """ - # :params gene_list - # """ - # mg = mygene.MyGeneInfo() - # gene_accessions = mg.querymany(gene_list, scopes="symbol", species=species, fields="accession") - # gene_accessions_maps = defaultdict(list) - # for obj in gene_accessions: - # if "accession" in obj: - # gene_accessions_maps[obj["query"]].append(obj["accession"]) - # return gene_accessions_maps + def get_gene_accessions(self, gene_list: list, species: str = "human"): + """ + :params gene_list + """ + mg = mygene.MyGeneInfo() + gene_accessions = mg.querymany(gene_list, scopes="symbol", species=species, fields="accession") + gene_accessions_maps = defaultdict(list) + for obj in gene_accessions: + if "accession" in obj and "genomic" in obj["accession"]: + gene_accessions_maps[obj["query"]] = ",".join(obj["accession"]["genomic"]) + return gene_accessions_maps + \ No newline at end of file diff --git a/quantmsio/operate/tools.py b/quantmsio/operate/tools.py index 612b9133..3580c0ba 100644 --- a/quantmsio/operate/tools.py +++ b/quantmsio/operate/tools.py @@ -14,7 +14,21 @@ from quantmsio.utils.file_utils import load_de_or_ae, read_large_parquet -def generate_features_of_spectrum( +def init_save_info(parquet_path: str): + pqwriters = {} + pqwriter_no_part = None + filename = os.path.basename(parquet_path) + return pqwriters, pqwriter_no_part,filename + +def close_file(partitions:list ,pqwriters: dict, pqwriter_no_part: str): + if not partitions or len(partitions) == 0: + if pqwriter_no_part: + pqwriter_no_part.close() + else: + for pqwriter in pqwriters.values(): + pqwriter.close() + +def generate_psms_of_spectrum( parquet_path: str, mzml_directory: str, output_folder: str, @@ -25,9 +39,7 @@ def generate_features_of_spectrum( parquet_path: parquet file path mzml_directory: mzml file directory path """ - pqwriters = {} - pqwriter_no_part = None - filename = os.path.basename(parquet_path) + pqwriters, pqwriter_no_part, filename = init_save_info(parquet_path) p = Query(parquet_path) for _, table in p.iter_file(file_num=file_num): refs = table["reference_file_name"].unique() @@ -42,77 +54,50 @@ def generate_features_of_spectrum( axis=1, result_type="expand", ) - if not partitions or len(partitions) > 0: - for key, df in table.groupby(partitions): - parquet_table = pa.Table.from_pandas(df, schema=PSM_SCHEMA) - folder = [output_folder] + [str(col) for col in key] - folder = os.path.join(*folder) - if not os.path.exists(folder): - os.makedirs(folder, exist_ok=True) - save_path = os.path.join(*[folder, filename]) - if not os.path.exists(save_path): - pqwriter = pq.ParquetWriter(save_path, parquet_table.schema) - pqwriters[key] = pqwriter - pqwriters[key].write_table(parquet_table) - else: - parquet_table = pa.Table.from_pandas(table, schema=PSM_SCHEMA) - if not os.path.exists(output_folder): - os.makedirs(output_folder, exist_ok=True) - save_path = os.path.join(*[output_folder, filename]) - if not pqwriter_no_part: - pqwriter_no_part = pq.ParquetWriter(save_path, parquet_table.schema) - pqwriter_no_part.write_table(parquet_table) - if not partitions or len(partitions) == 0: - if pqwriter_no_part: - pqwriter_no_part.close() + pqwriters, pqwriter_no_part = save_parquet_file(partitions, table, output_folder, filename, pqwriters, pqwriter_no_part, PSM_SCHEMA) + close_file(partitions, pqwriters, pqwriter_no_part) + + +def save_parquet_file(partitions, table, output_folder, filename, pqwriters = {}, pqwriter_no_part=None, schema=FEATURE_SCHEMA): + + if partitions and len(partitions) > 0: + for key, df in table.groupby(partitions): + parquet_table = pa.Table.from_pandas(df, schema=schema) + folder = [output_folder] + [str(col) for col in key] + folder = os.path.join(*folder) + if not os.path.exists(folder): + os.makedirs(folder, exist_ok=True) + save_path = os.path.join(*[folder, filename]) + if not os.path.exists(save_path): + pqwriter = pq.ParquetWriter(save_path, parquet_table.schema) + pqwriters[key] = pqwriter + pqwriters[key].write_table(parquet_table) + return pqwriters, pqwriter_no_part else: - for pqwriter in pqwriters.values(): - pqwriter.close() - - -# gei unqnimous name -def map_protein_for_parquet(parquet_path, fasta, output_path, map_parameter, label): - """ - according fasta database to map the proteins accessions to uniprot names. - :param parquet_path: psm_parquet_path or feature_parquet_path - :param fasta: Reference fasta database - :param output_path: output file path - :param map_parameter: map_protein_name or map_protein_accession - :param label: feature or psm - retrun: None - """ - - map_protein_names = defaultdict(set) - if map_parameter == "map_protein_name": - for seq_record in SeqIO.parse(fasta, "fasta"): - accession, name = seq_record.id.split("|")[1:] - map_protein_names[seq_record.id].add(name) - map_protein_names[accession].add(name) - map_protein_names[name].add(name) - else: - for seq_record in SeqIO.parse(fasta, "fasta"): - accession, name = seq_record.id.split("|")[1:] - map_protein_names[seq_record.id].add(accession) - map_protein_names[accession].add(accession) - map_protein_names[name].add(accession) - change_and_save_parquet(parquet_path, map_protein_names, output_path, label) - - -def change_and_save_parquet(parquet_path, map_dict, output_path, label): - pqwriter = None - for df in read_large_parquet(parquet_path): - df["protein_accessions"] = df["protein_accessions"].apply(lambda x: get_unanimous_name(x, map_dict)) - if label == "feature": - schema = FEATURE_SCHEMA - elif label == "psm": - schema = PSM_SCHEMA - parquet_table = pa.Table.from_pandas(df, schema=schema) - if not pqwriter: - pqwriter = pq.ParquetWriter(output_path, parquet_table.schema) - pqwriter.write_table(parquet_table) - if pqwriter: - pqwriter.close() - + parquet_table = pa.Table.from_pandas(table, schema=schema) + if not os.path.exists(output_folder): + os.makedirs(output_folder, exist_ok=True) + save_path = os.path.join(*[output_folder, filename]) + if not pqwriter_no_part: + pqwriter_no_part = pq.ParquetWriter(save_path, parquet_table.schema) + pqwriter_no_part.write_table(parquet_table) + return pqwriters, pqwriter_no_part + +def generate_feature_of_gene( + parquet_path: str, + fasta: str, + output_folder: str, + file_num: int, + partitions: list = None, + species: str = "human" +): + pqwriters, pqwriter_no_part, filename = init_save_info(parquet_path) + p = Query(parquet_path) + map_gene_names = p.get_protein_to_gene_map(fasta) + for _, table in p.iter_file(file_num=file_num): + table = p.inject_gene_msg(table, map_gene_names, species) + pqwriters, pqwriter_no_part = save_parquet_file(partitions, table, output_folder, filename, pqwriters, pqwriter_no_part) + close_file(partitions, pqwriters, pqwriter_no_part) def map_protein_for_tsv(path: str, fasta: str, output_path: str, map_parameter: str): """ diff --git a/quantmsio/quantmsioc.py b/quantmsio/quantmsioc.py index 2bac340a..635dbe6f 100644 --- a/quantmsio/quantmsioc.py +++ b/quantmsio/quantmsioc.py @@ -13,11 +13,10 @@ from quantmsio.commands.ae_command import convert_ibaq_absolute from quantmsio.commands.de_command import convert_msstats_differential from quantmsio.commands.attach_file_command import attach_file_to_json -from quantmsio.commands.get_unanimous_command import get_unanimous_for_parquet, get_unanimous_for_tsv from quantmsio.commands.generate_spectra_message_command import map_spectrum_message_to_parquet +from quantmsio.commands.generate_gene_message_command import map_gene_message_to_parquet from quantmsio.commands.plot_command import plot from quantmsio.commands.statistic_command import statistics -from quantmsio.commands.pg_matrix_command import convert_pg_matrix from quantmsio.commands.maxquant_command import convert_maxquant_psm, convert_maxquant_feature from quantmsio.commands.ibaq_command import convert_ibaq_file @@ -42,19 +41,14 @@ def cli(): cli.add_command(convert_ibaq_absolute) cli.add_command(convert_msstats_differential) cli.add_command(attach_file_to_json) -cli.add_command(get_unanimous_for_parquet) -cli.add_command(get_unanimous_for_tsv) cli.add_command(map_spectrum_message_to_parquet) +cli.add_command(map_gene_message_to_parquet) cli.add_command(plot) cli.add_command(statistics) -cli.add_command(convert_pg_matrix) cli.add_command(convert_maxquant_psm) cli.add_command(convert_maxquant_feature) cli.add_command(convert_ibaq_file) -# cli.add_command(convert_maxquant) -# cli.add_command(convert_zip_maxquant) - def quantms_io_main(): """ diff --git a/quantmsio/utils/pride_utils.py b/quantmsio/utils/pride_utils.py index cfb9a812..b3c96ffd 100644 --- a/quantmsio/utils/pride_utils.py +++ b/quantmsio/utils/pride_utils.py @@ -27,7 +27,10 @@ def get_unanimous_name(protein_accessions, map_dict): for accession in protein_accessions: if accession in map_dict: unqnimous_names.append(list(map_dict[accession])[0]) - return unqnimous_names + if len(unqnimous_names) > 0: + return unqnimous_names + else: + return None def generate_gene_name_map(fasta, map_parameter): @@ -54,15 +57,19 @@ def generate_gene_name_map(fasta, map_parameter): def get_gene_accessions(gene_list, map_dict): - if len(gene_list) == 0: - return [] + if gene_list is None or len(gene_list) == 0: + return None else: accessions = [] for gene in gene_list: - accession = map_dict[gene] - if len(accession) > 0: - accessions.append(str(accession[0])) - return accessions + if gene in map_dict: + accession = map_dict[gene] + if accession: + accessions.append(accession) + if len(accessions) > 0: + return accessions + else: + return None def generate_scan_number(spectra_ref: str):