diff --git a/quantmsio/commands/absolute_expression_command.py b/quantmsio/commands/absolute_expression_command.py index 2316a93..38c2006 100644 --- a/quantmsio/commands/absolute_expression_command.py +++ b/quantmsio/commands/absolute_expression_command.py @@ -22,15 +22,9 @@ help="quantms.io project file", required=False, ) -@click.option( - "--output_folder", help="Folder to generate the df expression file.", required=True -) -@click.option( - "--output_prefix_file", help="Prefix of the df expression file", required=False -) -@click.option( - "--delete_existing", help="Delete existing files in the output folder", is_flag=True -) +@click.option("--output_folder", help="Folder to generate the df expression file.", required=True) +@click.option("--output_prefix_file", help="Prefix of the df expression file", required=False) +@click.option("--delete_existing", help="Delete existing files in the output folder", is_flag=True) def convert_ibaq_absolute( ibaq_file: str, sdrf_file: str, diff --git a/quantmsio/commands/attach_file_command.py b/quantmsio/commands/attach_file_command.py index 616ca14..b933b3d 100644 --- a/quantmsio/commands/attach_file_command.py +++ b/quantmsio/commands/attach_file_command.py @@ -8,9 +8,7 @@ short_help="Register the file to project.json.", ) @click.option("--project_file", help="the project.json file", required=True) -@click.option( - "--attach_file", help="The path of the file that will be registered", required=True -) +@click.option("--attach_file", help="The path of the file that will be registered", required=True) @click.option( "--category", type=click.Choice( diff --git a/quantmsio/commands/diann_convert_command.py b/quantmsio/commands/diann_convert_command.py index b68f9f7..272ee5f 100644 --- a/quantmsio/commands/diann_convert_command.py +++ b/quantmsio/commands/diann_convert_command.py @@ -20,9 +20,7 @@ help="the design file path", required=True, ) -@click.option( - "--qvalue_threshold", help="qvalue_threshold", required=True, default=0.05 -) +@click.option("--qvalue_threshold", help="qvalue_threshold", required=True, default=0.05) @click.option( "--mzml_info_folder", help="the foldef of mzml_info tsv file", @@ -47,9 +45,7 @@ "--duckdb_max_memory", help="The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)", ) -@click.option( - "--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)" -) +@click.option("--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)") @click.option( "--file_num", help="The number of files being processed at the same time", @@ -85,14 +81,8 @@ def diann_convert_to_parquet( if not output_prefix_file: output_prefix_file = "" - feature_output_path = ( - output_folder - + "/" - + create_uuid_filename(output_prefix_file, ".feature.parquet") - ) - psm_output_path = ( - output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") - ) + feature_output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".feature.parquet") + psm_output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") dia_nn = DiaNNConvert() diff --git a/quantmsio/commands/differential_expression_command.py b/quantmsio/commands/differential_expression_command.py index 87b3fa5..82dad81 100644 --- a/quantmsio/commands/differential_expression_command.py +++ b/quantmsio/commands/differential_expression_command.py @@ -24,15 +24,9 @@ required=False, default="0.05", ) -@click.option( - "--output_folder", help="Folder to generate the df expression file.", required=True -) -@click.option( - "--output_prefix_file", help="Prefix of the df expression file", required=False -) -@click.option( - "--delete_existing", help="Delete existing files in the output folder", is_flag=True -) +@click.option("--output_folder", help="Folder to generate the df expression file.", required=True) +@click.option("--output_prefix_file", help="Prefix of the df expression file", required=False) +@click.option("--delete_existing", help="Delete existing files in the output folder", is_flag=True) def convert_msstats_differential( msstats_file: str, sdrf_file: str, diff --git a/quantmsio/commands/feature_command.py b/quantmsio/commands/feature_command.py index 9919346..3848712 100644 --- a/quantmsio/commands/feature_command.py +++ b/quantmsio/commands/feature_command.py @@ -62,12 +62,7 @@ def convert_feature_file( :return: none """ - if ( - sdrf_file is None - or msstats_file is None - or mztab_file is None - or output_folder is None - ): + if sdrf_file is None or msstats_file is None or mztab_file is None or output_folder is None: raise click.UsageError("Please provide all the required parameters") if use_cache is None: use_cache = False @@ -75,11 +70,7 @@ def convert_feature_file( feature_manager = FeatureHandler() if not output_prefix_file: output_prefix_file = "" - feature_manager.parquet_path = ( - output_folder - + "/" - + create_uuid_filename(output_prefix_file, ".feature.parquet") - ) + feature_manager.parquet_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".feature.parquet") if consensusxml_file is not None: feature_manager.convert_mztab_msstats_to_feature( mztab_file=mztab_file, diff --git a/quantmsio/commands/generate_gene_msg_command.py b/quantmsio/commands/generate_gene_msg_command.py index 75faa7a..cc5cf52 100644 --- a/quantmsio/commands/generate_gene_msg_command.py +++ b/quantmsio/commands/generate_gene_msg_command.py @@ -14,9 +14,7 @@ ) @click.option( "--map_parameter", - type=click.Choice( - ["map_protein_name", "map_protein_accession"], case_sensitive=False - ), + type=click.Choice(["map_protein_name", "map_protein_accession"], case_sensitive=False), help="map type", ) @click.option("--species", help="species", default="human") @@ -40,6 +38,4 @@ def map_gene_msg_to_parquet( if not output_path.endswith("parquet"): raise click.UsageError("Please provide file extension(.parquet)") - map_gene_msgs_to_parquet( - parquet_path, fasta_path, map_parameter, output_path, label, species - ) + map_gene_msgs_to_parquet(parquet_path, fasta_path, map_parameter, output_path, label, species) diff --git a/quantmsio/commands/generate_project_report_command.py b/quantmsio/commands/generate_project_report_command.py index 4fddb48..23f8490 100644 --- a/quantmsio/commands/generate_project_report_command.py +++ b/quantmsio/commands/generate_project_report_command.py @@ -7,9 +7,7 @@ "generate-project-report", short_help="generate report of project " "format", ) -@click.option( - "--project_folder", help="Folder to generate the df expression file.", required=True -) +@click.option("--project_folder", help="Folder to generate the df expression file.", required=True) def generate_report_about_project(project_folder): """ project_folder: The folder path for the full project. diff --git a/quantmsio/commands/generate_spectra_message_command.py b/quantmsio/commands/generate_spectra_message_command.py index a7e9492..a46d5bd 100644 --- a/quantmsio/commands/generate_spectra_message_command.py +++ b/quantmsio/commands/generate_spectra_message_command.py @@ -46,6 +46,4 @@ def map_spectrum_message_to_parquet( """ if not output_path.endswith("parquet"): raise click.UsageError("Please provide file extension(.parquet)") - generate_features_of_spectrum( - parquet_path, mzml_directory, output_path, label, file_num, partition - ) + generate_features_of_spectrum(parquet_path, mzml_directory, output_path, label, file_num, partition) diff --git a/quantmsio/commands/generate_start_and_end_command.py b/quantmsio/commands/generate_start_and_end_command.py index 8d68745..f81ef0d 100644 --- a/quantmsio/commands/generate_start_and_end_command.py +++ b/quantmsio/commands/generate_start_and_end_command.py @@ -15,9 +15,7 @@ help="parquet type", ) @click.option("--output_path", help="save path", required=True) -def inject_start_and_end_from_fasta( - parquet_path: str, fasta_path: str, label: str, output_path: str -): +def inject_start_and_end_from_fasta(parquet_path: str, fasta_path: str, label: str, output_path: str): """ Register the file with project.json :param parquet_path: psm or feature parquet file path diff --git a/quantmsio/commands/get_unanimous_command.py b/quantmsio/commands/get_unanimous_command.py index 1004e3a..b56275c 100644 --- a/quantmsio/commands/get_unanimous_command.py +++ b/quantmsio/commands/get_unanimous_command.py @@ -22,9 +22,7 @@ def labels(): @click.option("--output_path", help="output file path") @click.option( "--map_parameter", - type=click.Choice( - ["map_protein_name", "map_protein_accession"], case_sensitive=False - ), + type=click.Choice(["map_protein_name", "map_protein_accession"], case_sensitive=False), help="map type", ) @click.option( @@ -55,9 +53,7 @@ def get_unanimous_for_parquet(parquet_path, fasta, output_path, map_parameter, l @click.option("--output_path", help="output file path") @click.option( "--map_parameter", - type=click.Choice( - ["map_protein_name", "map_protein_accession"], case_sensitive=False - ), + type=click.Choice(["map_protein_name", "map_protein_accession"], case_sensitive=False), help="map type", ) def get_unanimous_for_tsv(path, fasta, output_path, map_parameter): diff --git a/quantmsio/commands/load_best_scan_number_command.py b/quantmsio/commands/load_best_scan_number_command.py index fcfc88b..682a703 100644 --- a/quantmsio/commands/load_best_scan_number_command.py +++ b/quantmsio/commands/load_best_scan_number_command.py @@ -8,13 +8,9 @@ short_help="inject bset_psm_scan_number to feature", ) @click.option("--diann_psm_path", help="diann psm parquet file path", required=True) -@click.option( - "--diann_feature_path", help="diann feature parquet file path", required=True -) +@click.option("--diann_feature_path", help="diann feature parquet file path", required=True) @click.option("--output_path", help="save path", required=True) -def inject_bset_psm_scan_number( - diann_psm_path: str, diann_feature_path: str, output_path: str -): +def inject_bset_psm_scan_number(diann_psm_path: str, diann_feature_path: str, output_path: str): """ Register the file with project.json :param diann_psm_path: diann psm parquet file path diff --git a/quantmsio/commands/plot.py b/quantmsio/commands/plot.py index 53a5009..afd1df5 100644 --- a/quantmsio/commands/plot.py +++ b/quantmsio/commands/plot.py @@ -32,21 +32,15 @@ def plot_peptides(ctx, psm_parquet_path: str, sdrf_path: str, save_path: str): :param save_path: img save path [xxx.png] :return: none """ - plot_peptides_of_lfq_condition( - psm_parquet_path=psm_parquet_path, sdrf_path=sdrf_path, save_path=save_path - ) + plot_peptides_of_lfq_condition(psm_parquet_path=psm_parquet_path, sdrf_path=sdrf_path, save_path=save_path) -@plot.command( - "plot-ibaq-distribution", short_help="plot ibaq distribution of expression" -) +@plot.command("plot-ibaq-distribution", short_help="plot ibaq distribution of expression") @click.option("--ibaq_path", help="ibaq file path", required=True) @click.option("--save_path", help="img save path [xxx.svg]", required=True) @click.option("--select_column", help="Selected column in Ibaq File", required=False) @click.pass_context -def plot_ibaq_distribution( - ctx, ibaq_path: str, save_path: str, select_column: str -) -> None: +def plot_ibaq_distribution(ctx, ibaq_path: str, save_path: str, select_column: str) -> None: """ plot ibaq distribution of expression :param ibaq_path: ibaq file path @@ -65,9 +59,7 @@ def plot_ibaq_distribution( @click.option("--save_path", help="img save path [xxx.svg]", required=True) @click.option("--num_samples", help="The number of samples plotted", default=10) @click.pass_context -def plot_kde_intensity_distribution( - feature_path: str, save_path: str, num_samples: int -): +def plot_kde_intensity_distribution(feature_path: str, save_path: str, num_samples: int): """ plot ibaq distribution of expression :param feature_path: feature file path @@ -107,9 +99,7 @@ def plot_bar_peptide_distribution(feature_path: str, save_path: str, num_samples @click.option("--save_path", help="img save path [xxx.svg]", required=True) @click.option("--num_samples", help="The number of samples plotted", default=10) @click.pass_context -def plot_box_intensity_distribution( - feature_path: str, save_path: str, num_samples: int -): +def plot_box_intensity_distribution(feature_path: str, save_path: str, num_samples: int): """ plot ibaq distribution of expression :param feature_path: feature file path diff --git a/quantmsio/commands/psm_command.py b/quantmsio/commands/psm_command.py index a6ca3d0..79659e4 100644 --- a/quantmsio/commands/psm_command.py +++ b/quantmsio/commands/psm_command.py @@ -31,9 +31,7 @@ help="Prefix of the parquet file needed to generate the file name", required=False, ) -@click.option( - "--verbose", help="Output debug information.", default=False, is_flag=True -) +@click.option("--verbose", help="Output debug information.", default=False, is_flag=True) def convert_psm_file( mztab_file: str, output_folder: str, @@ -57,9 +55,7 @@ def convert_psm_file( output_prefix_file = "" psm_manager = PSMHandler() - psm_manager.parquet_path = ( - output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") - ) + psm_manager.parquet_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") psm_manager.convert_mztab_to_psm( mztab_path=mztab_file, parquet_path=psm_manager.parquet_path, @@ -69,9 +65,7 @@ def convert_psm_file( @click.command("compare-set-psms", short_help="plot venn for a set of Psms parquet") -@click.option( - "-p", "--parquets", type=str, help="List of psm parquet path", multiple=True -) +@click.option("-p", "--parquets", type=str, help="List of psm parquet path", multiple=True) @click.option("-t", "--tags", type=str, help="List of parquet label", multiple=True) def compare_set_of_psms(parquets, tags): """ @@ -80,9 +74,7 @@ def compare_set_of_psms(parquets, tags): :param tags: a set of psm label """ if len(parquets) != len(tags): - raise click.UsageError( - "Please provide same length of parquet_list and label_list" - ) + raise click.UsageError("Please provide same length of parquet_list and label_list") plot_peptidoform_charge_venn(parquets, tags) plot_sequence_venn(parquets, tags) diff --git a/quantmsio/commands/statistics.py b/quantmsio/commands/statistics.py index 200d512..38b1399 100644 --- a/quantmsio/commands/statistics.py +++ b/quantmsio/commands/statistics.py @@ -23,8 +23,7 @@ def statistics(): @click.option("--parquet_path", help="psm parquet path in lfq", required=True) @click.option( "--save_path", - help="file with the statistics (e.g. statistics.csv), if not provided," - " will print to stdout", + help="file with the statistics (e.g. statistics.csv), if not provided," " will print to stdout", ) @click.pass_context def feature_file_statistics(ctx, absolute_path: str, parquet_path: str, save_path: str): @@ -41,15 +40,11 @@ def write_stats(file, stats: ParquetStatistics): file.write("Number of proteins: {}\n".format(stats.get_number_of_proteins())) file.write("Number of peptides: {}\n".format(stats.get_number_of_peptides())) file.write("Number of samples: {}\n".format(stats.get_number_of_samples())) - file.write( - "Number of peptidoforms: {}\n".format(stats.get_number_of_peptidoforms()) - ) + file.write("Number of peptidoforms: {}\n".format(stats.get_number_of_peptidoforms())) file.write("Number of msruns: {}\n".format(stats.get_number_msruns())) def write_absolute_stats(file, stats: IbaqStatistics): - file.write( - "Ibaq Number of proteins: {}\n".format(stats.get_number_of_proteins()) - ) + file.write("Ibaq Number of proteins: {}\n".format(stats.get_number_of_proteins())) file.write("Ibaq Number of samples: {}\n".format(stats.get_number_of_samples())) if save_path: @@ -70,8 +65,7 @@ def write_absolute_stats(file, stats: IbaqStatistics): @click.option("--parquet_path", help="psm parquet path in lfq", required=True) @click.option( "--save_path", - help="file with the statistics (e.g. statistics.csv), if not provided," - " will print to stdout", + help="file with the statistics (e.g. statistics.csv), if not provided," " will print to stdout", ) @click.pass_context def parquet_psm_statistics(ctx, parquet_path: str, save_path: str): @@ -85,9 +79,7 @@ def parquet_psm_statistics(ctx, parquet_path: str, save_path: str): def write_stats(file, stats: ParquetStatistics): file.write("Number of proteins: {}\n".format(stats.get_number_of_proteins())) file.write("Number of peptides: {}\n".format(stats.get_number_of_peptides())) - file.write( - "Number of peptidoforms: {}\n".format(stats.get_number_of_peptidoforms()) - ) + file.write("Number of peptidoforms: {}\n".format(stats.get_number_of_peptidoforms())) file.write("Number of psms: {}\n".format(stats.get_number_of_psms())) file.write("Number of msruns: {}\n".format(stats.get_number_msruns())) diff --git a/quantmsio/core/ae.py b/quantmsio/core/ae.py index 2727919..e3116a1 100644 --- a/quantmsio/core/ae.py +++ b/quantmsio/core/ae.py @@ -85,25 +85,13 @@ def convert_ibaq_to_quantms( output_lines = "" if self.project_manager: output_lines += ( - "#project_accession: " - + self.project_manager.project.project_info["project_accession"] - + "\n" + "#project_accession: " + self.project_manager.project.project_info["project_accession"] + "\n" ) + output_lines += "#project_title: " + self.project_manager.project.project_info["project_title"] + "\n" output_lines += ( - "#project_title: " - + self.project_manager.project.project_info["project_title"] - + "\n" - ) - output_lines += ( - "#project_description: " - + self.project_manager.project.project_info["project_description"] - + "\n" - ) - output_lines += ( - "#quantms_version: " - + self.project_manager.project.project_info["quantms_version"] - + "\n" + "#project_description: " + self.project_manager.project.project_info["project_description"] + "\n" ) + output_lines += "#quantms_version: " + self.project_manager.project.project_info["quantms_version"] + "\n" factor_value = self.get_factor_value() if factor_value is not None: output_lines += "#factor_value: " + factor_value + "\n" @@ -139,12 +127,8 @@ def convert_ibaq_to_quantms( f.write(output_lines) if self.project_manager: - self.project_manager.add_quantms_file( - file_category="absolute_file", file_name=output_filename - ) - logger.info( - f"Absolute expression file copied to {output_filename} and added to the project information" - ) + self.project_manager.add_quantms_file(file_category="absolute_file", file_name=output_filename) + logger.info(f"Absolute expression file copied to {output_filename} and added to the project information") def get_factor_value(self): """ diff --git a/quantmsio/core/core.py b/quantmsio/core/core.py index 13c095b..1f4c78a 100644 --- a/quantmsio/core/core.py +++ b/quantmsio/core/core.py @@ -18,9 +18,7 @@ def __init__(self, name_prefix: str): # Create a cache name using a hash and uuid if name_prefix is None: name_prefix = "generic" - self._cache_name = str( - "_cache_name_{}_{}".format(name_prefix, uuid.uuid4().hex) - ) + self._cache_name = str("_cache_name_{}_{}".format(name_prefix, uuid.uuid4().hex)) self.cache = diskcache.Cache(self._cache_name, statistics=True) self.cache.create_tag_index() diff --git a/quantmsio/core/de.py b/quantmsio/core/de.py index 44f3265..4ae04b7 100644 --- a/quantmsio/core/de.py +++ b/quantmsio/core/de.py @@ -64,9 +64,7 @@ def __init__(self): https://github.com/bigbio/quantms.io/blob/main/docs/DE.md """ # SDRF file information - self.fdr_threshold = ( - 0.05 # FDR threshold to consider a protein as differentially expressed - ) + self.fdr_threshold = 0.05 # FDR threshold to consider a protein as differentially expressed self.sdrf_manager = None self.sdrf_file_path = None @@ -87,9 +85,7 @@ def load_msstats_file(self, msstats_file_path: str): self.de_file_path = msstats_file_path if not os.path.isfile(msstats_file_path): - raise FileNotFoundError( - "MSstats differential file not found: " + msstats_file_path - ) + raise FileNotFoundError("MSstats differential file not found: " + msstats_file_path) self.msstats_df = pd.read_csv(msstats_file_path, sep="\t") # Rename columns to a lower case @@ -125,9 +121,7 @@ def convert_msstats_to_quantms( quantms_df = self.msstats_df[ [ - DifferentialExpressionHandler.PROTEIN_ACCESSION_COLUMN[ - "msstats_column" - ], + DifferentialExpressionHandler.PROTEIN_ACCESSION_COLUMN["msstats_column"], DifferentialExpressionHandler.LABEL_COLUMN["msstats_column"], DifferentialExpressionHandler.log2FC_COLUMN["msstats_column"], DifferentialExpressionHandler.SE_COLUMN["msstats_column"], @@ -142,25 +136,13 @@ def convert_msstats_to_quantms( output_lines = "" if self.project_manager: output_lines += ( - "#project_accession: " - + self.project_manager.project.project_info["project_accession"] - + "\n" + "#project_accession: " + self.project_manager.project.project_info["project_accession"] + "\n" ) + output_lines += "#project_title: " + self.project_manager.project.project_info["project_title"] + "\n" output_lines += ( - "#project_title: " - + self.project_manager.project.project_info["project_title"] - + "\n" - ) - output_lines += ( - "#project_description: " - + self.project_manager.project.project_info["project_description"] - + "\n" - ) - output_lines += ( - "#quantms_version: " - + self.project_manager.project.project_info["quantms_version"] - + "\n" + "#project_description: " + self.project_manager.project.project_info["project_description"] + "\n" ) + output_lines += "#quantms_version: " + self.project_manager.project.project_info["quantms_version"] + "\n" factor_value = self.get_factor_value() if factor_value is not None: output_lines += "#factor_value: " + factor_value + "\n" @@ -192,7 +174,9 @@ def convert_msstats_to_quantms( DifferentialExpressionHandler.DIFFERENTIAL_EXPRESSION_EXTENSION, ) - output_filename = f"{base_name}-{str(uuid.uuid4())}{DifferentialExpressionHandler.DIFFERENTIAL_EXPRESSION_EXTENSION}" + output_filename = ( + f"{base_name}-{str(uuid.uuid4())}{DifferentialExpressionHandler.DIFFERENTIAL_EXPRESSION_EXTENSION}" + ) if output_folder is None: output_filename_path = output_filename else: @@ -202,12 +186,8 @@ def convert_msstats_to_quantms( with open(output_filename_path, "w", encoding="utf8") as f: f.write(output_lines) if self.project_manager: - self.project_manager.add_quantms_file( - file_category="differential_file", file_name=output_filename - ) - logger.info( - f"Differential expression file copied to {output_filename} and added to the project information" - ) + self.project_manager.add_quantms_file(file_category="differential_file", file_name=output_filename) + logger.info(f"Differential expression file copied to {output_filename} and added to the project information") def update_project_file(self, project_file: str = None): """ diff --git a/quantmsio/core/diann_convert.py b/quantmsio/core/diann_convert.py index 77427bb..e331969 100644 --- a/quantmsio/core/diann_convert.py +++ b/quantmsio/core/diann_convert.py @@ -34,9 +34,7 @@ def get_exp_design_dfs(exp_design_file): f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]] f_header = data[0].replace("\n", "").split("\t") f_table = pd.DataFrame(f_table, columns=f_header) - f_table.loc[:, "run"] = f_table.apply( - lambda x: _true_stem(x["Spectra_Filepath"]), axis=1 - ) + f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1) f_table.rename(columns={"Fraction_Group": "ms_run", "run": "Run"}, inplace=True) s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] s_header = data[empty_row + 1].replace("\n", "").split("\t") @@ -94,9 +92,7 @@ def find_modification(peptide): for k in range(0, len(original_mods)): original_mods[k] = str(position[k]) + "-" + original_mods[k].upper() - original_mods = ( - ",".join(str(i) for i in original_mods) if len(original_mods) > 0 else "null" - ) + original_mods = ",".join(str(i) for i in original_mods) if len(original_mods) > 0 else "null" return original_mods @@ -123,9 +119,7 @@ def mtd_mod_info(fix_mod, var_mod): mod_name = mod_obj.getId() mod_accession = mod_obj.getUniModAccession() site = mod_obj.getOrigin() - fix_ptm.append( - ("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site) - ) + fix_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) else: fix_flag = 0 fix_ptm.append("[MS, MS:1002453, No fixed modifications searched, ]") @@ -137,9 +131,7 @@ def mtd_mod_info(fix_mod, var_mod): mod_name = mod_obj.getId() mod_accession = mod_obj.getUniModAccession() site = mod_obj.getOrigin() - var_ptm.append( - ("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site) - ) + var_ptm.append(("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site)) else: var_flag = 0 var_ptm.append("[MS, MS:1002454, No variable modifications searched, ]") @@ -148,31 +140,21 @@ def mtd_mod_info(fix_mod, var_mod): def get_modifications(fix_modifications: str, variable_modifications: str): - (fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info( - fix_modifications, variable_modifications - ) + (fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(fix_modifications, variable_modifications) modifications_list = [] if fix_flag == 1: for i in range(1, len(fixed_mods) + 1): - modifications_list.append( - f"MTD\tfixed_mod[{str(i)}]\t{fixed_mods[i - 1][0]}" - ) - modifications_list.append( - f"MTD\tfixed_mod[{str(i)}]-site\t{fixed_mods[i - 1][1]}" - ) + modifications_list.append(f"MTD\tfixed_mod[{str(i)}]\t{fixed_mods[i - 1][0]}") + modifications_list.append(f"MTD\tfixed_mod[{str(i)}]-site\t{fixed_mods[i - 1][1]}") modifications_list.append(f"MTD\tfixed_mod[{str(i)}]-position\tAnywhere") else: modifications_list.append(f"MTD\tfixed_mod[1]\t{fixed_mods[0]}") if var_flag == 1: for i in range(1, len(variable_mods) + 1): - modifications_list.append( - f"MTD\tvariable_mod[{str(i)}]\t{variable_mods[i - 1][0]}" - ) - modifications_list.append( - f"MTD\tvariable_mod[{str(i)}]-site\t{variable_mods[i - 1][1]}" - ) + modifications_list.append(f"MTD\tvariable_mod[{str(i)}]\t{variable_mods[i - 1][0]}") + modifications_list.append(f"MTD\tvariable_mod[{str(i)}]-site\t{variable_mods[i - 1][1]}") modifications_list.append(f"MTD\tvariable_mod[{str(i)}]-position\tAnywhere") else: modifications_list.append(f"MTD\tvariable_mod[1]\t{variable_mods[0]}") @@ -229,18 +211,12 @@ def create_duckdb_from_diann_report(self, report_path, max_memory, worker_thread if worker_threads is not None: database.execute("SET worker_threads='{}'".format(worker_threads)) - msg = database.execute( - "SELECT * FROM duckdb_settings() where name in ('worker_threads', 'max_memory')" - ).df() + msg = database.execute("SELECT * FROM duckdb_settings() where name in ('worker_threads', 'max_memory')").df() logging.info("Duckdb uses {} threads.".format(str(msg["value"][0]))) logging.info("duckdb uses {} of memory.".format(str(msg["value"][1]))) - database.execute( - "CREATE TABLE diann_report AS SELECT * FROM '{}'".format(report_path) - ) - database.execute( - """CREATE INDEX idx_precursor_q ON diann_report ("Precursor.Id", "Q.Value")""" - ) + database.execute("CREATE TABLE diann_report AS SELECT * FROM '{}'".format(report_path)) + database.execute("""CREATE INDEX idx_precursor_q ON diann_report ("Precursor.Id", "Q.Value")""") database.execute("""CREATE INDEX idx_run ON diann_report ("Run")""") et = time.time() - s @@ -322,21 +298,13 @@ def get_masses_and_modifications_map(self): def get_mods(self, sdrf_path): sdrf = pd.read_csv(sdrf_path, sep="\t", nrows=1) - mod_cols = [ - col - for col in sdrf.columns - if col.startswith("comment[modification parameter]") - ] + mod_cols = [col for col in sdrf.columns if col.startswith("comment[modification parameter]")] fix_m = [] variable_m = [] for col in mod_cols: mod_msg = sdrf[col].values[0].split(";") mod_dict = {k.split("=")[0]: k.split("=")[1] for k in mod_msg} - mod = ( - f"{mod_dict['NT']} ({mod_dict['TA']})" - if "TA" in mod_dict - else f"{mod_dict['NT']} ({mod_dict['PP']})" - ) + mod = f"{mod_dict['NT']} ({mod_dict['TA']})" if "TA" in mod_dict else f"{mod_dict['NT']} ({mod_dict['PP']})" if mod_dict["MT"] == "Variable" or mod_dict["MT"] == "variable": variable_m.append(mod) else: @@ -387,9 +355,7 @@ def intergrate_msg(n): for mzml in os.listdir(mzml_info_folder) if mzml.endswith("_mzml_info.tsv") ] - info_list = [ - info_list[i : i + file_num] for i in range(0, len(info_list), file_num) - ] + info_list = [info_list[i : i + file_num] for i in range(0, len(info_list), file_num)] for refs in info_list: report = self.get_report_from_database(refs) @@ -405,24 +371,18 @@ def intergrate_msg(n): report = pd.DataFrame(report, columns=usecols) # spectral count - report["spectral_count"] = report.groupby( - ["Modified.Sequence", "Precursor.Charge", "Run"] - ).transform("size") + report["spectral_count"] = report.groupby(["Modified.Sequence", "Precursor.Charge", "Run"]).transform( + "size" + ) # cal value and mod mass_vector = report["Modified.Sequence"].map(masses_map) report["Calculate.Precursor.Mz"] = ( mass_vector + (PROTON_MASS_U * report["Precursor.Charge"].values) ) / report["Precursor.Charge"].values - report["modifications"] = report["Modified.Sequence"].swifter.apply( - lambda x: find_modification(x) - ) - report["Modified.Sequence"] = report["Modified.Sequence"].map( - modifications_map - ) + report["modifications"] = report["Modified.Sequence"].swifter.apply(lambda x: find_modification(x)) + report["Modified.Sequence"] = report["Modified.Sequence"].map(modifications_map) # pep - report["best_psm_reference_file_name"] = report["Precursor.Id"].map( - best_ref_map - ) + report["best_psm_reference_file_name"] = report["Precursor.Id"].map(best_ref_map) report["best_psm_scan_number"] = None # add extra msg report = self.add_additional_msg(report) @@ -444,17 +404,13 @@ def generate_psm_and_feature_file( psm_pqwriter = None feature_pqwriter = None - self._duckdb = self.create_duckdb_from_diann_report( - report_path, duckdb_max_memory, duckdb_threads - ) + self._duckdb = self.create_duckdb_from_diann_report(report_path, duckdb_max_memory, duckdb_threads) s_data_frame, f_table = get_exp_design_dfs(design_file) sdrf_handle = SDRFHandler(sdrf_path) fix_mods, variable_mods = sdrf_handle.get_mods() self._modifications = get_modifications(fix_mods, variable_mods) - for report in self.main_report_df( - report_path, qvalue_threshold, mzml_info_folder, file_num - ): + for report in self.main_report_df(report_path, qvalue_threshold, mzml_info_folder, file_num): s = time.time() psm_pqwriter = self.generate_psm_file(report, psm_pqwriter, psm_output_path) feature_pqwriter = self.generate_feature_file( @@ -481,14 +437,10 @@ def add_additional_msg(self, report: pd.DataFrame) -> pd.DataFrame: :return: The report dataframe with the transformations """ report.rename(columns=self._columns_map, inplace=True) - report["reference_file_name"] = report["reference_file_name"].swifter.apply( - lambda x: x.split(".")[0] - ) + report["reference_file_name"] = report["reference_file_name"].swifter.apply(lambda x: x.split(".")[0]) report.loc[:, "is_decoy"] = "0" - report.loc[:, "unique"] = report["protein_accessions"].swifter.apply( - lambda x: "0" if ";" in str(x) else "1" - ) + report.loc[:, "unique"] = report["protein_accessions"].swifter.apply(lambda x: "0" if ";" in str(x) else "1") null_col = ["protein_start_positions", "protein_end_positions"] report.loc[:, null_col] = None @@ -500,9 +452,7 @@ def add_additional_msg(self, report: pd.DataFrame) -> pd.DataFrame: axis=1, ) - report["id_scores"] = report[ - ["Q.Value", "posterior_error_probability", "global_qvalue"] - ].swifter.apply( + report["id_scores"] = report[["Q.Value", "posterior_error_probability", "global_qvalue"]].swifter.apply( lambda x: f"""q-value: {x['Q.Value']},global q-value: {x['global_qvalue']}, posterior error probability:{x['posterior_error_probability']}""", axis=1, @@ -542,9 +492,7 @@ def generate_feature_file( ) samples = sdrf["source name"].unique() mixed_map = dict(zip(samples, range(1, len(samples) + 1))) - sdrf["comment[data file]"] = sdrf["comment[data file]"].apply( - lambda x: x.split(".")[0] - ) + sdrf["comment[data file]"] = sdrf["comment[data file]"].apply(lambda x: x.split(".")[0]) sdrf = sdrf.set_index(["comment[data file]"]) sdrf_map = sdrf.to_dict()["source name"] tec_map = sdrf.to_dict()["comment[technical replicate]"] @@ -578,12 +526,8 @@ def generate_feature_file( ) # peptide_score_name = self._score_names["peptide_score"] report.loc[:, "sample_accession"] = report["reference_file_name"].map(sdrf_map) - report.loc[:, "comment[technical replicate]"] = report[ - "reference_file_name" - ].map(tec_map) - report.loc[:, "run"] = report[ - ["sample_accession", "comment[technical replicate]", "fraction"] - ].swifter.apply( + report.loc[:, "comment[technical replicate]"] = report["reference_file_name"].map(tec_map) + report.loc[:, "run"] = report[["sample_accession", "comment[technical replicate]", "fraction"]].swifter.apply( lambda row: str(mixed_map[row["sample_accession"]]) + "_" + str(row["comment[technical replicate]"]) @@ -600,9 +544,7 @@ def generate_feature_file( if not feature_pqwriter: # create a parquet write object giving it an output file - feature_pqwriter = pq.ParquetWriter( - feature_output_path, parquet_table.schema - ) + feature_pqwriter = pq.ParquetWriter(feature_output_path, parquet_table.schema) feature_pqwriter.write_table(parquet_table) return feature_pqwriter @@ -634,38 +576,20 @@ def convert_psm_format_to_parquet(self, res): res["sequence"] = res["sequence"].astype(str) res["protein_accessions"] = res["protein_accessions"].str.split(";") res["protein_start_positions"] = ( - res["protein_start_positions"] - .swifter.apply(self.__split_start_or_end) - .to_list() - ) - res["protein_end_positions"] = ( - res["protein_end_positions"] - .swifter.apply(self.__split_start_or_end) - .to_list() + res["protein_start_positions"].swifter.apply(self.__split_start_or_end).to_list() ) + res["protein_end_positions"] = res["protein_end_positions"].swifter.apply(self.__split_start_or_end).to_list() res["protein_global_qvalue"] = res["protein_global_qvalue"].astype(float) - res["unique"] = ( - res["unique"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") - ) - res["modifications"] = res["modifications"].swifter.apply( - lambda x: feature._generate_modification_list(x) - ) + res["unique"] = res["unique"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") + res["modifications"] = res["modifications"].swifter.apply(lambda x: feature._generate_modification_list(x)) res["retention_time"] = res["retention_time"].astype(float) - res["charge"] = ( - res["charge"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") - ) + res["charge"] = res["charge"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") res["exp_mass_to_charge"] = res["exp_mass_to_charge"].astype(float) res["calc_mass_to_charge"] = res["calc_mass_to_charge"].astype(float) - res["posterior_error_probability"] = res["posterior_error_probability"].astype( - float - ) + res["posterior_error_probability"] = res["posterior_error_probability"].astype(float) res["global_qvalue"] = res["global_qvalue"].astype(float) - res["is_decoy"] = ( - res["is_decoy"] - .map(lambda x: pd.NA if pd.isna(x) else int(x)) - .astype("Int32") - ) + res["is_decoy"] = res["is_decoy"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") res.loc[:, "num_peaks"] = None res.loc[:, "mz_array"] = None diff --git a/quantmsio/core/feature.py b/quantmsio/core/feature.py index 4fceb54..85b9826 100644 --- a/quantmsio/core/feature.py +++ b/quantmsio/core/feature.py @@ -51,32 +51,22 @@ def get_msstats_in_batches(msstats_file: str, batch_size: int) -> int: return total_len // batch_size -def get_additional_properties_from_sdrf( - feature_dict: dict, experiment_type: str, sdrf_samples: dict -) -> dict: - if ( - "FragmentIon" not in feature_dict - ): # FragmentIon is not in the MSstats if the experiment is label free +def get_additional_properties_from_sdrf(feature_dict: dict, experiment_type: str, sdrf_samples: dict) -> dict: + if "FragmentIon" not in feature_dict: # FragmentIon is not in the MSstats if the experiment is label free feature_dict["FragmentIon"] = None if "IsotopeLabelType" not in feature_dict: feature_dict["IsotopeLabelType"] = "L" if "TMT" in experiment_type: # TMT experiment - feature_dict["Channel"] = TMT_CHANNELS[experiment_type][ - int(feature_dict["Channel"]) - 1 - ] + feature_dict["Channel"] = TMT_CHANNELS[experiment_type][int(feature_dict["Channel"]) - 1] elif "ITRAQ" in experiment_type: # ITRAQ experiment - feature_dict["Channel"] = ITRAQ_CHANNEL[experiment_type][ - int(feature_dict["Channel"]) - 1 - ] + feature_dict["Channel"] = ITRAQ_CHANNEL[experiment_type][int(feature_dict["Channel"]) - 1] else: # Label free experiment feature_dict["Channel"] = "LABEL FREE SAMPLE" # Get the sample accession from the SDRF file. - sample_id = sdrf_samples[ - feature_dict["Reference"] + ":_:" + feature_dict["Channel"] - ] + sample_id = sdrf_samples[feature_dict["Reference"] + ":_:" + feature_dict["Channel"]] feature_dict["SampleName"] = sample_id return feature_dict @@ -101,49 +91,31 @@ def _fetch_msstats_feature( # Reference is the Msstats form .e.g. HeLa_1to1_01 feature_dict["Reference"] = feature_dict["Reference"].replace('"', "").split(".")[0] - feature_dict = get_additional_properties_from_sdrf( - feature_dict, experiment_type, sdrf_samples - ) + feature_dict = get_additional_properties_from_sdrf(feature_dict, experiment_type, sdrf_samples) - protein_accessions_list = standardize_protein_list_accession( - feature_dict["ProteinName"] - ) - protein_accessions_string = standardize_protein_string_accession( - feature_dict["ProteinName"] - ) + protein_accessions_list = standardize_protein_list_accession(feature_dict["ProteinName"]) + protein_accessions_string = standardize_protein_string_accession(feature_dict["ProteinName"]) - peptidoform = feature_dict[ - "PeptideSequence" - ] # Peptidoform is the Msstats form .e.g. EM(Oxidation)QDLGGGER - peptide_sequence = clean_peptidoform_sequence( - peptidoform - ) # Get sequence .e.g. EMQDLGGGER + peptidoform = feature_dict["PeptideSequence"] # Peptidoform is the Msstats form .e.g. EM(Oxidation)QDLGGGER + peptide_sequence = clean_peptidoform_sequence(peptidoform) # Get sequence .e.g. EMQDLGGGER charge = None if "PrecursorCharge" in feature_dict: charge = feature_dict["PrecursorCharge"] # Charge is the Msstats form .e.g. 2 elif "Charge" in feature_dict: charge = feature_dict["Charge"] # Charge is the Msstats form .e.g. 2 - peptide_indexed = mztab_handler.get_peptide_index( - msstats_peptidoform=peptidoform, charge=charge - ) + peptide_indexed = mztab_handler.get_peptide_index(msstats_peptidoform=peptidoform, charge=charge) peptide_score_name = mztab_handler.get_search_engine_scores()["peptide_score"] - peptide_mztab_qvalue_accession = standardize_protein_list_accession( - peptide_indexed["protein_accession"] - ) + peptide_mztab_qvalue_accession = standardize_protein_list_accession(peptide_indexed["protein_accession"]) peptide_qvalue = peptide_indexed["peptide_qvalue"] # Peptide q-value index 1 - posterior_error_probability = peptide_indexed[ - "posterior_error_probability" - ] # Get posterior error probability + posterior_error_probability = peptide_indexed["posterior_error_probability"] # Get posterior error probability if posterior_error_probability is not None: posterior_error_probability = np.float64(posterior_error_probability) # Get if decoy or not - peptide_mztab_qvalue_decoy = peptide_indexed[ - "is_decoy" - ] # Peptide q-value decoy index 2 + peptide_mztab_qvalue_decoy = peptide_indexed["is_decoy"] # Peptide q-value decoy index 2 # Mods in quantms.io format modifications_string = peptide_indexed["modifications"] # Mods @@ -154,23 +126,13 @@ def _fetch_msstats_feature( modifications_string = "" for key, value in modifications.items(): modifications_string += "|".join(map(str, value["position"])) - modifications_string = ( - modifications_string + "-" + value["unimod_accession"] + "," - ) - modifications_string = ( - None if len(modifications_string) == 0 else modifications_string[:-1] - ) # Remove last comma - modification_list = ( - None if modifications_string is None else modifications_string.split(",") - ) + modifications_string = modifications_string + "-" + value["unimod_accession"] + "," + modifications_string = None if len(modifications_string) == 0 else modifications_string[:-1] # Remove last comma + modification_list = None if modifications_string is None else modifications_string.split(",") - start_positions = peptide_indexed["psm_protein_start"].split( - "," - ) # Start positions in the protein + start_positions = peptide_indexed["psm_protein_start"].split(",") # Start positions in the protein start_positions = [int(i) for i in start_positions] - end_positions = peptide_indexed["psm_protein_end"].split( - "," - ) # End positions in the protein + end_positions = peptide_indexed["psm_protein_end"].split(",") # End positions in the protein end_positions = [int(i) for i in end_positions] # The spectral count is cero for a given file if a peptide does not appear in the psm section because for example is @@ -202,9 +164,7 @@ def _fetch_msstats_feature( peptide_scan_number = None try: - protein_qvalue_object = mztab_handler.get_protein_qvalue_from_index( - protein_accession=protein_accessions_string - ) + protein_qvalue_object = mztab_handler.get_protein_qvalue_from_index(protein_accession=protein_accessions_string) protein_qvalue = protein_qvalue_object[0] # Protein q-value index 0 except ValueError: logger.error("Error in line: {}".format(feature_dict)) @@ -213,18 +173,10 @@ def _fetch_msstats_feature( # TODO: Importantly, the protein accessions in the peptide section of the mzTab files peptide_mztab_qvalue_accession # are not the same as the protein accessions in the protein section of the mzTab file protein_accessions_list, that is # why we are using the protein_accessions_list to compare with the protein accessions in the MSstats file. - if not compare_protein_lists( - protein_accessions_list, peptide_mztab_qvalue_accession - ): + if not compare_protein_lists(protein_accessions_list, peptide_mztab_qvalue_accession): logger.error("Error in line: {}".format(feature_dict)) - logger.error( - "Protein accessions: {}-{}".format( - protein_accessions_list, peptide_mztab_qvalue_accession - ) - ) - raise Exception( - "The protein accessions in the MSstats file do not match with the mztab file" - ) + logger.error("Protein accessions: {}-{}".format(protein_accessions_list, peptide_mztab_qvalue_accession)) + raise Exception("The protein accessions in the MSstats file do not match with the mztab file") # Unique is different in PSM section, Peptide, We are using the msstats number of accessions. unique = 1 if len(protein_accessions_list) == 1 else 0 @@ -232,24 +184,14 @@ def _fetch_msstats_feature( # TODO: get retention time from mztab file. The retention time in the mzTab peptide level is not clear how is # related to the retention time in the MSstats file. If the consensusxml file is provided, we can get the retention # time from the consensusxml file. - rt = ( - None - if "RetentionTime" not in feature_dict - else np.float64(feature_dict["RetentionTime"]) - ) + rt = None if "RetentionTime" not in feature_dict else np.float64(feature_dict["RetentionTime"]) if intensity_map is not None: key = None if "LABEL FREE" in feature_dict["Channel"]: key = peptidoform + ":_:" + str(charge) + ":_:" + feature_dict["Reference"] elif "TMT" in feature_dict["Channel"] or "ITRAQ" in feature_dict["Channel"]: key = ( - peptidoform - + ":_:" - + str(charge) - + ":_:" - + feature_dict["Reference"] - + ":_:" - + feature_dict["Channel"] + peptidoform + ":_:" + str(charge) + ":_:" + feature_dict["Reference"] + ":_:" + feature_dict["Channel"] ) if key is not None and key in intensity_map: consensus_intensity = intensity_map[key]["intensity"] @@ -268,9 +210,7 @@ def _fetch_msstats_feature( "retention_time": rt, "charge": int(charge), "calc_mass_to_charge": np.float64(calculated_mass), - "peptidoform": peptide_indexed[ - "peptidoform" - ], # Peptidoform in proforma notation + "peptidoform": peptide_indexed["peptidoform"], # Peptidoform in proforma notation "posterior_error_probability": posterior_error_probability, "global_qvalue": np.float64(peptide_qvalue), "is_decoy": int(peptide_mztab_qvalue_decoy), @@ -330,25 +270,19 @@ class FeatureHandler(ParquetHandler): pa.field( "protein_global_qvalue", pa.float64(), - metadata={ - "description": "global q-value of the associated protein or protein group" - }, + metadata={"description": "global q-value of the associated protein or protein group"}, ), pa.field( "unique", pa.int32(), - metadata={ - "description": "if the peptide is unique to a particular protein" - }, + metadata={"description": "if the peptide is unique to a particular protein"}, ), pa.field( "modifications", pa.list_(pa.string()), metadata={"description": "peptide modifications"}, ), - pa.field( - "retention_time", pa.float64(), metadata={"description": "retention time"} - ), + pa.field("retention_time", pa.float64(), metadata={"description": "retention time"}), pa.field( "charge", pa.int32(), @@ -374,21 +308,15 @@ class FeatureHandler(ParquetHandler): pa.float64(), metadata={"description": "posterior error probability"}, ), - pa.field( - "global_qvalue", pa.float64(), metadata={"description": "global q-value"} - ), + pa.field("global_qvalue", pa.float64(), metadata={"description": "global q-value"}), pa.field( "is_decoy", pa.int32(), - metadata={ - "description": "flag indicating if the feature is a decoy (1 is decoy, 0 is not decoy)" - }, + metadata={"description": "flag indicating if the feature is a decoy (1 is decoy, 0 is not decoy)"}, ), # pa.field("best_id_score", pa.string(), # metadata={"description": "best identification score as key value pair"}), - pa.field( - "intensity", pa.float64(), metadata={"description": "intensity value"} - ), + pa.field("intensity", pa.float64(), metadata={"description": "intensity value"}), pa.field( "spectral_count", pa.int32(), @@ -402,13 +330,9 @@ class FeatureHandler(ParquetHandler): pa.field( "condition", pa.string(), - metadata={ - "description": "experimental condition, value of the experimental factor" - }, - ), - pa.field( - "fraction", pa.string(), metadata={"description": "fraction information"} + metadata={"description": "experimental condition, value of the experimental factor"}, ), + pa.field("fraction", pa.string(), metadata={"description": "fraction information"}), pa.field( "biological_replicate", pa.string(), @@ -424,9 +348,7 @@ class FeatureHandler(ParquetHandler): pa.string(), metadata={"description": "type of isotope label"}, ), - pa.field( - "run", pa.string(), metadata={"description": "experimental run information"} - ), + pa.field("run", pa.string(), metadata={"description": "experimental run information"}), pa.field( "channel", pa.string(), @@ -447,9 +369,7 @@ class FeatureHandler(ParquetHandler): pa.field( "best_psm_reference_file_name", pa.string(), - metadata={ - "description": "file name of the reference file for the best PSM" - }, + metadata={"description": "file name of the reference file for the best PSM"}, ), pa.field( "best_psm_scan_number", @@ -549,9 +469,7 @@ def convert_mztab_msstats_to_feature( # Skip the header msstats_columns = line.rstrip().split(",") else: - raise Exception( - "The MSstats file does not have the expected header" - ) + raise Exception("The MSstats file does not have the expected header") line = msstats_file_handler.readline() pqwriter = None while line.rstrip() != "": @@ -574,9 +492,7 @@ def convert_mztab_msstats_to_feature( feature_list = [] batch_count += 1 if not pqwriter: - pqwriter = pq.ParquetWriter( - self.parquet_path, feature_table.schema - ) + pqwriter = pq.ParquetWriter(self.parquet_path, feature_table.schema) pqwriter.write_table(feature_table) line = msstats_file_handler.readline() # batches = 1 diff --git a/quantmsio/core/feature_in_memory.py b/quantmsio/core/feature_in_memory.py index f38e7bf..0575b65 100644 --- a/quantmsio/core/feature_in_memory.py +++ b/quantmsio/core/feature_in_memory.py @@ -223,13 +223,9 @@ def __get_spectra_count(self, mztab_path, psm_chunksize): from collections import Counter counter = Counter() - psms = self.skip_and_load_csv( - mztab_path, "PSH", sep="\t", chunksize=psm_chunksize - ) + psms = self.skip_and_load_csv(mztab_path, "PSH", sep="\t", chunksize=psm_chunksize) for psm in psms: - psm["spectra_ref"] = psm["spectra_ref"].swifter.apply( - lambda x: self._ms_runs[x.split(":")[0]] - ) + psm["spectra_ref"] = psm["spectra_ref"].swifter.apply(lambda x: self._ms_runs[x.split(":")[0]]) if "opt_global_cv_MS:1000889_peptidoform_sequence" not in psm.columns: psm.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = psm[ ["modifications", "sequence"] @@ -333,13 +329,9 @@ def _extract_from_pep(self, mztab_path): live_cols.append("sequence") live_cols.append("modifications") else: - raise Exception( - "The peptide table don't have opt_global_cv_MS:1000889_peptidoform_sequence columns" - ) + raise Exception("The peptide table don't have opt_global_cv_MS:1000889_peptidoform_sequence columns") if "charge" in not_cols or "best_search_engine_score[1]" in not_cols: - raise Exception( - "The peptide table don't have best_search_engine_score[1] or charge columns" - ) + raise Exception("The peptide table don't have best_search_engine_score[1] or charge columns") pep = self.skip_and_load_csv(mztab_path, "PEH", sep="\t", usecols=live_cols) @@ -349,9 +341,7 @@ def _extract_from_pep(self, mztab_path): pep.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = pep[ ["modifications", "sequence"] ].swifter.apply( - lambda row: get_petidoform_msstats_notation( - row["sequence"], row["modifications"], modifications - ), + lambda row: get_petidoform_msstats_notation(row["sequence"], row["modifications"], modifications), axis=1, ) @@ -360,20 +350,14 @@ def _extract_from_pep(self, mztab_path): pep.loc[:, "scan_number"] = None pep.loc[:, "spectra_ref"] = None else: - pep.loc[:, "scan_number"] = pep["spectra_ref"].swifter.apply( - lambda x: generate_scan_number(x) - ) - pep["spectra_ref"] = pep["spectra_ref"].swifter.apply( - lambda x: self._ms_runs[x.split(":")[0]] - ) + pep.loc[:, "scan_number"] = pep["spectra_ref"].swifter.apply(lambda x: generate_scan_number(x)) + pep["spectra_ref"] = pep["spectra_ref"].swifter.apply(lambda x: self._ms_runs[x.split(":")[0]]) pep_msg = pep.iloc[ - pep.groupby( - ["opt_global_cv_MS:1000889_peptidoform_sequence", "charge"] - ).apply(lambda row: row["best_search_engine_score[1]"].idxmin()) + pep.groupby(["opt_global_cv_MS:1000889_peptidoform_sequence", "charge"]).apply( + lambda row: row["best_search_engine_score[1]"].idxmin() + ) ] - pep_msg = pep_msg.set_index( - ["opt_global_cv_MS:1000889_peptidoform_sequence", "charge"] - ) + pep_msg = pep_msg.set_index(["opt_global_cv_MS:1000889_peptidoform_sequence", "charge"]) pep_msg.loc[:, "pep_msg"] = pep_msg[ ["best_search_engine_score[1]", "spectra_ref", "scan_number"] @@ -412,19 +396,13 @@ def _extract_from_psm_to_pep_msg(self, mztab_path, map_dict): ), axis=1, ) - for key, df in psm.groupby( - ["opt_global_cv_MS:1000889_peptidoform_sequence", "charge"] - ): + for key, df in psm.groupby(["opt_global_cv_MS:1000889_peptidoform_sequence", "charge"]): if key not in map_dict.keys(): map_dict[key] = [None, None, None] psm_unique_keys.append(key) df = df.reset_index(drop=True) - df.loc[:, "scan_number"] = df["spectra_ref"].apply( - lambda x: generate_scan_number(x) - ) - df["spectra_ref"] = df["spectra_ref"].apply( - lambda x: self._ms_runs[x.split(":")[0]] - ) + df.loc[:, "scan_number"] = df["spectra_ref"].apply(lambda x: generate_scan_number(x)) + df["spectra_ref"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x.split(":")[0]]) if pd.isna(map_dict[key][1]): if "opt_global_q-value_score" in df.columns: temp_df = df.iloc[df["opt_global_q-value_score"].idxmin()] @@ -466,32 +444,22 @@ def _extract_from_psm_to_pep_msg(self, mztab_path, map_dict): ): if len(map_dict[key]) != 7: if "opt_global_Posterior_Error_Probability_score" in df.columns: - probability_score = df[ - "opt_global_Posterior_Error_Probability_score" - ].min() + probability_score = df["opt_global_Posterior_Error_Probability_score"].min() else: - probability_score = df[ - "opt_global_Posterior_Error_Probability" - ].min() + probability_score = df["opt_global_Posterior_Error_Probability"].min() if float(probability_score) < map_dict[key][7]: map_dict[key][7] = probability_score else: if "opt_global_Posterior_Error_Probability_score" in df.columns: - map_dict[key].append( - df["opt_global_Posterior_Error_Probability_score"].min() - ) + map_dict[key].append(df["opt_global_Posterior_Error_Probability_score"].min()) else: - map_dict[key].append( - df["opt_global_Posterior_Error_Probability"].min() - ) + map_dict[key].append(df["opt_global_Posterior_Error_Probability"].min()) else: if len(map_dict[key]) == 7: map_dict[key].append(None) if len(map_dict[key]) == 8: if "opt_global_cv_MS:1002217_decoy_peptide" in df.columns: - map_dict[key].append( - df["opt_global_cv_MS:1002217_decoy_peptide"].values[0] - ) + map_dict[key].append(df["opt_global_cv_MS:1002217_decoy_peptide"].values[0]) else: map_dict[key].append(None) if len(map_dict[key]) != 11: @@ -499,10 +467,7 @@ def _extract_from_psm_to_pep_msg(self, mztab_path, map_dict): map_dict[key].append(df["calc_mass_to_charge"].values[0]) map_dict[key].append(None) else: - cals = df[ - (df["spectra_ref"] == map_dict[key][1]) - & (df["scan_number"] == map_dict[key][2]) - ] + cals = df[(df["spectra_ref"] == map_dict[key][1]) & (df["scan_number"] == map_dict[key][2])] if len(cals) == 0: map_dict[key].append(None) @@ -512,10 +477,7 @@ def _extract_from_psm_to_pep_msg(self, mztab_path, map_dict): map_dict[key].append(cals["exp_mass_to_charge"].values[0]) elif map_dict[key][-1] is None: if map_dict[key][1] in df["spectra_ref"].values: - cals = df[ - (df["spectra_ref"] == map_dict[key][1]) - & (df["scan_number"] == map_dict[key][2]) - ] + cals = df[(df["spectra_ref"] == map_dict[key][1]) & (df["scan_number"] == map_dict[key][2])] if len(cals) != 0: map_dict[key][-2] = cals["calc_mass_to_charge"].values[0] map_dict[key][-1] = cals["exp_mass_to_charge"].values[0] @@ -541,18 +503,14 @@ def _extract_rt_from_consensus_xml(self, intensity_map, msstats_in): msstats_in["retention_time"] = msstats_in[ ["peptidoform", "Charge", "Reference", "Channel", "Intensity"] ].swifter.apply( - lambda row: self.__map_rt_or_exp_mass( - row[:-1], intensity_map, row[-1:].values[0], label="rt" - ), + lambda row: self.__map_rt_or_exp_mass(row[:-1], intensity_map, row[-1:].values[0], label="rt"), axis=1, ) else: msstats_in["retention_time"] = msstats_in[ ["peptidoform", "PrecursorCharge", "Reference", "Intensity"] ].swifter.apply( - lambda row: self.__map_rt_or_exp_mass( - row[:-1], intensity_map, row[-1:].values[0], label="rt" - ), + lambda row: self.__map_rt_or_exp_mass(row[:-1], intensity_map, row[-1:].values[0], label="rt"), axis=1, ) if self.experiment_type != "LFQ": @@ -596,9 +554,7 @@ def _extract_rt_from_consensus_xml(self, intensity_map, msstats_in): ) return msstats_in - def __map_rt_or_exp_mass( - self, row, intensity_map, intensity, label=None, exp_mass=None - ): + def __map_rt_or_exp_mass(self, row, intensity_map, intensity, label=None, exp_mass=None): row = list(map(str, row.tolist())) key = ":_:".join(row) if key in intensity_map: @@ -636,18 +592,12 @@ def _map_msstats_in(self, msstats_in, map_dict, spectra_count_dict): msstats_in.loc[:, "spectral_count"] = msstats_in[ ["PeptideSequence", "PrecursorCharge", "Reference"] ].swifter.apply( - lambda row: spectra_count_dict[ - (row["PeptideSequence"], row["PrecursorCharge"], row["Reference"]) - ], + lambda row: spectra_count_dict[(row["PeptideSequence"], row["PrecursorCharge"], row["Reference"])], axis=1, ) else: - msstats_in.loc[:, "spectral_count"] = msstats_in[ - ["PeptideSequence", "Charge", "Reference"] - ].swifter.apply( - lambda row: spectra_count_dict[ - (row["PeptideSequence"], row["Charge"], row["Reference"]) - ], + msstats_in.loc[:, "spectral_count"] = msstats_in[["PeptideSequence", "Charge", "Reference"]].swifter.apply( + lambda row: spectra_count_dict[(row["PeptideSequence"], row["Charge"], row["Reference"])], axis=1, ) map_features = [ @@ -665,18 +615,12 @@ def _map_msstats_in(self, msstats_in, map_dict, spectra_count_dict): ] for i, feature in enumerate(map_features): if self.experiment_type == "LFQ": - msstats_in.loc[:, feature] = msstats_in[ - ["PeptideSequence", "PrecursorCharge"] - ].swifter.apply( - lambda row: map_dict[ - (row["PeptideSequence"], row["PrecursorCharge"]) - ][i], + msstats_in.loc[:, feature] = msstats_in[["PeptideSequence", "PrecursorCharge"]].swifter.apply( + lambda row: map_dict[(row["PeptideSequence"], row["PrecursorCharge"])][i], axis=1, ) else: - msstats_in.loc[:, feature] = msstats_in[ - ["PeptideSequence", "Charge"] - ].swifter.apply( + msstats_in.loc[:, feature] = msstats_in[["PeptideSequence", "Charge"]].swifter.apply( lambda row: map_dict[(row["PeptideSequence"], row["Charge"])][i], axis=1, ) @@ -708,22 +652,16 @@ def merge_mztab_and_sdrf_to_msstats_in( msstats_ins = pd.read_csv(msstats_path, chunksize=msstats_chunksize) spectra_count_dict = self.__get_spectra_count(mztab_path, 500000) for msstats_in in msstats_ins: - msstats_in["Reference"] = msstats_in["Reference"].swifter.apply( - lambda x: x.split(".")[0] + msstats_in["Reference"] = msstats_in["Reference"].swifter.apply(lambda x: x.split(".")[0]) + msstats_in.loc[:, "protein_global_qvalue"] = msstats_in["ProteinName"].swifter.apply( + lambda x: self._handle_protein_map(protein_map, x) ) - msstats_in.loc[:, "protein_global_qvalue"] = msstats_in[ - "ProteinName" - ].swifter.apply(lambda x: self._handle_protein_map(protein_map, x)) msstats_in.loc[:, "sequence"] = msstats_in["PeptideSequence"].swifter.apply( lambda x: clean_peptidoform_sequence(x) ) if self.experiment_type != "LFQ": - no_tmt_usecols = [ - col - for col in self._tmt_msstats_usecols - if col not in msstats_in.columns - ] + no_tmt_usecols = [col for col in self._tmt_msstats_usecols if col not in msstats_in.columns] for col in no_tmt_usecols: if "IsotopeLabelType" == col: msstats_in.loc[:, col] = "L" @@ -741,21 +679,15 @@ def merge_mztab_and_sdrf_to_msstats_in( self._tmt_msstats_usecols.append("RetentionTime") msstats_in = msstats_in[self._tmt_msstats_usecols] self._tmt_msstats_usecols.remove("RetentionTime") - msstats_in = self._map_msstats_in( - msstats_in, map_dict, spectra_count_dict - ) - msstats_in.loc[:, "peptidoform"] = msstats_in[ - ["sequence", "modifications"] - ].swifter.apply( + msstats_in = self._map_msstats_in(msstats_in, map_dict, spectra_count_dict) + msstats_in.loc[:, "peptidoform"] = msstats_in[["sequence", "modifications"]].swifter.apply( lambda row: get_peptidoform_proforma_version_in_mztab( row["sequence"], row["modifications"], self._modifications ), axis=1, ) msstats_in.drop(["PeptideSequence"], inplace=True, axis=1) - msstats_in[ - ["best_psm_reference_file_name", "best_psm_scan_number"] - ] = msstats_in[ + msstats_in[["best_psm_reference_file_name", "best_psm_scan_number"]] = msstats_in[ [ "best_psm_reference_file_name", "best_psm_scan_number", @@ -771,39 +703,27 @@ def merge_mztab_and_sdrf_to_msstats_in( result_type="expand", ) if intensity_map is not None and len(intensity_map) != 0: - msstats_in = self._extract_rt_from_consensus_xml( - intensity_map, msstats_in - ) + msstats_in = self._extract_rt_from_consensus_xml(intensity_map, msstats_in) table = self._merge_sdrf_to_msstats_in(sdrf_path, msstats_in) parquet_table = self.convert_to_parquet(table) yield parquet_table else: - no_lfq_usecols = [ - col - for col in self._lfq_msstats_usecols - if col not in msstats_in.columns - ] + no_lfq_usecols = [col for col in self._lfq_msstats_usecols if col not in msstats_in.columns] for col in no_lfq_usecols: if col == "Channel": msstats_in.loc[:, col] = "LABEL FREE SAMPLE" else: msstats_in.loc[:, col] = None msstats_in = msstats_in[self._lfq_msstats_usecols] - msstats_in = self._map_msstats_in( - msstats_in, map_dict, spectra_count_dict - ) - msstats_in.loc[:, "peptidoform"] = msstats_in[ - ["sequence", "modifications"] - ].swifter.apply( + msstats_in = self._map_msstats_in(msstats_in, map_dict, spectra_count_dict) + msstats_in.loc[:, "peptidoform"] = msstats_in[["sequence", "modifications"]].swifter.apply( lambda row: get_peptidoform_proforma_version_in_mztab( row["sequence"], row["modifications"], self._modifications ), axis=1, ) msstats_in.drop(["PeptideSequence"], inplace=True, axis=1) - msstats_in[ - ["best_psm_reference_file_name", "best_psm_scan_number"] - ] = msstats_in[ + msstats_in[["best_psm_reference_file_name", "best_psm_scan_number"]] = msstats_in[ [ "best_psm_reference_file_name", "best_psm_scan_number", @@ -819,9 +739,7 @@ def merge_mztab_and_sdrf_to_msstats_in( result_type="expand", ) if intensity_map is not None and len(intensity_map) != 0: - msstats_in = self._extract_rt_from_consensus_xml( - intensity_map, msstats_in - ) + msstats_in = self._extract_rt_from_consensus_xml(intensity_map, msstats_in) table = self._merge_sdrf_to_msstats_in(sdrf_path, msstats_in) parquet_table = self.convert_to_parquet(table) yield parquet_table @@ -871,9 +789,7 @@ def _merge_sdrf_to_msstats_in(self, sdrf_path, msstats_in): "comment[technical replicate]", ] ] - sdrf["comment[data file]"] = sdrf["comment[data file]"].swifter.apply( - lambda x: x.split(".")[0] - ) + sdrf["comment[data file]"] = sdrf["comment[data file]"].swifter.apply(lambda x: x.split(".")[0]) if self.experiment_type != "LFQ": res = pd.merge( msstats_in, @@ -944,9 +860,7 @@ def extract_ms_runs(fle): ms_runs = {} while line.split("\t")[0] == "MTD": if line.split("\t")[1].split("-")[-1] == "location": - ms_runs[line.split("\t")[1].split("-")[0]] = ( - line.split("\t")[2].split("//")[-1].split(".")[0] - ) + ms_runs[line.split("\t")[1].split("-")[0]] = line.split("\t")[2].split("//")[-1].split(".")[0] line = f.readline() f.close() return ms_runs @@ -970,15 +884,11 @@ def _generate_modification_list(self, modification_str: str): if pd.isna(modification_str): return pd.NA - modifications = get_quantmsio_modifications( - modification_str, self._modifications - ) + modifications = get_quantmsio_modifications(modification_str, self._modifications) modifications_string = "" for key, value in modifications.items(): modifications_string += "|".join(map(str, value["position"])) - modifications_string = ( - modifications_string + "-" + value["unimod_accession"] + "," - ) + modifications_string = modifications_string + "-" + value["unimod_accession"] + "," modifications_string = modifications_string[:-1] # Remove last comma modification_list = modifications_string.split(",") @@ -993,36 +903,18 @@ def convert_to_parquet(self, res): res["sequence"] = res["sequence"].astype(str) res["protein_accessions"] = res["protein_accessions"].str.split(";") res["protein_start_positions"] = ( - res["protein_start_positions"] - .swifter.apply(self.__split_start_or_end) - .to_list() - ) - res["protein_end_positions"] = ( - res["protein_end_positions"] - .swifter.apply(self.__split_start_or_end) - .to_list() + res["protein_start_positions"].swifter.apply(self.__split_start_or_end).to_list() ) + res["protein_end_positions"] = res["protein_end_positions"].swifter.apply(self.__split_start_or_end).to_list() res["protein_global_qvalue"] = res["protein_global_qvalue"].astype(float) - res["unique"] = ( - res["unique"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") - ) - res["modifications"] = res["modifications"].swifter.apply( - lambda x: self._generate_modification_list(x) - ) - res["charge"] = ( - res["charge"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") - ) + res["unique"] = res["unique"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") + res["modifications"] = res["modifications"].swifter.apply(lambda x: self._generate_modification_list(x)) + res["charge"] = res["charge"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") res["exp_mass_to_charge"] = res["exp_mass_to_charge"].astype(float) res["calc_mass_to_charge"] = res["calc_mass_to_charge"].astype(float) - res["posterior_error_probability"] = res["posterior_error_probability"].astype( - float - ) + res["posterior_error_probability"] = res["posterior_error_probability"].astype(float) res["global_qvalue"] = res["global_qvalue"].astype(float) - res["is_decoy"] = ( - res["is_decoy"] - .map(lambda x: pd.NA if pd.isna(x) else int(x)) - .astype("Int32") - ) + res["is_decoy"] = res["is_decoy"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") res["intensity"] = res["intensity"].astype(float) res["spectral_count"] = res["spectral_count"].astype(int) res["fraction"] = res["fraction"].astype(int).astype(str) diff --git a/quantmsio/core/json.py b/quantmsio/core/json.py index 73d1256..020ead8 100644 --- a/quantmsio/core/json.py +++ b/quantmsio/core/json.py @@ -44,19 +44,13 @@ def feature_json(feature_row) -> dict: feature_dic = feature_row.to_dict() feature_dic["protein_accessions"] = ( - list(feature_dic["protein_accessions"]) - if feature_dic["protein_accessions"].any() - else [] + list(feature_dic["protein_accessions"]) if feature_dic["protein_accessions"].any() else [] ) feature_dic["protein_start_positions"] = ( - list(feature_dic["protein_start_positions"]) - if feature_dic["protein_start_positions"].any() - else [] + list(feature_dic["protein_start_positions"]) if feature_dic["protein_start_positions"].any() else [] ) feature_dic["protein_end_positions"] = ( - list(feature_dic["protein_end_positions"]) - if feature_dic["protein_end_positions"].any() - else [] + list(feature_dic["protein_end_positions"]) if feature_dic["protein_end_positions"].any() else [] ) feature_dic["modifications"] = ( list(feature_dic["modifications"]) @@ -72,10 +66,7 @@ def feature_json(feature_row) -> dict: ) feature_dic["gene_accessions"] = ( list(feature_dic["gene_accessions"]) - if ( - feature_dic["gene_accessions"] is not None - and feature_dic["gene_accessions"].any() - ) + if (feature_dic["gene_accessions"] is not None and feature_dic["gene_accessions"].any()) else [] ) @@ -117,9 +108,7 @@ def psm_to_json(self, parquet_psm_path: str, json_psm_path: str): chunks = read_large_parquet(parquet_psm_path) for psm_df in chunks: - psm_df.to_json( - json_psm_path, orient="records", lines=True, compression="gzip" - ) + psm_df.to_json(json_psm_path, orient="records", lines=True, compression="gzip") return json_psm_path def convert_tsv_to_json(self, file_path: str, json_path: str): @@ -152,9 +141,7 @@ def sdrf_to_json(self, sdrf_path: str, json_path: str): sdrf_json["cell_lines"] = sdrf_handler.get_cell_lines() sdrf_json["method"] = sdrf_handler.get_acquisition_properties() sdrf_json["experiment_type"] = sdrf_handler.get_experiment_type_from_sdrf() - sdrf_json["feature_msg"] = sdrf_handler.extract_feature_properties().to_json( - orient="values" - ) + sdrf_json["feature_msg"] = sdrf_handler.extract_feature_properties().to_json(orient="values") sdrf_json["sample_map"] = sdrf_handler.get_sample_map() fix_m, var_m = sdrf_handler.get_mods() diff --git a/quantmsio/core/mztab.py b/quantmsio/core/mztab.py index d3a00c4..afce4dc 100644 --- a/quantmsio/core/mztab.py +++ b/quantmsio/core/mztab.py @@ -151,9 +151,7 @@ def add_peptide_to_index(self, peptide_key: str, peptide_value: dict): if self._use_cache: if self._peptide_index.contains(peptide_key): pep = self._peptide_index.get_item(peptide_key) - if float(peptide_value["peptide_qvalue"]) < float( - pep["peptide_qvalue"] - ): + if float(peptide_value["peptide_qvalue"]) < float(pep["peptide_qvalue"]): self._peptide_index.add_item(peptide_key, peptide_value) else: self._peptide_index.add_item(peptide_key, peptide_value) @@ -190,28 +188,16 @@ def add_psm_to_count( if self._use_cache: if self._peptide_index.contains(psm_key): psm = self._peptide_index.get_item(psm_key) - peptide_protein_accession = ( - psm["protein_accession"] if "protein_accession" in psm else None - ) - peptide_protein_start = ( - psm["psm_protein_start"] if "psm_protein_start" in psm else None - ) - peptide_protein_end = ( - psm["psm_protein_end"] if "psm_protein_end" in psm else None - ) + peptide_protein_accession = psm["protein_accession"] if "protein_accession" in psm else None + peptide_protein_start = psm["psm_protein_start"] if "psm_protein_start" in psm else None + peptide_protein_end = psm["psm_protein_end"] if "psm_protein_end" in psm else None if posterior_error_probability is not None: if psm["posterior_error_probability"] is None or ( - np.float64(posterior_error_probability) - < psm["posterior_error_probability"] + np.float64(posterior_error_probability) < psm["posterior_error_probability"] ): - psm["posterior_error_probability"] = np.float64( - posterior_error_probability - ) + psm["posterior_error_probability"] = np.float64(posterior_error_probability) - if ( - peptide_protein_start is not None - and peptide_protein_end is not None - ) and [ + if (peptide_protein_start is not None and peptide_protein_end is not None) and [ peptide_protein_accession, peptide_protein_start, peptide_protein_end, @@ -220,9 +206,7 @@ def add_psm_to_count( protein_start, protein_end, ]: - raise Exception( - "The protein information is different for the same psm key" - ) + raise Exception("The protein information is different for the same psm key") else: psm["psm_protein_start"] = protein_start psm["psm_protein_end"] = protein_end @@ -265,10 +249,7 @@ def add_psm_to_count( and posterior_error_probability < psm["posterior_error_probability"] ): psm["posterior_error_probability"] = posterior_error_probability - if ( - peptide_protein_start is not None - and peptide_protein_end is not None - ) and [ + if (peptide_protein_start is not None and peptide_protein_end is not None) and [ peptide_protein_accession, peptide_protein_start, peptide_protein_end, @@ -277,9 +258,7 @@ def add_psm_to_count( protein_start, protein_end, ]: - raise Exception( - "The protein information is different for the same psm key" - ) + raise Exception("The protein information is different for the same psm key") else: psm["psm_protein_start"] = protein_start psm["psm_protein_end"] = protein_end @@ -310,9 +289,7 @@ def add_psm_to_count( self._peptide_index[psm_key] = psm - def create_mztab_index( - self, mztab_file: str, qvalue_index: bool = True, psm_count_index: bool = True - ): + def create_mztab_index(self, mztab_file: str, qvalue_index: bool = True, psm_count_index: bool = True): """ create an index for a mztab file; the index contains a structure with the position of each psm, peptide and protein in the file. @@ -332,9 +309,7 @@ def create_mztab_index( if line.startswith("MTD") and "search_engine_score" in line: self._get_search_engine_scores(line) if line.startswith("MTD") and "_mod[": - self._modifications = fetch_modifications_from_mztab_line( - line, self._modifications - ) + self._modifications = fetch_modifications_from_mztab_line(line, self._modifications) elif line.startswith("PRH"): logger.info("-- End of the Metadata section of the mztab file -- ") protein_columns = line.split("\t") @@ -345,9 +320,7 @@ def create_mztab_index( protein = fetch_protein_from_mztab_line(es) self._protein_details[protein["accession"]] = [protein["score"]] elif line.startswith("PEH"): - logger.info( - "-- All proteins have been read, starting peptide section -- " - ) + logger.info("-- All proteins have been read, starting peptide section -- ") peptide_columns = line.split("\t") elif line.startswith("PEP"): peptide_info = line.split("\t") @@ -373,9 +346,7 @@ def create_mztab_index( ) self.add_peptide_to_index(peptide_key, peptide_value) elif line.startswith("PSH"): - logger.info( - "-- All peptides have been read, starting psm section -- " - ) + logger.info("-- All peptides have been read, starting psm section -- ") psm_columns = line.split("\t") elif line.startswith("PSM"): psm_info = line.split("\t") @@ -385,9 +356,7 @@ def create_mztab_index( ms_runs=self._ms_runs, modifications_definition=self._modifications, ) - psm_key = get_key_peptide_combination( - psm["peptidoform"], psm["charge"] - ) + psm_key = get_key_peptide_combination(psm["peptidoform"], psm["charge"]) self.add_psm_to_count( psm_key=psm_key, protein_accession=psm["accession"], @@ -418,14 +387,10 @@ def print_all_peptides(self): """ if self._use_cache: for key in self._peptide_index.get_all_keys(): - logger.debug( - "Key {} Value {}".format(key, self._peptide_index.get_item(key)) - ) + logger.debug("Key {} Value {}".format(key, self._peptide_index.get_item(key))) else: for peptide in self._peptide_index.keys(): - logger.debug( - "Key {} Value {}".format(peptide, self._peptide_index[peptide]) - ) + logger.debug("Key {} Value {}".format(peptide, self._peptide_index[peptide])) def print_mztab_stats(self): """ @@ -465,14 +430,8 @@ def get_protein_qvalue_from_index(self, protein_accession: str): :param protein_accession: protein accession :return: protein qvalue object """ - if ( - protein_accession is None - or self._protein_details is None - or protein_accession not in self._protein_details - ): - raise Exception( - "Protein accession to be search is None or the protein accession is not in the index" - ) + if protein_accession is None or self._protein_details is None or protein_accession not in self._protein_details: + raise Exception("Protein accession to be search is None or the protein accession is not in the index") return self._protein_details[protein_accession] def get_protein_qvalue_from_index_list(self, protein_accession_list: list): @@ -483,9 +442,7 @@ def get_protein_qvalue_from_index_list(self, protein_accession_list: list): """ if protein_accession_list is None or self._protein_details is None: - raise Exception( - "Protein accession to be search is None or the protein accession is not in the index" - ) + raise Exception("Protein accession to be search is None or the protein accession is not in the index") # # If the number of proteins where the peptide maps is to big, we will have an error in the permutations. # if len(protein_accession_list) > 4: @@ -511,13 +468,9 @@ def get_protein_qvalue_from_index_list(self, protein_accession_list: list): def _get_search_engine_scores(self, line: str): if line.lower().__contains__("protein_search_engine_score"): - self._search_engine_scores["protein_score"] = parse_score_name_in_mztab( - line - ) + self._search_engine_scores["protein_score"] = parse_score_name_in_mztab(line) elif line.lower().__contains__("peptide_search_engine_score"): - self._search_engine_scores["peptide_score"] = parse_score_name_in_mztab( - line - ) + self._search_engine_scores["peptide_score"] = parse_score_name_in_mztab(line) elif line.lower().__contains__("psm_search_engine_score"): self._search_engine_scores["psm_score"] = parse_score_name_in_mztab(line) @@ -534,9 +487,7 @@ def create_mztab_psm_iterator(self, mztab_file: str): :return: None """ self._protein_details = {} - self._psm_iterator = open( - mztab_file, "r", buffering=calculate_buffer_size(mztab_file) - ) + self._psm_iterator = open(mztab_file, "r", buffering=calculate_buffer_size(mztab_file)) line = self._psm_iterator.readline() while line != "": line = line.strip() @@ -545,9 +496,7 @@ def create_mztab_psm_iterator(self, mztab_file: str): if line.startswith("MTD") and "search_engine_score" in line: self._get_search_engine_scores(line) if line.startswith("MTD") and "_mod[": - self._modifications = fetch_modifications_from_mztab_line( - line, self._modifications - ) + self._modifications = fetch_modifications_from_mztab_line(line, self._modifications) elif line.startswith("PRH"): logger.info("-- End of the Metadata section of the mztab file -- ") protein_columns = line.split("\t") @@ -557,15 +506,9 @@ def create_mztab_psm_iterator(self, mztab_file: str): if PROTEIN_DETAILS not in line: es = dict(zip(protein_columns, protein_info)) protein = fetch_protein_from_mztab_line(es) - protein["accession"] = standardize_protein_string_accession( - protein["accession"], sorted=True - ) + protein["accession"] = standardize_protein_string_accession(protein["accession"], sorted=True) self._protein_details[protein["accession"]] = [protein["score"]] - logger.info( - "Added protein to the protein index -- {}".format( - protein["accession"] - ) - ) + logger.info("Added protein to the protein index -- {}".format(protein["accession"])) elif line.startswith("PSH"): logger.info("-- All peptides have been read, starting psm section -- ") self._psm_columns = line.split("\t") @@ -579,9 +522,7 @@ def read_next_psm(self): :return: psm dictionary """ if self._psm_iterator is None: - raise Exception( - "The mztab file has not been loaded or the iterator has not been created" - ) + raise Exception("The mztab file has not been loaded or the iterator has not been created") line = self._psm_iterator.readline() if line != "" and line.startswith("PSM"): diff --git a/quantmsio/core/openms.py b/quantmsio/core/openms.py index 783ca07..b1a2486 100644 --- a/quantmsio/core/openms.py +++ b/quantmsio/core/openms.py @@ -13,9 +13,7 @@ def __init__(self) -> None: self._consensus_xml_path = None self._spec_lookup = None - def get_spectrum_from_scan( - self, mzml_path: str, scan_number: int - ) -> Tuple[Any, Any]: + def get_spectrum_from_scan(self, mzml_path: str, scan_number: int) -> Tuple[Any, Any]: """ Get a spectrum from a mzML file using the scan number :param mzml_path: path to the mzML file @@ -30,18 +28,14 @@ def get_spectrum_from_scan( try: index = self._spec_lookup.findByScanNumber(scan_number) except IndexError: - message = ( - "scan_number" + str(scan_number) + "not found in file: " + mzml_path - ) + message = "scan_number" + str(scan_number) + "not found in file: " + mzml_path warnings.warn(message, category=None, stacklevel=1, source=None) return [], [] spectrum = self._mzml_exp.getSpectrum(index) spectrum_mz, spectrum_intensities = spectrum.get_peaks() return spectrum_mz, spectrum_intensities - def get_intensity_map( - self, consensusxml_path: str, experiment_type: str = None - ) -> dict: + def get_intensity_map(self, consensusxml_path: str, experiment_type: str = None) -> dict: """ Get the intensity map from a consensusxml file. The intensity map is a dictionary with the following structure: - key: peptide sequence + ":_:" + charge + ":_:" + reference file @@ -65,9 +59,7 @@ def get_intensity_map( return self._get_intensity_map_tmt_or_itraq(df, experiment_type) elif experiment_type is not None and "ITRAQ" in experiment_type.upper(): return self._get_intensity_map_tmt_or_itraq(df, experiment_type) - return self._get_intensity_map_lfq( - df - ) # If not experiment type is provided, we assume it is label free + return self._get_intensity_map_lfq(df) # If not experiment type is provided, we assume it is label free @staticmethod def _get_intensity_map_lfq(df): @@ -77,17 +69,13 @@ def _get_intensity_map_lfq(df): :return: intensity map """ peptide_columns = ["sequence", "charge", "RT", "mz", "quality"] - intensity_columns = [ - column for column in df.columns if column not in peptide_columns - ] + intensity_columns = [column for column in df.columns if column not in peptide_columns] intensity_map = {} for index, row in df.iterrows(): for column in intensity_columns: if np.float64(row[f"{column}"]) > 0.0: reference_file = column.split(".")[0] - key = ( - row.sequence + ":_:" + str(row.charge) + ":_:" + reference_file - ) + key = row.sequence + ":_:" + str(row.charge) + ":_:" + reference_file if key not in intensity_map: intensity_map[key] = { "rt": row.RT, @@ -111,30 +99,18 @@ def _get_intensity_map_tmt_or_itraq(df, experiment_type): :return: intensity map """ peptide_columns = ["sequence", "charge", "RT", "mz", "quality", "file"] - intensity_columns = [ - column for column in df.columns if column not in peptide_columns - ] + intensity_columns = [column for column in df.columns if column not in peptide_columns] intensity_map = {} for index, row in df.iterrows(): for column in intensity_columns: if np.float64(row[f"{column}"]) > 0.0: reference_file = row.file.split(".")[0] if "TMT" in experiment_type.upper(): - channel = ( - "TMT" + column.split("_")[1] - ) # A TMT channel has in consesusXML the following format: + channel = "TMT" + column.split("_")[1] # A TMT channel has in consesusXML the following format: # tmt10plex_129N -> TMT129N else: channel = "ITRAQ" + column.split("_")[1] - key = ( - row.sequence - + ":_:" - + str(row.charge) - + ":_:" - + reference_file - + ":_:" - + channel - ) + key = row.sequence + ":_:" + str(row.charge) + ":_:" + reference_file + ":_:" + channel if key not in intensity_map: intensity_map[key] = { "rt": row.RT, diff --git a/quantmsio/core/parquet_handler.py b/quantmsio/core/parquet_handler.py index 6f9ebed..89c6632 100644 --- a/quantmsio/core/parquet_handler.py +++ b/quantmsio/core/parquet_handler.py @@ -20,9 +20,7 @@ def _create_schema(self) -> pa.Schema: """ pass - def write_single_file_parquet( - self, table: pa.Table, parquet_output: str = None, write_metadata: bool = False - ): + def write_single_file_parquet(self, table: pa.Table, parquet_output: str = None, write_metadata: bool = False): """ Write a single file parquet file. If parquet_output is not specified, the parquet_path will be used. Pyarrow is still not writing the metadata of each field in the schema ( diff --git a/quantmsio/core/plots.py b/quantmsio/core/plots.py index 4a02860..08fae22 100644 --- a/quantmsio/core/plots.py +++ b/quantmsio/core/plots.py @@ -7,9 +7,7 @@ import seaborn as sns -def plot_distribution_of_ibaq( - ibaq_path: str, save_path: str = None, selected_column: str = None -) -> None: +def plot_distribution_of_ibaq(ibaq_path: str, save_path: str = None, selected_column: str = None) -> None: """ This function plots the distribution of the protein IBAQ values. :param ibaq_path: ibaq file path @@ -25,9 +23,7 @@ def plot_distribution_of_ibaq( elif "IbaqLog" in columns: selected_column = "IbaqLog" if selected_column is not None: - fig = sns.histplot( - data=df[selected_column], stat="frequency", kde=True, color="#209D73" - ) + fig = sns.histplot(data=df[selected_column], stat="frequency", kde=True, color="#209D73") fig.set(xlabel=selected_column, ylabel="Frequency") fig.set_title("Distribution of IBAQ values using {}".format(selected_column)) sns.despine(ax=fig, top=True, right=True) @@ -38,9 +34,7 @@ def plot_distribution_of_ibaq( raise ValueError("No IBAQ column found in the ibaq file") -def plot_peptides_of_lfq_condition( - psm_parquet_path: str, sdrf_path: str, save_path: str = None -) -> None: +def plot_peptides_of_lfq_condition(psm_parquet_path: str, sdrf_path: str, save_path: str = None) -> None: """ this function plots the number of peptides for each condition in a LFQ (Label-Free Quantification) experiment. @@ -60,9 +54,7 @@ def plot_peptides_of_lfq_condition( use_cols = [col for col in sdrf.columns if col.startswith("factor value")] use_cols.append("comment[data file]") sdrf = sdrf[use_cols] - sdrf["comment[data file]"] = sdrf["comment[data file]"].apply( - lambda x: x.split(".")[0] - ) + sdrf["comment[data file]"] = sdrf["comment[data file]"].apply(lambda x: x.split(".")[0]) sdrf.rename(columns={"comment[data file]": "reference_file_name"}, inplace=True) df = df.merge(sdrf, on="reference_file_name", how="left") df.columns = ["reference", "condition"] @@ -90,18 +82,14 @@ def plot_peptides_of_lfq_condition( df = pd.DataFrame([list(f_count.values)], columns=f_count.index) num_subplots = math.ceil(len(f_count) / 20) columns_per_subplot = 20 - fig, axes = plt.subplots( - nrows=num_subplots, ncols=1, figsize=(12, 4 * num_subplots) - ) + fig, axes = plt.subplots(nrows=num_subplots, ncols=1, figsize=(12, 4 * num_subplots)) for i in range(num_subplots): start_col = i * columns_per_subplot end_col = (i + 1) * columns_per_subplot subset_data = df.iloc[:, start_col:end_col] sns.barplot(data=subset_data, ax=axes[i]) - axes[i].set_title( - "Condition vs Number of Peptides {}-{}".format(start_col + 1, end_col) - ) + axes[i].set_title("Condition vs Number of Peptides {}-{}".format(start_col + 1, end_col)) axes[i].set(xlabel=None) for tick in axes[i].get_xticklabels(): tick.set_rotation(30) @@ -112,9 +100,7 @@ def plot_peptides_of_lfq_condition( return fig -def plot_intensity_distribution_of_samples( - feature_path: str, save_path: str = None, num_samples: int = 10 -) -> None: +def plot_intensity_distribution_of_samples(feature_path: str, save_path: str = None, num_samples: int = 10) -> None: """ Plot the distribution of intensities for different samples. :param feature_path: path to the feature file @@ -132,9 +118,7 @@ def plot_intensity_distribution_of_samples( df["intensity"] = np.log(df["intensity"]) plt.figure(dpi=500, figsize=(12, 8)) - fig = sns.kdeplot( - data=df, x="intensity", hue="sample_accession", palette="Paired", linewidth=2 - ) + fig = sns.kdeplot(data=df, x="intensity", hue="sample_accession", palette="Paired", linewidth=2) sns.despine(ax=fig, top=True, right=True) fig.set(ylabel=None) if save_path: @@ -142,9 +126,7 @@ def plot_intensity_distribution_of_samples( return fig -def plot_peptide_distribution_of_protein( - feature_path: str, save_path: str = None, num_samples: int = 20 -) -> None: +def plot_peptide_distribution_of_protein(feature_path: str, save_path: str = None, num_samples: int = 20) -> None: """ Bar graphs of peptide counts for different samples. :param feature_path: path to the feature file @@ -160,9 +142,7 @@ def plot_peptide_distribution_of_protein( max_p = max(df["peptides"]) + 1 / 3 * max(df["peptides"]) plt.figure(dpi=500, figsize=(12, 8)) - fig = sns.barplot( - data=df, x="sample", y="peptides", color="#82C3A3", legend=False, width=0.7 - ) + fig = sns.barplot(data=df, x="sample", y="peptides", color="#82C3A3", legend=False, width=0.7) fig.set_ylim(0, max_p) fig.set(xlabel=None) sns.despine(ax=fig, top=True, right=True) @@ -173,9 +153,7 @@ def plot_peptide_distribution_of_protein( return fig -def plot_intensity_box_of_samples( - feature_path: str, save_path: str = None, num_samples: int = 10 -) -> None: +def plot_intensity_box_of_samples(feature_path: str, save_path: str = None, num_samples: int = 10) -> None: """ Boxplot of peptide intensity distribution for different samples. :param feature_path: path to the feature file diff --git a/quantmsio/core/project.py b/quantmsio/core/project.py index 68c1d57..4562e62 100644 --- a/quantmsio/core/project.py +++ b/quantmsio/core/project.py @@ -23,9 +23,7 @@ def check_directory(output_folder: str, project_accession: str = None): :param project_accession: Prefix of the Json file needed to generate the file name """ if project_accession is None: - project_json = [ - f for f in os.listdir(output_folder) if f.endswith("project.json") - ] + project_json = [f for f in os.listdir(output_folder) if f.endswith("project.json")] if len(project_json) == 1: json_path = output_folder + "/" + project_json[0] project = ProjectHandler(project_json_file=json_path) @@ -34,9 +32,7 @@ def check_directory(output_folder: str, project_accession: str = None): raise Exception(f"More than one project json file found in {output_folder}") else: if os.path.exists(output_folder): - project_json = [ - f for f in os.listdir(output_folder) if f.endswith("project.json") - ] + project_json = [f for f in os.listdir(output_folder) if f.endswith("project.json")] for json_file in project_json: json_path = output_folder + "/" + json_file project = ProjectHandler(project_json_file=json_path) @@ -113,21 +109,11 @@ def populate_from_pride_archive(self): pride_data = response.json() self.project.project_info["project_accession"] = self.project_accession self.project.project_info["project_title"] = pride_data["title"] - self.project.project_info["project_description"] = pride_data[ - "projectDescription" - ] - self.project.project_info["project_sample_description"] = pride_data[ - "sampleProcessingProtocol" - ] - self.project.project_info["project_data_description"] = pride_data[ - "dataProcessingProtocol" - ] - self.project.project_info["project_pubmed_id"] = get_pubmed_id_pride_json( - pride_data - ) - self.project.project_info["experiment_type"] = ( - get_set_of_experiment_keywords(pride_data) - ) + self.project.project_info["project_description"] = pride_data["projectDescription"] + self.project.project_info["project_sample_description"] = pride_data["sampleProcessingProtocol"] + self.project.project_info["project_data_description"] = pride_data["dataProcessingProtocol"] + self.project.project_info["project_pubmed_id"] = get_pubmed_id_pride_json(pride_data) + self.project.project_info["experiment_type"] = get_set_of_experiment_keywords(pride_data) else: logger.info("A non-pride project is being created.") @@ -149,13 +135,9 @@ def add_sdrf_project_properties(self, sdrf: SDRFHandler): self.project.project_info["cell_lines"] = sdrf.get_cell_lines() self.project.project_info["instruments"] = sdrf.get_instruments() self.project.project_info["enzymes"] = sdrf.get_enzymes() - self.project.project_info["acquisition_properties"] = ( - sdrf.get_acquisition_properties() - ) + self.project.project_info["acquisition_properties"] = sdrf.get_acquisition_properties() - def add_quantms_file( - self, file_name: str, file_category: str, replace_existing: bool = True - ): + def add_quantms_file(self, file_name: str, file_category: str, replace_existing: bool = True): """ Add a quantms file to the project information. The file name will be generated automatically. Read more about the quantms file naming convention in the docs folder of this repository @@ -166,22 +148,16 @@ def add_quantms_file( """ if "quantms_files" not in self.project.project_info: self.project.project_info["quantms_files"] = [] - self.project.project_info["quantms_files"].append( - {file_category: file_name} - ) + self.project.project_info["quantms_files"].append({file_category: file_name}) elif replace_existing: obj_index = None for index, obj in enumerate(self.project.project_info["quantms_files"]): if file_category in obj: obj_index = index if obj_index is not None: - self.project.project_info["quantms_files"][obj_index][ - file_category - ] = file_name + self.project.project_info["quantms_files"][obj_index][file_category] = file_name else: - self.project.project_info["quantms_files"].append( - {file_category: file_name} - ) + self.project.project_info["quantms_files"].append({file_category: file_name}) else: obj_index = None for index, obj in enumerate(self.project.project_info["quantms_files"]): @@ -189,27 +165,17 @@ def add_quantms_file( obj_index = index if obj_index is not None: if isinstance( - self.project.project_info["quantms_files"][obj_index][ - file_category - ], + self.project.project_info["quantms_files"][obj_index][file_category], list, ): - self.project.project_info["quantms_files"][obj_index][ - file_category - ].append(file_name) + self.project.project_info["quantms_files"][obj_index][file_category].append(file_name) else: - self.project.project_info["quantms_files"][obj_index][ - file_category - ] = self.project.project_info["quantms_files"][obj_index][ - file_category - ].split() - self.project.project_info["quantms_files"][obj_index][ - file_category - ].append(file_name) + self.project.project_info["quantms_files"][obj_index][file_category] = self.project.project_info[ + "quantms_files" + ][obj_index][file_category].split() + self.project.project_info["quantms_files"][obj_index][file_category].append(file_name) else: - self.project.project_info["quantms_files"].append( - {file_category: [file_name]} - ) + self.project.project_info["quantms_files"].append({file_category: [file_name]}) def register_file(self, output_path, extension): extension_map = { @@ -247,7 +213,9 @@ def save_project_info( if output_folder is None: output_filename = f"{output_prefix_file}-{str(uuid.uuid4())}{ProjectHandler.PROJECT_EXTENSION}" else: - output_filename = f"{output_folder}/{output_prefix_file}-{str(uuid.uuid4())}{ProjectHandler.PROJECT_EXTENSION}" + output_filename = ( + f"{output_folder}/{output_prefix_file}-{str(uuid.uuid4())}{ProjectHandler.PROJECT_EXTENSION}" + ) with open(output_filename, "w") as json_file: json.dump(self.project.project_info, json_file, indent=4) @@ -271,9 +239,7 @@ def populate_from_sdrf(self, sdrf_file: str): sdrf = SDRFHandler(sdrf_file) self.add_sdrf_project_properties(sdrf) - def add_sdrf_file( - self, sdrf_file_path: str, output_folder: str, delete_existing: bool = True - ) -> None: + def add_sdrf_file(self, sdrf_file_path: str, output_folder: str, delete_existing: bool = True) -> None: """ Copy the given file to the project folder and add the file name to the project information. :param sdrf_file_path: SDRF file path @@ -295,16 +261,12 @@ def add_sdrf_file( if output_folder is None: output_filename_path = output_filename else: - output_filename_path = ( - f"{output_folder}/{base_name}-{str(uuid.uuid4())}{extension}" - ) + output_filename_path = f"{output_folder}/{base_name}-{str(uuid.uuid4())}{extension}" shutil.copyfile(sdrf_file_path, output_filename_path) # self.project.project_info["sdrf_file"] = output_filename self.register_file(output_filename, ".sdrf.tsv") - logger.info( - f"SDRF file copied to {output_filename} and added to the project information" - ) + logger.info(f"SDRF file copied to {output_filename} and added to the project information") class ProjectDefinition: diff --git a/quantmsio/core/protein.py b/quantmsio/core/protein.py index 511a4f6..5d30adb 100644 --- a/quantmsio/core/protein.py +++ b/quantmsio/core/protein.py @@ -34,19 +34,13 @@ class ProteinHandler(ParquetHandler): pa.field( "abundance", pa.float64(), - metadata={ - "description": "protein abundance value selected by the workflow (e.g. MaxLFQ, iBAQ, etc)" - }, - ), - pa.field( - "global_qvalue", pa.float64(), metadata={"description": "global q-value"} + metadata={"description": "protein abundance value selected by the workflow (e.g. MaxLFQ, iBAQ, etc)"}, ), + pa.field("global_qvalue", pa.float64(), metadata={"description": "global q-value"}), pa.field( "is_decoy", pa.int32(), - metadata={ - "description": "flag indicating if the protein is a decoy (1 is decoy, 0 is not decoy)" - }, + metadata={"description": "flag indicating if the protein is a decoy (1 is decoy, 0 is not decoy)"}, ), pa.field( "best_id_score", @@ -66,23 +60,17 @@ class ProteinHandler(ParquetHandler): pa.field( "number_of_peptides", pa.int32(), - metadata={ - "description": "number of peptides associated with the protein in the given sample" - }, + metadata={"description": "number of peptides associated with the protein in the given sample"}, ), pa.field( "number_of_psms", pa.int32(), - metadata={ - "description": "number of peptide spectrum matches in the given sample" - }, + metadata={"description": "number of peptide spectrum matches in the given sample"}, ), pa.field( "number_of_unique_peptides", pa.int32(), - metadata={ - "description": "number of unique peptides associated with the protein" - }, + metadata={"description": "number of unique peptides associated with the protein"}, ), pa.field( "protein_descriptions", @@ -97,9 +85,7 @@ class ProteinHandler(ParquetHandler): pa.field( "ribaq", pa.float64(), - metadata={ - "description": "normalized intensity-based absolute quantification value" - }, + metadata={"description": "normalized intensity-based absolute quantification value"}, ), pa.field( "intensity", diff --git a/quantmsio/core/psm.py b/quantmsio/core/psm.py index fcf8643..2643153 100644 --- a/quantmsio/core/psm.py +++ b/quantmsio/core/psm.py @@ -62,25 +62,19 @@ class PSMHandler(ParquetHandler): pa.field( "protein_global_qvalue", pa.float64(), - metadata={ - "description": "global q-value of the associated protein or protein group" - }, + metadata={"description": "global q-value of the associated protein or protein group"}, ), pa.field( "unique", pa.int32(), - metadata={ - "description": "if the peptide is unique to a particular protein" - }, + metadata={"description": "if the peptide is unique to a particular protein"}, ), pa.field( "modifications", pa.list_(pa.string()), metadata={"description": "peptide modifications"}, ), - pa.field( - "retention_time", pa.float64(), metadata={"description": "retention time"} - ), + pa.field("retention_time", pa.float64(), metadata={"description": "retention time"}), pa.field( "charge", pa.int32(), @@ -106,15 +100,11 @@ class PSMHandler(ParquetHandler): pa.float64(), metadata={"description": "posterior error probability"}, ), - pa.field( - "global_qvalue", pa.float64(), metadata={"description": "global q-value"} - ), + pa.field("global_qvalue", pa.float64(), metadata={"description": "global q-value"}), pa.field( "is_decoy", pa.int32(), - metadata={ - "description": "flag indicating if the feature is a decoy (1 is decoy, 0 is not decoy)" - }, + metadata={"description": "flag indicating if the feature is a decoy (1 is decoy, 0 is not decoy)"}, ), pa.field( "id_scores", @@ -208,22 +198,12 @@ def convert_mztab_to_psm( for it in iter(mztab_handler.read_next_psm, None): if verbose: - logger.info( - "Sequence: {} -- Protein: {}".format( - it["sequence"], it["accession"] - ) - ) - psm_list.append( - self._transform_psm_from_mztab(psm=it, mztab_handler=mztab_handler) - ) - if ( - len(psm_list) == batch_size and batch_count < batches - ): # write in batches + logger.info("Sequence: {} -- Protein: {}".format(it["sequence"], it["accession"])) + psm_list.append(self._transform_psm_from_mztab(psm=it, mztab_handler=mztab_handler)) + if len(psm_list) == batch_size and batch_count < batches: # write in batches feature_table = self._create_psm_table(psm_list) if not pq_writer: - pq_writer = pq.ParquetWriter( - self.parquet_path, feature_table.schema - ) + pq_writer = pq.ParquetWriter(self.parquet_path, feature_table.schema) pq_writer.write_table(feature_table) psm_list = [] batch_count += 1 @@ -238,17 +218,11 @@ def convert_mztab_to_psm( if pq_writer: pq_writer.close() - logger.info( - "The parquet file was generated in: {}".format(self.parquet_path) - ) + logger.info("The parquet file was generated in: {}".format(self.parquet_path)) else: convert = PsmInMemory(self.schema) - convert.write_feature_to_file( - mztab_path, self.parquet_path, chunksize=batch_size - ) - logger.info( - "The parquet file was generated in: {}".format(self.parquet_path) - ) + convert.write_feature_to_file(mztab_path, self.parquet_path, chunksize=batch_size) + logger.info("The parquet file was generated in: {}".format(self.parquet_path)) @staticmethod def _transform_psm_from_mztab(psm, mztab_handler) -> dict: @@ -270,9 +244,7 @@ def _transform_psm_from_mztab(psm, mztab_handler) -> dict: protein_accession_list=protein_accession_nredundant ) protein_qvalue = ( - None - if (protein_qvalue is None or protein_qvalue[0] == "null") - else np.float64(protein_qvalue[0]) + None if (protein_qvalue is None or protein_qvalue[0] == "null") else np.float64(protein_qvalue[0]) ) retention_time = ( @@ -307,27 +279,18 @@ def _transform_psm_from_mztab(psm, mztab_handler) -> dict: else None ) - modification_list = ( - None if modifications_string is None else modifications_string.split(",") - ) + modification_list = None if modifications_string is None else modifications_string.split(",") posterior_error_probability = ( None - if ( - "posterior_error_probability" not in psm - or psm["posterior_error_probability"] is None - ) + if ("posterior_error_probability" not in psm or psm["posterior_error_probability"] is None) else np.float64(psm["posterior_error_probability"]) ) global_qvalue = ( - None - if ("global_qvalue" not in psm or psm["global_qvalue"] is None) - else np.float64(psm["global_qvalue"]) + None if ("global_qvalue" not in psm or psm["global_qvalue"] is None) else np.float64(psm["global_qvalue"]) ) - consensus_support = ( - None if psm["consensus_support"] else np.float32(psm["consensus_support"]) - ) + consensus_support = None if psm["consensus_support"] else np.float32(psm["consensus_support"]) psm_score = np.float64(psm["score"]) peptide_score_name = mztab_handler.get_search_engine_scores()["psm_score"] @@ -343,9 +306,7 @@ def _transform_psm_from_mztab(psm, mztab_handler) -> dict: "retention_time": retention_time, "charge": charge, "calc_mass_to_charge": calc_mass_to_charge, - "peptidoform": psm[ - "proforma_peptidoform" - ], # Peptidoform in proforma notation + "peptidoform": psm["proforma_peptidoform"], # Peptidoform in proforma notation "posterior_error_probability": posterior_error_probability, "global_qvalue": global_qvalue, "is_decoy": int(psm["is_decoy"]), diff --git a/quantmsio/core/psm_in_memory.py b/quantmsio/core/psm_in_memory.py index 12c0180..a8aefed 100644 --- a/quantmsio/core/psm_in_memory.py +++ b/quantmsio/core/psm_in_memory.py @@ -63,9 +63,7 @@ def generate_psm_parquet(self, mztab_path, chunksize=1000000): self._modifications = get_modifications(mztab_path) for psm in psms: if "opt_global_cv_MS:1000889_peptidoform_sequence" not in psm.columns: - psm.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = psm[ - ["modifications", "sequence"] - ].apply( + psm.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = psm[["modifications", "sequence"]].apply( lambda row: get_petidoform_msstats_notation( row["sequence"], row["modifications"], self._modifications ), @@ -76,12 +74,8 @@ def generate_psm_parquet(self, mztab_path, chunksize=1000000): for col in self._psm_usecols: if col not in psm.columns: psm.loc[:, col] = None - psm.loc[:, "scan_number"] = psm["spectra_ref"].apply( - lambda x: generate_scan_number(x) - ) - psm["spectra_ref"] = psm["spectra_ref"].apply( - lambda x: self._ms_runs[x.split(":")[0]] - ) + psm.loc[:, "scan_number"] = psm["spectra_ref"].apply(lambda x: generate_scan_number(x)) + psm["spectra_ref"] = psm["spectra_ref"].apply(lambda x: self._ms_runs[x.split(":")[0]]) self._psm_usecols.append("scan_number") psm = psm[self._psm_usecols] self._psm_usecols.pop() @@ -153,33 +147,17 @@ def convert_to_parquet(self, res): res["id_scores"] = res["id_scores"].apply(lambda x: x.split(",")) res["sequence"] = res["sequence"].astype(str) res["protein_accessions"] = res["protein_accessions"].str.split(";") - res["protein_start_positions"] = ( - res["protein_start_positions"].apply(self.__split_start_or_end).to_list() - ) - res["protein_end_positions"] = ( - res["protein_end_positions"].apply(self.__split_start_or_end).to_list() - ) + res["protein_start_positions"] = res["protein_start_positions"].apply(self.__split_start_or_end).to_list() + res["protein_end_positions"] = res["protein_end_positions"].apply(self.__split_start_or_end).to_list() res["protein_global_qvalue"] = res["protein_global_qvalue"].astype(float) - res["unique"] = ( - res["unique"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") - ) - res["modifications"] = res["modifications"].apply( - lambda x: self._feature._generate_modification_list(x) - ) - res["charge"] = ( - res["charge"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") - ) + res["unique"] = res["unique"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") + res["modifications"] = res["modifications"].apply(lambda x: self._feature._generate_modification_list(x)) + res["charge"] = res["charge"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") res["exp_mass_to_charge"] = res["exp_mass_to_charge"].astype(float) res["calc_mass_to_charge"] = res["calc_mass_to_charge"].astype(float) - res["posterior_error_probability"] = res["posterior_error_probability"].astype( - float - ) + res["posterior_error_probability"] = res["posterior_error_probability"].astype(float) res["global_qvalue"] = res["global_qvalue"].astype(float) - res["is_decoy"] = ( - res["is_decoy"] - .map(lambda x: pd.NA if pd.isna(x) else int(x)) - .astype("Int32") - ) + res["is_decoy"] = res["is_decoy"].map(lambda x: pd.NA if pd.isna(x) else int(x)).astype("Int32") res["consensus_support"] = res["consensus_support"].astype(float) res["scan_number"] = res["scan_number"].astype(str) diff --git a/quantmsio/core/query.py b/quantmsio/core/query.py index f495c91..d26a6dd 100644 --- a/quantmsio/core/query.py +++ b/quantmsio/core/query.py @@ -32,9 +32,7 @@ def map_spectrum_mz(mz_path: str, scan: str, mzml: dict, mzml_directory: str): mz_path = mzml_directory + mz_path + ".mzML" else: mz_path = mzml_directory + "/" + mz_path + ".mzML" - mz_array, array_intensity = mzml[reference].get_spectrum_from_scan( - mz_path, int(scan) - ) + mz_array, array_intensity = mzml[reference].get_spectrum_from_scan(mz_path, int(scan)) return mz_array, array_intensity, 0 @@ -60,13 +58,9 @@ class Parquet: def __init__(self, parquet_path: str): if os.path.exists(parquet_path): self._path = parquet_path - self.parquet_db = duckdb.connect( - config={"max_memory": "16GB", "worker_threads": 4} - ) + self.parquet_db = duckdb.connect(config={"max_memory": "16GB", "worker_threads": 4}) self.parquet_db = self.parquet_db.execute( - "CREATE VIEW parquet_db AS SELECT * FROM parquet_scan('{}')".format( - parquet_path - ) + "CREATE VIEW parquet_db AS SELECT * FROM parquet_scan('{}')".format(parquet_path) ) else: raise FileNotFoundError(f"the file {parquet_path} does not exist.") @@ -132,9 +126,7 @@ def iter_file(self, file_num: int = 10): :yield: _description_ """ references = self.get_unique_references() - ref_list = [ - references[i : i + file_num] for i in range(0, len(references), file_num) - ] + ref_list = [references[i : i + file_num] for i in range(0, len(references), file_num)] for refs in ref_list: batch_df = self.get_report_from_database(refs) yield batch_df @@ -162,9 +154,7 @@ def inject_spectrum_msg(self, df: pd.DataFrame, mzml_directory: str): ) return df elif "reference_file_name" in df.columns: - df[["mz_array", "intensity_array", "num_peaks"]] = df[ - ["reference_file_name", "scan_number"] - ].apply( + df[["mz_array", "intensity_array", "num_peaks"]] = df[["reference_file_name", "scan_number"]].apply( lambda x: map_spectrum_mz( x["reference_file_name"], x["scan_number"], @@ -182,9 +172,7 @@ def inject_position_msg(self, df: pd.DataFrame, protein_dict: dict): :params protein_dict: {protein_accession:seq} :retrun df """ - df[["protein_start_positions", "protein_end_positions"]] = df[ - ["sequence", "protein_accessions"] - ].apply( + df[["protein_start_positions", "protein_end_positions"]] = df[["sequence", "protein_accessions"]].apply( lambda row: fill_start_and_end(row, protein_dict), axis=1, result_type="expand", @@ -206,14 +194,10 @@ def inject_gene_msg( :return df """ map_gene_names = generate_gene_name_map(fasta, map_parameter) - df["gene_names"] = df["protein_accessions"].apply( - lambda x: get_unanimous_name(x, map_gene_names) - ) + df["gene_names"] = df["protein_accessions"].apply(lambda x: get_unanimous_name(x, map_gene_names)) gene_list = self.get_gene_list(map_gene_names) gene_accessions = self.get_gene_accessions(gene_list, species) - df["gene_accessions"] = df["gene_names"].apply( - lambda x: get_gene_accessions(x, gene_accessions) - ) + df["gene_accessions"] = df["gene_names"].apply(lambda x: get_gene_accessions(x, gene_accessions)) return df @@ -221,9 +205,7 @@ def get_protein_dict(self, fasta_path): """ return: protein_map {protein_accession:seq} """ - df = self.parquet_db.sql( - "SELECT DISTINCT protein_accessions FROM parquet_db" - ).df() + df = self.parquet_db.sql("SELECT DISTINCT protein_accessions FROM parquet_db").df() proteins = set() for protein_accessions in df["protein_accessions"].tolist(): proteins.update(set(protein_accessions)) @@ -252,9 +234,7 @@ def get_unique_references(self): """ return: A list of deduplicated reference """ - unique_reference = self.parquet_db.sql( - "SELECT DISTINCT reference_file_name FROM parquet_db" - ).df() + unique_reference = self.parquet_db.sql("SELECT DISTINCT reference_file_name FROM parquet_db").df() return unique_reference["reference_file_name"].tolist() @@ -262,9 +242,7 @@ def get_unique_peptides(self): """ return: A list of deduplicated peptides. """ - unique_peps = self.parquet_db.sql( - "SELECT DISTINCT sequence FROM parquet_db" - ).df() + unique_peps = self.parquet_db.sql("SELECT DISTINCT sequence FROM parquet_db").df() return unique_peps["sequence"].tolist() @@ -273,9 +251,7 @@ def get_unique_proteins(self): return: A list of deduplicated proteins. """ - unique_prts = self.parquet_db.sql( - "SELECT DISTINCT protein_accessions FROM parquet_db" - ).df() + unique_prts = self.parquet_db.sql("SELECT DISTINCT protein_accessions FROM parquet_db").df() return unique_prts["protein_accessions"].tolist() @@ -284,9 +260,7 @@ def get_unique_genes(self): return: A list of deduplicated genes. """ - unique_prts = self.parquet_db.sql( - "SELECT DISTINCT gene_names FROM parquet_db" - ).df() + unique_prts = self.parquet_db.sql("SELECT DISTINCT gene_names FROM parquet_db").df() return unique_prts["gene_names"].tolist() @@ -294,9 +268,7 @@ def get_unique_samples(self): """ return: A list of deduplicated sampless. """ - unique_peps = self.parquet_db.sql( - "SELECT DISTINCT sample_accession FROM parquet_db" - ).df() + unique_peps = self.parquet_db.sql("SELECT DISTINCT sample_accession FROM parquet_db").df() return unique_peps["sample_accession"].tolist() def query_peptide(self, peptide: str, columns: list = None): @@ -307,9 +279,7 @@ def query_peptide(self, peptide: str, columns: list = None): if check_string("^[A-Z]+$", peptide): cols = ", ".join(columns) if columns and isinstance(columns, list) else "*" - return self.parquet_db.sql( - f"SELECT {cols} FROM parquet_db WHERE sequence ='{peptide}'" - ).df() + return self.parquet_db.sql(f"SELECT {cols} FROM parquet_db WHERE sequence ='{peptide}'").df() else: return KeyError("Illegal peptide!") @@ -322,9 +292,7 @@ def query_peptides(self, peptides: list, columns: list = None): if not check_string("^[A-Z]+$", p): return KeyError("Illegal peptide!") cols = ", ".join(columns) if columns and isinstance(columns, list) else "*" - database = self.parquet_db.sql( - f"select {cols} from parquet_db where sequence IN {tuple(peptides)}" - ) + database = self.parquet_db.sql(f"select {cols} from parquet_db where sequence IN {tuple(peptides)}") return database.df() def query_proteins(self, proteins: list, columns: list = None): @@ -338,9 +306,7 @@ def query_proteins(self, proteins: list, columns: list = None): proteins_key = [f"protein_accessions LIKE '%{p}%'" for p in proteins] query_key = " OR ".join(proteins_key) cols = ", ".join(columns) if columns and isinstance(columns, list) else "*" - database = self.parquet_db.sql( - f"SELECT {cols} FROM parquet_db WHERE {query_key}" - ) + database = self.parquet_db.sql(f"SELECT {cols} FROM parquet_db WHERE {query_key}") return database.df() def query_protein(self, protein: str, columns: list = None): @@ -362,9 +328,7 @@ def get_gene_list(self, map_gene_names: dict): return: unique gene list """ unique_prts = self.get_unique_proteins() - gene_names = [ - get_unanimous_name(proteins, map_gene_names) for proteins in unique_prts - ] + gene_names = [get_unanimous_name(proteins, map_gene_names) for proteins in unique_prts] gene_list = list(set([item for sublist in gene_names for item in sublist])) return gene_list @@ -374,9 +338,7 @@ def get_gene_accessions(self, gene_list: list, species: str = "human"): :params gene_list """ mg = mygene.MyGeneInfo() - gene_accessions = mg.querymany( - gene_list, scopes="symbol", species=species, fields="accession" - ) + gene_accessions = mg.querymany(gene_list, scopes="symbol", species=species, fields="accession") gene_accessions_maps = defaultdict(list) for obj in gene_accessions: if "accession" in obj: diff --git a/quantmsio/core/sdrf.py b/quantmsio/core/sdrf.py index d6e477f..2a8d901 100644 --- a/quantmsio/core/sdrf.py +++ b/quantmsio/core/sdrf.py @@ -50,9 +50,7 @@ def get_complex_value_sdrf_column(sdrf_table: DataFrame, column: str) -> list: return [get_name_from_complex_sdrf_value(value) for value in values] -def get_acquisition_method( - sdrf_table: DataFrame, acquisition_method_column: str, column_labeling: str -) -> list: +def get_acquisition_method(sdrf_table: DataFrame, acquisition_method_column: str, column_labeling: str) -> list: """ Get the acquisition method from the SDRF table.Returns the acquisition method and the labeling method. Three different methods are supported: label free, TMT and iTRAQ. @@ -62,16 +60,11 @@ def get_acquisition_method( :param acquisition_method_column: acquisition method column name :param column_labeling: labeling column name """ - acquisition_values = get_complex_value_sdrf_column( - sdrf_table, acquisition_method_column - ) + acquisition_values = get_complex_value_sdrf_column(sdrf_table, acquisition_method_column) labeling_values = get_complex_value_sdrf_column(sdrf_table, column_labeling) if len(acquisition_values) == 0 and len(labeling_values) > 0: for labeling_value in labeling_values: - if ( - "label free" in labeling_value.lower() - or "label-free" in labeling_value.lower() - ): + if "label free" in labeling_value.lower() or "label-free" in labeling_value.lower(): acquisition_values.append("Label free") acquisition_values.append("Data-dependent acquisition") elif "tmt" in labeling_value.lower(): @@ -149,31 +142,21 @@ def get_acquisition_properties(self): """ acquisition_values = [] [ - acquisition_values.append( - {"proteomics data acquisition method": acquisition_value} - ) - for acquisition_value in get_acquisition_method( - self.sdrf_table, self.ACQUISITION_METHOD, self.LABELING - ) + acquisition_values.append({"proteomics data acquisition method": acquisition_value}) + for acquisition_value in get_acquisition_method(self.sdrf_table, self.ACQUISITION_METHOD, self.LABELING) ] [ acquisition_values.append({"dissociation method": dissociation_value}) - for dissociation_value in get_complex_value_sdrf_column( - self.sdrf_table, self.DISSOCIATION_METHOD - ) + for dissociation_value in get_complex_value_sdrf_column(self.sdrf_table, self.DISSOCIATION_METHOD) ] [ - acquisition_values.append( - {"precursor mass tolerance": precursor_mass_tolerance_value} - ) + acquisition_values.append({"precursor mass tolerance": precursor_mass_tolerance_value}) for precursor_mass_tolerance_value in get_complex_value_sdrf_column( self.sdrf_table, self.PRECURSOR_MASS_TOLERANCE ) ] [ - acquisition_values.append( - {"fragment mass tolerance": fragment_mass_tolerance_value} - ) + acquisition_values.append({"fragment mass tolerance": fragment_mass_tolerance_value}) for fragment_mass_tolerance_value in get_complex_value_sdrf_column( self.sdrf_table, self.FRAGMENT_MASS_TOLERANCE ) @@ -184,9 +167,7 @@ def get_factor_value(self) -> str: """ Get the factor value """ - selected_columns = [ - column for column in self.sdrf_table.columns if "factor value" in column - ] + selected_columns = [column for column in self.sdrf_table.columns if "factor value" in column] if len(selected_columns) != 1: return None values = re.findall(r"\[(.*?)\]", selected_columns[0]) @@ -203,13 +184,9 @@ def extract_feature_properties(self) -> DataFrame: experiment_type = self.get_experiment_type_from_sdrf() sdrf_pd = self.sdrf_table.copy() # type: DataFrame - sdrf_pd["comment[data file]"] = sdrf_pd["comment[data file]"].apply( - lambda x: x.split(".")[0] - ) + sdrf_pd["comment[data file]"] = sdrf_pd["comment[data file]"].apply(lambda x: x.split(".")[0]) - factor_columns = [ - column for column in sdrf_pd.columns if "factor value" in column - ] + factor_columns = [column for column in sdrf_pd.columns if "factor value" in column] if len(factor_columns) != 1: raise ValueError("The number of factor columns should be 1") @@ -263,15 +240,11 @@ def get_experiment_type_from_sdrf(self): The three possible values supported in SDRF are lfq, tmt and itraq. """ if self.LABELING not in self.sdrf_table.columns: - raise ValueError( - "The SDRF file provided does not contain the comment[label] column" - ) + raise ValueError("The SDRF file provided does not contain the comment[label] column") labeling_values = get_complex_value_sdrf_column(self.sdrf_table, self.LABELING) if len(labeling_values) == 0: - raise ValueError( - "The SDRF file provided does not contain any comment[label] value" - ) + raise ValueError("The SDRF file provided does not contain any comment[label] value") labeling_values = [i.upper() for i in labeling_values] @@ -287,22 +260,16 @@ def get_experiment_type_from_sdrf(self): elif len(labeling_values) == 6: return "TMT6" else: - raise ValueError( - "The SDRF file provided does not contain a supported TMT comment[label] value" - ) + raise ValueError("The SDRF file provided does not contain a supported TMT comment[label] value") elif len([i for i in labeling_values if "ITRAQ" in i]) > 0: if len(labeling_values) == 4: return "ITRAQ4" elif len(labeling_values) == 8: return "ITRAQ8" else: - raise ValueError( - "The SDRF file provided does not contain a supported iTRAQ comment[label] value" - ) + raise ValueError("The SDRF file provided does not contain a supported iTRAQ comment[label] value") else: - raise ValueError( - "The SDRF file provided does not contain any supported comment[label] value" - ) + raise ValueError("The SDRF file provided does not contain any supported comment[label] value") def get_sample_labels(self): """ @@ -321,27 +288,14 @@ def get_sample_map(self): """ sample_map = {} sdrf_pd = self.sdrf_table.copy() # type: DataFrame - sdrf_pd["comment[data file]"] = sdrf_pd["comment[data file]"].apply( - lambda x: x.split(".")[0] - ) + sdrf_pd["comment[data file]"] = sdrf_pd["comment[data file]"].apply(lambda x: x.split(".")[0]) for index, row in sdrf_pd.iterrows(): - channel = ( - "LABEL FREE SAMPLE" - if "LABEL FREE" in row["comment[label]"].upper() - else row["comment[label]"] - ) + channel = "LABEL FREE SAMPLE" if "LABEL FREE" in row["comment[label]"].upper() else row["comment[label]"] if row["comment[data file]"] + ":_:" + channel in sample_map: - if ( - sample_map[row["comment[data file]"] + ":_:" + channel] - != row["source name"] - ): + if sample_map[row["comment[data file]"] + ":_:" + channel] != row["source name"]: raise ValueError("The sample map is not unique") else: - print( - "channel {} for sample {} already in the sample map".format( - channel, row["source name"] - ) - ) + print("channel {} for sample {} already in the sample map".format(channel, row["source name"])) sample_map[row["comment[data file]"] + ":_:" + channel] = row["source name"] return sample_map @@ -350,21 +304,14 @@ def get_mods(self): mod_cols = [ col for col in sdrf.columns - if ( - col.startswith("comment[modification parameter]") - | col.startswith("comment[modification parameters]") - ) + if (col.startswith("comment[modification parameter]") | col.startswith("comment[modification parameters]")) ] fix_m = [] variable_m = [] for col in mod_cols: mod_msg = sdrf[col].values[0].split(";") mod_dict = {k.split("=")[0]: k.split("=")[1] for k in mod_msg} - mod = ( - f"{mod_dict['NT']} ({mod_dict['TA']})" - if "TA" in mod_dict - else f"{mod_dict['NT']} ({mod_dict['PP']})" - ) + mod = f"{mod_dict['NT']} ({mod_dict['TA']})" if "TA" in mod_dict else f"{mod_dict['NT']} ({mod_dict['PP']})" if mod_dict["MT"] == "Variable" or mod_dict["MT"] == "variable": variable_m.append(mod) else: diff --git a/quantmsio/core/statistics.py b/quantmsio/core/statistics.py index ea0161b..e416ba4 100644 --- a/quantmsio/core/statistics.py +++ b/quantmsio/core/statistics.py @@ -52,29 +52,21 @@ def __init__(self, parquet_path: str) -> None: if os.path.exists(parquet_path): self.parquet_db = duckdb.connect() self.parquet_db = self.parquet_db.execute( - "CREATE VIEW parquet_db AS SELECT * FROM parquet_scan('{}')".format( - parquet_path - ) + "CREATE VIEW parquet_db AS SELECT * FROM parquet_scan('{}')".format(parquet_path) ) else: raise FileNotFoundError(f"the file {parquet_path} does not exist.") def get_number_of_peptides(self) -> int: - count = self.parquet_db.sql( - "SELECT COUNT(DISTINCT sequence) FROM parquet_db" - ).fetchone()[0] + count = self.parquet_db.sql("SELECT COUNT(DISTINCT sequence) FROM parquet_db").fetchone()[0] return count def get_number_of_peptidoforms(self) -> int: - count = self.parquet_db.sql( - "SELECT COUNT(DISTINCT peptidoform) FROM parquet_db" - ).fetchone()[0] + count = self.parquet_db.sql("SELECT COUNT(DISTINCT peptidoform) FROM parquet_db").fetchone()[0] return count def get_number_of_samples(self) -> int: - count = self.parquet_db.sql( - "SELECT COUNT(DISTINCT sample_accession) FROM parquet_db" - ).fetchone()[0] + count = self.parquet_db.sql("SELECT COUNT(DISTINCT sample_accession) FROM parquet_db").fetchone()[0] return count def get_number_of_proteins(self) -> int: @@ -83,18 +75,14 @@ def get_number_of_proteins(self) -> int: This method is not accurate. It needs to be refined. :return: number of unique proteins """ - protein_ids = self.parquet_db.sql( - "SELECT DISTINCT protein_accessions FROM parquet_db" - ).fetchall() + protein_ids = self.parquet_db.sql("SELECT DISTINCT protein_accessions FROM parquet_db").fetchall() # This probalby needs to be refined. protein_ids = [item for sublist in protein_ids for item in sublist[0]] protein_ids = set(protein_ids) return len(protein_ids) def get_number_msruns(self) -> int: - count = self.parquet_db.sql( - "SELECT COUNT(DISTINCT reference_file_name) FROM parquet_db" - ).fetchone()[0] + count = self.parquet_db.sql("SELECT COUNT(DISTINCT reference_file_name) FROM parquet_db").fetchone()[0] return count def get_number_of_psms(self) -> int: diff --git a/quantmsio/core/tools.py b/quantmsio/core/tools.py index 391f6d6..759d84c 100644 --- a/quantmsio/core/tools.py +++ b/quantmsio/core/tools.py @@ -105,9 +105,7 @@ def generate_features_of_spectrum( def generate_gene_acession_map(gene_names, species="human"): mg = mygene.MyGeneInfo() - gene_accessions = mg.querymany( - gene_names, scopes="symbol", species=species, fields="accession" - ) + gene_accessions = mg.querymany(gene_names, scopes="symbol", species=species, fields="accession") gene_accessions_maps = defaultdict(list) for obj in gene_accessions: if "accession" in obj: @@ -126,16 +124,10 @@ def map_gene_msgs_to_parquet( map_gene_names = generate_gene_name_map(fasta_path, map_parameter) pqwriter = None for df in read_large_parquet(parquet_path): - df["gene_names"] = df["protein_accessions"].swifter.apply( - lambda x: get_unanimous_name(x, map_gene_names) - ) - gene_names = list( - set([item for sublist in df["gene_names"] for item in sublist]) - ) + df["gene_names"] = df["protein_accessions"].swifter.apply(lambda x: get_unanimous_name(x, map_gene_names)) + gene_names = list(set([item for sublist in df["gene_names"] for item in sublist])) gene_accessions_maps = generate_gene_acession_map(gene_names, species) - df["gene_accessions"] = df["gene_names"].swifter.apply( - lambda x: get_gene_accessions(x, gene_accessions_maps) - ) + df["gene_accessions"] = df["gene_names"].swifter.apply(lambda x: get_gene_accessions(x, gene_accessions_maps)) if label == "feature": hander = FeatureHandler() elif label == "psm": @@ -157,12 +149,7 @@ def plot_peptidoform_charge_venn(parquet_path_list, labels): psm_message = "Total number of PSM for " + label + ": " + str(len(df)) print(psm_message) unique_pep_forms = set((df["peptidoform"] + df["charge"].astype(str)).to_list()) - pep_form_message = ( - "Total number of Peptidoform for " - + label - + ": " - + str(len(unique_pep_forms)) - ) + pep_form_message = "Total number of Peptidoform for " + label + ": " + str(len(unique_pep_forms)) print(pep_form_message) data_map[label] = unique_pep_forms plt.figure(figsize=(16, 12), dpi=500) @@ -180,9 +167,7 @@ def plot_sequence_venn(parquet_path_list, labels): for parquet_path, label in zip(parquet_path_list, labels): sequence = pd.read_parquet(parquet_path, columns=["sequence"]) unique_seqs = set(sequence["sequence"].to_list()) - pep_message = ( - "Total number of peptide for " + label + ": " + str(len(unique_seqs)) - ) + pep_message = "Total number of peptide for " + label + ": " + str(len(unique_seqs)) print(pep_message) data_map[label] = unique_seqs plt.figure(figsize=(16, 12), dpi=500) @@ -247,9 +232,7 @@ def map_protein_for_tsv(path: str, fasta: str, output_path: str, map_parameter: map_protein_names[accession].add(accession) map_protein_names[name].add(accession) df, content = load_de_or_ae(path) - df["protein"] = df["protein"].apply( - lambda x: get_unanimous_name(x, map_protein_names) - ) + df["protein"] = df["protein"].apply(lambda x: get_unanimous_name(x, map_protein_names)) content += df.columns.str.cat(sep="\t") + "\n" for index, row in df.iterrows(): content += "\t".join(map(str, row)).strip() + "\n" @@ -273,9 +256,7 @@ def load_de_or_ae(path): def change_and_save_parquet(parquet_path, map_dict, output_path, label): pqwriter = None for df in read_large_parquet(parquet_path): - df["protein_accessions"] = df["protein_accessions"].apply( - lambda x: get_unanimous_name(x, map_dict) - ) + df["protein_accessions"] = df["protein_accessions"].apply(lambda x: get_unanimous_name(x, map_dict)) if label == "feature": hander = FeatureHandler() elif label == "psm": @@ -309,9 +290,7 @@ def register_file_to_json(project_file, attach_file, category, replace_existing) # get best_scan_number -def load_best_scan_number( - diann_psm_path: str, diann_feature_path: str, output_path: str -): +def load_best_scan_number(diann_psm_path: str, diann_feature_path: str, output_path: str): p = Parquet(diann_psm_path) psm_df = p.load_psm_scan() pqwriter = None @@ -352,9 +331,7 @@ def generate_start_and_end_from_fasta(parquet_path, fasta_path, label, output_pa hander = PSMHandler() pqwriter = None for df in read_large_parquet(parquet_path): - df[["protein_start_positions", "protein_end_positions"]] = df[ - ["sequence", "protein_accessions"] - ].swifter.apply( + df[["protein_start_positions", "protein_end_positions"]] = df[["sequence", "protein_accessions"]].swifter.apply( lambda row: fill_start_and_end(row, protein_dict), axis=1, result_type="expand", @@ -430,15 +407,9 @@ def generate_project_report(project_folder): msgs["featureSamples"] = feature_statistics.get_number_of_samples() msgs["featurePeptidoforms"] = feature_statistics.get_number_of_peptidoforms() msgs["featureMsruns"] = feature_statistics.get_number_msruns() - msgs["featureImg1"] = convert_to_base64( - plot_intensity_distribution_of_samples(feature_paths[0]) - ) - msgs["featureImg2"] = convert_to_base64( - plot_peptide_distribution_of_protein(feature_paths[0]) - ) - msgs["featureImg3"] = convert_to_base64( - plot_intensity_box_of_samples(feature_paths[0]) - ) + msgs["featureImg1"] = convert_to_base64(plot_intensity_distribution_of_samples(feature_paths[0])) + msgs["featureImg2"] = convert_to_base64(plot_peptide_distribution_of_protein(feature_paths[0])) + msgs["featureImg3"] = convert_to_base64(plot_intensity_box_of_samples(feature_paths[0])) psm_paths = [f for f in file_list if f.endswith(".psm.parquet")] if len(psm_paths) > 0: psm_statistics = ParquetStatistics(psm_paths[0]) @@ -449,9 +420,7 @@ def generate_project_report(project_folder): msgs["psmMsruns"] = psm_statistics.get_number_msruns() sdrf_paths = [f for f in file_list if f.endswith(".sdrf.tsv")] if len(sdrf_paths) > 0: - msgs["psmImg1"] = convert_to_base64( - plot_peptides_of_lfq_condition(psm_paths[0], sdrf_paths[0]) - ) + msgs["psmImg1"] = convert_to_base64(plot_peptides_of_lfq_condition(psm_paths[0], sdrf_paths[0])) ae_paths = [f for f in file_list if f.endswith(".absolute.tsv")] if len(ae_paths) > 0: absolute_stats = IbaqStatistics(ibaq_path=ae_paths[0]) diff --git a/quantmsio/quantmsio_cli.py b/quantmsio/quantmsio_cli.py index 650ad10..bd91b80 100644 --- a/quantmsio/quantmsio_cli.py +++ b/quantmsio/quantmsio_cli.py @@ -39,9 +39,7 @@ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) -@click.version_option( - version=__version__, package_name="quantmsio", message="%(package)s %(version)s" -) +@click.version_option(version=__version__, package_name="quantmsio", message="%(package)s %(version)s") @click.group(context_settings=CONTEXT_SETTINGS) def cli(): """ diff --git a/quantmsio/tests/test_project.py b/quantmsio/tests/test_project.py index d8e3268..7916a51 100644 --- a/quantmsio/tests/test_project.py +++ b/quantmsio/tests/test_project.py @@ -37,9 +37,7 @@ def test_populate_from_pride_archive_successful(self, mock_get): project_manager.populate_from_pride_archive() # Assert that the project_info has been updated - self.assertEqual( - project_manager.project.project_info["project_title"], "Test Project" - ) + self.assertEqual(project_manager.project.project_info["project_title"], "Test Project") self.assertEqual( project_manager.project.project_info["project_description"], "Test description", diff --git a/quantmsio/tests/test_sdrf.py b/quantmsio/tests/test_sdrf.py index 314761b..6549300 100644 --- a/quantmsio/tests/test_sdrf.py +++ b/quantmsio/tests/test_sdrf.py @@ -31,9 +31,7 @@ def test__load_sdrf_info(self): print(sdrf_handler.get_experiment_type_from_sdrf()) def test_get_labels(self): - file = ( - __package__ + "/examples/DDA-plex/MSV000079033-Blood-Plasma-iTRAQ.sdrf.tsv" - ) + file = __package__ + "/examples/DDA-plex/MSV000079033-Blood-Plasma-iTRAQ.sdrf.tsv" sdrf_handler = SDRFHandler(file) self.assertEqual(len(sdrf_handler.get_sample_labels()), 4) diff --git a/quantmsio/utils/pride_utils.py b/quantmsio/utils/pride_utils.py index b8b9995..8760b22 100644 --- a/quantmsio/utils/pride_utils.py +++ b/quantmsio/utils/pride_utils.py @@ -102,9 +102,7 @@ def get_set_of_experiment_keywords(pride_json: dict) -> list: return list(experiment_types) -def get_key_peptide_combination( - msstats_peptidoform: str, charge: str, reference_file: str = None -): +def get_key_peptide_combination(msstats_peptidoform: str, charge: str, reference_file: str = None): """ Get the key for a peptide or psm in mztab. For peptides the key is: - sequence peptidoform in msstats notation (e.g. PEPTIDE(Oxidation)) @@ -164,9 +162,7 @@ def compare_protein_lists(protein_list_1: list, protein_list_2: list) -> bool: return protein_list_1 == protein_list_2 -def standardize_protein_string_accession( - protein_string: str, sorted: bool = False -) -> str: +def standardize_protein_string_accession(protein_string: str, sorted: bool = False) -> str: """ standardize the protein string accession, in some cases the protein string accession is decided by commas instead of semicolons. @@ -201,16 +197,12 @@ def parse_score_name_in_mztab(score_name_mztab_line: str) -> str: score_values = lines[2].replace("[", "").replace("]", "").split(",") score_name = score_values[2].strip() if ":" in score_name: - score_name = "'{}'".format( - score_name - ) # add quotes to the score name if it contains a colon like + score_name = "'{}'".format(score_name) # add quotes to the score name if it contains a colon like # "OpenMS:Target-decoy protein q-value" return score_name -def get_modifications_object_from_mztab_line( - modification_string: str, modifications_definition: dict -) -> dict: +def get_modifications_object_from_mztab_line(modification_string: str, modifications_definition: dict) -> dict: """ get the modifications from a mztab line. This method is used to transform peptide + modification strings to proteoform notations, for msstats notation and for proforma notation. @@ -227,28 +219,16 @@ def get_modifications_object_from_mztab_line( accession = modification.split("-")[1] unimod_accession = accession if accession not in modifications_definition: - raise Exception( - "The modification {} is not in the modifications definition".format( - accession - ) - ) - accession = modifications_definition[accession][ - 0 - ] # get the name of the modification + raise Exception("The modification {} is not in the modifications definition".format(accession)) + accession = modifications_definition[accession][0] # get the name of the modification position = [] position_probability_string = modification.split("-")[0] - if ( - "[" not in position_probability_string - and "|" not in position_probability_string - ): # only one position + if "[" not in position_probability_string and "|" not in position_probability_string: # only one position position = [position_probability_string] elif ( - "[" not in position_probability_string - and "|" in position_probability_string + "[" not in position_probability_string and "|" in position_probability_string ): # multiple positions not probability - position = position_probability_string.split( - "|" - ) # multiple positions not probability + position = position_probability_string.split("|") # multiple positions not probability else: positions_probabilities = position_probability_string.split("|") for position_probability in positions_probabilities: @@ -274,9 +254,7 @@ def get_modifications_object_from_mztab_line( return modifications -def get_quantmsio_modifications( - modifications_string: str, modification_definition: dict -) -> dict: +def get_quantmsio_modifications(modifications_string: str, modification_definition: dict) -> dict: """ Get the modifications in quantms.io format from a string of modifications in mztab format. :param modifications_string: Modifications string in mztab format @@ -305,9 +283,7 @@ def fetch_peptide_spectra_ref(peptide_spectra_ref: str): return ms_run, scan_number -def compare_msstats_peptidoform_with_proforma( - msstats_peptidoform: str, proforma: str -) -> bool: +def compare_msstats_peptidoform_with_proforma(msstats_peptidoform: str, proforma: str) -> bool: """ Compare a msstats peptidoform with a proforma representation. - [Acetyl]-EM[Oxidation]EVEES[Phospho]PEK vs .(Acetyl)EM(Oxidation)EVEES(Phospho)PEK @@ -319,9 +295,7 @@ def compare_msstats_peptidoform_with_proforma( if "-" in proforma: proforma_parts = proforma.split("-") if len(proforma_parts) > 3: - raise Exception( - "The proforma peptidoform is not valid has more than 3 termini modifications" - ) + raise Exception("The proforma peptidoform is not valid has more than 3 termini modifications") result_proforma = "" par_index = 0 for part in proforma_parts: @@ -349,11 +323,7 @@ def get_petidoform_msstats_notation( :param modifications_definition: dictionary modifications definition :return: peptidoform in msstats """ - if ( - modification_string == "null" - or modification_string is None - or modification_string == "" - ): + if modification_string == "null" or modification_string is None or modification_string == "": return peptide_sequence modifications = get_modifications_object_from_mztab_line( @@ -402,17 +372,13 @@ def fetch_peptide_from_mztab_line( peptide["accession"] = standardize_protein_string_accession(peptide["accession"]) peptide["score"] = peptide_dict["best_search_engine_score[1]"] - if ( - "opt_global_cv_MS:1002217_decoy_peptide" in peptide_dict - ): # check if the peptide is a decoy + if "opt_global_cv_MS:1002217_decoy_peptide" in peptide_dict: # check if the peptide is a decoy peptide["is_decoy"] = peptide_dict["opt_global_cv_MS:1002217_decoy_peptide"] else: peptide["is_decoy"] = None # if the information is not available if "opt_global_cv_MS:1000889_peptidoform_sequence" in peptide_dict: - peptide["peptidoform"] = peptide_dict[ - "opt_global_cv_MS:1000889_peptidoform_sequence" - ] + peptide["peptidoform"] = peptide_dict["opt_global_cv_MS:1000889_peptidoform_sequence"] else: peptide["peptidoform"] = get_petidoform_msstats_notation( peptide["sequence"], peptide["modifications"], modification_definition @@ -458,9 +424,7 @@ def fetch_protein_from_mztab_line(protein_dict: dict): :param protein_dict: dictionary with the protein information :return: protein dictionary """ - accession_string = standardize_protein_string_accession( - protein_dict["ambiguity_members"] - ) + accession_string = standardize_protein_string_accession(protein_dict["ambiguity_members"]) return { "accession": accession_string, "score": protein_dict["best_search_engine_score[1]"], @@ -479,15 +443,11 @@ def fetch_ms_runs_from_mztab_line(mztab_line: str, ms_runs: dict) -> dict: mztab_line = mztab_line.strip() line_parts = mztab_line.split("\t") if line_parts[0] == "MTD" and line_parts[1].split("-")[-1] == "location": - ms_runs[line_parts[1].split("-")[0]] = ( - line_parts[2].split("//")[-1].split(".")[0] - ) + ms_runs[line_parts[1].split("-")[0]] = line_parts[2].split("//")[-1].split(".")[0] return ms_runs -def fetch_psm_from_mztab_line( - es: dict, ms_runs: dict = None, modifications_definition: dict = None -) -> dict: +def fetch_psm_from_mztab_line(es: dict, ms_runs: dict = None, modifications_definition: dict = None) -> dict: """ Get the psm from a mztab line include the post. :param es: dictionary with the psm information @@ -524,17 +484,13 @@ def fetch_psm_from_mztab_line( psm["accession"] = standardize_protein_string_accession(psm["accession"]) psm["score"] = es["search_engine_score[1]"] - if ( - "opt_global_cv_MS:1002217_decoy_peptide" in es - ): # check if the peptide is a decoy + if "opt_global_cv_MS:1002217_decoy_peptide" in es: # check if the peptide is a decoy psm["is_decoy"] = es["opt_global_cv_MS:1002217_decoy_peptide"] else: psm["is_decoy"] = None # if the information is not available if "opt_global_Posterior_Error_Probability_score" in es: - psm["posterior_error_probability"] = es[ - "opt_global_Posterior_Error_Probability_score" - ] + psm["posterior_error_probability"] = es["opt_global_Posterior_Error_Probability_score"] else: psm["posterior_error_probability"] = None @@ -605,9 +561,7 @@ def fetch_modifications_from_mztab_line(line: str, _modifications: dict) -> dict for ( key, value, - ) in ( - _modifications.items() - ): # for name, age in dictionary.iteritems(): (for Python 2.x) + ) in _modifications.items(): # for name, age in dictionary.iteritems(): (for Python 2.x) if value[1] == index: accession = key if accession is None: diff --git a/setup.py b/setup.py index b5ba58e..e1fb53a 100644 --- a/setup.py +++ b/setup.py @@ -59,9 +59,7 @@ def get_version(rel_path): "ddt", "psutil", ], - entry_points={ - "console_scripts": ["quantmsio_cli = quantmsio.quantmsio_cli:quantms_io_main"] - }, + entry_points={"console_scripts": ["quantmsio_cli = quantmsio.quantmsio_cli:quantms_io_main"]}, platforms=["any"], classifiers=[ "Programming Language :: Python :: 3",