METABOLIC-C.2nd_run.pl

#!/usr/bin/env perl

###########################

# METABOLIC-C.pl

# METABOLIC  =>  METabolic And BiogeOchemistry anaLyses In miCrobes

# This software gives a metabolic and biogeochemical function trait profile to given genome datasets 
# [either metagenome-assembled genomes (MAGs), single-cell amplified genomes (SAGs) or pure culture sequenced genomes]. 
# It also integrates the genome coverage to make element cycling pathways.
# METABOLIC-C.pl is specifically for users who have metagenomic reads and want to include them in the community analysis. 

# Written by Zhichao Zhou, zczhou2017@gmail.com 
# July, 2019

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

###########################

use 5.010;
use strict;
use warnings;

##modules
use Data::Dumper;
use POSIX qw(strftime);
use Getopt::Long;
use Statistics::Descriptive;
use Parallel::ForkManager;
use File::Spec;
use File::Basename;

=head1 DESCRIPTION

        Takes a folder containing genome files to generate a profile of their metablic and biogeochemical functions
        
=head1 USAGE       

        perl METABOLIC-C.pl -t 40 -m-cutoff 0.75  -in Genome_proteins -kofam-db full -r omic_reads_parameters.txt -o METABOLIC_out
		(When you also want to calculate genome coverages, you would have to add genome files with the same basename and the extention as ".fasta" in this folder)
        
        perl METABOLIC-C.pl -t 40 -m-cutoff 0.75  -in-gn Genome_files -kofam-db full -r omic_reads_parameters.txt -o METABOLIC_out
		
	perl METABOLIC-G.pl -test true 
		(use the 5 genomes to test the METABOLIC-C script)

	perl METABOLIC-G.pl -test true -tax order
		(to calculate MW-score contribution of microbial groups at the resolution of the "order" level)
		
	perl METABOLIC-C.pl -t 40 -m-cutoff 0.75  -in-gn Genome_files -kofam-db full -r omic_reads_parameters.txt -st nanopore -o METABOLIC_out	
		(to use long reads generated by Oxford Nanopore sequencing)		
        
=head1 OPTIONS

	-t         or -cpu             [integer] The cpu numbers to run the hmmsearch (default: 20)
        -m-cutoff  or -module-cutoff   [float]   The cutoff value to assign the presence of a specific KEGG module (KEGG module step present numbers / KEGG module step total number) (default: 0.75) 
        -in                            [string]  The folder pf given genome faa files [should also give the genome fasta files and genone gene files if the (meta)genome/(meta)transciptome datasets are included]
        -in-gn                         [string]  The folder of given genome fasta files (Prodigal will be used to annotate your genomes)
        -kofam-db                      [string]  To use the "small" size or "full" size of KOfam database in METABOLIC (default: 'full')
	-p         or -prodigal-method [string]  "meta" or "single" for prodigal to annotate the orf
        -r         or -omic-reads      [string]  The file which indicates the address of omic reads
	-rt        or -reads-type      [string]  To use "metaG" or "metaT" to indicate whether you use the metagenomic reads or metatranscriptomic reads (default: 'metaG')
	-st        or -sequencing-type [string]  To use "illumina" (for Illumina short reads), or "pacbio" (for PacBio CLR reads), or "pacbio_hifi" (for PacBio HiFi/CCS genomic reads (v2.19 or later)), or "pacbio_asm20" (for PacBio HiFi/CCS genomic reads (v2.18 or earlier)), or "nanopore" (for Oxford Nanopore reads) to indicate the sequencing type of metagenomes or metatranscriptomes (default: 'illumina'; Note that all "illumina", "pacbio", "pacbio_hifi", "pacbio_asm20", and "nanopore" should be provided as lowercase letters and the underscore "_" should not be typed as "-" or any other marks)
	-tax       or -taxonomy        [string]  To calculate MW-score contribution of microbial groups at the resolution of which taxonomical level (default: "phylum"; other options: "class", "order", "family", "genus", "species", and "bin" (MAG itself))
	-o         or -output          [string]  The METABOLIC output folder (default: current address)
	-2nd-run                       [string]  The option to make a 2nd run using the same set of genomes of a previous run. This option can use the previous genome annotation intermediate folders or metagenomic/metatranscriptomic mapping result, thus it can save lots of time. "true" or "false" to run the 2nd-run option (default: 'false'). [Note] If this option was set to be "true", option "-o" should be set to the previous output folder of a successful run, METABOLIC-C will use the intermediate files within. You will also need to set the option "-2nd-run-suffix", the suffix will be appended to the new folders and files that are generated by the 2nd run. The Prodigal Method (-p or -prodigal-method), KOfam DB (-kofam-db), Module Cutoff Value (-m-cutoff or -module-cutoff), and Input Genome directory (nucleotides) (-in-gn) should be the same as the previous run.											
	-2nd-run-suffix                [string]  The suffix that will be appended to the folders and files that are generated by the 2nd run (default: '2nd_run')
	-depth-file                    [string]  The option to use the depth file provided by a previous run (You only to provide the depth file name to this option without adding the path in the front, e.g., "All_gene_collections_mapped.depth.txt"). This option should be used only when you use the option "-2nd-run". 
	-test                          [string]  The option to test the performance of METABOLIC-G by 5 genomes; "true" or "false" to run the test option. The test option will use 5 CPUs to run the command.	
	
=head1 INSTRUCTIONS

	GitHub: https://github.com/AnantharamanLab/METABOLIC

=head1 OUTPUT

        Prodigal-annotated protein and gene files will be given in the input folder.
        The METABOLIC result table will be generated.
	Biogeochemical pathway diagrams will be generated.
		
=head1 COPYRIGHT

	Zhichao Zhou, zczhou2017@gmail.com
	Patricia Tran, ptran5@wisc.edu
	Karthik Anantharaman, karthik@bact.wisc.edu
	Anantharaman Microbiome Laboratory
	Department of Bacteriology, University of Wisconsin, Madison
=cut

# Intake the address of METABOLIC directory:
my $METABOLIC_dir = dirname(File::Spec->rel2abs(__FILE__));

# The options 
# Option variables with default value
my $cpu_numbers = 20; # Parallel running cpu numbers
my $module_cutoff = 0.75; # The cutoff value to assign the existence of a module
my $input_protein_folder;  # Input microbial genome protein files
my $input_genome_folder; # Input microbial genome fasta files
my $omic_reads_parameters; # The address of omic reads
my $prodigal_method = "meta"; # The prodigal method to annotate orfs
my $kofam_db_size = "full"; # The full kofam size
my $omic_reads_type = "metaG"; # Metagenomic reads
my $sequencing_type = "illumina"; # The sequencing type of input omics reads
my $output = `pwd`; # The output folder 
my $second_run = "false"; # true or false for a 2nd run
my $second_run_suffix = "2nd_run"; # The suffix to append on the folders and files generated by the 2nd run
my $depth_file; # The depth file that is generated by a previous run
my $taxonomy = "phylum"; # The taxonomy level to calculate MW-score table 
my $version="METABOLIC-C.pl v4.0";
my $test = "false";

GetOptions(
	'cpu|t=i' => \$cpu_numbers,
	'module-cutoff|m-cutoff=f' => \$module_cutoff,
	'in=s' => \$input_protein_folder,
	'in-gn=s' => \$input_genome_folder,
	'prodigal-method|p=s' => \$prodigal_method,
	'omic-reads|r=s' => \$omic_reads_parameters,
	'reads-type|rt=s' => \$omic_reads_type,
	'sequencing-type|st=s' => \$sequencing_type,
	'kofam-db=s' => \$kofam_db_size,
	'taxonomy|tax=s' => \$taxonomy,
	'2nd-run=s' => \$second_run,
	'2nd-run-suffix=s' => \$second_run_suffix,
	'depth-file=s' => \$depth_file,
	'output|o=s' => \$output,
	'help|h' => sub{system('perldoc', $0); exit;},
	'v|version'=>sub{print $version."\n"; exit;},
	'test=s' => \$test
) or die("Getting options from the command line failed, please check your options");

## Pre-required files and documents
 # METABOLIC hmm database files
 my $METABOLIC_hmm_db_address = "$METABOLIC_dir/METABOLIC_hmm_db";
 
 # KofamKOALA hmm database files
 # Link: ftp://ftp.genome.jp/pub/db/kofam/
 my $kofam_db_address = "$METABOLIC_dir/kofam_database/profiles";
 my $kofam_db_KO_list = "$METABOLIC_dir/kofam_database/ko_list";
 
 # Input hmm information table as a template
 my $hmm_table_temp = "$METABOLIC_dir/METABOLIC_template_and_database/hmm_table_template.txt";
 my $hmm_table_temp_2 = "$METABOLIC_dir/METABOLIC_template_and_database/hmm_table_template_2.txt"; 
 
 # The KEGG module information
 my $ko_module_table = "$METABOLIC_dir/METABOLIC_template_and_database/ko00002.keg";
 
 # The KEGG module step db 
 my $ko_module_step_db = "$METABOLIC_dir/METABOLIC_template_and_database/kegg_module_step_db.txt";
 
 # The pathway information to draw element cycling diagrams and metabolic handoff
 my $R_pathways = "$METABOLIC_dir/METABOLIC_template_and_database/R_pathways.txt";
 my $R_mh_01 = "$METABOLIC_dir/METABOLIC_template_and_database/Sequential_transformations_01.txt";
 my $R_mh_02 = "$METABOLIC_dir/METABOLIC_template_and_database/Sequential_transformations_02.txt";
 my $R_mh_tsv = "$METABOLIC_dir/METABOLIC_template_and_database/Sequential-transformations.tsv";
 my $R_order_of_input_01 = "$METABOLIC_dir/METABOLIC_template_and_database/order_of_input_01.txt";
 my $R_order_of_input_02 = "$METABOLIC_dir/METABOLIC_template_and_database/order_of_input_02.txt";
 my $CAZy_map_address = "$METABOLIC_dir/METABOLIC_template_and_database/CAZy_map.txt";
 
 # The MW-score reaction table template
 my $MW_score_reaction_table = "$METABOLIC_dir/METABOLIC_template_and_database/MW-score_reaction_table.txt";
 
 # The motif files to validate specific protein hits
 my $motif_file = "$METABOLIC_dir/METABOLIC_template_and_database/motif.txt";
 my $motif_pair_file = "$METABOLIC_dir/METABOLIC_template_and_database/motif.pair.txt";

# The test option:
if ($test eq "true"){
	$input_genome_folder = "$METABOLIC_dir/METABOLIC_test_files/Guaymas_Basin_genome_files";
	$output = "METABOLIC_out";
	$cpu_numbers = "5";
	$omic_reads_parameters = "$METABOLIC_dir/METABOLIC_test_files/Reads_address.txt";
}

# To make sure the input taxonomy is right
my %Tax2code = (); # Store the corresponding map of user-defined taxonomy to taxonomy level code
%Tax2code = ('phylum'=> '0', 'class'=> '1', 'order'=> '2', 'family'=> '3', 'genus'=> '4', 'species'=> '5', 'bin' => '6');

if (!exists $Tax2code{$taxonomy}){
	die "Your input taxonomy is wrong, please check your spelling. It should be one of these taxonomies: phylum, class, order, family, genus, species or bin (MAG itself)\n";
}

## Main Body
if (!$second_run){
	die "Please add option -2nd-run true\n";
}

if (!(-s "$output")){
	die "This script is for a 2nd run using the previous genome annotation results. Please provide the output folder of the previous run\n";
}

if (!(-s "$input_genome_folder")){
	die "This script is for a 2nd run using the previous genome annotation results. Please provide the input genome folder of the previous run\n";
}

# To make sure the input sequencing type should one of the five following values
if ($sequencing_type ne "illumina" and $sequencing_type ne "pacbio" and $sequencing_type ne "pacbio_asm20" and $sequencing_type ne "pacbio_hifi" and $sequencing_type ne "nanopore"){
	die "Your input sequencing type is wrong, please check your spelling. It should be one of these: illumina, pacbio, pacbio_asm20, pacbio_hifi, nanopore\n";
}

# The present time
my $datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
my $starttime = $datestring; my $starttime_raw = time;

# Store the stdout and stderr into log  
open STDOUT, "| tee -ai  $output/METABOLIC_log.$second_run_suffix.log";
open STDERR, "| tee -ai  $output/METABOLIC_log.$second_run_suffix.log";

# Store the hmm table template
my %Hmm_table_temp = (); # line no. => each line 
my @Hmm_table_head = (); # The head of the hmm table template
my %METABOLIC_hmm2threshold = (); # hmm file id => threshold and score_type
open IN, "$hmm_table_temp";
while (<IN>){
	chomp;
	if (!/^#/){		
		my @tmp = split (/\t/);
		$Hmm_table_temp{$tmp[0]} = $_;
		if ($tmp[5] and $tmp[5] !~ /K\d\d\d\d\d/){
			$METABOLIC_hmm2threshold{$tmp[5]} = $tmp[10];
		}
	}else{
		my $line = $_; @Hmm_table_head = split (/\t/);		
	}
}
close IN;

# Store the hmm table template 2
my %Hmm_table_temp_2 = (); # line no. => each line; 
open IN, "$hmm_table_temp_2";
while (<IN>){
	chomp;
	if (!/^#/){	
		my @tmp = split (/\t/);
		$Hmm_table_temp_2{$tmp[0]}= $_;
	}
}
close IN;

# The hash of hmm file and corresponding threshold and score_type
my %Total_hmm2threshold = (%METABOLIC_hmm2threshold, _get_kofam_db_KO_threshold($kofam_db_KO_list,$kofam_db_address)); 

if ($input_genome_folder){
	$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
	print "\[$datestring\] Skip Prodigal running for a 2nd run\n";
	$input_protein_folder = $input_genome_folder;
}

my %Genome_id = (); # genome id => 1
my %Seqid2Genomeid = (); # seq id => genome id 
my %Total_faa_seq = (); # Store the total faa file into a hash{$line_no}
open IN,"ls $input_protein_folder/*.faa |";
while (<IN>){
	chomp;
	my $file = $_;
	# Store faa file into a hash
	%Total_faa_seq = (%Total_faa_seq, _get_faa_seq($file));
	
	my ($gn_id) = $file =~ /^$input_protein_folder\/(.+?)\.faa/; 
	$Genome_id{$gn_id} = 1; 
	open IN_, "$file";
	while (<IN_>){
		if (/>/){
			my ($seq) = $_ =~ /^>(.+?)\s/;
			$Seqid2Genomeid{$seq} = $gn_id;
		}
	}
	close IN_;
}

if (!(-s "$output/intermediate_files")){
	die "$output/intermediate_files folder is not found\n";
}

if (!(-s "$output/intermediate_files/Hmmsearch_Outputs")){
	die "$output/intermediate_files/Hmmsearch_Outputs is nor found\n";
}

$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skip hmmsearch running for a 2nd run\n";

`cat $input_protein_folder/*.faa > $input_protein_folder/faa.total; mv $input_protein_folder/faa.total $input_protein_folder/total.faa`;

# Store motif validation files
my %Motif = _get_motif($motif_file); # protein id => motif sequences (DsrC => GPXKXXCXXXGXPXPXXCX)
my %Motif_pair = _get_motif_pair($motif_pair_file); # dsrC => tusE

# Summarize hmmsearch result and print table
my %Hmmscan_result = (); # genome_name => hmm => numbers
my %Hmmscan_hits = (); # genome_name => hmm => hits
my %Hmm_id = (); # hmm => 1 
open IN, "find $output/intermediate_files/Hmmsearch_Outputs -type f -name '*.hmmsearch_result.txt' | ";
while (<IN>){
        chomp;
        my $file_name = $_;
        my ($hmm) = $file_name =~ /^$output\/intermediate_files\/Hmmsearch_Outputs\/(.+?\.hmm)\./; 
		$Hmm_id{$hmm} = 1;
		my $gn_id = "";
        open INN, "$file_name";
        while (<INN>){
                chomp;
                if (!/^#/){
                        my $line = $_; $line =~ s/\s+/\t/g;
						my @tmp = split (/\t/,$line); $gn_id = $Seqid2Genomeid{$tmp[0]};
						my ($threshold,$score_type) = $Total_hmm2threshold{$hmm} =~ /^(.+?)\|(.+?)$/; 
						if ($score_type eq "domain"){
							if ($tmp[8] >= $threshold){
								my ($hmm_basename) = $hmm =~ /^(.+?)\.hmm/; 
								if (exists $Motif{$hmm_basename}){
									my $seq; 
									my $motif = $Motif{$hmm_basename}; $motif =~ s/X/\[ARNDCQEGHILKMFPSTWYV\]/g; 									
									my %Seq_gn = _store_seq("$input_protein_folder/total.faa"); # Get the total genome sequences
									$seq = $Seq_gn{">$tmp[0]"};
									if ($seq =~ /$motif/){
										if (! exists $Hmmscan_hits{$gn_id}{$hmm}){
											$Hmmscan_hits{$gn_id}{$hmm} = $tmp[0];
										}else{
											$Hmmscan_hits{$gn_id}{$hmm} .= "\,".$tmp[0];
										}
										$Hmmscan_result{$gn_id}{$hmm}++;
									}
								}elsif(exists $Motif_pair{$hmm_basename}){
									my $motif_hmm = "$METABOLIC_hmm_db_address/$hmm_basename.check.hmm";
									my $motif_anti_hmm = "$METABOLIC_hmm_db_address/$Motif_pair{$hmm_basename}.check.hmm";
									_get_1_from_input_faa("$input_protein_folder/total.faa",">$tmp[0]","$output/tmp.$hmm_basename.check.faa");								
									`hmmsearch --cpu 1 --tblout $output/tmp.$hmm_basename.check.hmmsearch_result.txt $motif_hmm $output/tmp.$hmm_basename.check.faa`;
									`hmmsearch --cpu 1 --tblout $output/tmp.$Motif_pair{$hmm_basename}.check.hmmsearch_result.txt $motif_anti_hmm $output/tmp.$hmm_basename.check.faa`;
									my $motif_check_score = _get_check_score("$output/tmp.$hmm_basename.check.hmmsearch_result.txt"); 
									my $motif_anti_check_score = _get_check_score("$output/tmp.$Motif_pair{$hmm_basename}.check.hmmsearch_result.txt");
									if ($motif_check_score >= $motif_anti_check_score and $motif_check_score != 0){
										if (! exists $Hmmscan_hits{$gn_id}{$hmm}){
											$Hmmscan_hits{$gn_id}{$hmm} = $tmp[0];
										}else{
											$Hmmscan_hits{$gn_id}{$hmm} .= "\,".$tmp[0];
										}
										$Hmmscan_result{$gn_id}{$hmm}++;
									}
									`rm $output/tmp.$hmm_basename.check.faa $output/tmp.$hmm_basename.check.hmmsearch_result.txt $output/tmp.$Motif_pair{$hmm_basename}.check.hmmsearch_result.txt`;									
								}else{ # Do not have motif check step
									if (! exists $Hmmscan_hits{$gn_id}{$hmm}){
										$Hmmscan_hits{$gn_id}{$hmm} = $tmp[0];
									}else{
										$Hmmscan_hits{$gn_id}{$hmm} .= "\,".$tmp[0];
									}
									$Hmmscan_result{$gn_id}{$hmm}++;
								}
							}
						}else{
							my ($hmm_basename) = $hmm =~ /^(.+?)\.hmm/; 
							if (exists $Motif{$hmm_basename}){
								my $seq; # The protein seq
								my $motif = $Motif{$hmm_basename};  $motif =~ s/X/\[ARNDCQEGHILKMFPSTWYV\]/g; 		
								my %Seq_gn = _store_seq("$input_protein_folder/total.faa"); # get the total genome sequences
								$seq = $Seq_gn{">$tmp[0]"};
								if ($seq =~ /$motif/){
									if (! exists $Hmmscan_hits{$gn_id}{$hmm}){
										$Hmmscan_hits{$gn_id}{$hmm} = $tmp[0];
									}else{
										$Hmmscan_hits{$gn_id}{$hmm} .= "\,".$tmp[0];
									}
									$Hmmscan_result{$gn_id}{$hmm}++;
								}
							}elsif(exists $Motif_pair{$hmm_basename}){
								my $motif_hmm = "$METABOLIC_hmm_db_address/$hmm_basename.check.hmm";
								my $motif_anti_hmm = "$METABOLIC_hmm_db_address/$Motif_pair{$hmm_basename}.check.hmm";
								_get_1_from_input_faa("$input_protein_folder/total.faa",">$tmp[0]","$output/tmp.$hmm_basename.check.faa");
								`hmmsearch --cpu 1 --tblout $output/tmp.$hmm_basename.check.hmmsearch_result.txt $motif_hmm $output/tmp.$hmm_basename.check.faa`;
								`hmmsearch --cpu 1 --tblout $output/tmp.$Motif_pair{$hmm_basename}.check.hmmsearch_result.txt $motif_anti_hmm $output/tmp.$hmm_basename.check.faa`;
								my $motif_check_score = _get_check_score("$output/tmp.$hmm_basename.check.hmmsearch_result.txt"); 
								my $motif_anti_check_score = _get_check_score("$output/tmp.$Motif_pair{$hmm_basename}.check.hmmsearch_result.txt");
								if ($motif_check_score >= $motif_anti_check_score and $motif_check_score != 0){
									if (! exists $Hmmscan_hits{$gn_id}{$hmm}){
										$Hmmscan_hits{$gn_id}{$hmm} = $tmp[0];
									}else{
										$Hmmscan_hits{$gn_id}{$hmm} .= "\,".$tmp[0];
									}
									$Hmmscan_result{$gn_id}{$hmm}++;
								}
								`rm $output/tmp.$hmm_basename.check.faa $output/tmp.$hmm_basename.check.hmmsearch_result.txt $output/tmp.$Motif_pair{$hmm_basename}.check.hmmsearch_result.txt`;
							}else{
								if (! exists $Hmmscan_hits{$gn_id}{$hmm}){
									$Hmmscan_hits{$gn_id}{$hmm} = $tmp[0];
								}else{
									$Hmmscan_hits{$gn_id}{$hmm} .= "\,".$tmp[0];
								}
								$Hmmscan_result{$gn_id}{$hmm}++;
							}
						}						
                }
        }
        close INN;		
}
close IN;

`rm $input_protein_folder/total.faa`;

# Skip printing out each hmm faa collection
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skip printing out each hmm faa collection\n";

# Skip doing the KEGG module calculating
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skipe doing the KEGG module result calculating\n";

# Skip doing the KEGG identifier result calculating
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skip doing the KEGG identifier \(KO id\) result calculating\n";

# Skip searching CAZymes by dbCAN2
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skip searching CAZymes by dbCAN2\n";

# Skip searching MEROPS peptidase
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skip searching MEROPS peptidase\n";

# Skip generating METABOLIC table
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Skip generating METABOLIC table\n";

# Draw element cycling diagrams
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Drawing element cycling diagrams...\n";

# Store R pathways
my %R_pathways = (); #step => hmms
my %R_hmm_ids = ();
open IN, "$R_pathways";
while (<IN>){
	chomp;
	my @tmp = split (/\t/);
	$R_pathways{$tmp[0]} = $tmp[1];
	if ($tmp[1] !~ /\;/){
		my @tmp2 = split (/\,/,$tmp[1]);
		foreach my $key (@tmp2){
			$R_hmm_ids{$key} = 1;
		}
	}elsif ($tmp[1] =~ /\;/){
		my @tmp2 = split (/\;/,$tmp[1]);
		foreach my $key (@tmp2){
			my @tmp3 = split (/\,/,$key);
			foreach my $key2 (@tmp3){
				if ($key2 !~ /NO/){
					$R_hmm_ids{$key2} = 1;
				}
			}
		}
	}
}
close IN;

`mkdir $output/METABOLIC_Figures_Input.$second_run_suffix`;
`mkdir $output/METABOLIC_Figures_Input.$second_run_suffix/Nutrient_Cycling_Diagram_Input`;
`mkdir $output/METABOLIC_Figures.$second_run_suffix`;

# Get each R pathway input files
my %Total_R_input = (); #pathway => gn => 1 or 0
foreach my $gn (sort keys %Hmmscan_result){
	my %R_input = (); #for each input file
	foreach my $key (sort keys %R_pathways){
		$R_input{$key} = 0; $Total_R_input{$key}{$gn} = 0;
		my $hmms = $R_pathways{$key};
		if ($hmms !~ /\;/){
			foreach my $hmm_id (sort keys %R_hmm_ids){
				if ($hmms =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
					$R_input{$key} = 1; $Total_R_input{$key}{$gn} = 1;
				}
			}
		}elsif ($hmms =~ /\;/){
			my ($hmms_1,$hmms_2) = $hmms =~ /^(.+?)\;(.+?)$/;
			if ($hmms_2 !~ /NO/){
				my $logic1 = 0; my $logic2 = 0;
				foreach my $hmm_id (sort keys %R_hmm_ids){
					if ($hmms_1 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic1 = 1;	
					}
					if ($hmms_2 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic2 = 1;	
					}
				}
				if ($logic1 and $logic2){
					$R_input{$key} = 1; $Total_R_input{$key}{$gn} = 1;
				}
			}elsif ($hmms_2 =~ /NO/){
				my $logic1 = 0; my $logic2 = 1;
				foreach my $hmm_id (sort keys %R_hmm_ids){
					if ($hmms_1 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic1 = 1;	
					}			 
					if ($hmms_2 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){  # if $hmms_2 contains $hmm_id , and the genome has $hmm_id hit(s), then it will be false (0)
						$logic2 = 0;	
					}		
				}
				if ($logic1 and $logic2){
					$R_input{$key} = 1; $Total_R_input{$key}{$gn} = 1;
				}
			}
		}
	}
	
	open OUT, ">$output/METABOLIC_Figures_Input.$second_run_suffix/Nutrient_Cycling_Diagram_Input/$gn.R_input.txt";
	foreach my $key (sort keys %R_input){
		print OUT "$key\t$R_input{$key}\n";
	}	
	close OUT;
}

my %Genome_cov_constant = ();
# The genome coverage: genome id => coverage value
if ($omic_reads_parameters){
	my %Genome_cov = ();
	if ($depth_file){
		%Genome_cov = _get_Genome_coverge_by_depth_file($omic_reads_parameters,$depth_file);		
	}else{
		if ($sequencing_type eq 'illumina'){
			%Genome_cov = _get_Genome_coverge($omic_reads_parameters,$input_genome_folder);
		}else{
			%Genome_cov = _get_Genome_coverge_for_long_reads($omic_reads_parameters,$input_genome_folder);
		}		
	}
	%Genome_cov_constant = %Genome_cov;
	
	my %Total_R_input_2 = (); # pathway => genome numbers \t genome coverage percentage
	foreach my $pth (sort keys %Total_R_input){
		my $gn_no = 0; my $gn_cov_percentage = 0;
		foreach my $gn (sort keys %Hmmscan_result){
			if ($Total_R_input{$pth}{$gn}){
			$gn_no += $Total_R_input{$pth}{$gn};
			if ($Genome_cov{$gn}){
				$gn_cov_percentage += $Genome_cov{$gn};
				}
			}
		}
		$Total_R_input_2{$pth} = "$gn_no\t$gn_cov_percentage";
	}
	
	open OUT, ">$output/METABOLIC_Figures_Input.$second_run_suffix/Nutrient_Cycling_Diagram_Input/Total.R_input.txt"; 
	foreach my $key (sort keys %Total_R_input_2){
		print OUT "$key\t$Total_R_input_2{$key}\n";
	}
	close OUT;
}

`Rscript $METABOLIC_dir/draw_biogeochemical_cycles.R $output/METABOLIC_Figures_Input.$second_run_suffix/Nutrient_Cycling_Diagram_Input $output/Output TRUE 2> /dev/null`;
`mv $output/Output/draw_biogeochem_cycles $output/METABOLIC_Figures.$second_run_suffix/Nutrient_Cycling_Diagrams; rm -r $output/Output`;

$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Drawing element cycling diagrams finished\n";

my $endtime = "";
if ($omic_reads_parameters){

# Draw metabolic handoff diagrams
$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Drawing metabolic handoff diagrams...\n";

# Store metabolic handoff steps 1
my %R_mh_01 = (); #step => KOs
my %R_mh_01_hmm_ids = ();
my %Letter2reaction = (); # D -> S2-=>S0
open IN, "$R_mh_01";
while (<IN>){
	chomp;
	if (/\:/ or /\+/){
		my @tmp = split (/\t/);
		
		if ($tmp[0] =~ /\:/){
			my ($step,$reaction) = $tmp[0] =~ /^(\D+?)\:(.+?)$/;
			$R_mh_01{$step} = $tmp[2]; $Letter2reaction{$step} = $reaction; 
			my @tmp2 = split (/\,/,$tmp[2]);
			foreach my $key (@tmp2){
				$R_mh_01_hmm_ids{$key} = 1;
			}						
		}else{
			$R_mh_01{$tmp[0]} = $tmp[2];
		}
	}
}
close IN;

my %R_mh_01_summary = (); # step => gn => 1 or 0
foreach my $gn (sort keys %Hmmscan_result){
	foreach my $key (sort keys %R_mh_01){
		$R_mh_01_summary{$key}{$gn} = 0;
		if ($key !~ /\+/){
			my $hmms = $R_mh_01{$key};
			foreach my $hmm_id (sort keys %R_mh_01_hmm_ids){
				if ($hmms =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
					$R_mh_01_summary{$key}{$gn} = 1;
				}
			}
		}else{
			my $hmms_total = $R_mh_01{$key};
			my @tmp_hmm = split (/\;/,$hmms_total);
			my $count = 0; my $tmp_hmm_scalar = scalar @tmp_hmm;
			foreach my $hmms (@tmp_hmm){
				my $logic = 0;
				foreach my $hmm_id (sort keys %R_mh_01_hmm_ids){
					if ($hmms =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic = 1 ;
					}
				}
				if ($logic){
					$count++;
				}
			}
			if ($tmp_hmm_scalar == $count){
				$R_mh_01_summary{$key}{$gn} = 1;
			}
		}
	}
}

# The genome coverage: genome id => coverage value
if ($omic_reads_parameters){
	my %Genome_cov = %Genome_cov_constant;
	
	my %Total_R_hm_input_1 = (); # step => genome numbers \t genome coverage percentage
	foreach my $step (sort keys %R_mh_01){
		my $gn_no = 0; my $gn_cov_percentage = 0;
		foreach my $gn (sort keys %Hmmscan_result){
			if ($R_mh_01_summary{$step}{$gn}){
				$gn_no += $R_mh_01_summary{$step}{$gn};
			if ($Genome_cov{$gn}){
				$gn_cov_percentage += $Genome_cov{$gn};
				}
			}
		}
		$Total_R_hm_input_1{$step} = "$gn_no\t$gn_cov_percentage";
	}

	open OUT, ">$output/METABOLIC_Figures_Input.$second_run_suffix/Sequential_Transformation_input_1.txt"; 
	foreach my $key (sort keys %Total_R_hm_input_1){
		print OUT "$key\t$Total_R_hm_input_1{$key}\n";
	}
	close OUT;
}

# Store the CAZy map
my %CAZy_map = (); #GH => enzymes
open IN, "$CAZy_map_address";
while (<IN>){
	chomp;
	if (!/^Family/){
		my @tmp = split (/\t/);
		$CAZy_map{$tmp[0]} = $tmp[1];
	}
}
close IN;

# Store dbCAN running result
my %dbCANout = (); # genome => hmmid => number
my %dbCANout2 = (); # genome => hmmid => hits
my %Hmm_dbCAN2_id = (); # hmm => 1 
open IN, "ls $output/intermediate_files/dbCAN2_Files/*.dbCAN2.out.dm.ps |";
while (<IN>)
{
	my $file = $_;
	my ($gn_id) = $file =~ /^$output\/intermediate_files\/dbCAN2_Files\/(.+?)\.dbCAN2\.out\.dm\.ps/;
    open INN, "$file";
	while (<INN>){
	   if (/^GH|^PL/){
               my @tmp = split(/\t/,$_);
               my ($hmmid) = $tmp[0] =~ /(\S+?)\.hmm/; 
			   my ($hmmid_p1,$hmmid_p2) = $hmmid =~ /^(\D+?)(\d+)/;
			   my $num=(sprintf "%03d", $hmmid_p2);
			   $hmmid = $hmmid_p1.$num;
			   $Hmm_dbCAN2_id{$hmmid} = 1; 
               my ($name) = $tmp[2];
               $dbCANout{$gn_id}{$hmmid}++;
               if (!exists $dbCANout2{$gn_id}{$hmmid}){
                       $dbCANout2{$gn_id}{$hmmid} = $name;
               }else{
                       $dbCANout2{$gn_id}{$hmmid} .= "\;".$name;
               }
       }
	}
	close INN;
}
close IN;

# Store metabolic handoff steps 2
my %R_mh_02 = (); #step => enzymes
my %R_mh_02_enzyme_ids = ();
open IN, "$R_mh_02";
while (<IN>){
	chomp;
	if (/\:/ or /\+/){
		my @tmp = split (/\t/);
		
		if ($tmp[0] =~ /\:/){
			my ($step,$reaction) = $tmp[0] =~ /^(\D+?)\:(.+?)$/;
			$R_mh_02{$step} = $tmp[2]; $Letter2reaction{$step} = $reaction;
			my @tmp2 = split (/\;\s/,$tmp[2]);
			foreach my $key (@tmp2){
				$R_mh_02_enzyme_ids{$key} = 1;
			}						
		}else{
			$R_mh_02{$tmp[0]} = $tmp[2];
		}	
	}
}
close IN;

my %R_mh_02_summary = (); # step => gn => 1 or 0
foreach my $step (sort keys %R_mh_02){
	foreach my $gn (sort keys %dbCANout){
		$R_mh_02_summary{$step}{$gn} = 0;
		if ($step !~ /\+/){
			my $enzymes = $R_mh_02{$step};
			my @tmp_enzymes = split (/\;/, $enzymes);
			foreach my $enzyme_id (@tmp_enzymes){
				foreach my $GH_id (sort keys %CAZy_map){
					my @tmp_enzymes_2 = split (/\;/,$CAZy_map{$GH_id});
					foreach my $enzyme_id_2 (@tmp_enzymes_2){
						if ($enzyme_id_2 eq $enzyme_id and $dbCANout{$gn}{$GH_id}){
							$R_mh_02_summary{$step}{$gn} = 1;
						}
					}
				}
			}
		}else{
			my $hmms_total = $R_mh_02{$step};
			my @tmp_hmm = split (/\|/,$hmms_total);
			my $count = 0; my $tmp_hmm_scalar = scalar @tmp_hmm;			
			foreach my $hmms (@tmp_hmm){
						my $logic = 0;
						my @tmp_enzymes = split (/\;/, $hmms);
						foreach my $enzyme_id (@tmp_enzymes){
							foreach my $GH_id (sort keys %CAZy_map){
								my @tmp_enzymes_2 = split (/\;/,$CAZy_map{$GH_id});
								foreach my $enzyme_id_2 (@tmp_enzymes_2){
									if ($enzyme_id_2 eq $enzyme_id and $dbCANout{$gn}{$GH_id}){
										$logic = 1;
									}
								}				
							}
						}
						if ($logic){
							$count++;
						}
				
			}
			if ($tmp_hmm_scalar == $count){
				$R_mh_02_summary{$step}{$gn} = 1;
			}
		}
	}
}

# The genome coverage: genome id => coverage value
if ($omic_reads_parameters){
	my %Genome_cov = %Genome_cov_constant;
	
	my %Total_R_hm_input_2 = (); # step => genome numbers \t genome coverage percentage
	foreach my $step (sort keys %R_mh_02){
		my $gn_no = 0; my $gn_cov_percentage = 0;
		foreach my $gn (sort keys %Hmmscan_result){
			if ($R_mh_02_summary{$step}{$gn}){
				$gn_no += $R_mh_02_summary{$step}{$gn};
			if ($Genome_cov{$gn}){
				$gn_cov_percentage += $Genome_cov{$gn};
				}
			}
		}
		$Total_R_hm_input_2{$step} = "$gn_no\t$gn_cov_percentage";
	}

	open OUT, ">$output/METABOLIC_Figures_Input.$second_run_suffix/Sequential_Transformation_input_2.txt"; 
	foreach my $key (sort keys %Total_R_hm_input_2){
		print OUT "$key\t$Total_R_hm_input_2{$key}\n";
	}
	close OUT;
}

`mkdir $output/newdir`;
`Rscript $METABOLIC_dir/draw_sequential_reaction_diagram.R $output/METABOLIC_Figures_Input.$second_run_suffix/Sequential_Transformation_input_1.txt $output/METABOLIC_Figures_Input.$second_run_suffix/Sequential_Transformation_input_2.txt $R_mh_tsv $R_order_of_input_01 $R_order_of_input_02 $output/newdir 2> /dev/null`;
`mv $output/newdir/Bar_plot/bar_plot_input_1.pdf $output/METABOLIC_Figures.$second_run_suffix/Sequential_transformation_01.pdf`;
`mv $output/newdir/Bar_plot/bar_plot_input_2.pdf $output/METABOLIC_Figures.$second_run_suffix/Sequential_transformation_02.pdf`;
`rm -r $output/newdir`;

$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Drawing metabolic handoff diagrams finished\n";

$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Drawing energy flow chart...\n";

# Store the bin category
#system ("gtdbtk classify_wf --cpus $cpu_numbers -x fasta --genome_dir $input_genome_folder --skip_ani_screen --out_dir $output/intermediate_files/gtdbtk_Genome_files 2> /dev/null");

my %Bin2Cat = (); # bin => category, for instance, Acidimicrobiia_bacterium_UWMA-0264 => [0] Actinobacteriota (phylum) [1] XX (class) [2] XX (order) [3] XX (family) [4] XX (genus) [5] XX (species) [6] XX (bin)

if (-e "$output/intermediate_files/gtdbtk_Genome_files/gtdbtk.bac120.summary.tsv"){
	open IN, "$output/intermediate_files/gtdbtk_Genome_files/gtdbtk.bac120.summary.tsv";
	while (<IN>){
		chomp;
		if (!/^user_genome/){
			my @tmp = split (/\t/);
			if ($tmp[1] =~ /p\_\_Proteobacteria/){
				my $cat = ""; my $cat1 = ""; my $cat2 = ""; my $cat3 = ""; my $cat4 = ""; my $cat5 = "";
				($cat) = $tmp[1] =~ /\;c\_\_(.*?)\;/;
				if(!$cat){
					$cat = "NK_phylum";
				}
				$Bin2Cat{$tmp[0]}[0] = $cat;  # Store bin to phylum info
				
				($cat1) = $tmp[1] =~ /\;c\_\_(.*?)\;/;
				if(!$cat1){
					$cat1 = "NK_class";
				}	
				$Bin2Cat{$tmp[0]}[1] = $cat1; # Store bin to class info	
				
				($cat2) = $tmp[1] =~ /\;o\_\_(.*?)\;/;
				if(!$cat2){
					$cat2 = "NK_order";
				}	
				$Bin2Cat{$tmp[0]}[2] = $cat2; # Store bin to order info			

				($cat3) = $tmp[1] =~ /\;f\_\_(.*?)\;/;
				if(!$cat3){
					$cat3 = "NK_family";
				}	
				$Bin2Cat{$tmp[0]}[3] = $cat3; # Store bin to family info		

				($cat4) = $tmp[1] =~ /\;g\_\_(.*?)\;/;
				if(!$cat4){
					$cat4 = "NK_genus";
				}	
				$Bin2Cat{$tmp[0]}[4] = $cat4; # Store bin to genus info				
				
				($cat5) = $tmp[1] =~ /\;s\_\_(.*?)$/;
				if(!$cat5){
					$cat5 = "NK_species";
				}	
				$Bin2Cat{$tmp[0]}[5] = $cat5; # Store bin to species info	
				$Bin2Cat{$tmp[0]}[6] = $tmp[0]; # Store bin to bin info		
			}else{
				my $cat = ""; my $cat1 = ""; my $cat2 = ""; my $cat3 = ""; my $cat4 = ""; my $cat5 = "";
				($cat) = $tmp[1] =~ /\;p\_\_(.*?)\;/;
				if(!$cat){
					$cat = "NK_phylum";
				}
				$Bin2Cat{$tmp[0]}[0] = $cat;  # Store bin to phylum info
				
				($cat1) = $tmp[1] =~ /\;c\_\_(.*?)\;/;
				if(!$cat1){
					$cat1 = "NK_class";
				}	
				$Bin2Cat{$tmp[0]}[1] = $cat1; # Store bin to class info	
				
				($cat2) = $tmp[1] =~ /\;o\_\_(.*?)\;/;
				if(!$cat2){
					$cat2 = "NK_order";
				}	
				$Bin2Cat{$tmp[0]}[2] = $cat2; # Store bin to order info		

				($cat3) = $tmp[1] =~ /\;f\_\_(.*?)\;/;
				if(!$cat3){
					$cat3 = "NK_family";
				}	
				$Bin2Cat{$tmp[0]}[3] = $cat3; # Store bin to family info	

				($cat4) = $tmp[1] =~ /\;g\_\_(.*?)\;/;
				if(!$cat4){
					$cat4 = "NK_genus";
				}	
				$Bin2Cat{$tmp[0]}[4] = $cat4; # Store bin to genus info		

				($cat5) = $tmp[1] =~ /\;s\_\_(.*?)$/;
				if(!$cat5){
					$cat5 = "NK_species";
				}	
				$Bin2Cat{$tmp[0]}[5] = $cat5; # Store bin to species info	
				$Bin2Cat{$tmp[0]}[6] = $tmp[0]; # Store bin to bin info		
			}
		}
	}
	close IN;
}

if (-e "$output/intermediate_files/gtdbtk_Genome_files/gtdbtk.ar53.summary.tsv"){
	open IN, "$output/intermediate_files/gtdbtk_Genome_files/gtdbtk.ar53.summary.tsv";
	while (<IN>){
		chomp;
		if (!/^user_genome/){
			my @tmp = split (/\t/);
			my $cat = ""; my $cat1 = ""; my $cat2 = ""; my $cat3 = ""; my $cat4 = ""; my $cat5 = "";
			($cat) = $tmp[1] =~ /\;p\_\_(.*?)\;/;
			if(!$cat){
				$cat = "NK_phylum";
			}			
			$Bin2Cat{$tmp[0]}[0] = $cat;  # Store bin to phylum info
			
			($cat1) = $tmp[1] =~ /\;c\_\_(.*?)\;/;
			if(!$cat1){
				$cat1 = "NK_class";
			}	
			$Bin2Cat{$tmp[0]}[1] = $cat1; # Store bin to class info	
			
			($cat2) = $tmp[1] =~ /\;o\_\_(.*?)\;/;
			if(!$cat2){
				$cat2 = "NK_order";
			}	
			$Bin2Cat{$tmp[0]}[2] = $cat2; # Store bin to order info			

			($cat3) = $tmp[1] =~ /\;f\_\_(.*?)\;/;
			if(!$cat3){
				$cat3 = "NK_family";
			}	
			$Bin2Cat{$tmp[0]}[3] = $cat3; # Store bin to family info	

			($cat4) = $tmp[1] =~ /\;g\_\_(.*?)\;/;
			if(!$cat4){
				$cat4 = "NK_genus";
			}	
			$Bin2Cat{$tmp[0]}[4] = $cat4; # Store bin to genus info		

			($cat5) = $tmp[1] =~ /\;s\_\_(.*?)$/;
			if(!$cat5){
				$cat5 = "NK_species";
			}	
			$Bin2Cat{$tmp[0]}[5] = $cat5; # Store bin to species info	
			$Bin2Cat{$tmp[0]}[6] = $tmp[0]; # Store bin to bin info		
		}
	}
	close IN;
}

my %Hash_gn_n_pth = (); 
my %Total_R_community_coverage = (); # genome\tpathway => category \t pathway \t genome coverage percentage
if ($omic_reads_parameters){
	my %Genome_cov = %Genome_cov_constant;
	foreach my $pth (sort keys %Total_R_input){
		my $gn_cov_percentage = 0;
		foreach my $gn (sort keys %Hmmscan_result){
			if ($Genome_cov{$gn} and $Total_R_input{$pth}{$gn}){
				$gn_cov_percentage = $Genome_cov{$gn};
				my $cat = $Bin2Cat{$gn}[0];
				my $gn_n_pth = "$gn\t$pth"; $Hash_gn_n_pth{$gn_n_pth} = 1;
				$Total_R_community_coverage{$gn_n_pth} = "$cat\t$pth\t$gn_cov_percentage";
			}
		}
	}		
}

my %Total_R_community_coverage2 = (); # $genome\tpath pair => cat \t  coverage percentage average
foreach my $gn (sort keys %Hmmscan_result){
	my %Path = (); # path => 1
	foreach my $gn_n_pth (sort keys %Total_R_community_coverage){
		if ($gn_n_pth =~ /$gn\t/){
			my @tmp = split (/\t/,$gn_n_pth);
			$Path{$tmp[1]} = 1;
		}
	}
	my @Path_keys = sort keys %Path;
	for(my $i=0; $i<=$#Path_keys; $i++){
		for(my $j = $i+1; $j<=$#Path_keys; $j++){
			my $pair = "$Path_keys[$i]\t$Path_keys[$j]";
			my $coverage = 0;
			my @tmp1 = split (/\t/, $Total_R_community_coverage{"$gn\t$Path_keys[$i]"});
			my @tmp2 = split (/\t/, $Total_R_community_coverage{"$gn\t$Path_keys[$j]"});
			$coverage = ($tmp1[2] + $tmp2[2]) / 2;
			$Total_R_community_coverage2{"$gn\t$pair"} = $Bin2Cat{$gn}[0]."\t".$coverage;
		}
	}
}

open OUT, ">$output/METABOLIC_Figures_Input.$second_run_suffix/Metabolic_Sankey_diagram_input.txt";
foreach my $gn_n_pth (sort keys %Hash_gn_n_pth){
	print OUT "$Total_R_community_coverage{$gn_n_pth}\n";
}
close OUT;

open OUT, ">$output/METABOLIC_Figures_Input.$second_run_suffix/Functional_network_input.txt";
print OUT "#Genome\tStep1\tStep2\tTaxonomic Group\tCoverage value\(average\)\n";
foreach my $gn_n_pair (sort keys %Total_R_community_coverage2){
	print OUT "$gn_n_pair\t$Total_R_community_coverage2{$gn_n_pair}\n";
}
close OUT;

`Rscript $METABOLIC_dir/draw_metabolic_Sankey_diagram.R $output/METABOLIC_Figures_Input.$second_run_suffix/Metabolic_Sankey_diagram_input.txt $output/Output_energy_flow 2> /dev/null`;
`mv $output/Output_energy_flow/Energy_plot/network.plot.pdf   $output/METABOLIC_Figures.$second_run_suffix/Metabolic_Sankey_diagram.pdf; rm -r $output/Output_energy_flow`;

`Rscript $METABOLIC_dir/draw_functional_network_diagram.R $output/METABOLIC_Figures_Input.$second_run_suffix/Functional_network_input.txt $output/OutputFolder_Energy 2> /dev/null`;
`mv $output/OutputFolder_Energy/network_plot $output/METABOLIC_Figures.$second_run_suffix/Functional_network_figures; rm -r $output/OutputFolder_Energy`;

$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Drawing energy flow chart finished\n";


$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Calculating MW-score ...\n";

### To calculate MW-score
# Store MW_score_reaction_table
my %MW_functions = (); # func => hmms
my %MW_function_hmm_ids = ();
open IN, "$MW_score_reaction_table"; # Read the MW_score_reaction table
while (<IN>){
	chomp;
	if (!/^#/){
		my @tmp = split (/\t/);
		$MW_functions{$tmp[0]} = $tmp[2];
		if ($tmp[2] !~ /\;/){
			my @tmp2 = split (/\,/,$tmp[2]);
			foreach my $key (@tmp2){
				$MW_function_hmm_ids{$key} = 1;
			}
		}elsif ($tmp[2] =~ /\;/){
			my @tmp2 = split (/\;/,$tmp[2]);
			foreach my $key (@tmp2){
				my @tmp3 = split (/\,/,$key);
				foreach my $key2 (@tmp3){
					if ($key2 !~ /NO/){
						$MW_function_hmm_ids{$key2} = 1;
					}
				}
			}
		}
	}
}
close IN;

# Get MW_score_reaction_table result
my %MW_score_hash = (); # pathway => gn => 1 or 0
foreach my $gn (sort keys %Hmmscan_result){
	foreach my $key (sort keys %MW_functions){
		$MW_score_hash{$key}{$gn} = 0;
		my $hmms = $MW_functions{$key};
		if ($hmms !~ /\;/){
			foreach my $hmm_id (sort keys %MW_function_hmm_ids){
				if ($hmms =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
					$MW_score_hash{$key}{$gn} = 1; 
				}
			}
		}elsif ($hmms =~ /\;/){
			my ($hmms_1,$hmms_2) = $hmms =~ /^(.+?)\;(.+?)$/;
			if ($hmms_2 !~ /NO/){
				my $logic1 = 0; my $logic2 = 0;  
				foreach my $hmm_id (sort keys %MW_function_hmm_ids){
					if ($hmms_1 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic1 = 1;	
					}
					if ($hmms_2 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic2 = 1;	
					}
				}
				if ($logic1 and $logic2){
					$MW_score_hash{$key}{$gn} = 1;
				}
			}elsif ($hmms_2 =~ /NO/){
				my $logic1 = 0; my $logic2 = 1;
				foreach my $hmm_id (sort keys %MW_function_hmm_ids){
					if ($hmms_1 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){
						$logic1 = 1;	
					}			
					if ($hmms_2 =~ /$hmm_id/ and $Hmmscan_result{$gn}{$hmm_id}){  # If $hmms_2 contains $hmm_id , and the genome has $hmm_id hit(s), then it will be false (0)
						$logic2 = 0;	
					}		
				}
				if ($logic1 and $logic2){
					$MW_score_hash{$key}{$gn} = 1;
				}
			}
		}
	}
}

my %MW_score_community_coverage = (); # genome\tpathway => category \t pathway \t genome coverage percentage
if ($omic_reads_parameters){
	my %Genome_cov = %Genome_cov_constant;
	foreach my $pth (sort keys %MW_score_hash){
		my $gn_cov_percentage = 0; 
		foreach my $gn (sort keys %Hmmscan_result){
			if ($Genome_cov{$gn} and $MW_score_hash{$pth}{$gn}){
				$gn_cov_percentage = $Genome_cov{$gn}; 
				my $cat = "";
				my $tax_code = $Tax2code{$taxonomy};
				$cat = $Bin2Cat{$gn}[$tax_code];
				my $gn_n_pth = "$gn\t$pth"; 
				$MW_score_community_coverage{$gn_n_pth} = "$cat\t$pth\t$gn_cov_percentage";
				
			}
		}
	}		
}

my %MW_score_community_coverage2 = (); # $genome\tpath pair => cat \t  coverage percentage average
foreach my $gn (sort keys %Hmmscan_result){
	my %Path = (); # path => 1
	foreach my $gn_n_pth (sort keys %MW_score_community_coverage){
		if ($gn_n_pth =~ /$gn\t/){
			my @tmp = split (/\t/,$gn_n_pth);
			$Path{$tmp[1]} = 1;
		}
	}
	my @Path_keys = sort keys %Path;
	for(my $i=0; $i<=$#Path_keys; $i++){
		for(my $j = $i+1; $j<=$#Path_keys; $j++){
			my $pair = "$Path_keys[$i]\t$Path_keys[$j]";
			my $coverage = 0;
			my @tmp1 = split (/\t/, $MW_score_community_coverage{"$gn\t$Path_keys[$i]"});
			my @tmp2 = split (/\t/, $MW_score_community_coverage{"$gn\t$Path_keys[$j]"});
			$coverage = ($tmp1[2] + $tmp2[2]) / 2;
			my $cat = "";
			my $tax_code = $Tax2code{$taxonomy};
			$cat = $Bin2Cat{$gn}[$tax_code];
			$MW_score_community_coverage2{"$gn\t$pair"} = $cat."\t".$coverage;
		}
	}
}

`mkdir $output/MW-score_result.$second_run_suffix`;
open OUT, ">$output/MW-score_result.$second_run_suffix/MW-score_result_table_input.txt";
print OUT "#Genome\tFunc1\tFunc2\tTaxonomic Group\tCoverage value\(average\)\n";
foreach my $gn_n_pair (sort keys %MW_score_community_coverage2){
	print OUT "$gn_n_pair\t$MW_score_community_coverage2{$gn_n_pair}\n";
}
close OUT;

#Read the "MW-score_result_table_input.txt" and make the "MW-score_result.txt", which is the final result of MW-score
my %Input = (); # whole line => [0]  Acidimicrobiia_bacterium_UWMA-0264	[1]  C-S-01:Organic carbon oxidation	[2] C-S-04:Acetate oxidation	[3] Actinobacteriota	[4] 0.038328883
open IN, "$output/MW-score_result.$second_run_suffix/MW-score_result_table_input.txt";
while (<IN>){
	chomp;
	if (!/\#/){
		my @tmp = split (/\t/,$_);
		$Input{$_}[0] = $tmp[0];
		$Input{$_}[1] = $tmp[1];
		$Input{$_}[2] = $tmp[2];
		$Input{$_}[3] = $tmp[3];
		$Input{$_}[4] = $tmp[4];
	}
}
close IN;

my %Output1 = (); # func. => category => summed coverage
my %Output2 = (); # func. =>  summed coverage
my %Cat_2 =(); # This hash of Cat_2 is only used in MW-score calculating
foreach my $key (sort keys %Input){
	$Output1{$Input{$key}[1]}{$Input{$key}[3]} += $Input{$key}[4];
	$Output1{$Input{$key}[2]}{$Input{$key}[3]} += $Input{$key}[4];
	$Cat_2{$Input{$key}[3]} = 1;
	$Output2{$Input{$key}[1]} += $Input{$key}[4];
	$Output2{$Input{$key}[2]} += $Input{$key}[4];
}

my %Output3 = (); # The contribution percentage for each function
my $sum_cov_for_output2 = 0;
foreach my $func (sort keys %Output2){
	$sum_cov_for_output2 += $Output2{$func};
}

foreach my $func (sort keys %Output2){
	my $var = ($Output2{$func} / $sum_cov_for_output2) * 100;
	$Output3{$func} = sprintf "%.1f",$var;
}

my %Output4 = (); # func. => category => percentage  
# The func. and each category contribution percentage table
foreach my $func (sort keys %Output1){
	foreach my $cat (sort keys %Cat_2){
		my $var = 0;
		if ($Output2{$func} and $Output1{$func}{$cat}){
			$var = ($Output1{$func}{$cat} / $Output2{$func}) * 100;
		}
		$Output4{$func}{$cat} = sprintf "%.1f",$var;
	}
}

open OUT, ">$output/MW-score_result.$second_run_suffix/MW-score_result.txt";
my $row=join("\t", sort keys %Cat_2);
print OUT "Function\tMW-score for each function\t$row\n";
foreach my $tmp1 (sort keys %Output4)
{
        print OUT $tmp1."\t";
		print OUT $Output3{$tmp1}."\t";
        my @tmp = ();
        foreach my $tmp2 (sort keys %Cat_2)
        {
                if (exists $Output4{$tmp1}{$tmp2})
                {
                        push @tmp, $Output4{$tmp1}{$tmp2};
                }
                else
                {
                        push @tmp,"0"
                }
        }
        print OUT join("\t",@tmp)."\n";
}
close OUT;

$datestring = strftime "%Y-%m-%d %H:%M:%S", localtime; 
print "\[$datestring\] Calculating MW-score is done\n";
$endtime = $datestring;
}

my $duration = time - $starttime_raw;
$duration = parse_duration($duration);
print "METABOLIC-C was done, the total running time: $duration (hh:mm:ss)\n";

# Print information about this run:
open OUT, ">$output/METABOLIC_run.$second_run_suffix.log";
# Print information about this run:
print OUT "$version
Run Start: $starttime
Run End: $endtime
Total running time: $duration (hh:mm:ss)
Input Reads: $omic_reads_parameters
Reads type: $omic_reads_type
Input Genome directory (nucleotides): $input_genome_folder
Number of Threads: $cpu_numbers
Prodigal Method: $prodigal_method
KOfam DB: $kofam_db_size
Module Cutoff Value: $module_cutoff
Taxonomic level to calculate MW-score table: $taxonomy
Output directory: $output\n";
close OUT;


## Subroutines
sub parse_duration {
    use integer;
    sprintf("%02d:%02d:%02d", $_[0]/3600, $_[0]/60%60, $_[0]%60);
}

# Input ko_list, return a result hash of threshold and score_type
sub _get_kofam_db_KO_threshold{
	my $list = $_[0]; 
	my $prok_list = "";
	if ($kofam_db_size eq "full"){
		$prok_list = "$_[1]/prokaryote.hal";
	}elsif ($kofam_db_size eq "small"){
		$prok_list = "$_[1]/All_Module_KO_ids.txt";
	}
	my %result = ();	
	open IN, "$list";
	while (<IN>){
		chomp;
		if (/^K/){
			my @tmp = split (/\t/);
			if ($tmp[1] eq "\-"){
				my $hmm_id = "$tmp[0]\.hmm";
				$result{$hmm_id} = "50|full";
			}else{
				my $hmm_id = "$tmp[0]\.hmm";
				$result{$hmm_id} = "$tmp[1]|$tmp[2]";
			}
		}
	}
	close IN;
	
	my %Prok_list = ();
	open IN, "$prok_list";
	while (<IN>){
		chomp;
		$Prok_list{$_} = 1;
	}	
	close IN;
	
	foreach my $key (sort keys %result){
		if (!$Prok_list{$key}){
			delete $result{$key};
		}
	}
	return (%result);
}

# Input the hmm_table_temp hash, return a hmm to ko hash (like: TIGR02694.hmm => K08355.hmm)
sub _get_hmm_2_KO_hash{
	my %hash = @_;
	my %result = ();
	my %result2 = ();
	foreach my $line_no (sort keys %hash){
		my @tmp = split (/\t/, $hash{$line_no});
		my $hmm = $tmp[5];
		my $ko = $tmp[6];
		if ($hmm and $hmm !~ /\;/){
				$result{$hmm} = $ko."\.hmm";
		}elsif ($hmm){
			my @array_hmm = split (/\; /, $hmm);
			my @array_ko = split (/\; /, $ko);
			for(my $i=0; $i<=$#array_hmm; $i++){
				$result{$array_hmm[$i]} = $array_ko[$i]."\.hmm";
			}
		}
	}
	
	foreach my $hmm (sort keys %result){
		if ($result{$hmm} =~ /^K\d\d\d\d\d/){
			$result2{$hmm} = $result{$hmm};
		}
	}
	return %result2;
}

# Input faa file, and output the seq hash
sub _get_faa_seq{
	my $file = $_[0]; 
	my ($file_name) = $file =~ /^.+\/(.+?)\.faa/;
	my %result = (); my $head = "";
	open _IN, "$file";
	while(<_IN>){
		chomp;
		if (/>/){
			if (/\s/){
				my ($head_old) = $_ =~ /^>(.+?)\s/; 
				$head = ">".$file_name."~~".$head_old;
			}else{
				my ($head_old) = $_ =~ /^>(.+?)$/; 
				$head = ">".$file_name."~~".$head_old;
			}	
			$result{$head} = "";
		}else{
			$result{$head} .= $_; 
			$result{$head} =~ s/\*$//g;
		}
	}
	close _IN;
	return %result;
}

sub _get_Genome_coverge{
	my $reads = $_[0];
	my $folder = $_[1]; # The input_genome_folder
	# Cat all the genes
	my %Seq = (); my $head = "";
	open __IN, "ls $folder/*.gene | ";
	while (<__IN>){
		chomp;
		my $file = $_;
		my ($seq_name) = $file =~ /$folder\/(.+?)\.gene/; 
		open __INN, $file;
		while (<__INN>){
			chomp;
			if (/>/){
				my $head_old = $_;
				my $head_old_2;
				if ($head_old =~ /\s/){
					($head_old_2) = $head_old =~ />(.+?)\s/;
				}else{
					($head_old_2) = $head_old =~ />(.+?)$/;
				}
				$head = ">".$seq_name."~~".$head_old_2;
				$Seq{$head} = "";
			}else{
				$Seq{$head} .= $_;
			}
		}
		close __INN;
	}
	close __IN;
	
	open __OUT, ">$output/All_gene_collections.gene";
	foreach my $key (sort keys %Seq){
		print __OUT "$key\n$Seq{$key}\n";
	}
	close __OUT;
	
	system ("bowtie2-build --quiet $output/All_gene_collections.gene $output/All_gene_collections.gene.scaffold");
	my %Reads = (); my $i = 1;
	open __IN, "$reads";
	while (<__IN>){
		chomp;
		next if /^#/ or /^\s*$/; # Skip comments and blank lines
		
		my @tmp = split (/\,/,$_);
		my $tmp_link = "";
		if ($test ne "true"){
			$tmp_link = $tmp[0]."\t".$tmp[1];
		}else{
			$tmp_link = "$METABOLIC_dir/METABOLIC_test_files/METABOLIC_test_reads/".$tmp[0]."\t"."$METABOLIC_dir/METABOLIC_test_files/METABOLIC_test_reads/".$tmp[1];
		}
		$Reads{$tmp_link} = $i;
		$i++;
	}
	close __IN;
	
	my %Read_seq_numbers = (); # read pair => read seq number
	my $average_read_seq_number = 0; # The average read
	open OUT__,">$output/tmp_calculate_depth.sh";
	foreach my $key (sort keys %Reads){
		my $j = $Reads{$key};
		my @tmp = split (/\t/,$key); 
		if ($omic_reads_type eq "metaT"){
			my $seq_number = `cat $tmp[0] | wc -l`; chomp $seq_number; $seq_number = $seq_number / 4 * 2; $Read_seq_numbers{$key} = $seq_number;
		}
		print OUT__ "bowtie2 -x $output/All_gene_collections.gene.scaffold -1 $tmp[0] -2 $tmp[1] -S $output/All_gene_collections_mapped.$j.sam -p $cpu_numbers --quiet;";
		print OUT__ "samtools view -bS $output/All_gene_collections_mapped.$j.sam > $output/All_gene_collections_mapped.$j.bam -@ $cpu_numbers 2> /dev/null;";
		print OUT__ "mkdir $output/sambamba_tmpfiles.$j; sambamba sort  $output/All_gene_collections_mapped.$j.bam --tmpdir $output/sambamba_tmpfiles.$j -o $output/All_gene_collections_mapped.$j.sorted.bam 2> /dev/null;";
		print OUT__ "samtools index $output/All_gene_collections_mapped.$j.sorted.bam 2> /dev/null;";
		print OUT__ "samtools flagstat $output/All_gene_collections_mapped.$j.sorted.bam > $output/All_gene_collections_mapped.$j.sorted.stat 2> /dev/null;";
		print OUT__ "rm $output/All_gene_collections_mapped.$j.sam $output/All_gene_collections_mapped.$j.bam;rm -r $output/sambamba_tmpfiles.$j\n";
	}
	close OUT__;
	
	foreach my $key (sort keys %Read_seq_numbers){
		$average_read_seq_number += $Read_seq_numbers{$key};
	}	
	
	if ($omic_reads_type eq "metaT"){
		my @Read_seq_numbers = keys %Read_seq_numbers;
		$average_read_seq_number = $average_read_seq_number / (scalar @Read_seq_numbers) ;
	}
	
	# Parallel run calculate coverage
	_run_parallel("$output/tmp_calculate_depth.sh", $i); `rm $output/tmp_calculate_depth.sh`;
	
	# The output name of depth and transcript coverage file
	my $depth_file_name = "All_gene_collections_mapped.depth.txt";
	my $transcript_coverge_file_name = "All_gene_collections_transcript_coverage.txt";
	if ($second_run){
		$depth_file_name = "All_gene_collections_mapped.depth.$second_run_suffix.txt";
		$transcript_coverge_file_name = "All_gene_collections_transcript_coverage.$second_run_suffix.txt";
	}
	
	system ("coverm contig --methods metabat --bam-files  $output/All_gene_collections_mapped.*.sorted.bam > $output/$depth_file_name 2> /dev/null");
	
	`rm $output/*.bt2;rm $output/All_gene_collections.gene; rm $output/*.bam; rm $output/*.sorted.stat;rm $output/*.bai`;
	
	my %h = (); # average => bin => all gene coverage values
	my @h_head = ();  
	my @h_head_num = (); 
	my %Bin = ();
	
	if ($omic_reads_type eq "metaG"){
		open __IN, "$output/$depth_file_name";
		while (<__IN>){
			chomp;
			if (/^contigName/){
					my @tmp = split (/\t/);@h_head = @tmp;
					for(my $i=0; $i<=$#h_head; $i++){
							if ($h_head[$i] =~ /^totalAvgDepth$/){
								push @h_head_num, $i;
							}
					}
			}else{
					my @tmp = split (/\t/);
					my ($bin) = $tmp[0] =~ /^(.+?)\~\~/;
					$Bin{$bin} = 1;
					foreach my $i (@h_head_num){
                        if (!exists $h{$h_head[$i]}{$bin}){
                                $h{$h_head[$i]}{$bin} = $tmp[$i];
                        }else{
                                $h{$h_head[$i]}{$bin} .= "\t".$tmp[$i];
                        }
					}
			}
		}
		close __IN;
	}elsif ($omic_reads_type eq "metaT"){
		open __IN, "$output/$depth_file_name";
		open __OUT, ">$output/$transcript_coverge_file_name"; # contigName => transcript coverage in RPKM
		while (<__IN>){
			chomp;
			if (/^contigName/){
				my @tmp = split (/\t/);@h_head = @tmp;
				for(my $i=0; $i<=$#h_head; $i++){
					if ($h_head[$i] =~ /^totalAvgDepth$/){
						push @h_head_num, $i;
					}	
				}
				print __OUT "contigName\tTranscript coverage in RPKM\n";
			}else{
					my @tmp = split (/\t/);
					my ($bin) = $tmp[0] =~ /^(.+?)\~\~/;
					my $geneLength = $tmp[1] / 1000; # Gene length in kb
					$Bin{$bin} = 1;
					foreach my $i (@h_head_num){
                        my $transcript_coverage = $tmp[$i] * (1000000  / $average_read_seq_number) / $geneLength;
						print __OUT "$tmp[0]\t$transcript_coverage\n";
						if (!exists $h{$h_head[$i]}{$bin}){
                                $h{$h_head[$i]}{$bin} = $transcript_coverage;
                        }else{
                                $h{$h_head[$i]}{$bin} .= "\t".$transcript_coverage;
                        }
					}
			}
		}
		close __IN;
		close __OUT;
	}
	
	my %Bin2Cov = (); # bin => cov value
	my $total_cov = 0;
	foreach my $i (@h_head_num){
        foreach my $bin (sort keys %Bin){
                my @tmp = split (/\t/, $h{$h_head[$i]}{$bin});
                my $stat = Statistics::Descriptive::Full->new();
                $stat->add_data(\@tmp);
                my $mean = $stat->mean();
                $h{$h_head[$i]}{$bin} = $mean;
				$Bin2Cov{$bin} = $mean;
				$total_cov += $mean;
        }
	}
	
	my %Bin2cov_percentage = ();
	foreach my $bin (sort keys %Bin2Cov){
		my $percentage = $Bin2Cov{$bin} / $total_cov;
		$Bin2cov_percentage{$bin} = $percentage;
	}
	return %Bin2cov_percentage;
}

sub _get_Genome_coverge_for_long_reads{
	my $reads = $_[0];
	my $folder = $_[1]; # The input_genome_folder
	# Cat all the genes
	my %Seq = (); my $head = "";
	open __IN, "ls $folder/*.gene | ";
	while (<__IN>){
		chomp;
		my $file = $_;
		my ($seq_name) = $file =~ /$folder\/(.+?)\.gene/; 
		open __INN, $file;
		while (<__INN>){
			chomp;
			if (/>/){
				my $head_old = $_;
				my $head_old_2;
				if ($head_old =~ /\s/){
					($head_old_2) = $head_old =~ />(.+?)\s/;
				}else{
					($head_old_2) = $head_old =~ />(.+?)$/;
				}
				$head = ">".$seq_name."~~".$head_old_2;
				$Seq{$head} = "";
			}else{
				$Seq{$head} .= $_;
			}
		}
		close __INN;
	}
	close __IN;
	
	open __OUT, ">$output/All_gene_collections.gene";
	foreach my $key (sort keys %Seq){
		print __OUT "$key\n$Seq{$key}\n";
	}
	close __OUT;
	
	my %Reads = (); my $i = 1;
	open __IN, "$reads";
	while (<__IN>){
		chomp;
		next if /^#/ or /^\s*$/; # Skip comments and blank lines
		
		my $tmp_link = "";
		if ($test ne "true"){
			$tmp_link = $_;
		}else{
			$tmp_link = "$METABOLIC_dir/METABOLIC_test_files/METABOLIC_test_reads/".$_;
		}
		$Reads{$tmp_link} = $i;
		$i++;
	}
	close __IN;
	
	my %Read_seq_numbers = (); # read pair => read seq number
	my $average_read_seq_number = 0; # The average read
	open OUT__,">$output/tmp_calculate_depth.sh";
	foreach my $key (sort keys %Reads){
		my $j = $Reads{$key};
		if ($omic_reads_type eq "metaT"){
			my $seq_number = `cat $key | wc -l`; chomp $seq_number; $seq_number = $seq_number / 4; $Read_seq_numbers{$key} = $seq_number;
		}
		my $ax_input = "";
		if ($sequencing_type eq 'pacbio'){
			$ax_input = 'map-pb';
		}elsif($sequencing_type eq 'nanopore'){
			$ax_input = 'map-ont';
		}elsif($sequencing_type eq 'pacbio_hifi'){
			$ax_input = 'map-hifi';
		}elsif($sequencing_type eq 'pacbio_asm20'){
			$ax_input = 'asm20';
		}
		print OUT__ "minimap2 -ax $ax_input $output/All_gene_collections.gene $key > $output/All_gene_collections_mapped.$j.sam;";
		print OUT__ "samtools view -bS $output/All_gene_collections_mapped.$j.sam > $output/All_gene_collections_mapped.$j.bam -@ $cpu_numbers 2> /dev/null;";
		print OUT__ "mkdir $output/sambamba_tmpfiles.$j; sambamba sort  $output/All_gene_collections_mapped.$j.bam --tmpdir $output/sambamba_tmpfiles.$j -o $output/All_gene_collections_mapped.$j.sorted.bam -q;";
		print OUT__ "samtools index $output/All_gene_collections_mapped.$j.sorted.bam 2> /dev/null;";
		print OUT__ "samtools flagstat $output/All_gene_collections_mapped.$j.sorted.bam > $output/All_gene_collections_mapped.$j.sorted.stat 2> /dev/null;";
		print OUT__ "rm $output/All_gene_collections_mapped.$j.sam $output/All_gene_collections_mapped.$j.bam;rm -r $output/sambamba_tmpfiles.$j\n";
	}
	close OUT__;
	
	foreach my $key (sort keys %Read_seq_numbers){
		$average_read_seq_number += $Read_seq_numbers{$key};
	}	
	
	if ($omic_reads_type eq "metaT"){
		my @Read_seq_numbers = keys %Read_seq_numbers;
		$average_read_seq_number = $average_read_seq_number / (scalar @Read_seq_numbers) ;
	}
	
	# Parallel run calculate coverage
	_run_parallel("$output/tmp_calculate_depth.sh", $i); `rm $output/tmp_calculate_depth.sh`;
	
	system ("coverm contig --methods metabat --bam-files  $output/All_gene_collections_mapped.*.sorted.bam > $output/All_gene_collections_mapped.depth.txt 2> /dev/null");
	`rm $output/All_gene_collections.gene; rm $output/*.bam; rm $output/*.sorted.stat;rm $output/*.bai`;
	
	my %h = (); # average => bin => all gene coverage values
	my @h_head = ();  
	my @h_head_num = (); 
	my %Bin = ();
	
	if ($omic_reads_type eq "metaG"){
		open __IN, "$output/All_gene_collections_mapped.depth.txt";
		while (<__IN>){
			chomp;
			if (/^contigName/){
					my @tmp = split (/\t/);@h_head = @tmp;
					for(my $i=0; $i<=$#h_head; $i++){
							if ($h_head[$i] =~ /^totalAvgDepth$/){
								push @h_head_num, $i;
							}
					}
			}else{
					my @tmp = split (/\t/);
					my ($bin) = $tmp[0] =~ /^(.+?)\~\~/;
					$Bin{$bin} = 1;
					foreach my $i (@h_head_num){
                        if (!exists $h{$h_head[$i]}{$bin}){
                                $h{$h_head[$i]}{$bin} = $tmp[$i];
                        }else{
                                $h{$h_head[$i]}{$bin} .= "\t".$tmp[$i];
                        }
					}
			}
		}
		close __IN;
	}elsif ($omic_reads_type eq "metaT"){
		open __IN, "$output/All_gene_collections_mapped.depth.txt";
		open __OUT, ">$output/All_gene_collections_transcript_coverage.txt"; # contigName => transcript coverage in RPKM
		while (<__IN>){
			chomp;
			if (/^contigName/){
				my @tmp = split (/\t/);@h_head = @tmp;
				for(my $i=0; $i<=$#h_head; $i++){
					if ($h_head[$i] =~ /^totalAvgDepth$/){
						push @h_head_num, $i;
					}	
				}
				print __OUT "contigName\tTranscript coverage in RPKM\n";
			}else{
					my @tmp = split (/\t/);
					my ($bin) = $tmp[0] =~ /^(.+?)\~\~/;
					my $geneLength = $tmp[1] / 1000; # gene length in kb
					$Bin{$bin} = 1;
					foreach my $i (@h_head_num){
                        my $transcript_coverage = $tmp[$i] * (1000000  / $average_read_seq_number) / $geneLength;
						print __OUT "$tmp[0]\t$transcript_coverage\n";
						if (!exists $h{$h_head[$i]}{$bin}){
                                $h{$h_head[$i]}{$bin} = $transcript_coverage;
                        }else{
                                $h{$h_head[$i]}{$bin} .= "\t".$transcript_coverage;
                        }
					}
			}
		}
		close __IN;
		close __OUT;
	}
	
	my %Bin2Cov = (); # bin => cov value
	my $total_cov = 0;
	foreach my $i (@h_head_num){
        foreach my $bin (sort keys %Bin){
                my @tmp = split (/\t/, $h{$h_head[$i]}{$bin});
                my $stat = Statistics::Descriptive::Full->new();
                $stat->add_data(\@tmp);
                my $mean = $stat->mean();
                $h{$h_head[$i]}{$bin} = $mean;
				$Bin2Cov{$bin} = $mean;
				$total_cov += $mean;
        }
	}
	
	my %Bin2cov_percentage = ();
	foreach my $bin (sort keys %Bin2Cov){
		my $percentage = $Bin2Cov{$bin} / $total_cov;
		$Bin2cov_percentage{$bin} = $percentage;
	}
	return %Bin2cov_percentage;
}

sub _get_Genome_coverge_by_depth_file{
	my $reads = $_[0];
	my $depth_file_name = $_[1];
	my $transcript_coverge_file_name = "All_gene_collections_transcript_coverage.txt";
	
	my %Reads = (); my $i = 1;
	open __IN, "$reads";
	while (<__IN>){
		chomp;
		next if /^#/ or /^\s*$/; # Skip comments and blank lines
		
		my @tmp = split (/\,/,$_);
		my $tmp_link = "";
		if ($test ne "true"){
			$tmp_link = $tmp[0]."\t".$tmp[1];
		}else{
			$tmp_link = "$METABOLIC_dir/METABOLIC_test_files/METABOLIC_test_reads/".$tmp[0]."\t"."$METABOLIC_dir/METABOLIC_test_files/METABOLIC_test_reads/".$tmp[1];
		}
		$Reads{$tmp_link} = $i;
		$i++;
	}
	close __IN;
	
	my %Read_seq_numbers = (); # read pair => read seq number
	my $average_read_seq_number = 0; # The average read
	foreach my $key (sort keys %Reads){
		my @tmp = split (/\t/,$key); 
		if ($omic_reads_type eq "metaT"){
			my $seq_number = `cat $tmp[0] | wc -l`; chomp $seq_number; $seq_number = $seq_number / 4 * 2; $Read_seq_numbers{$key} = $seq_number;
		}
	}
	
	foreach my $key (sort keys %Read_seq_numbers){
		$average_read_seq_number += $Read_seq_numbers{$key};
	}	
	
	if ($omic_reads_type eq "metaT"){
		my @Read_seq_numbers = keys %Read_seq_numbers;
		$average_read_seq_number = $average_read_seq_number / (scalar @Read_seq_numbers) ;
	}	
	
	my %h = (); # average => bin => all gene coverage values
	my @h_head = ();  
	my @h_head_num = (); 
	my %Bin = ();
	
	if ($omic_reads_type eq "metaG"){
		open __IN, "$output/$depth_file_name";
		while (<__IN>){
			chomp;
			if (/^contigName/){
					my @tmp = split (/\t/);@h_head = @tmp;
					for(my $i=0; $i<=$#h_head; $i++){
							if ($h_head[$i] =~ /^totalAvgDepth$/){
								push @h_head_num, $i;
							}
					}
			}else{
					my @tmp = split (/\t/);
					my ($bin) = $tmp[0] =~ /^(.+?)\~\~/;
					$Bin{$bin} = 1;
					foreach my $i (@h_head_num){
                        if (!exists $h{$h_head[$i]}{$bin}){
                                $h{$h_head[$i]}{$bin} = $tmp[$i];
                        }else{
                                $h{$h_head[$i]}{$bin} .= "\t".$tmp[$i];
                        }
					}
			}
		}
		close __IN;
	}elsif ($omic_reads_type eq "metaT"){
		open __IN, "$output/$depth_file_name";
		open __OUT, ">$output/$transcript_coverge_file_name"; # contigName => transcript coverage in RPKM
		while (<__IN>){
			chomp;
			if (/^contigName/){
				my @tmp = split (/\t/);@h_head = @tmp;
				for(my $i=0; $i<=$#h_head; $i++){
					if ($h_head[$i] =~ /^totalAvgDepth$/){
						push @h_head_num, $i;
					}	
				}
				print __OUT "contigName\tTranscript coverage in RPKM\n";
			}else{
					my @tmp = split (/\t/);
					my ($bin) = $tmp[0] =~ /^(.+?)\~\~/;
					my $geneLength = $tmp[1] / 1000; # Gene length in kb
					$Bin{$bin} = 1;
					foreach my $i (@h_head_num){
                        my $transcript_coverage = $tmp[$i] * (1000000  / $average_read_seq_number) / $geneLength;
						print __OUT "$tmp[0]\t$transcript_coverage\n";
						if (!exists $h{$h_head[$i]}{$bin}){
                                $h{$h_head[$i]}{$bin} = $transcript_coverage;
                        }else{
                                $h{$h_head[$i]}{$bin} .= "\t".$transcript_coverage;
                        }
					}
			}
		}
		close __IN;
		close __OUT;
	}
	
	my %Bin2Cov = (); # bin => cov value
	my $total_cov = 0;
	foreach my $i (@h_head_num){
        foreach my $bin (sort keys %Bin){
                my @tmp = split (/\t/, $h{$h_head[$i]}{$bin});
                my $stat = Statistics::Descriptive::Full->new();
                $stat->add_data(\@tmp);
                my $mean = $stat->mean();
                $h{$h_head[$i]}{$bin} = $mean;
				$Bin2Cov{$bin} = $mean;
				$total_cov += $mean;
        }
	}
	
	my %Bin2cov_percentage = ();
	foreach my $bin (sort keys %Bin2Cov){
		my $percentage = $Bin2Cov{$bin} / $total_cov;
		$Bin2cov_percentage{$bin} = $percentage;
	}
	return %Bin2cov_percentage;		
}

sub _store_seq{
	my $file = $_[0];
	my %Seq = (); my $head = "";
	open _IN, "$file";
	while (<_IN>){
		chomp;
		if (/>/){
			if (/\s/){
				($head) = $_ =~ /^(>.+?)\s/;
				$Seq{$head} = "";
			}else{
				($head) = $_ =~ /^(>.+?)$/;
				$Seq{$head} = "";
			}
		}else{
			$Seq{$head} .= $_;
		}
	}
	close _IN;
	return %Seq;
}

sub _get_1_from_input_faa{
	my $input_file = $_[0];
	my $seq = $_[1];
	my $output_file = $_[2];
	my %Seq = (); my $head = "";	
	open IN_, "$input_file";
	while (<IN_>){
		chomp;
		if (/>/){
			if (/\s/){
				($head) = $_ =~ /^(>.+?)\s/;
				$Seq{$head} = "";
			}else{
				($head) = $_ =~ /^(>.+?)$/;
				$Seq{$head} = "";
			}
		}else{
			$Seq{$head} .= $_;
		}
	}
	close IN_;
	
	open OUT, ">$output_file";
	print OUT "$seq\n$Seq{$seq}\n";
	close OUT;	
}

sub _get_check_score{
	my $input_file = $_[0];
	my $score = 0;
	open _IN, "$input_file";
	while (<_IN>){
		chomp;
		if (!/^#/){
			my $line = $_;
			$line =~ s/ +/ /g;
			my @tmp = split (/ /,$line);
			if ($tmp[5] >= $score){
				$score = $tmp[5];
			}
		}
	}
	close _IN;
	return $score;
}

sub _get_motif{
	my $file = $_[0];
	my %Hash; # protein id => motif sequences (dsrC => GPXKXXCXXXGXPXPXXCX)
	open _IN, "$file";
	while (<_IN>){
		chomp;
		my @tmp = split (/\:/);
		$Hash{$tmp[0]} = $tmp[1];
	}
	close _IN;
	return %Hash;
}

sub _get_motif_pair{
	my $file = $_[0];
	my %Hash; # dsrC => tusE
	open _IN, "$file";
	while (<_IN>){
		chomp;
		my @tmp = split (/\:/);
		$Hash{$tmp[0]} = $tmp[1];
	}
	close _IN;
	return %Hash;
} 

sub _run_parallel{
	my $file = $_[0];
	my $cpu_numbers_ = $_[1];
	my @Runs; 
	open ___IN, $file;
	while (<___IN>){
		chomp;
		push @Runs, $_;
	}
	close ___IN;

	my $pm = Parallel::ForkManager->new($cpu_numbers_);
	foreach my $run (@Runs){
		my $pid = $pm->start and next;
		`$run`;
		$pm->finish;
	}
	$pm->wait_all_children;
}