Skip to content

Commit

Permalink
statistic
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Nov 1, 2023
1 parent 570fdfa commit 66f6450
Showing 1 changed file with 148 additions and 0 deletions.
148 changes: 148 additions & 0 deletions python/quantmsio/quantms_io/core/statistic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb
import random

def get_skip_rows(path):
skip_rows = 0
with open(path) as f:
line = f.readline()
while line.startswith("#"):
skip_rows += 1
line = f.readline()
return skip_rows

def check_string(re_exp, strings):
res = re.search(re_exp, strings)
if res:
return True
else:
return False

def check_exist(label):
def check_db_exist(func):
def wrapper(self, *args, **kwargs):
if label=='feature_db':
if self.feature_db is None:
return LookupError('The database is not loaded.')
elif label=='ibaq_db':
if self.load_db is None:
return LookupError('The database is not loaded.')

res = func(self, *args, **kwargs)
return res
return wrapper
return check_db_exist

class Statistic():

__slot__ = ['feature_db','ibaq_db']

def __init__(self,feature_path:str=None,ibaq_path:str=None):

self.feature_db = self.load_db(feature_path)
self.ibaq_db = self.load_db(ibaq_path)

def load_db(self,path:str):
if path is not None:
if os.path.exists(path):
if path.endswith(".parquet"):
return duckdb.read_parquet(path)
else:
skip_rows = get_skip_rows(path)
return duckdb.read_csv(path,sep='\t',skiprows=skip_rows)
else:
return FileNotFoundError(f'the file {path} does not exist.')
else:
return None

@check_exist('feature_db')
def get_unique_peptides(self):

feature_db = self.feature_db
unique_peps = duckdb.sql(f"SELECT DISTINCT sequence FROM feature_db").df()

return unique_peps['sequence'].tolist()

@check_exist('ibaq_db')
def get_unique_proteins(self):

ibaq_db = self.ibaq_db
unique_prts = duckdb.sql(f"SELECT DISTINCT protein FROM ibaq_db").df()

return unique_prts['protein'].tolist()

@check_exist('feature_db')
def query_peptide(self,peptide:str):

feature_db = self.feature_db
if check_string('^[A-Z]+$',peptide):
return duckdb.sql(f"SELECT * FROM feature_db WHERE sequence ='{peptide}'").df()
else:
return KeyError('Illegal peptide!')

@check_exist('ibaq_db')
def query_protein(self,protein:str):

ibaq_db = self.ibaq_db
if check_string('^[A-Z]+',protein):
return duckdb.sql(f"SELECT * FROM ibaq_db WHERE protein ='{protein}'").df()
else:
return KeyError('Illegal protein!')

@check_exist('feature_db')
def plot_peptide_distribution_of_protein(self):
feature_db = self.feature_db
df = duckdb.sql(f"SELECT sample_accession, COUNT(sequence) FROM feature_db GROUP BY sample_accession").df()
df.columns = ['sample','peptides']
df = df.sample(frac=1).reset_index(drop=True)
if len(df) > 20:
return df.iloc[:20,:].plot.bar(figsize=(12,8),x='sample',y='peptides',title='number of peptides for different samples',width=0.7,color='#82C3A3',rot=65,ylim=(0,max(df.loc[:20,'peptides'])+1/2*max(df.loc[:20,'peptides'])))
else:
return df.plot.bar(figsize=(12,8),x='sample',y='peptides',title='number of peptides for different samples',color='#82C3A3',rot=65)

@check_exist('feature_db')
def plot_intensty_distribution_of_samples(self):
feature_db = self.feature_db
sample_accessions = duckdb.sql(f"SELECT DISTINCT sample_accession FROM feature_db").df()['sample_accession'].tolist()
random.shuffle(sample_accessions)
if len(sample_accessions) > 8:
sample_accessions = sample_accessions[:8]
df = pd.DataFrame()
for sample in sample_accessions:
df_sample = duckdb.sql(f"SELECT intensity FROM feature_db WHERE sample_accession='{sample}'").df()
df_sample = df_sample[df_sample['intensity']>0]
df_sample['intensity'] = np.log(df_sample['intensity'])
df_sample.columns = [sample]
df = pd.concat([df,df_sample],axis=1)

return df.plot.kde(figsize=(12,8),linewidth=2, legend=False)

@check_exist('feature_db')
def plot_intensty_box_of_samples(self):
feature_db = self.feature_db
sample_accessions = duckdb.sql(f"SELECT DISTINCT sample_accession FROM feature_db").df()['sample_accession'].tolist()
random.shuffle(sample_accessions)
if len(sample_accessions) > 10:
sample_accessions = sample_accessions[:10]
df = pd.DataFrame()
for sample in sample_accessions:
df_sample = duckdb.sql(f"SELECT sample_accession,intensity FROM db WHERE sample_accession='{sample}'").df()
df_sample = df_sample[df_sample['intensity']>0]
df_sample['intensity'] = np.log(df_sample['intensity'])
df = pd.concat([df,df_sample],axis=0)
plt.figure(figsize=(12,8),dpi=500)
chart = sns.boxplot(
x='sample_accession',
y='intensity',
data=df,
boxprops=dict(alpha=0.3),
palette="muted",
)
chart.set_xticklabels(labels =sample_accessions, rotation = 65)

return chart

0 comments on commit 66f6450

Please sign in to comment.