Skip to content

Commit

Permalink
Also store gene quant
Browse files Browse the repository at this point in the history
  • Loading branch information
glormph committed Sep 27, 2024
1 parent 33c41fa commit 88b2827
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 23 deletions.
12 changes: 12 additions & 0 deletions src/backend/analysis/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ class WfOutput(models.Model):
psmscorefield = models.ForeignKey(OutputFieldName, related_name='score', on_delete=models.CASCADE)
psmsetname = models.ForeignKey(OutputFieldName, related_name='psmset', on_delete=models.CASCADE)
psmpeptide = models.ForeignKey(OutputFieldName, related_name='psmpep', on_delete=models.CASCADE)
genetablegenefield = models.ForeignKey(OutputFieldName, related_name='genegene', on_delete=models.CASCADE)

def get_fasta_files(self, **jobkw):
'''Fasta files need inspection of job parameters as there is no "proper" DB
Expand Down Expand Up @@ -190,6 +191,17 @@ def get_peptide_outfile(self, analysis):
else:
return (1, False, f'Cannot find output peptide file ({self.pepfile}) for this analysis.')

def get_gene_outfile(self, analysis):
'''Gene file is not always output'''
genefile = analysis.analysisresultfile_set.filter(sfile__filename=self.genefile)
if genefile.count() > 1:
return (1, False, f'Multiple gene files ({self.genefile}) found for this analysis? Contact admin.')
elif genefile.count():
return (0, genefile.values('sfile__servershare__name', 'sfile__path', 'sfile__filename'), '')
else:
return (0, False, 'No gene file available in this analysis')



class PipelineVersionOutput(models.Model):
'''Mapping a pipeline (version) to an output field definition. Multiple
Expand Down
33 changes: 22 additions & 11 deletions src/backend/mstulos/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,26 @@ def process(self, **kwargs):
plexq = Q(dataset__quantdataset__quanttype__shortname__contains='plex')
plexq |= Q(dataset__quantdataset__quanttype__shortname='tmtpro')
isob_types = []
all_pepfile_arg, all_psmfile_arg, all_fa_files = {}, {}, {}
all_pepfile_arg, all_psmfile_arg, all_genefile_arg, all_fa_files = {}, {}, {}, {}
for plextype in analysis.datasetanalysis_set.filter(plexq).distinct(
'dataset__quantdataset__quanttype__shortname'):
ptname = plextype.dataset.quantdataset.quanttype.shortname
plextype_trf = {'tmtpro': 'tmt16plex'}.get(ptname, ptname)
isob_types.append(plextype_trf)

# Pass all WfOuput objects for the pipeline
# Pass all WfOuput objects mapped for the used pipeline
headers, fa_files, pepfile_arg, psmfile_arg = {}, {}, {}, {}
for pipe_out in analysis.nextflowsearch.nfwfversionparamset.pipelineversionoutput_set.all():
for pipe_out in analysis.nextflowsearch.nfwfversionparamset.pipelineversionoutput_set.select_related('output').all():
output = pipe_out.output
# Output files headers according to their DB entries
headers[output.pk] = {
'isobaric': isob_types,
'pep': {
'fdr': output.pepfdrfield.fieldname,
'posterior': output.peppepfield.fieldname,
'peptide': output.peppeptidefield.fieldname,
'ms1': output.pepms1field.fieldname,
'isobaric': isob_types},
},
'psm': {
'fdr': output.psmfdrfield.fieldname,
'posterior': output.psmpepfield.fieldname,
Expand All @@ -65,23 +66,33 @@ def process(self, **kwargs):
'ms1': output.psmms1field.fieldname,
'rt': output.rtfield.fieldname,
'protein': output.psmprotfield.fieldname,
}}
},
'gene': {
'genename': output.genetablegenefield.fieldname,
},
}

# Get fasta files
fa_rc, fa_files, faerr = output.get_fasta_files(**analysis.nextflowsearch.job.kwargs['inputs'])
psm_rc, psmfile, psmerr = output.get_psm_outfile(analysis)
pep_rc, pepfile, peperr = output.get_peptide_outfile(analysis)
gene_rc, genefile, geneerr = output.get_gene_outfile(analysis)

if fa_rc or psm_rc or pep_rc:
raise RuntimeError('\n'.join([faerr, psmerr, peperr]).strip())
else:
psmfile = psmfile.get()
pepfile = pepfile.get()
if fa_rc or psm_rc or pep_rc or gene_rc:
raise RuntimeError('\n'.join([faerr, psmerr, peperr, generr]).strip())

psmfile = psmfile.get()
pepfile = pepfile.get()
genefile = genefile.get() if genefile else False

all_fa_files[output.pk] = [(x['pk'], x['servershare__name'], os.path.join(x['path'], x['filename']))
for x in fa_files]
all_pepfile_arg[output.pk] = (pepfile['sfile__servershare__name'],
os.path.join(pepfile['sfile__path'], pepfile['sfile__filename']))
all_psmfile_arg[output.pk] = (psmfile['sfile__servershare__name'],
os.path.join(psmfile['sfile__path'], psmfile['sfile__filename']))
self.run_tasks.append(((kwargs['token'], kwargs['organism_id'], all_pepfile_arg, all_psmfile_arg, headers, all_fa_files), {}))
if genefile:
all_genefile_arg[output.pk] = (genefile['sfile__servershare__name'],
os.path.join(genefile['sfile__path'], genefile['sfile__filename']))

self.run_tasks.append(((kwargs['token'], kwargs['organism_id'], all_pepfile_arg, all_psmfile_arg, all_genefile_arg, headers, all_fa_files), {}))
9 changes: 9 additions & 0 deletions src/backend/mstulos/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,15 @@ class Meta:
constraints = [models.UniqueConstraint(fields=['channel', 'peptide'], name='uni_isochpep')]


class GeneIsoQuant(models.Model):
value = models.FloatField()
channel = models.ForeignKey(Condition, on_delete=models.CASCADE)
gene = models.ForeignKey(Gene, on_delete=models.CASCADE)

class Meta:
constraints = [models.UniqueConstraint(fields=['channel', 'gene'], name='uni_isochgene')]


class PeptideMS1(models.Model):
ms1 = models.FloatField()
idpep = models.OneToOneField(IdentifiedPeptide, on_delete=models.CASCADE)
Expand Down
62 changes: 51 additions & 11 deletions src/backend/mstulos/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@


@shared_task(bind=True, queue=settings.QUEUE_SEARCH_INBOX)
def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file, outheaders, fafns):
def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file, gene_file,
outheaders, fafns):
# FIXME maybe not do proteins when running 6FT? Need to make it selectable!
# FIXME exempt proteogenomics completely for now or make button (will be misused!)
# FIXME not all runs have genes
Expand Down Expand Up @@ -52,12 +53,16 @@ def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file,
best_match_wf = max(wf_out_count, key=wf_out_count.get)
peptide_file = peptide_file[best_match_wf]
psm_file = psm_file[best_match_wf]
if best_match_wf in gene_file:
gene_file = gene_file[best_match_wf]
else:
gene_file = False
outheaders = outheaders[best_match_wf]
fafns = fafns[best_match_wf]

# Store proteins, genes found, first fasta
protgenes = []
storedproteins = {}
storedproteins, storedgenes = {}, {}
protein_url = urljoin(settings.KANTELEHOST, reverse('mstulos:upload_proteins'))
all_seq = {}
for fa_id, fa_server, fafn in fafns:
Expand Down Expand Up @@ -112,7 +117,9 @@ def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file,
resp = update_db(protein_url, json={'protgenes': protgenes, 'token': token,
'organism_id': organism_id, 'fa_ids': [x[0] for x in fafns]})
resp.raise_for_status()
storedproteins.update(resp.json()['protein_ids'])
rj = resp.json()
storedproteins.update(rj['protein_ids'])
storedgenes.update(rj['gene_ids'])
protgenes = []
for newprot, bareseqs_tosave in pepprots_nopk.items():
storeprot = storedproteins[newprot]
Expand All @@ -127,7 +134,9 @@ def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file,
resp = update_db(protein_url, json={'protgenes': protgenes, 'token': token,
'organism_id': organism_id, 'fa_ids': [x[0] for x in fafns]})
resp.raise_for_status()
storedproteins.update(resp.json()['protein_ids'])
rj = resp.json()
storedproteins.update(rj['protein_ids'])
storedgenes.update(rj['gene_ids'])
for newprot, bareseqs_tosave in pepprots_nopk.items():
storeprot = storedproteins[newprot]
for fa_id, acc_seqs in all_seq.items():
Expand Down Expand Up @@ -161,7 +170,7 @@ def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file,
elif pepheader['ms1'] and pepheader['ms1'] in field:
setname = field.replace(f'_{pepheader["ms1"]}', '')
conditions['ms1'].append((ix, samplesets[setname]['set_id']))
elif pepheader['isobaric'] and any(plex in field for plex in pepheader['isobaric']):
elif outheaders['isobaric'] and any(plex in field for plex in outheaders['isobaric']):
plex_re = '(.*)_[a-z0-9]+plex_([0-9NC]+)'
# Need to remove e.g. plex_126 - Quanted PSMs, with $
if re.match(f'{plex_re}$', field):
Expand Down Expand Up @@ -233,11 +242,42 @@ def summarize_result_peptable(self, token, organism_id, peptide_file, psm_file,
if len(psms):
resp = update_db(psmurl, json={'psms': psms, 'token': token})
resp.raise_for_status()

# Finished, report done
update_db(urljoin(settings.KANTELEHOST, reverse('mstulos:upload_done')), json={'token': token, 'task_id': self.request.id})


# Gene table parse if any
if gene_file:
geneurl = urljoin(settings.KANTELEHOST, reverse('mstulos:upload_geneq'))
genefile_fpath = os.path.join(settings.SHAREMAP[gene_file[0]], gene_file[1])
with open(genefile_fpath) as fp:
header = next(fp).strip('\n').split('\t')
conditions = {'isobaric': []}
# find header fields and match with conditions by setname/sample/channel
genefield = header.index(outheaders['gene']['genename'])
for ix, field in enumerate(header):
# bit strict handling with f'_PSM count' to avoid Quanted PSM count fields..
# Text parsing is not always super clean. We may need to think about that in
# the pipeline itself, to make sure it is parseable, or encode somehow
if outheaders['isobaric'] and any(plex in field for plex in outheaders['isobaric']):
plex_re = '(.*)_[a-z0-9]+plex_([0-9NC]+)'
# Need to remove e.g. plex_126 - Quanted PSMs, with $
if re.match(f'{plex_re}$', field):
sample_set_ch = re.sub(plex_re, '\\1___\\2', field)
conditions['isobaric'].append((ix, samples[sample_set_ch]))
gene_values = []
for line in fp:
line = line.strip('\n').split('\t')
storegene = {'gene': storedgenes[line[genefield]]}
for datatype, col_conds in conditions.items():
storegene[datatype] = []
for col, cond_id in col_conds:
storegene[datatype].append((cond_id, line[col]))
gene_values.append(storegene)
if len(gene_values) == 1000:
resp = update_db(geneurl, json={'genes': gene_values, 'token': token})
resp.raise_for_status()
gene_values = []
if len(gene_values):
resp = update_db(geneurl, json={'genes': gene_values, 'token': token})
resp.raise_for_status()

x = ('jorrit/14394_STD_GMPSDL1-allsample_pool_denominator_20221102_14.52/peptides_table.txt', 'georgios/14392_STD_GMPSDL1-13_20221101_15.20/target_psmtable.txt', 'georgios/14392_STD_GMPSDL1-13_20221101_15.20/target_psmlookup.sql', {'pep': {'psmcount': 'PSM count', 'fdr': 'q-value', 'peptide': 'Peptide sequence', 'isobaric': []}, 'psm': {'fdr': 'PSM q-value', 'fn': 'SpectraFile', 'scan': 'ScanNum', 'setname': 'Biological set', 'peptide': 'Peptide'}}, {'set1': {'set_id': 269, 'fractions': {}, 'files': {'GMPSDL1-13_Labelcheck_4hrs_3of10_set01.mzML': 270}}}, {'groups': {}, 'samples': {'GMPSDL_1_set1___126': 271, 'GMPSDL_2_set1___127N': 273, 'GMPSDL_3_set1___127C': 275, 'GMPSDL_4_set1___128N': 277, 'GMPSDL_5_set1___128C': 279, 'GMPSDL_6_set1___129N': 281, 'GMPSDL_7_set1___129C': 283, 'GMPSDL_8_set1___130N': 285, 'GMPSDL_9_set1___130C': 287, 'GMPSDL_10_set1___131N': 289, 'GMPSDL_11_set1___131C': 291, 'GMPSDL_12_set1___132N': 293, 'GMPSDL_13_set1___132C': 295, 'POOL(EAPSDL1-30)_set1___133N': 297, 'empty_set1___133C': 299, 'empty_set1___134N': 299}, 'GMPSDL_1_set1___126': 272, 'GMPSDL_2_set1___127N': 274, 'GMPSDL_3_set1___127C': 276, 'GMPSDL_4_set1___128N': 278, 'GMPSDL_5_set1___128C': 280, 'GMPSDL_6_set1___129N': 282, 'GMPSDL_7_set1___129C': 284, 'GMPSDL_8_set1___130N': 286, 'GMPSDL_9_set1___130C': 288, 'GMPSDL_10_set1___131N': 290, 'GMPSDL_11_set1___131C': 292, 'GMPSDL_12_set1___132N': 294, 'GMPSDL_13_set1___132C': 296, 'POOL(EAPSDL1-30)_set1___133N': 298, 'empty_set1___133C': 300, 'empty_set1___134N': 301})

# Finished, report done
update_db(urljoin(settings.KANTELEHOST, reverse('mstulos:upload_done')), json={'token': token, 'task_id': self.request.id})
1 change: 1 addition & 0 deletions src/backend/mstulos/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
path('upload/proteins/', views.upload_proteins, name='upload_proteins'),
path('upload/peptides/', views.upload_peptides, name='upload_peptides'),
path('upload/psms/', views.upload_psms, name='upload_psms'),
path('upload/genes/', views.upload_genes, name='upload_geneq'),
path('upload/done/', views.upload_done, name='upload_done'),
path('plotdata/peptides/', views.fetch_plotdata_peptides),
path('plotdata/psms/', views.fetch_plotdata_psms),
Expand Down
23 changes: 22 additions & 1 deletion src/backend/mstulos/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,8 @@ def upload_proteins(request):
store_gid = existing_genes[gene]
elif gene:
store_gid = m.Gene.objects.get_or_create(name=gene, organism_id=data['organism_id'])[0].pk
if gene:
stored_genes[gene] = store_gid
fa_prot = f'{fa_id}__{prot}'
if fa_prot not in existing_prots:
dbprot, _ = m.Protein.objects.get_or_create(name=prot)
Expand All @@ -538,7 +540,7 @@ def upload_proteins(request):
if gene:
m.ProteinGene.objects.get_or_create(proteinfa=protfa, gene_id=store_gid)
stored_prots[prot] = existing_prots[fa_prot]
return JsonResponse({'error': False, 'protein_ids': stored_prots})
return JsonResponse({'error': False, 'protein_ids': stored_prots, 'gene_ids': stored_genes})


def get_mods_from_seq(seq, mods=False, pos=0):
Expand Down Expand Up @@ -636,6 +638,25 @@ def upload_psms(request):
return JsonResponse({'error': False})


@require_POST
def upload_genes(request):
data = json.loads(request.body.decode('utf-8'))
print(data)
try:
exp = m.Experiment.objects.get(token=data['token'], upload_complete=False)
except m.Experiment.DoesNotExist:
return JsonResponse({'error': 'Not allowed to access'}, status=403)
except KeyError:
return JsonResponse({'error': 'Bad request to mstulos uploads'}, status=400)
for gene in data['genes']:
for cond_id, quant in gene['isobaric']:
if quant != 'NA':
m.GeneIsoQuant.objects.create(gene_id=gene['gene'], value=quant, channel_id=cond_id)
return JsonResponse({'error': False})




@require_POST
def upload_done(request):
data = json.loads(request.body.decode('utf-8'))
Expand Down

0 comments on commit 88b2827

Please sign in to comment.