Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dynamic fields to datasets/files in analysis interface WIP #82

Merged
merged 5 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 23 additions & 28 deletions src/backend/analysis/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def recurse_nrdsets_baseanalysis(aba):
old_mzmls, old_dsets = recurse_nrdsets_baseanalysis(older_aba)
# First get stripnames of old ds
strips = {}
for oldads in aba.base_analysis.analysisdatasetsetname_set.select_related('dataset__prefractionationdataset__hiriefdataset'):
for oldads in aba.base_analysis.analysisdatasetsetvalue_set.select_related('dataset__prefractionationdataset__hiriefdataset'):
if hasattr(oldads.dataset, 'prefractionationdataset'):
pfd = oldads.dataset.prefractionationdataset
if hasattr(pfd, 'hiriefdataset'):
Expand All @@ -152,26 +152,28 @@ def recurse_nrdsets_baseanalysis(aba):
# This would in 3. give us all oldmzmls from 1. and 2., so setB would be double
single_ana_oldmzml = {}
single_ana_oldds = {}
regexes = {x.dataset_id: x.value for x in models.AnalysisDatasetSetValue.objects.filter(
analysis=aba.base_analysis, field='__regex')}
for asf in models.AnalysisDSInputFile.objects.filter(
analysis=aba.base_analysis).select_related(
'sfile__rawfile__producer', 'analysisdset__setname'):
if asf.analysisdset.regex:
frnr = re.match(asf.analysisdset.regex, asf.sfile.filename) or False
analysisset__analysis=aba.base_analysis).select_related(
'sfile__rawfile__producer', 'analysisset__setname'):
if asf.dsanalysis.dataset_id in regexes:
frnr = re.match(regexes[asf.dsanalysis.dataset_id], asf.sfile.filename) or False
frnr = frnr.group(1) if frnr else 'NA'
else:
frnr = 'NA'
oldasf = {'fn': asf.sfile.filename,
'instrument': asf.sfile.rawfile.producer.name,
'setname': asf.analysisdset.setname.setname,
'plate': strips[asf.analysisdset.dataset_id],
'setname': asf.analysisset.setname,
'plate': strips[asf.analysisset.dataset_id],
'fraction': frnr,
}
try:
single_ana_oldmzml[asf.analysisdset.setname.setname].append(oldasf)
single_ana_oldds[asf.analysisdset.setname.setname].add(asf.analysisdset.dataset_id)
single_ana_oldmzml[asf.analyisset.setname].append(oldasf)
single_ana_oldds[asf.analysisset.setname].add(asf.dsanalysis.dataset_id)
except KeyError:
single_ana_oldmzml[asf.analysisdset.setname.setname] = [oldasf]
single_ana_oldds[asf.analysisdset.setname.setname] = {asf.analysisdset.dataset_id}
single_ana_oldmzml[asf.analysisset.setname] = [oldasf]
single_ana_oldds[asf.analysisset.setname] = {asf.dsanalysis.dataset_id}
old_mzmls.update(single_ana_oldmzml)
old_dsets.update(single_ana_oldds)
return old_mzmls, old_dsets
Expand Down Expand Up @@ -236,8 +238,8 @@ def process(self, **kwargs):

# Now remove obsolete deleted-from-dataset files from job (e.g. corrupt, empty, etc)
obsolete = sfiles_passed.exclude(rawfile__datasetrawfile__dataset__datasetanalysis__in=dsa)
analysis.analysisdsinputfile_set.filter(sfile__in=obsolete).delete()
analysis.analysisfilesample_set.filter(sfile__in=obsolete).delete()
models.AnalysisDSInputFile.objects.filter(analysisset__analysis=analysis, sfile__in=obsolete).delete()
analysis.analysisfilevalue_set.filter(sfile__in=obsolete).delete()
rm.FileJob.objects.filter(job_id=job.pk, storedfile__in=obsolete).delete()
for del_sf in obsolete:
# FIXME setnames/frac is specific
Expand Down Expand Up @@ -271,31 +273,24 @@ def process(self, **kwargs):
for fn in sfiles_passed:
infile = {'servershare': fn.servershare.name, 'path': fn.path, 'fn': fn.filename}
if 'setname' in inputdef_fields:
infile['setname'] = kwargs['setnames'].get(str(fn.id), '')
infile['setname'] = kwargs['filesamples'].get(str(fn.id), '')
if 'plate' in inputdef_fields:
infile['plate'] = kwargs['platenames'].get(str(fn.rawfile.datasetrawfile.dataset_id), '')
if 'sampleID' in inputdef_fields:
# sampleID is for pgt / dbgenerator
infile['sampleID'] = fn.rawfile.datasetrawfile.quantsamplefile.projsample.sample
# No fallback, is required if in header
infile['sampleID'] = kwargs['filesamples'][str(fn.id)]
if 'fraction' in inputdef_fields:
infile['fraction'] = kwargs['infiles'].get(str(fn.id), {}).get('fr')
if 'instrument' in inputdef_fields:
# No fallback, instrument in header cannot be ''
infile['instrument'] = fn.rawfile.producer.msinstrument.instrumenttype.name
if 'channel' in inputdef_fields:
# For non-pooled labelcheck
# For non-pooled labelcheck, cannot be ''
infile['channel'] = fn.rawfile.datasetrawfile.quantfilechannel.channel.channel.name
if 'file_type' in inputdef_fields:
infile['file_type'] = fn.filetype.filetype
if 'pep_prefix' in inputdef_fields:
# FIXME needs to be able to change to none, mutalt (VCF), fusion_squid, etc
# We can probably use setname frontend code for that
infile['pep_prefix'] = 'none'


# FIXME add the pgt DB/other fields here
# expr_str expr_thresh sample_gtf_file pep_prefix
# Dynamic fields
infile.update(kwargs['filefields'][fn.pk])
infiles.append(infile)
# FIXME this in tasks and need to write header
# FIXME bigrun not hardcode, probably need to remove when new infra
shortname = models.UserWorkflow.WFTypeChoices(analysis.nextflowsearch.workflow.wftype).name
bigrun = shortname == 'PISEP' or len(infiles) > 500
Expand All @@ -312,7 +307,7 @@ def process(self, **kwargs):
run['infiles'] = infiles
else:
# SELECT prefrac with fraction regex to get fractionated datasets in old analysis
if ana_baserec.base_analysis.exclude(analysisdatasetsetname__regex='').count():
if ana_baserec.base_analysis.filter(analysisdatasetsetvalue__field='__regex').count():
# rerun/complement runs with fractionated base analysis need --oldmzmldef parameter
old_infiles, old_dsets = recurse_nrdsets_baseanalysis(ana_baserec)
run['old_infiles'] = ['{}\t{}'.format(x['fn'], '\t'.join([x[key] for key in run['components']['INPUTDEF']]))
Expand Down
49 changes: 49 additions & 0 deletions src/backend/analysis/migrations/0048_auto_20240326_1509.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Generated by Django 3.2.13 on 2024-03-26 15:09

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('rawstatus', '0028_alter_storedfile_checked'),
('datasets', '0019_deletes'),
('analysis', '0047_alter_param_ptype'),
]

operations = [
migrations.RenameModel(
old_name='AnalysisDatasetSetname',
new_name='AnalysisDatasetSetValue',
),
migrations.RenameModel(
old_name='AnalysisFileSample',
new_name='AnalysisFileValue',
),
migrations.RenameField(
model_name='AnalysisFileValue',
old_name='sample',
new_name='value',
),
migrations.RemoveConstraint(
model_name='analysisdsinputfile',
name='uni_anainfile',
),
migrations.RemoveConstraint(
model_name='analysisfilevalue',
name='uni_anassamplefile',
),
migrations.AddField(
model_name='analysisdsinputfile',
name='analysisset',
field=models.ForeignKey(default=1, on_delete=django.db.models.deletion.CASCADE, to='analysis.analysissetname'),
preserve_default=False,
),
migrations.AddField(
model_name='analysisdsinputfile',
name='dsanalysis',
field=models.ForeignKey(default=1, on_delete=django.db.models.deletion.CASCADE, to='analysis.datasetanalysis'),
preserve_default=False,
),
]
50 changes: 50 additions & 0 deletions src/backend/analysis/migrations/0049_auto_20240327_1144.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Generated by Django 3.2.13 on 2024-03-27 11:44

from django.db import migrations, models
import django.db.models.deletion
from django.db.models import OuterRef, Subquery


def remove_dups(apps, s):
DSA = apps.get_model('analysis', 'DatasetAnalysis')
for dsa in DSA.objects.all()[::-1]:
if DSA.objects.filter(dataset=dsa.dataset, analysis=dsa.analysis).count() > 1:
dsa.delete()


def populate_analysisset(apps, s):
ADSI = apps.get_model('analysis', 'AnalysisDSInputFile')
ADSI.objects.update(analysisset=Subquery(ADSI.objects.filter(pk=OuterRef('pk')).values('analysisdset__setname')[:1]))


def moveback_analysisset(apps, s):
pass


def populate_dsanalysis(apps, s):
ADSI = apps.get_model('analysis', 'AnalysisDSInputFile')
ADSV = apps.get_model('analysis', 'AnalysisDatasetSetValue')
DSA = apps.get_model('analysis', 'DatasetAnalysis')
ADSI.objects.update(dsanalysis=Subquery(DSA.objects.filter(
dataset=Subquery(ADSV.objects.filter(pk=OuterRef(OuterRef('analysisdset'))).values('dataset')[:1]),
analysis=OuterRef('analysis')).values('pk')[:1]))


def moveback_dsanalysis(apps, s):
pass

def fake(apps, s):
pass


class Migration(migrations.Migration):

dependencies = [
('analysis', '0048_auto_20240326_1509'),
]

operations = [
migrations.RunPython(remove_dups, fake),
migrations.RunPython(populate_analysisset, moveback_analysisset),
migrations.RunPython(populate_dsanalysis, moveback_dsanalysis),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('analysis', '0049_auto_20240327_1144'),
]

operations = [
migrations.AddConstraint(
model_name='datasetanalysis',
constraint=models.UniqueConstraint(fields=('analysis', 'dataset'), name='uni_dsa_anadsets'),
),

migrations.RemoveField(
model_name='analysisdsinputfile',
name='analysis',
),
migrations.RemoveField(
model_name='analysisdsinputfile',
name='analysisdset',
),
migrations.AddConstraint(
model_name='analysisdsinputfile',
constraint=models.UniqueConstraint(fields=('analysisset', 'sfile'), name='uni_anaset_infile'),
),
]
62 changes: 62 additions & 0 deletions src/backend/analysis/migrations/0051_auto_20240327_2033.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Generated by Django 3.2.13 on 2024-03-27 20:33

from django.db import migrations, models


def sample_or_regex_to_field(apps, s):
ADSV = apps.get_model('analysis', 'AnalysisDatasetSetValue')
ADSV.objects.filter(regex='').delete()
ADSV.objects.update(field='__regex', value=models.F('regex'))
AFS = apps.get_model('analysis', 'AnalysisFileValue')
AFS.objects.update(field='__sample')


def fake(a, s):
pass



class Migration(migrations.Migration):

dependencies = [
('analysis', '0050_delete_old_fields_20240327_1350'),
]

operations = [
migrations.RemoveConstraint(
model_name='analysisdatasetsetvalue',
name='uni_anadsets',
),
migrations.AddField(
model_name='analysisdatasetsetvalue',
name='field',
field=models.TextField(default=''),
preserve_default=False,
),
migrations.AddField(
model_name='analysisdatasetsetvalue',
name='value',
field=models.TextField(default=''),
preserve_default=False,
),
migrations.AddField(
model_name='analysisfilevalue',
name='field',
field=models.TextField(default=''),
preserve_default=False,
),
migrations.RunPython(sample_or_regex_to_field, fake),

migrations.AddConstraint(
model_name='analysisdatasetsetvalue',
constraint=models.UniqueConstraint(fields=('analysis', 'dataset', 'field'), name='uni_anadsetsfields'),
),
migrations.AddConstraint(
model_name='analysisfilevalue',
constraint=models.UniqueConstraint(fields=('analysis', 'sfile', 'field'), name='uni_anassamplefile'),
),
migrations.RemoveField(
model_name='analysisdatasetsetvalue',
name='regex',
),
]
46 changes: 28 additions & 18 deletions src/backend/analysis/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,58 +251,68 @@ class AnalysisSampletable(models.Model):


class AnalysisSetname(models.Model):
'''All set or sample names in an analysis that are per dataset,
which means prefractionated proteomics data'''
'''All set or sample names in an analysis that are per dataset'''
analysis = models.ForeignKey(Analysis, on_delete=models.CASCADE)
setname = models.TextField()

class Meta:
constraints = [models.UniqueConstraint(fields=['analysis', 'setname'], name='uni_anasets')]


class AnalysisDatasetSetname(models.Model):
class DatasetAnalysis(models.Model):
analysis = models.ForeignKey(Analysis, on_delete=models.CASCADE)
dataset = models.ForeignKey(dsmodels.Dataset, on_delete=models.CASCADE)
# cannot put setname here because of searches without dset/setname
# model used in reporting, and also for finding datasets for base analysis etc

class Meta:
constraints = [models.UniqueConstraint(fields=['analysis', 'dataset'], name='uni_dsa_anadsets')]


class AnalysisDatasetSetValue(models.Model):
'''Dataset mapping to setnames (multiple dataset can have the same setname)'''
# Note that datasets can be deleted, or have their file contents changed
# That means this is not to be trusted for future bookkeeping of what was in the analysis
# For that, you should combine it with using the below AnalysisDSInputFile model
analysis = models.ForeignKey(Analysis, on_delete=models.CASCADE)
dataset = models.ForeignKey(dsmodels.Dataset, on_delete=models.CASCADE)
setname = models.ForeignKey(AnalysisSetname, on_delete=models.CASCADE, null=True)
regex = models.TextField() # optional
field = models.TextField()
value = models.TextField()

class Meta:
constraints = [models.UniqueConstraint(fields=['analysis', 'dataset'], name='uni_anadsets')]
constraints = [models.UniqueConstraint(fields=['analysis', 'dataset', 'field'], name='uni_anadsetsfields')]

# FIXME how should we do with pgt DBGEN input? Are those sets, or are they something else?
# they def have sample names, and can be multiple per sample (BAMs merged, VCFs indel/snv etc)

class AnalysisDSInputFile(models.Model):
'''Input files for set-based analysis (isobaric and prefraction-datasets)'''
analysis = models.ForeignKey(Analysis, on_delete=models.CASCADE)
dsanalysis = models.ForeignKey(DatasetAnalysis, on_delete=models.CASCADE)
sfile = models.ForeignKey(filemodels.StoredFile, on_delete=models.CASCADE)
analysisdset = models.ForeignKey(AnalysisDatasetSetname, on_delete=models.CASCADE)
analysisset = models.ForeignKey(AnalysisSetname, on_delete=models.CASCADE)

class Meta:
constraints = [models.UniqueConstraint(fields=['analysis', 'sfile'], name='uni_anainfile')]
constraints = [models.UniqueConstraint(fields=['analysisset', 'sfile'], name='uni_anaset_infile')]


class AnalysisFileSample(models.Model):
class AnalysisFileValue(models.Model):
'''If one sample per file is used in labelfree analyses, the samples are stored
here'''
# this assumes at least one entry of this model per file/analysis
# (for non-set data), so samplename is a field. This is the only mapping of
# file/analysis we have currently for non-set data. If there's ever need
# of mapping files WITHOUT field/value for an analysis, we can break out
# to an extra model, alternatively null the fields

analysis = models.ForeignKey(Analysis, on_delete=models.CASCADE)
sample = models.TextField()
field = models.TextField()
value = models.TextField()
sfile = models.ForeignKey(filemodels.StoredFile, on_delete=models.CASCADE)

# FIXME this should maybe FK to infile above here?
class Meta:
constraints = [models.UniqueConstraint(fields=['analysis', 'sfile'], name='uni_anassamplefile')]


class DatasetAnalysis(models.Model):
analysis = models.ForeignKey(Analysis, on_delete=models.CASCADE)
dataset = models.ForeignKey(dsmodels.Dataset, on_delete=models.CASCADE)
# cannot put setname here because of searches without dset/setname
# model used in reporting, and also for finding datasets for base analysis etc
constraints = [models.UniqueConstraint(fields=['analysis', 'sfile', 'field'], name='uni_anassamplefile')]


class AnalysisIsoquant(models.Model):
Expand Down
2 changes: 1 addition & 1 deletion src/backend/analysis/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def run_nextflow_workflow(self, run, params, stagefiles, profiles, nf_version):
else:
fndir = os.path.join(settings.SHAREMAP[fn['servershare']], fn['path'])
fnpath = os.path.join(fndir, fn['fn'])
fn_metadata = '\t'.join(fn[x] for x in run['components']['INPUTDEF'][1:] if fn[x])
fn_metadata = '\t'.join(fn[x] or '' for x in run['components']['INPUTDEF'][1:])
fp.write(f'\n{fnpath}\t{fn_metadata}')
params.extend(['--input', os.path.join(rundir, 'inputdef.txt')])

Expand Down
Loading
Loading