Skip to content

Commit

Permalink
Merge pull request #2987 from AlexsLemonade/dev
Browse files Browse the repository at this point in the history
Deploy to prod
  • Loading branch information
davidsmejia authored Dec 22, 2021
2 parents 34e426f + bcc4316 commit 2e600d7
Show file tree
Hide file tree
Showing 24 changed files with 364 additions and 309 deletions.
2 changes: 2 additions & 0 deletions api/data_refinery_api/views/relation_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class Meta:
"pretty_platform",
"technology",
"is_processed",
"is_unable_to_be_processed",
)
read_only_fields = fields

Expand All @@ -112,6 +113,7 @@ class Meta:
"manufacturer",
"protocol_info",
"is_processed",
"is_unable_to_be_processed",
"created_at",
"last_modified",
)
Expand Down
13 changes: 13 additions & 0 deletions api/data_refinery_api/views/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
from data_refinery_api.exceptions import InvalidFilters
from data_refinery_api.utils import check_filters
from data_refinery_api.views.relation_serializers import (
ComputedFileRelationSerializer,
DownloaderJobRelationSerializer,
OrganismIndexRelationSerializer,
OrganismRelationSerializer,
ProcessorJobRelationSerializer,
ProcessorRelationSerializer,
)
from data_refinery_common.models import (
Expand Down Expand Up @@ -58,6 +61,10 @@ class DetailedSampleSerializer(serializers.ModelSerializer):
annotations = SampleAnnotationSerializer(many=True, source="sampleannotation_set")
organism = OrganismRelationSerializer(many=False)
results = DetailedSamplesComputationalResultSerializer(many=True)
last_processor_job = ProcessorJobRelationSerializer(many=False)
last_downloader_job = DownloaderJobRelationSerializer(many=False)
most_recent_smashable_file = ComputedFileRelationSerializer(many=False)
most_recent_quant_file = ComputedFileRelationSerializer(many=False)

class Meta:
model = Sample
Expand Down Expand Up @@ -90,10 +97,15 @@ class Meta:
"compound",
"time",
"is_processed",
"is_unable_to_be_processed",
"created_at",
"last_modified",
"original_files",
"computed_files",
"last_processor_job",
"last_downloader_job",
"most_recent_smashable_file",
"most_recent_quant_file",
"contributed_metadata",
"contributed_keywords",
"experiment_accession_codes",
Expand Down Expand Up @@ -156,6 +168,7 @@ class SampleListView(generics.ListAPIView):
"compound",
"time",
"is_processed",
"is_unable_to_be_processed",
"is_public",
)

Expand Down
2 changes: 1 addition & 1 deletion api/data_refinery_api/views/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

logger = get_and_configure_logger(__name__)

JOB_CREATED_AT_CUTOFF = datetime(2019, 9, 19, tzinfo=timezone.utc)
JOB_CREATED_AT_CUTOFF = datetime(2021, 7, 14, tzinfo=timezone.utc)

# We want to cache all stats pages for 10 minutes to reduce the load on our
# servers, because computing all of the stats is really expensive
Expand Down
7 changes: 6 additions & 1 deletion api/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ if ! [ "$(docker ps --filter name=drdb -q)" ]; then
echo "./scripts/run_postgres.sh" >&2
exit 1
fi
# Ensure that elasticsearch is running
if ! [ "$(docker ps --filter name=dres -q)" ]; then
echo "You must start elasticsearchfirst with:" >&2
echo "./scripts/run_es.sh" >&2
exit 1
fi

project_root=$(pwd) # "cd .." called above
volume_directory="$project_root/test_volume"
Expand All @@ -33,7 +39,6 @@ fi
DB_HOST_IP=$(get_docker_db_ip_address)
ES_HOST_IP=$(get_docker_es_ip_address)


# Only run interactively if we are on a TTY
if [ -t 1 ]; then
INTERACTIVE="-i"
Expand Down
19 changes: 19 additions & 0 deletions common/data_refinery_common/job_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@
logger = get_and_configure_logger(__name__)


UNFIXABLE_ERRORS = [
"The annotation package, pd.primeview, could not be loaded.",
"Missing gene index for primeview",
"Missing gene index for zebgene11st",
"This could be indicative of a mismatch between the reference and sample, or a very bad sample.",
]


def is_job_unprocessable(job):
if job.success is not False:
return False

for error in UNFIXABLE_ERRORS:
if job.failure_reason and error in job.failure_reason:
return True

return False


def create_downloader_job(
undownloaded_files: List[OriginalFile], *, processor_job_id=None, force=False
) -> bool:
Expand Down
57 changes: 57 additions & 0 deletions common/data_refinery_common/migrations/0070_auto_20211208_2118.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Generated by Django 3.2.7 on 2021-12-08 21:18

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("data_refinery_common", "0069_rename_compendia_version_computedfile_compendium_version"),
]

operations = [
migrations.AddField(
model_name="sample",
name="is_unable_to_be_processed",
field=models.BooleanField(default=False),
),
migrations.AddField(
model_name="sample",
name="last_downloader_job",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="data_refinery_common.downloaderjob",
),
),
migrations.AddField(
model_name="sample",
name="last_processor_job",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to="data_refinery_common.processorjob",
),
),
migrations.AddField(
model_name="sample",
name="most_recent_quant_file",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="+",
to="data_refinery_common.computedfile",
),
),
migrations.AddField(
model_name="sample",
name="most_recent_smashable_file",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="+",
to="data_refinery_common.computedfile",
),
),
]
23 changes: 23 additions & 0 deletions common/data_refinery_common/models/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ def __str__(self):

# Crunch Properties
is_processed = models.BooleanField(default=False)
is_unable_to_be_processed = models.BooleanField(default=False)
last_processor_job = models.ForeignKey("ProcessorJob", null=True, on_delete=models.SET_NULL)
last_downloader_job = models.ForeignKey("DownloaderJob", null=True, on_delete=models.SET_NULL)
# Set related_name to "+" to prevent the backwards relation, since
# it should be a duplicate of the relation already established by
# the computed_files field.
most_recent_smashable_file = models.ForeignKey(
"ComputedFile", null=True, on_delete=models.SET_NULL, related_name="+"
)
most_recent_quant_file = models.ForeignKey(
"ComputedFile", null=True, on_delete=models.SET_NULL, related_name="+"
)
is_unable_to_be_processed = models.BooleanField(default=False)

# Blacklisting
is_blacklisted = models.BooleanField(default=False)
Expand Down Expand Up @@ -131,6 +144,11 @@ def get_processor_jobs(self) -> Set:

return processor_jobs

def get_most_recent_processor_job(self):
processor_jobs = self.get_processor_jobs()
if processor_jobs:
return min(processor_jobs, key=lambda job: job.created_at)

# Returns a set of DownloaderJob objects but we cannot specify
# that in type hints because it hasn't been declared yet.
def get_downloader_jobs(self) -> Set:
Expand All @@ -141,6 +159,11 @@ def get_downloader_jobs(self) -> Set:

return downloader_jobs

def get_most_recent_downloader_job(self):
downloader_jobs = self.get_downloader_jobs()
if downloader_jobs:
return min(downloader_jobs, key=lambda job: job.created_at)

def get_result_files(self):
"""Get all of the ComputedFile objects associated with this Sample"""
return self.computed_files.all()
Expand Down
80 changes: 39 additions & 41 deletions common/data_refinery_common/rna_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,47 +28,64 @@
EARLY_TXIMPORT_MIN_PERCENT = 0.80


def should_run_tximport(experiment: Experiment, results, is_tximport_job: bool):
""" Returns whether or not the experiment is eligible to have tximport
run on it.
def get_latest_organism_index(organism):
# Salmon version gets saved as what salmon outputs, which includes this prefix.
current_salmon_version = "salmon " + get_env_variable("SALMON_VERSION", "0.13.1")
return (
OrganismIndex.objects.filter(salmon_version=current_salmon_version, organism=organism)
.order_by("-created_at")
.first()
)

results is a set of ComputationalResults for the samples that had salmon quant run on them.
"""
num_quantified = len(results)
if num_quantified == 0:
return False

salmon_versions = set()
for result in results:
if result.organism_index.salmon_version:
salmon_versions.add(result.organism_index.salmon_version)
def get_tximport_inputs_if_eligible(experiment: Experiment, is_tximport_job: bool):
"""Returns a list of the most recent quant.sf ComputedFiles for the experiment.
if len(salmon_versions) > 1:
# Tximport requires that all samples are processed with the same salmon version
# https://github.com/AlexsLemonade/refinebio/issues/1496
return False
If the experiment is not eligble for tximport, then None will be returned instead.
"""
organism_indices = {}
for organism in experiment.organisms.all():
organism_indices[organism.id] = get_latest_organism_index(organism)

good_quant_files = []
num_unprocessable_samples = 0
for sample in experiment.samples.all():
if sample.is_unable_to_be_processed:
num_unprocessable_samples += 1
elif (
sample.most_recent_quant_file
and sample.most_recent_quant_file.s3_bucket is not None
and sample.most_recent_quant_file.s3_key is not None
):
sample_organism_index = sample.most_recent_quant_file.result.organism_index
if sample_organism_index == organism_indices[sample.organism.id]:
good_quant_files.append(sample.most_recent_quant_file)

num_quantified = len(good_quant_files)
if num_quantified == 0:
return None

eligible_samples = experiment.samples.filter(source_database="SRA", technology="RNA-SEQ")

num_eligible_samples = eligible_samples.count()
if num_eligible_samples == 0:
return False
num_eligible_samples = eligible_samples.count() - num_unprocessable_samples
if num_eligible_samples <= 0:
return None

percent_complete = num_quantified / num_eligible_samples

if percent_complete == 1.0:
# If an experiment is fully quantified then we should run
# tximport regardless of its size.
return True
return good_quant_files

if (
is_tximport_job
and num_eligible_samples >= EARLY_TXIMPORT_MIN_SIZE
and percent_complete >= EARLY_TXIMPORT_MIN_PERCENT
):
return True
return good_quant_files
else:
return False
return None


def get_quant_results_for_experiment(experiment: Experiment, filter_old_versions=True):
Expand Down Expand Up @@ -108,25 +125,6 @@ def get_sample_id_set(result):
return latest_results


def get_quant_files_for_results(results: List[ComputationalResult]):
"""Returns a list of salmon quant results from `experiment`."""
quant_files = []

for result in results:
quant_sf_file = result.get_quant_sf_file()
if quant_sf_file:
quant_files.append(quant_sf_file)
else:
logger.exception(
"Salmon quant result found without quant.sf ComputedFile!",
quant_result=result.id,
sample=result.samples.first(),
)
raise Exception("Salmon quant result found without quant.sf ComputedFile!")

return quant_files


ENA_DOWNLOAD_URL_TEMPLATE = (
"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{short_accession}{sub_dir}"
"/{long_accession}/{long_accession}{read_suffix}.fastq.gz"
Expand Down
Loading

0 comments on commit 2e600d7

Please sign in to comment.