Merge pull request #2987 from AlexsLemonade/dev

Deploy to prod
AlexsLemonade · Dec 22, 2021 · 2e600d7 · 2e600d7
2 parents 34e426f + bcc4316
commit 2e600d7
Show file tree

Hide file tree

Showing 24 changed files with 364 additions and 309 deletions.
diff --git a/api/data_refinery_api/views/relation_serializers.py b/api/data_refinery_api/views/relation_serializers.py
@@ -90,6 +90,7 @@ class Meta:
             "pretty_platform",
             "technology",
             "is_processed",
+            "is_unable_to_be_processed",
         )
         read_only_fields = fields
 
@@ -112,6 +113,7 @@ class Meta:
             "manufacturer",
             "protocol_info",
             "is_processed",
+            "is_unable_to_be_processed",
             "created_at",
             "last_modified",
         )

diff --git a/api/data_refinery_api/views/sample.py b/api/data_refinery_api/views/sample.py
@@ -15,8 +15,11 @@
 from data_refinery_api.exceptions import InvalidFilters
 from data_refinery_api.utils import check_filters
 from data_refinery_api.views.relation_serializers import (
+    ComputedFileRelationSerializer,
+    DownloaderJobRelationSerializer,
     OrganismIndexRelationSerializer,
     OrganismRelationSerializer,
+    ProcessorJobRelationSerializer,
     ProcessorRelationSerializer,
 )
 from data_refinery_common.models import (
@@ -58,6 +61,10 @@ class DetailedSampleSerializer(serializers.ModelSerializer):
     annotations = SampleAnnotationSerializer(many=True, source="sampleannotation_set")
     organism = OrganismRelationSerializer(many=False)
     results = DetailedSamplesComputationalResultSerializer(many=True)
+    last_processor_job = ProcessorJobRelationSerializer(many=False)
+    last_downloader_job = DownloaderJobRelationSerializer(many=False)
+    most_recent_smashable_file = ComputedFileRelationSerializer(many=False)
+    most_recent_quant_file = ComputedFileRelationSerializer(many=False)
 
     class Meta:
         model = Sample
@@ -90,10 +97,15 @@ class Meta:
             "compound",
             "time",
             "is_processed",
+            "is_unable_to_be_processed",
             "created_at",
             "last_modified",
             "original_files",
             "computed_files",
+            "last_processor_job",
+            "last_downloader_job",
+            "most_recent_smashable_file",
+            "most_recent_quant_file",
             "contributed_metadata",
             "contributed_keywords",
             "experiment_accession_codes",
@@ -156,6 +168,7 @@ class SampleListView(generics.ListAPIView):
         "compound",
         "time",
         "is_processed",
+        "is_unable_to_be_processed",
         "is_public",
     )
 

diff --git a/api/data_refinery_api/views/stats.py b/api/data_refinery_api/views/stats.py
@@ -40,7 +40,7 @@
 
 logger = get_and_configure_logger(__name__)
 
-JOB_CREATED_AT_CUTOFF = datetime(2019, 9, 19, tzinfo=timezone.utc)
+JOB_CREATED_AT_CUTOFF = datetime(2021, 7, 14, tzinfo=timezone.utc)
 
 # We want to cache all stats pages for 10 minutes to reduce the load on our
 # servers, because computing all of the stats is really expensive

diff --git a/api/run_tests.sh b/api/run_tests.sh
@@ -19,6 +19,12 @@ if ! [ "$(docker ps --filter name=drdb -q)" ]; then
     echo "./scripts/run_postgres.sh" >&2
     exit 1
 fi
+# Ensure that elasticsearch is running
+if ! [ "$(docker ps --filter name=dres -q)" ]; then
+    echo "You must start elasticsearchfirst with:" >&2
+    echo "./scripts/run_es.sh" >&2
+    exit 1
+fi
 
 project_root=$(pwd) # "cd .." called above
 volume_directory="$project_root/test_volume"
@@ -33,7 +39,6 @@ fi
 DB_HOST_IP=$(get_docker_db_ip_address)
 ES_HOST_IP=$(get_docker_es_ip_address)
 
-
 # Only run interactively if we are on a TTY
 if [ -t 1 ]; then
     INTERACTIVE="-i"

diff --git a/common/data_refinery_common/job_management.py b/common/data_refinery_common/job_management.py
@@ -19,6 +19,25 @@
 logger = get_and_configure_logger(__name__)
 
 
+UNFIXABLE_ERRORS = [
+    "The annotation package, pd.primeview, could not be loaded.",
+    "Missing gene index for primeview",
+    "Missing gene index for zebgene11st",
+    "This could be indicative of a mismatch between the reference and sample, or a very bad sample.",
+]
+
+
+def is_job_unprocessable(job):
+    if job.success is not False:
+        return False
+
+    for error in UNFIXABLE_ERRORS:
+        if job.failure_reason and error in job.failure_reason:
+            return True
+
+    return False
+
+
 def create_downloader_job(
     undownloaded_files: List[OriginalFile], *, processor_job_id=None, force=False
 ) -> bool:

diff --git a/common/data_refinery_common/migrations/0070_auto_20211208_2118.py b/common/data_refinery_common/migrations/0070_auto_20211208_2118.py
@@ -0,0 +1,57 @@
+# Generated by Django 3.2.7 on 2021-12-08 21:18
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("data_refinery_common", "0069_rename_compendia_version_computedfile_compendium_version"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="sample",
+            name="is_unable_to_be_processed",
+            field=models.BooleanField(default=False),
+        ),
+        migrations.AddField(
+            model_name="sample",
+            name="last_downloader_job",
+            field=models.ForeignKey(
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                to="data_refinery_common.downloaderjob",
+            ),
+        ),
+        migrations.AddField(
+            model_name="sample",
+            name="last_processor_job",
+            field=models.ForeignKey(
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                to="data_refinery_common.processorjob",
+            ),
+        ),
+        migrations.AddField(
+            model_name="sample",
+            name="most_recent_quant_file",
+            field=models.ForeignKey(
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name="+",
+                to="data_refinery_common.computedfile",
+            ),
+        ),
+        migrations.AddField(
+            model_name="sample",
+            name="most_recent_smashable_file",
+            field=models.ForeignKey(
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name="+",
+                to="data_refinery_common.computedfile",
+            ),
+        ),
+    ]
diff --git a/common/data_refinery_common/models/sample.py b/common/data_refinery_common/models/sample.py
@@ -71,6 +71,19 @@ def __str__(self):
 
     # Crunch Properties
     is_processed = models.BooleanField(default=False)
+    is_unable_to_be_processed = models.BooleanField(default=False)
+    last_processor_job = models.ForeignKey("ProcessorJob", null=True, on_delete=models.SET_NULL)
+    last_downloader_job = models.ForeignKey("DownloaderJob", null=True, on_delete=models.SET_NULL)
+    # Set related_name to "+" to prevent the backwards relation, since
+    # it should be a duplicate of the relation already established by
+    # the computed_files field.
+    most_recent_smashable_file = models.ForeignKey(
+        "ComputedFile", null=True, on_delete=models.SET_NULL, related_name="+"
+    )
+    most_recent_quant_file = models.ForeignKey(
+        "ComputedFile", null=True, on_delete=models.SET_NULL, related_name="+"
+    )
+    is_unable_to_be_processed = models.BooleanField(default=False)
 
     # Blacklisting
     is_blacklisted = models.BooleanField(default=False)
@@ -131,6 +144,11 @@ def get_processor_jobs(self) -> Set:
 
         return processor_jobs
 
+    def get_most_recent_processor_job(self):
+        processor_jobs = self.get_processor_jobs()
+        if processor_jobs:
+            return min(processor_jobs, key=lambda job: job.created_at)
+
     # Returns a set of DownloaderJob objects but we cannot specify
     # that in type hints because it hasn't been declared yet.
     def get_downloader_jobs(self) -> Set:
@@ -141,6 +159,11 @@ def get_downloader_jobs(self) -> Set:
 
         return downloader_jobs
 
+    def get_most_recent_downloader_job(self):
+        downloader_jobs = self.get_downloader_jobs()
+        if downloader_jobs:
+            return min(downloader_jobs, key=lambda job: job.created_at)
+
     def get_result_files(self):
         """Get all of the ComputedFile objects associated with this Sample"""
         return self.computed_files.all()

diff --git a/common/data_refinery_common/rna_seq.py b/common/data_refinery_common/rna_seq.py
@@ -28,47 +28,64 @@
 EARLY_TXIMPORT_MIN_PERCENT = 0.80
 
 
-def should_run_tximport(experiment: Experiment, results, is_tximport_job: bool):
-    """ Returns whether or not the experiment is eligible to have tximport
-    run on it.
+def get_latest_organism_index(organism):
+    # Salmon version gets saved as what salmon outputs, which includes this prefix.
+    current_salmon_version = "salmon " + get_env_variable("SALMON_VERSION", "0.13.1")
+    return (
+        OrganismIndex.objects.filter(salmon_version=current_salmon_version, organism=organism)
+        .order_by("-created_at")
+        .first()
+    )
 
-    results is a set of ComputationalResults for the samples that had salmon quant run on them.
-    """
-    num_quantified = len(results)
-    if num_quantified == 0:
-        return False
 
-    salmon_versions = set()
-    for result in results:
-        if result.organism_index.salmon_version:
-            salmon_versions.add(result.organism_index.salmon_version)
+def get_tximport_inputs_if_eligible(experiment: Experiment, is_tximport_job: bool):
+    """Returns a list of the most recent quant.sf ComputedFiles for the experiment.
 
-    if len(salmon_versions) > 1:
-        # Tximport requires that all samples are processed with the same salmon version
-        # https://github.com/AlexsLemonade/refinebio/issues/1496
-        return False
+    If the experiment is not eligble for tximport, then None will be returned instead.
+    """
+    organism_indices = {}
+    for organism in experiment.organisms.all():
+        organism_indices[organism.id] = get_latest_organism_index(organism)
+
+    good_quant_files = []
+    num_unprocessable_samples = 0
+    for sample in experiment.samples.all():
+        if sample.is_unable_to_be_processed:
+            num_unprocessable_samples += 1
+        elif (
+            sample.most_recent_quant_file
+            and sample.most_recent_quant_file.s3_bucket is not None
+            and sample.most_recent_quant_file.s3_key is not None
+        ):
+            sample_organism_index = sample.most_recent_quant_file.result.organism_index
+            if sample_organism_index == organism_indices[sample.organism.id]:
+                good_quant_files.append(sample.most_recent_quant_file)
+
+    num_quantified = len(good_quant_files)
+    if num_quantified == 0:
+        return None
 
     eligible_samples = experiment.samples.filter(source_database="SRA", technology="RNA-SEQ")
 
-    num_eligible_samples = eligible_samples.count()
-    if num_eligible_samples == 0:
-        return False
+    num_eligible_samples = eligible_samples.count() - num_unprocessable_samples
+    if num_eligible_samples <= 0:
+        return None
 
     percent_complete = num_quantified / num_eligible_samples
 
     if percent_complete == 1.0:
         # If an experiment is fully quantified then we should run
         # tximport regardless of its size.
-        return True
+        return good_quant_files
 
     if (
         is_tximport_job
         and num_eligible_samples >= EARLY_TXIMPORT_MIN_SIZE
         and percent_complete >= EARLY_TXIMPORT_MIN_PERCENT
     ):
-        return True
+        return good_quant_files
     else:
-        return False
+        return None
 
 
 def get_quant_results_for_experiment(experiment: Experiment, filter_old_versions=True):
@@ -108,25 +125,6 @@ def get_sample_id_set(result):
     return latest_results
 
 
-def get_quant_files_for_results(results: List[ComputationalResult]):
-    """Returns a list of salmon quant results from `experiment`."""
-    quant_files = []
-
-    for result in results:
-        quant_sf_file = result.get_quant_sf_file()
-        if quant_sf_file:
-            quant_files.append(quant_sf_file)
-        else:
-            logger.exception(
-                "Salmon quant result found without quant.sf ComputedFile!",
-                quant_result=result.id,
-                sample=result.samples.first(),
-            )
-            raise Exception("Salmon quant result found without quant.sf ComputedFile!")
-
-    return quant_files
-
-
 ENA_DOWNLOAD_URL_TEMPLATE = (
     "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{short_accession}{sub_dir}"
     "/{long_accession}/{long_accession}{read_suffix}.fastq.gz"