HumanCellAtlas · calvinnhieu · Nov 8, 2019 · Nov 8, 2019 · calvinnhieu · Nov 8, 2019
diff --git a/docker-images/matrix-converter/VERSION b/docker-images/matrix-converter/VERSION
@@ -1 +1 @@
-36
+37
diff --git a/matrix/common/constants.py b/matrix/common/constants.py
@@ -459,7 +459,7 @@ class MetadataSchemaName(Enum):
     MatrixFormat.MTX.value: """
 <h2>HCA Matrix Service MTX Output</h2>
 <p>The mtx-formatted output from the matrix service is a zip archive that contains
-three files:</p>
+four files:</p>
 <table class="table table-striped table-bordered">
 <thead>
 <tr>
@@ -477,11 +477,18 @@ class MetadataSchemaName(Enum):
 <td>Cell metadata</td>
 </tr>
 <tr>
-<td>&lt;directory_name&gt;/genes.tsv.gz</td>
+<td>&lt;directory_name&gt;/features.tsv.gz</td>
 <td>Gene (or transcript) metadata</td>
 </tr>
+<tr>
+<td>&lt;directory_name&gt;/barcodes.tsv.gz</td>
+<td>Cell barcodes</td>
+</tr>
 </tbody>
 </table>
+<p>For 10x experiments, this format adheres to the Cell Ranger
+<a href="https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices">
+feature-barcode matrix</a> specification.</p>
 
 <h3><code>matrix.mtx.gz</code></h3>
 <p>This file contains expression values in the
@@ -494,8 +501,8 @@ class MetadataSchemaName(Enum):
 <p>The expression values are meant to be a "raw" count, so for SmartSeq2 experiments, this
 is the <code>expected_count</code> field from
 <a href="http://deweylab.biostat.wisc.edu/rsem/rsem-calculate-expression.html#output">RSEM
-output</a>. For 10x experiments analyzed with Cell Ranger, this is read from the
-<code>matrix.mtx</code> file that Cell Ranger produces as its filtered feature-barcode matrix.</p>
+output</a>. For 10x experiments analyzed with Optimus, this is read from the
+<a href="https://zarr.readthedocs.io/en/stable">zarr</a> array produced by the pipeline.</p>
 
 <h3><code>cells.tsv.gz</code></h3>
 <p>Each row of the cell metadata table represents a cell, and each column is a different metadata
@@ -504,10 +511,14 @@ class MetadataSchemaName(Enum):
 fields, <code>genes_detected</code> for example, are calculated during secondary analysis.
 Full descriptions of those fields are forthcoming.</p>
 
-<h3><code>genes.tsv.gz</code></h3>
+<h3><code>features.tsv.gz</code></h3>
 <p>The gene metadata contains basic information about the genes in the count matrix.
 Each row is a gene, and each row corresponds to the same row in the expression mtx file.
 Note that <code>featurename</code> is not unique.</p>
+
+<h3><code>barcodes.tsv.gz</code></h3>
+<p>A list of cell barcodes corresponding to the columns found in matrix.mtx.gz.
+Note that barcodes may not be unique.</p>
 """
 }
 

diff --git a/matrix/docker/matrix_converter.py b/matrix/docker/matrix_converter.py
@@ -176,7 +176,7 @@ def _grouper(iterable, n):
                     yield cells_df
 
     def _to_mtx(self):
-        """Write a zip file with an mtx and two metadata tsvs from Redshift query
+        """Write a zip file with an mtx and three metadata tsvs from Redshift query
         manifests.
 
         Returns:
@@ -219,11 +219,10 @@ def _to_mtx(self):
                 cell_count += pivoted.shape[1]
                 cellkeys.extend(pivoted.columns.to_list())
 
-        self._write_out_gene_dataframe(results_dir, "genes.tsv.gz", compression=True)
         self._write_out_cell_dataframe(results_dir, "cells.tsv.gz", cell_df, cellkeys, compression=True)
         self._write_out_barcode_dataframe(results_dir, "barcodes.tsv.gz", cell_df, cellkeys)
 
-        file_names = ["features.tsv.gz", "genes.tsv.gz", "matrix.mtx.gz", "cells.tsv.gz", "barcodes.tsv.gz"]
+        file_names = ["features.tsv.gz", "matrix.mtx.gz", "cells.tsv.gz", "barcodes.tsv.gz"]
         zip_path = self._zip_up_matrix_output(results_dir, file_names)
         return zip_path
 

diff --git a/terraform/modules/matrix-service/infra/converter_batch.tf b/terraform/modules/matrix-service/infra/converter_batch.tf
@@ -222,7 +222,7 @@ resource "aws_batch_job_definition" "converter_job_def" {
     container_properties = <<CONTAINER_PROPERTIES
 {
   "command": [],
-  "image": "humancellatlas/matrix-converter:36",
+  "image": "humancellatlas/matrix-converter:37",
   "memory": 8192,
   "vcpus": 2,
   "jobRoleArn": "${aws_iam_role.converter_job_role.arn}",

diff --git a/tests/functional/test_conversions.py b/tests/functional/test_conversions.py
@@ -209,11 +209,10 @@ def test_mtx(self, mock_upload_method):
             # Check the components of the zip file
             members = mtx_output.namelist()
             self.assertIn("test.mtx/matrix.mtx.gz", members)
-            self.assertIn("test.mtx/genes.tsv.gz", members)
             self.assertIn("test.mtx/cells.tsv.gz", members)
             self.assertIn("test.mtx/features.tsv.gz", members)
             self.assertIn("test.mtx/barcodes.tsv.gz", members)
-            self.assertEqual(len(members), 5)
+            self.assertEqual(len(members), 4)
 
             # Read in the cell and gene tables. We need both for mtx files
             # since the mtx itself is just numbers and indices.
@@ -223,8 +222,19 @@ def test_mtx(self, mock_upload_method):
                 mtx_cells[row["cellkey"]] = row
 
             mtx_genes = collections.OrderedDict()
-            for row in csv.DictReader(io.StringIO(gzip.GzipFile(fileobj=io.BytesIO(
-                    mtx_output.read("test.mtx/genes.tsv.gz"))).read().decode()), delimiter='\t'):
+            for row in csv.DictReader(
+                    io.StringIO(gzip.GzipFile(fileobj=io.BytesIO(
+                    mtx_output.read("test.mtx/features.tsv.gz"))).read().decode()),
+                    delimiter='\t',
+                    fieldnames=["featurekey",
+                                "featurename",
+                                "featuretype",
+                                "featuretype_10x",
+                                "chromosome",
+                                "featurestart",
+                                "featureend",
+                                "isgene",
+                                "genus_species"]):
                 mtx_genes[row["featurekey"]] = row
 
             # Read the expression values. This is supposed to be aligned with

diff --git a/tests/unit/docker/test_matrix_converter.py b/tests/unit/docker/test_matrix_converter.py
@@ -385,9 +385,6 @@ def test__to_mtx(self, mock_parse_manifest, mock_load_cell_results, mock_write_g
         test_data["genes_df"].to_csv(os.path.join(results_dir, "features.tsv.gz"),
                                      index_label="featurekey",
                                      sep="\t", compression="gzip")
-        test_data["genes_df"].to_csv(os.path.join(results_dir, "genes.tsv.gz"),
-                                     index_label="featurekey",
-                                     sep="\t", compression="gzip")
         self.matrix_converter.local_output_filename = "unit_test__to_mtx.zip"
         zip_path = self.matrix_converter._to_mtx()