From a38d1d9bbdef5a208ece4543a2872f557b031041 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Thu, 15 Feb 2024 10:14:07 -0500 Subject: [PATCH] group alt alleles using gene symbols refs https://github.com/related-sciences/ensembl-genes/issues/9 enables grouping genes that are not grouped in alt_allele table. --- ensembl_genes/ensembl_genes.py | 97 ++++++++----------- .../notebooks/ensembl_genes_eda.ipynb | 15 +-- ensembl_genes/queries/gene_alt_alleles.sql | 47 ++++++--- 3 files changed, 73 insertions(+), 86 deletions(-) diff --git a/ensembl_genes/ensembl_genes.py b/ensembl_genes/ensembl_genes.py index 4bbfa1a..cd56dbe 100644 --- a/ensembl_genes/ensembl_genes.py +++ b/ensembl_genes/ensembl_genes.py @@ -44,15 +44,17 @@ def __init__(self, species: str | Species = "human", release: str = "latest"): def connection_url(self) -> str: """ Ensembl public MySQL Servers information at - https://uswest.ensembl.org/info/data/mysql.html + . + NOTE: Common Table Expression (CTEs) are not supported in MySQL < 8.0 or MariaDB < 10.2. + Window functions are also not available. """ import sqlalchemy url = sqlalchemy.engine.url.URL.create( drivername="mysql+mysqlconnector", username="anonymous", - host="ensembldb.ensembl.org", # From Ensembl 48 onwards only - # host="useastdb.ensembl.org", # Current and previous Ensembl version only + host="ensembldb.ensembl.org", # From Ensembl 48 onwards only, MySQL 5.6.33 + # host="useastdb.ensembl.org", # Current and previous Ensembl version only, MariaDB 10.0.30 port=3306, database=self.database, ) @@ -96,7 +98,9 @@ def gene_df(self) -> pd.DataFrame: gene_df = gene_df.join(desc_df) # add ensembl_representative_gene_id column gene_repr_df = gene_df.merge( - self.alt_allele_df[["ensembl_gene_id", "ensembl_representative_gene_id"]], + self.representative_gene_df[ + ["ensembl_gene_id", "ensembl_representative_gene_id"] + ], how="left", ) gene_repr_df.ensembl_representative_gene_id = ( @@ -130,36 +134,26 @@ def _check_gene_df(self, gene_df: pd.DataFrame) -> None: assert bad_chromosome_df.empty @cached_property - def alt_allele_df(self) -> pd.DataFrame: - alt_allele_df = self.run_query("gene_alt_alleles") - expected_cols = [ - *alt_allele_df.columns, - "ensembl_representative_gene_id", - "is_representative_gene", - "representative_gene_method", - ] - if not alt_allele_df.empty: - alt_allele_df = alt_allele_df.groupby("alt_allele_group_id").apply( - self._alt_allele_add_representative - ) - else: - # Force expected output columns, since groupby-apply does not add columns - # from _alt_allele_add_representative when alt_allele_df is empty. - alt_allele_df = alt_allele_df.reindex(columns=expected_cols) - # ensembl_gene_id can be duplicated due to multiple alt_allele_attrib values - alt_allele_df = alt_allele_df.drop_duplicates("ensembl_gene_id", keep="first") - self._check_alt_allele_df(alt_allele_df) - return alt_allele_df + def representative_gene_df(self) -> pd.DataFrame: + df = ( + self.run_query("gene_alt_alleles") + .groupby("rs_allele_group", group_keys=False) + .apply(self._alt_allele_add_representative) + # ensembl_gene_id can be duplicated due to multiple alt_allele_attrib values + .drop_duplicates("ensembl_gene_id", keep="first") + ) + self._check_representative_gene_df(df) + return df @staticmethod - def _check_alt_allele_df(alt_allele_df: pd.DataFrame) -> None: + def _check_representative_gene_df(alt_allele_df: pd.DataFrame) -> None: dup_df = alt_allele_df[alt_allele_df.ensembl_gene_id.duplicated(keep=False)] assert dup_df.empty @staticmethod - def _alt_allele_get_representative(df: pd.DataFrame) -> "tuple[str, str]": + def _alt_allele_get_representative(df: pd.DataFrame) -> str: """ - For a subset of the alt_allele_df corresponding to a single `alt_allele_group_id`, + For a subset of the alt_allele_df corresponding to a single `rs_allele_group`, return a (ensembl_representative_gene_id, representative_gene_method) tuple, where `ensembl_representative_gene_id` is the selected representative gene from this group and `representative_gene_method` is the method by which it was selected. @@ -169,43 +163,28 @@ def _alt_allele_get_representative(df: pd.DataFrame) -> "tuple[str, str]": 2. primary_assembly: a single gene is on the primary assembly. 3. first_added: the gene first added to the Ensembl database. """ - representatives = list( - df.loc[df.alt_allele_is_representative.astype("bool"), "ensembl_gene_id"] - ) - if len(representatives) == 1: - return representatives[0], "alt_allele_is_representative" - if len(representatives) > 1: - raise ValueError( - "expected at most 1 IS_REPRESENTATIVE gene per alt_allele_group" - ) - representatives = list( - df.loc[df.primary_assembly.astype("bool"), "ensembl_gene_id"] - ) - if len(representatives) == 1: - return representatives[0], "primary_assembly" - if len(representatives) > 1: - raise ValueError( - "expected at most 1 primary assembly gene per alt_allele_group" - ) - return ( - df.sort_values( - ["ensembl_created_date", "ensembl_gene_id"] - ).ensembl_gene_id.iloc[0], - "first_added", - ) - - @staticmethod - def _alt_allele_add_representative(df: pd.DataFrame) -> pd.DataFrame: + representative_gene = df.sort_values( + [ + "alt_allele_is_representative", + "primary_assembly", + "ensembl_created_date", + "ensembl_gene_id", + ], + ascending=[False, False, True, True], + ).ensembl_gene_id.iloc[0] + assert isinstance(representative_gene, str) + return representative_gene + + @classmethod + def _alt_allele_add_representative(cls, df: pd.DataFrame) -> pd.DataFrame: """ - Apply to alt_allele_df grouped by `alt_allele_group_id` to add columns: + Apply to alt_allele_df grouped by `rs_allele_group` to add columns: `ensembl_representative_gene_id`, `is_representative_gene`, `representative_gene_method`. """ - representative, method = Ensembl_Gene_Queries._alt_allele_get_representative(df) - df["ensembl_representative_gene_id"] = representative + df["ensembl_representative_gene_id"] = cls._alt_allele_get_representative(df) df["is_representative_gene"] = ( df.ensembl_gene_id == df.ensembl_representative_gene_id ) - df["representative_gene_method"] = method # the ordering of alt_allele_df should always place the representative gene first. # at some point we might be able to move all logic to SQL assert df["is_representative_gene"].iloc[0] @@ -481,7 +460,7 @@ class Ensembl_Gene_Catalog_Writer(Ensembl_Gene_Queries): ), DatasetExport( name="alt_alleles", - query_fxn="alt_allele_df", + query_fxn="representative_gene_df", description=( "This is an intermediate table that groups genes if they are alternate alleles of each other. " "A representative gene is selected from each group." diff --git a/ensembl_genes/notebooks/ensembl_genes_eda.ipynb b/ensembl_genes/notebooks/ensembl_genes_eda.ipynb index 333ec5d..3813ff8 100644 --- a/ensembl_genes/notebooks/ensembl_genes_eda.ipynb +++ b/ensembl_genes/notebooks/ensembl_genes_eda.ipynb @@ -194,7 +194,7 @@ "metadata": {}, "outputs": [], "source": [ - "ensg.alt_allele_df.head()" + "ensg.representative_gene_df.head()" ] }, { @@ -221,7 +221,7 @@ "metadata": {}, "outputs": [], "source": [ - "ensg.alt_allele_df.alt_allele_attrib.value_counts()" + "ensg.representative_gene_df.alt_allele_attrib.value_counts()" ] }, { @@ -230,16 +230,7 @@ "metadata": {}, "outputs": [], "source": [ - "ensg.alt_allele_df.query(\"is_representative_gene\").representative_gene_method.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ensg.gene_df.query(\"ensembl_gene_id != ensembl_representative_gene_id\").head(2)" + "ensg.representative_gene_df.query(\"ensembl_gene_id != ensembl_representative_gene_id\").head(2)" ] }, { diff --git a/ensembl_genes/queries/gene_alt_alleles.sql b/ensembl_genes/queries/gene_alt_alleles.sql index 7e70ce9..dbe42d7 100644 --- a/ensembl_genes/queries/gene_alt_alleles.sql +++ b/ensembl_genes/queries/gene_alt_alleles.sql @@ -1,24 +1,41 @@ --- Get alt allele groups: genes with multiple alleles +-- Get alt allele groups: genes with multiple alleles. +-- Includes all genes, even if they are in a group of only themselves. +-- Use gene symbols to group genes, because the alt_allele table alone is incomplete. +-- https://github.com/related-sciences/ensembl-genes/issues/9. SELECT + COALESCE( + xref.display_label, + CAST(alt_allele_group_id AS CHAR), + gene.stable_id + ) AS rs_allele_group, gene.stable_id AS ensembl_gene_id, - alt_allele.alt_allele_group_id, - alt_allele_attrib.attrib = "IS_REPRESENTATIVE" AS alt_allele_is_representative, + xref.display_label AS gene_symbol, + gene.created_date AS ensembl_created_date, + seq_region.name AS seq_region, -- we used to use assembly_exception to determine primary assembly, but this table is now empty -- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773 -- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1'), -- even though this is a flawed method that would miss scaffolds that are primary assemblies. LENGTH(seq_region.name) <= 3 AS primary_assembly, - seq_region.name AS seq_region, + alt_allele.alt_allele_group_id AS alt_allele_group_id, + -- WARNING: alt_allele_attrib can introduce multiple rows per gene! alt_allele_attrib.attrib AS alt_allele_attrib, - gene.created_date AS ensembl_created_date -FROM alt_allele -INNER JOIN gene - ON gene.gene_id = alt_allele.gene_id -INNER JOIN alt_allele_attrib - ON alt_allele.alt_allele_id = alt_allele_attrib.alt_allele_id -INNER JOIN seq_region - ON gene.seq_region_id = seq_region.seq_region_id --- all genes were current when query was written, ensure this is always the case + IFNULL(alt_allele_attrib.attrib = "IS_REPRESENTATIVE", FALSE) AS alt_allele_is_representative +FROM gene +LEFT JOIN seq_region USING (seq_region_id) +LEFT JOIN xref ON xref.xref_id = gene.display_xref_id +LEFT JOIN alt_allele USING (gene_id) +LEFT JOIN alt_allele_attrib USING (alt_allele_id) WHERE gene.is_current -ORDER BY alt_allele_group_id, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id --- LIMIT 5 +ORDER BY rs_allele_group, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id + +-- it would be nice to identify the representative gene from a group in this query, +-- but window functions like the following are not supported: +-- ROW_NUMBER() OVER ( +-- PARTITION BY rs_allele_group +-- ORDER BY +-- alt_allele_is_representative DESC NULLS LAST, +-- primary_assembly DESC NULLS LAST, +-- ensembl_created_date ASC NULLS LAST, +-- ensembl_gene_id ASC NULLS LAST +-- ) AS representative_gene_rank