Merge branch 'ui-v2' of github.com:IGS/gEAR into ui-v2

IGS · Aug 20, 2024 · 7f1c553 · 7f1c553
2 parents 030108b + 5e73205
commit 7f1c553
Show file tree

Hide file tree

Showing 44 changed files with 1,067 additions and 526 deletions.
diff --git a/bin/profile_single_heatmap_run.py b/bin/profile_single_heatmap_run.py
@@ -147,7 +147,7 @@ def get_analysis(analysis, dataset_id, session_id):
             ana.type = analysis['type']
         except:
             user = geardb.get_user_from_session_id(session_id)
-            ana.discover_type(current_user_id=user.id)
+            ana.discover_type()
     else:
         ds = geardb.Dataset(id=dataset_id, has_h5ad=1)
         h5_path = ds.get_file_path()

diff --git a/bin/profile_single_projectr_tsne_run.py b/bin/profile_single_projectr_tsne_run.py
@@ -183,7 +183,7 @@ def get_analysis(analysis, dataset_id, session_id):
             ana.type = analysis['type']
         except:
             user = geardb.get_user_from_session_id(session_id)
-            ana.discover_type(current_user_id=user.id)
+            ana.discover_type()
     else:
         ds = geardb.Dataset(id=dataset_id, has_h5ad=1)
         h5_path = ds.get_file_path()

diff --git a/bin/remove_duplicate_layout_displays.py b/bin/remove_duplicate_layout_displays.py
@@ -0,0 +1,47 @@
+#!/opt/bin/python
+
+# This is to fix an issue where some layouts have duplicated display members, if the user
+# saved layouts in the layout arranger when the duplcation bug was active (https://github.com/IGS/gEAR/issues/768)
+
+import sys
+
+from pathlib import Path
+lib_path = Path(__file__).resolve().parents[1].joinpath('lib')
+
+sys.path.append(str(lib_path))
+
+import geardb
+
+conn = geardb.Connection()
+cursor = conn.get_cursor()
+
+# print row count
+qry = "SELECT COUNT(*) FROM layout_displays"
+cursor.execute(qry)
+row_count = cursor.fetchone()[0]
+print("Row count before deletion: {}".format(row_count))
+
+# https://www.tutorialspoint.com/mysql/mysql-delete-duplicate-records.htm
+qry = """
+DELETE ld1 FROM layout_displays ld1
+INNER JOIN layout_displays ld2
+WHERE ld1.layout_id = ld2.layout_id
+AND ld1.display_id = ld2.display_id
+AND ld1.start_col = ld2.start_col
+AND ld1.grid_width = ld2.grid_width
+AND ld1.start_row = ld2.start_row
+AND ld1.grid_height = ld2.grid_height
+AND ld1.id > ld2.id
+"""
+cursor.execute(qry)
+
+conn.commit()
+
+# print row count
+qry = "SELECT COUNT(*) FROM layout_displays"
+cursor.execute(qry)
+row_count = cursor.fetchone()[0]
+print("Row count after deletion: {}".format(row_count))
+
+cursor.close()
+conn.close()
diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -9,7 +9,6 @@ Flask==3.0.0
 Flask-RESTful==0.3.9
 gunicorn
 h5py==3.10.0
-itsdangerous==2.1.2 # See -> https://stackoverflow.com/a/71206978
 jupyterlab==4.0.5
 jupyter==1.0.0
 kaleido==0.2.1
@@ -36,6 +35,5 @@ scanpy==1.10.1
 scikit-learn==1.0.2
 scipy==1.11.04
 seaborn==0.13.2
-SQLAlchemy==1.4.32
 tables==3.9.2 # Read hdf5 files into pandas
 xlrd==1.2.0
diff --git a/docs/setup.python.md b/docs/setup.python.md
@@ -39,7 +39,6 @@ fixed paths have worked fine for decades.
       Flask-RESTful==0.3.9 \
       gunicorn \
       h5py==3.10.0 \
-      itsdangerous==2.1.2 \
       jupyterlab==4.0.5 \
       jupyter==1.0.0 \
       kaleido==0.2.1 \
@@ -66,7 +65,6 @@ fixed paths have worked fine for decades.
       scikit-learn==1.0.2 \
       scipy==1.11.04 \
       seaborn==0.13.2 \
-      SQLAlchemy==1.4.32 \
       tables==3.9.2 \
       xlrd==1.2.0
     $ sudo mkdir /opt/bin

diff --git a/lib/gear/orthology.py b/lib/gear/orthology.py
@@ -98,7 +98,8 @@ def filter_organism_by_id(organism_id: str):
     Returns:
         dict: The organism dictionary corresponding to the given organism ID.
     """
-    return next((item for item in organisms if item["id"] == organism_id), None)
+
+    return next((item for item in organisms if item.id == organism_id), None)
 
 def get_organism_name_by_id(organism_id: str):
     """Get the organism name corresponding to the given organism ID.
@@ -109,7 +110,11 @@ def get_organism_name_by_id(organism_id: str):
     Returns:
         str: The organism name corresponding to the given organism ID.
     """
-    return filter_organism_by_id(organism_id)["name"]
+    organism = filter_organism_by_id(organism_id)
+    if organism is not None:
+        return organism.label
+    else:
+        return ""
 
 def create_orthology_df(orthomap_file: Path):
     """
@@ -149,7 +154,8 @@ def map_dataframe_genes(orig_df: pd.DataFrame, orthomap_file: Path):
 
     def get_best_match(id1):
         # Get the best match for the id2 gene symbol
-        sorted_by_best_match = orthomap_df[orthomap_df["id1"] == id1].sort_values("algorithms_match_count", ascending=False)
+        best_match_for_id = orthomap_df[orthomap_df["id1"] == id1]
+        sorted_by_best_match = best_match_for_id.sort_values(by="algorithms_match_count", ascending=False)
         # If no match, return the original id1
         if sorted_by_best_match.empty:
             return id1

diff --git a/lib/gear/plotting.py b/lib/gear/plotting.py
@@ -468,18 +468,25 @@ def generate_plot(df, x=None, y=None, z=None, facet_row=None, facet_col=None,
         # TODO: put in function
 
         # Map indexes for subplot ordering.  Indexes start at 1 since plotting rows/cols start at 1
-        facet_row_groups = category_orders[facet_row] if facet_row and facet_row in category_orders else []
+        facet_row_groups = []
+        facet_col_groups = []
+
+        if facet_row:
+            facet_row_groups = category_orders[facet_row] if facet_row in category_orders else df[facet_row].unique().tolist()
+
+        if facet_col:
+            facet_col_groups = category_orders[facet_col] if facet_col in category_orders else df[facet_col].unique().tolist()
+
         facet_row_indexes = {group: idx for idx, group in enumerate(facet_row_groups, start=1)}
         num_rows = len(facet_row_groups) if facet_row else 1
-        facet_col_groups = category_orders[facet_col] if facet_col and facet_col in category_orders else []
         facet_col_indexes = {group: idx for idx, group in enumerate(facet_col_groups, start=1)}
         num_cols = len(facet_col_groups) if facet_col else 1
 
         # Make faceted plot
         fig = make_subplots(rows=num_rows
                 , cols=num_cols
-                , row_titles=facet_row_groups if facet_row else None
-                , column_titles=facet_col_groups if facet_col else None
+                , row_titles=list(facet_row_groups)
+                , column_titles=list(facet_col_groups)
                 , x_title=x_title if x_title else None
                 , y_title=y_title if y_title else None
                 )
@@ -524,7 +531,7 @@ def generate_plot(df, x=None, y=None, z=None, facet_row=None, facet_col=None,
                 # Each individual trace is a separate scalegroup to ensure plots are scaled correctly for violin plots
                 new_plotting_args['scalegroup'] = name
                 if isinstance(name, tuple):
-                    new_plotting_args['scalegroup'] = "_".join(name)
+                    new_plotting_args['scalegroup'] = "_".join(str(name))
 
                 # If color dataseries is present, add some special configurations
                 if color_name:

diff --git a/lib/geardb.py b/lib/geardb.py
@@ -104,7 +104,12 @@ def get_analysis(analysis, dataset_id, session_id):
         if 'type' in analysis:
             ana.type = analysis['type']
         else:
-            ana.discover_type(current_user_id=user_id)
+            ana.discover_type()
+
+        # Check that the h5ad file exists
+        if not os.path.exists(ana.dataset_path()):
+            raise FileNotFoundError("No h5 file found for the passed in analysis")
+
     else:
         ds = Dataset(id=dataset_id, has_h5ad=1)
         h5_path = ds.get_file_path()
@@ -682,7 +687,7 @@ def discover_vetting(self, current_user_id=None):
             return 'community'
 
 
-    def discover_type(self, current_user_id=None):
+    def discover_type(self):
         """
         Given an analysis ID it's technically possible to scan the directory hierarchies and
         find the type.
@@ -909,9 +914,9 @@ def __init__(self, id=None, label=None, genus=None, species=None, strain=None, t
     def __repr__(self):
         return json.dumps(self.__dict__)
 
+@dataclass
 class OrganismCollection:
-    def __init__(self, organisms=None):
-        self.organisms = [] if organisms is None else organisms
+    organisms: List[Organism] = field(default_factory=list)
 
     def __repr__(self):
         return json.dumps(self.__dict__)
@@ -944,6 +949,7 @@ def get_all(self):
             self.organisms.append(org)
 
         cursor.close()
+        conn.close()
 
         return self.organisms
 
@@ -1006,6 +1012,7 @@ def add_member(self, member):
 
         cursor.close()
         conn.commit()
+        conn.close()
 
     def dataset_ids(self):
         """
@@ -1108,7 +1115,7 @@ def load(self):
         self.get_members()
 
         cursor.close()
-        conn.commit()
+        conn.close()
 
     def remove(self):
         """
@@ -1131,6 +1138,7 @@ def remove(self):
 
         cursor.close()
         conn.commit()
+        conn.close()
 
     def remove_all_members(self):
         """
@@ -1147,6 +1155,7 @@ def remove_all_members(self):
 
         cursor.close()
         conn.commit()
+        conn.close()
 
         self.members = []
 
@@ -1172,6 +1181,7 @@ def remove_member_by_display_id(self, display_id):
 
         cursor.close()
         conn.commit()
+        conn.close()
 
     def remove_members_by_dataset_id(self, dataset_id):
         """Deletes all members where the display ID belongs to a given dataset ID from the database."""
@@ -1217,8 +1227,6 @@ def save(self):
             self.id = cursor.lastrowid
         else:
             # ID already populated
-            conn = Connection()
-            cursor = conn.get_cursor()
 
             # Update layout properties
             sql = """
@@ -1235,12 +1243,11 @@ def save(self):
                 self.is_domain, self.share_id, self.id
             ))
 
-            conn.commit()
-
             # TODO: delete existing members, add current ones
 
         cursor.close()
         conn.commit()
+        conn.close()
 
     def save_change(self, attribute=None, value=None):
         """

diff --git a/services/projectr/install_bioc.sh b/services/projectr/install_bioc.sh
@@ -5,8 +5,7 @@ Rver="${Rmaj}.3.1"
 
 current_dir=$(pwd)
 
-# Install and build R (Using 'apt-get install' on Ubuntu Trusty installs version 3.0.2 of R)
-curl http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar -C /opt -zx
+curl -s -L http://lib.stat.cmu.edu/R/CRAN/src/base/${Rmaj}/${Rver}.tar.gz | tar xzv -C /opt
 cd /opt/${Rver}
 /opt/${Rver}/configure --with-readline=no --enable-R-shlib --enable-BLAS-shlib --with-x=no || exit 1
 make || exit 1

diff --git a/services/projectr/main.py b/services/projectr/main.py
@@ -1,5 +1,6 @@
 import os, sys
 import pandas as pd
+from io import StringIO
 from flask import Flask, abort, jsonify, request
 
 cloud_logging = False
@@ -36,9 +37,21 @@ def write_entry(logger_name, severity, message):
 
 def do_binary_projection(target_df, loading_df):
     """Perform projection based on the number of genes that were expressed in the cell or observation."""
-    # Only applies with unweighted gene carts.
-    tp_target_series = target_df.astype(bool).sum(axis=0).transpose()
-    return pd.DataFrame(data=tp_target_series, columns=loading_df.columns, index=tp_target_series.index)
+    # Only applies with unweighted gene carts, or weighted carts with binary values.
+
+    # for each loading pattern, count the number of genes that are expressed in the target
+    # and return the count as the pattern weight.
+    binary_target_df = pd.DataFrame()
+    for pattern in loading_df.columns:
+        # Count the number of genes that are 1 in the loading_df
+        good_loading_genes_mask = loading_df[pattern].astype(bool)
+        good_loading_genes = loading_df.index[good_loading_genes_mask]
+
+        # Count the number of those genes that are 1 (expressed) in the target_df.
+        good_genes = target_df.loc[good_loading_genes].astype(bool).sum(axis=0).transpose()
+        binary_target_df[pattern] = good_genes
+    return binary_target_df
+
 
 def do_pca_projection(target_df, loading_df):
     """Perform projection of PCA loadings."""
@@ -66,6 +79,10 @@ def index():
     write_entry("projectr", "INFO", "Genecart ID: {}".format(genecart_id))
 
 
+    # pd.read_json gives a FutureWarning, and suggest to wrap the json in StringIO.
+    target = StringIO(target)
+    loadings = StringIO(loadings)
+
     target_df = pd.read_json(target, orient="split")
     loading_df = pd.read_json(loadings, orient="split")
 

diff --git a/services/projectr/requirements.txt b/services/projectr/requirements.txt
@@ -1,5 +1,5 @@
 Flask==3.0.0
 gunicorn==20.1.0
 rpy2==3.5.1 # 3.5.2 and up gives errors with rpy2py and py2rpy
-pandas==1.4.1
+pandas==2.2.1
 google-cloud-logging
diff --git a/www/api/requirements.txt b/www/api/requirements.txt
diff --git a/www/api/resources/common.py b/www/api/resources/common.py
@@ -17,7 +17,6 @@
 
 def create_projection_adata(dataset_adata, dataset_id, projection_id):
     # Create AnnData object out of readable CSV file
-    # ? Does it make sense to put this in the geardb/Analysis class?
     projection_id = secure_filename(projection_id)
     dataset_id = secure_filename(dataset_id)
 
@@ -34,7 +33,7 @@ def create_projection_adata(dataset_adata, dataset_id, projection_id):
         obs = dataset_adata.obs
         # Create the anndata object and write to h5ad
         # Associate with a filename to ensure AnnData is read in "backed" mode
-        projection_adata = anndata.AnnData(X=X, obs=obs, var=var, obsm=dataset_adata.obsm, filename=projection_adata_path, filemode='r')
+        projection_adata = anndata.AnnData(X=X, obs=obs, var=var, obsm=dataset_adata.obsm, filemode='r')
     except Exception as e:
         print(str(e), file=sys.stderr)
         raise PlotError("Could not create projection AnnData object from CSV.")
@@ -45,6 +44,12 @@ def create_projection_adata(dataset_adata, dataset_id, projection_id):
     # For some reason the gene_symbol is not taken in by the constructor
     projection_adata.var["gene_symbol"] = projection_adata.var_names
 
+    # Associate with a filename to ensure AnnData is read in "backed" mode
+    # This creates the h5ad file if it does not exist
+    # TODO: If too many processes read from this file, it can throw a BlockingIOError. Eventually we should
+    #       handle this by creating a copy of the file for each process, like a tempfile.
+    projection_adata.filename = projection_adata_path
+
     return projection_adata
 
 def order_by_time_point(obs_df):