From 9f10320acb6b39a32f351faa90cb4111418065ef Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 12 Dec 2024 12:28:01 -0600 Subject: [PATCH 01/15] add LCA to dictionary --- .github/components/dictionary.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/components/dictionary.txt b/.github/components/dictionary.txt index 76ffc1812..cc834fc48 100644 --- a/.github/components/dictionary.txt +++ b/.github/components/dictionary.txt @@ -121,6 +121,7 @@ Jitter JSON Jupyter karyotyping +LCA leiden LGBTQ licensor From 770c1c87fa6aa1b0844111f3494bc13072d9716d Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 12 Dec 2024 12:28:30 -0600 Subject: [PATCH 02/15] consensus labels from references notebook --- .../01-reference-exploration.Rmd | 590 ++++++++++++++++++ 1 file changed, 590 insertions(+) create mode 100644 analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd new file mode 100644 index 000000000..7fee6beb7 --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -0,0 +1,590 @@ +--- +title: "Summary of cell type ontologies in reference files" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 3 + code_folding: "hide" + df_print: "kable" +--- + +This notebook aims to identify a set of consensus labels between cell types in the PanglaoDB and Blueprint Encode references. + +- First I look at the entire cell type ontology and summarize the total ancestors and descendants. +- Then we find all possible combinations of cell type labels between PanglaoDB and Blueprint Encode and find the latest common ancestor (LCA) for all combinations. +- The total descendants for each LCA is used to define a cutoff for consensus terms we may want to use. +- I also explored the pairs for some terms in depth to look at borderline LCA terms. +- Finally, I calculated the similarity index for all pairs. + +## Setup + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_bw() +) +``` + + +```{r base paths} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) + +# The path to this module +ref_dir <- file.path(repository_base, "analyses", "cell-type-consensus", "references") + +# path to ref file for panglao +panglao_file <- file.path(ref_dir, "panglao-cell-type-ontologies.tsv") +``` + + +```{r} +# grab obo file +cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl-basic.obo") + +# read in panglao file +panglao_df <- readr::read_tsv(panglao_file) |> + # rename columns to have panglao in them for easy joining later + dplyr::select( + panglao_ontology = "ontology_id", + panglao_annotation = "human_readable_value" + ) + +# grab singler ref from celldex +blueprint_ref <- celldex::BlueprintEncodeData() +# get ontologies and human readable name into data frame +blueprint_df <- data.frame( + blueprint_ontology = blueprint_ref$label.ont, + blueprint_annotation_main = blueprint_ref$label.main, + blueprint_annotation_fine = blueprint_ref$label.fine +) |> + unique() +``` + +## Full cell ontology + +Below I will calculate the total number of ancestors and the total number of descendants for each term in the full cell type ontology and then show the distributions for those statistics. +This will give us an idea of the range of values we expect to see when looking at the PanglaoDB and Blueprint Encode references. + +```{r} +# turn cl_ont into data frame with one row per term +cl_df <- data.frame( + cl_ontology = cl_ont$id, + cl_annotation = cl_ont$name +) |> + dplyr::rowwise() |> + dplyr::mutate( + # list all ancestors and descendants calculate total + ancestors = list(ontologyIndex::get_ancestors(cl_ont, cl_ontology)), + total_ancestors = length(ancestors), + descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology)), + total_descendants = length(descendants) + ) +``` + +The vertical lines in the below plot indicate the value for cell types of varying granularity. + +```{r} +celltypes_of_interest <- c("eukaryotic cell", "lymphocyte", "leukocyte", "hematopoietic cell", "T cell", "endothelial cell", "smooth muscle cell", "memory T cell") +line_df <- cl_df |> + dplyr::filter(cl_annotation %in% celltypes_of_interest) |> + dplyr::select(cl_annotation, total_descendants, total_ancestors) |> + unique() + +# group any labels that have the same number of ancestors +ancestor_labels_df <- line_df |> + dplyr::group_by(total_ancestors) |> + dplyr::summarise(cl_annotation = paste(cl_annotation, collapse = ",")) +``` + + +```{r} +# make density plots showing distribution of ancestors and descendants +ggplot(cl_df, aes(x = total_ancestors)) + + geom_density(fill = "#00274C", alpha = 0.5) + + geom_vline(data = ancestor_labels_df, + mapping = aes(xintercept = total_ancestors), + lty = 2) + + geom_text( + data = ancestor_labels_df, + mapping = aes(x = total_ancestors, y = 0.04, label = cl_annotation), + angle = 90, + vjust = -0.5 + ) + + labs( + x = "Number of ancestors", + y = "Density" + ) +``` + +Generally it looks like as the cell types get more specific we see a greater number of ancestors. +However, the range of values is small and we see some cell types have the same value and probably not the same level of granularity. + +Below we will look at total number of descendants. + +```{r} +ggplot(cl_df, aes(x = total_descendants)) + + geom_density(fill = "#FFCB05", alpha = 0.5) + + geom_vline(data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2) + + geom_text( + data = line_df, + mapping = aes(x = total_descendants, y = 0.6, label = cl_annotation), + angle = 90, + vjust = -0.5 + ) + + labs( + x = "Number of descendants", + y = "Density" + ) +``` + +It looks like most cell types have very few descendants, so let's zoom into the area below 500 to get a better look. + +```{r} +ggplot(cl_df, aes(x = total_descendants)) + + geom_density(fill = "#FFCB05", alpha = 0.5) + + geom_vline(data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2) + + geom_text( + data = line_df, + mapping = aes(x = total_descendants, y = 0.6, label = cl_annotation), + angle = 90, + vjust = -0.5 + ) + + labs( + x = "Number of descendants", + y = "Density" + ) + + xlim(c(0,500)) +``` + +Here we see a much larger range of values and that cell types become more general as the number of descendants goes up. +However, this distribution alone is probably not helpful in determining a cutoff. +The next section we will look at this distribution specifically for cell types present in our references, PanglaoDB and Blueprint encode. + + +## Latest common ancestor (LCA) between PanglaoDB and Blueprint encode + +This section will look at identifying the latest common ancestor (LCA) between all possible combinations of terms from PanglaoDB (used for assigning cell types with `CellAssign`) and the `BlueprintEncodeData` reference from `celldex` (used for assigning cell types with `SingleR`). +The LCA refers to the latest term in the cell ontology heirarchy that is common between two terms. +I will use the [`ontoProc::findCommonAncestors()` function](https://rdrr.io/bioc/ontoProc/man/findCommonAncestors.html) to get the LCA for each combination. + +Note that it is possible to have more than one LCA for a set of terms. +To start, I will keep all LCA terms found. + +For each LCA, I will again look at the total number of ancestors and descendants and see if I can identify an appropriate cutoff. +Ultimately, I would like to see if we can use that cutoff to decide if we should keep the LCA term as the consensus label or use "Unknown". + +```{r} +# first set up the graph from cl ont +parent_terms <- cl$parents +cl_graph <- igraph::make_graph(rbind(unlist(parent_terms), rep(names(parent_terms), lengths(parent_terms)))) +``` + + +```{r} +# get a data frame with all combinations of panglao and blueprint terms +# one row for each combination +all_ref_df <- expand.grid(panglao_df$panglao_ontology, + blueprint_df$blueprint_ontology) |> + dplyr::rename( + panglao_ontology = "Var1", + blueprint_ontology = "Var2" + ) |> + # add in the human readable values for each ontology term + dplyr::left_join(blueprint_df, by = "blueprint_ontology") |> + dplyr::left_join(panglao_df, by = "panglao_ontology") |> + tidyr::drop_na() |> + dplyr::rowwise() |> + dplyr::mutate( + # least common shared ancestor + lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = g))) + ) + +lca_df <- all_ref_df |> + dplyr::mutate( + total_lca = length(lca), # max is three terms + lca = paste0(lca, collapse = ",") # make it easier to split the df + ) |> + # split each lca term into its own column + tidyr::separate(lca, into = c("lca_1", "lca_2", "lca_3"), sep = ",") |> + tidyr::pivot_longer( + cols = dplyr::starts_with("lca"), + names_to = "lca_number", + values_to = "lca" + ) |> + tidyr::drop_na() |> + dplyr::select(-lca_number) |> + # account for any cases where the ontology IDs are exact matches + # r complains about doing this earlier since the lca column holds lists until now + dplyr::mutate(lca = dplyr::if_else(blueprint_ontology == panglao_ontology, blueprint_ontology, lca)) |> + # join in information for each of the lca terms including name, number of ancestors and descendants + dplyr::left_join(cl_df, by = c("lca" = "cl_ontology")) +``` + + +### Distribution of ancestors and descendants + +```{r} +ggplot(lca_df, aes(x = total_ancestors)) + + geom_density() + + geom_vline(data = ancestor_labels_df, + mapping = aes(xintercept = total_ancestors), + lty = 2) + + geom_text( + data = ancestor_labels_df, + mapping = aes(x = total_ancestors, y = 0.6, label = cl_annotation), + angle = 90, + vjust = -0.5 + ) + + labs( + x = "Total number of ancestors", + y = "Density" + ) +``` + +```{r} +ggplot(lca_df, aes(x = total_descendants)) + + geom_density() + + geom_vline(data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2) + + geom_text( + data = line_df, + mapping = aes(x = total_descendants, y = 0.002, label = cl_annotation), + angle = 90, + vjust = -0.5 + ) + + labs( + x = "Total number of descendants", + y = "Density" + ) +``` + +Let's zoom into the area below 1000, since we already know we would want to exlude anything above that based on this plot. + +```{r} +ggplot(lca_df, aes(x = total_descendants)) + + geom_density() + + geom_vline(data = line_df, + mapping = aes(xintercept = total_descendants), + lty = 2) + + geom_text( + data = line_df, + mapping = aes(x = total_descendants, y = 0.002, label = cl_annotation), + angle = 90, + vjust = -0.5 + ) + + xlim(c(0, 1000)) + + labs( + x = "Total number of descendants", + y = "Density" + ) +``` + +We can use the vertical lines for cells of interest to help us define a potential cutoff based on the granularity we would like to see in our consensus label. +We want to be able to label things like T cell, but we don't want to label anything as lymphocyte as that's probably not helpful. +I don't see any obvious cutoffs that may be present in the total number of ancestors, but the number of descendants is likely to be informative. +I think it might be a good idea to start by drawing a line at the local maxima between the T cell and lymphocyte lines on the number of descendants graph. + +### Defining a cutoff for number of descendants + +First we will find the value for the first peak shown in the distribution. +This is likely to be a good cutoff for deciding which LCA labels to keep. + +```{r} +peak_idx <- splus2R::peaks(lca_df$total_descendants) +cutoff <- lca_df$total_descendants[peak_idx] |> + min() # find the smallest peak and use that as the cutoff for number of descendants +``` + + +Below is the list of all consensus cell type labels that we will be keeping if we were to just use this cutoff. + +```{r} +celltypes_to_keep <- lca_df |> + dplyr::filter(total_descendants <= cutoff) |> + dplyr::pull(cl_annotation) |> + unique() + +celltypes_to_keep +``` + + +We can also look at all the cell types we are keeping and the total number of descendants to see if there are any that may be we don't want to include because the term is too braod. + +```{r} +# pull out the cell types and total descendants for cell types to keep +plot_celltype_df <- lca_df |> + dplyr::filter(cl_annotation %in% celltypes_to_keep) |> + dplyr::select(cl_annotation, total_descendants) |> + unique() + +# bar chart showing total number of descendants for each cell type +ggplot(plot_celltype_df, aes(x = reorder(cl_annotation, total_descendants), y = total_descendants)) + + geom_bar(stat = "identity") + + theme( + axis.text.x = element_text(angle = 90) + ) + + labs( + x = "cell type", + y = "Total descendants" + ) +``` + + +There are a few terms that I think might be more broad than we want like `blood cell`, `bone cell`, `supporting cell`, and `lining cell`. +I'm on the fence about keeping `myeloid leukocyte` and `progenitor cell`. +I think if we wanted to remove those terms we could move our cutoff to be the same number of descendants as `T cell`, since we do want to keep that. + +One could also argue to remove `stromal cell` or `extracellular matrix secreting cell`. + +Below are tables that look specifically at the combinations of cell type annotations that resulted in some of the terms that I might consider removing. + +```{r} +print_df <- lca_df |> + dplyr::select(blueprint_annotation_main, blueprint_annotation_fine, panglao_annotation, total_lca, cl_annotation) + +# blood cell +print_df |> + dplyr::filter(cl_annotation == "blood cell") +``` + +I think I'm in favor of not having a "blood cell" label, since I'm not sure that it's helpful. +Also, if two different methods label something a platelet and a neutrophil, then perhaps that label is inaccurate and it's really a tumor cell. + +```{r} +# bone cell +print_df |> + dplyr::filter(cl_annotation == "bone cell") +``` + +I think I would also remove bone cell, since hematopoietic stem cells and osteoclasts seem pretty different to me. + +```{r} +# myeloid leukocyte cell +print_df |> + dplyr::filter(cl_annotation == "myeloid leukocyte") +``` + +I'm torn on this one, because I do think it's helpful to know if something is of the myeloid lineage, but if we aren't keeping lymphocyte then I would argue we shouldn't keep myeloid leukocyte. + +```{r} +# progenitor cell +print_df |> + dplyr::filter(cl_annotation == "progenitor cell") |> + head(n=15) # there's a lot of these so let's only print out some +``` + +Same with `progenitor cell`, I do think it could be helpful to know that something may be a progenitor cell, but when you have a cell with the label for HSC and the label for cells like monocytes or osteoblasts, then maybe we are talking about a tumor cell instead. + +Along those same lines, I think the below terms, `lining cell` and `supporting cell`, are too broad even though they have few descendants. + +```{r} +# lining cell +print_df |> + dplyr::filter(cl_annotation == "lining cell") +``` + + +```{r} +# supporting cell +print_df |> + dplyr::filter(cl_annotation == "supporting cell") +``` + + + +We can also look at what cell type labels we are excluding when using this cut off to see if there are any terms we might actually want to keep instead. + +```{r} +lca_df |> + dplyr::filter(total_descendants > cutoff) |> + dplyr::pull(cl_annotation) |> + unique() +``` + +The only term in this list that I would be concerned about losing is "neuron". +Let's look at those combinations. + +```{r} +# blood cell +print_df |> + dplyr::filter(cl_annotation == "neuron") +``` + +It looks like there are a lot of types of neurons in the PanglaoDB reference and only "neuron" as a term in Blueprint. +Even though neuron has ~ 500 descendants, I think we should keep these labels. + +### Removing anything with more than 1 LCA + +One thing I noticed when looking at the labels that have less than the cutoff is that most of them are from scenarios where we have multiple LCAs. +Maybe in the case where we have multiple LCAs we are already too broad and we should just eliminate those matches from the beginning. +Here I'm looking at the total number of descendants for all terms that show up because a term has multiple LCAs. + +```{r} +lca_df |> + dplyr::filter(total_lca > 1) |> + dplyr::select(cl_annotation, total_descendants) |> + unique() +``` + +It looks like most of these terms are pretty broad and are either much higher than the cutoff or right around the cutoff with a few exceptions. +Things like "bone cell" and "supporting cell" have few descendants, but I would still argue these are very broad terms and not useful. + +I'm going to filter out any matches that show two LCA terms first and then use the cutoff to define labels we would keep. +I'll also look to see what cell types we lose when we add this extra filtering step to be sure they are ones that we want to lose. + +```{r} +# remove any combinations with more than one lca +filtered_lca_df <- lca_df |> + dplyr::filter(total_lca < 2) + +# get a list of cell types to keep based on cutoff +updated_celltypes <- filtered_lca_df |> + dplyr::filter(total_descendants <= cutoff) |> + dplyr::pull(cl_annotation) |> + unique() + +# which cell types are now missing from the list to keep +setdiff(celltypes_to_keep, updated_celltypes) +``` + +It looks like I am losing a few terms I already said were not specific and then a few other terms, like "hematopoietic precursor cell" and "perivascular cell". I'll look at both of those to confirm we would not want them. + +```{r} +print_df |> + dplyr::filter(cl_annotation == "hematopoietic precursor cell") +``` + +It looks like here we should be keeping these matches because both references have these labels as hematopoietic stem and progenitor cells. +I think in the context of pediatric cancer having this label would be helpful, so maybe we shouldn't remove all terms that have 2 LCAs. + +Let's look at what the other LCA is for an example set. + +```{r} +lca_df |> + dplyr::filter(panglao_ontology == "CL:0000037" & blueprint_ontology == "CL:0000050") |> + dplyr::select(blueprint_annotation_main, blueprint_annotation_fine, panglao_annotation, cl_annotation) +``` + +It looks like these terms have both `hematopoietic precursor cell` and `progenitor cell` as LCAs. +Personally, I would keep the term for `hematopoietic precursor cell` because I think it's more informative and specific to the type of progenitor cell. + + +```{r} +print_df |> + dplyr::filter(cl_annotation == "perivascular cell") +``` + +I would remove `perivascular cell`, since the cell type labels from PanglaoDB and Blueprint are pretty different from each other. + +## Similarity index + +An alternative approach would be to calculate the [similarity index](https://cran.r-project.org/web/packages/ontologySimilarity/vignettes/ontologySimilarity-introduction.html) between each set of terms and define a cutoff for which set of terms are similar. +This is a value on a 0-1 scale where 0 indicates no similarity and 1 indicates the terms are equal. + +Although this could provide a metric that we could use to define similar cell types, we would still have to identify the label to use which would most likely be the LCA. +Even if the similarity index is close to 1, if the LCA term is not informative then I don't know that we would want to use that. + +However, we could use this to finalize the actual pairs of terms that we trust. +For example, if the LCA for a pair is `T cell` we can look at the similarity index to confirm that specific pair of terms has high similarity. + +Below I'll calculate the similarity index for each set of terms and plot the distribution. +Then we will look at the values for pairs that have an LCA that pass the total descendants threshold we set to see if those pairs have a higher similarity index. + +```{r} +information_content <- ontologySimilarity::descendants_IC(cl_ont) + +# get similarity index for each set of terms +si_df <- lca_df |> + dplyr::rowwise() |> + dplyr::mutate( + similarity_index = ontologySimilarity::get_sim_grid(ontology = cl_ont, + term_sets = list(panglao_ontology, blueprint_ontology)) |> + ontologySimilarity::get_sim() + ) +``` + +```{r} +si_df <- si_df |> + dplyr::mutate( + lca_threshold = dplyr::if_else(total_descendants < cutoff, "PASS", "FAIL") + ) + +ggplot(si_df, aes(x = similarity_index, fill = lca_threshold)) + + geom_density(bw = 0.05, alpha = 0.5) + + labs( + x = "Similarity index", + y = "Density" + ) +``` + +This looks as I expected with most of the pairs that pass the total descendants cutoff having a higher similarity index than those that do not pass. +There is still some overlap though so perhaps even if a set of terms shares an LCA that passes the threshold, the actual terms being compared may be further apart than we would like. + +Now let's look at the similarity index for various LCA terms. +Here each LCA term is its own plot and the vertical lines are the similarity index for each pair of terms that results in that LCA. + +```{r} +celltypes_to_plot <- c("myeloid leukocyte", "T cell", "cell", "supporting cell", "B cell") + +celltypes_to_plot |> + purrr::map(\(celltype){ + line_df <- si_df |> + dplyr::filter(cl_annotation == celltype) |> + dplyr::select(cl_annotation, similarity_index) |> + unique() + + ggplot(si_df, aes(x = similarity_index)) + + geom_density() + + geom_vline(data = line_df, + mapping = aes(xintercept = similarity_index), + lty = 2) + + labs( + x = "Similarity index", + y = "Density", + title = celltype + ) + + }) +``` + + +It looks like terms that are more granular like T and B cell have higher similarity index values than terms that are less granular which is what we would expect. +However, within terms like myeloid leukocyte and even T cell we do see a range of values. +We could dig deeper into which pairs are resulting in which similarity index values if we wanted to, but I think that might be a future direction if we feel like the similarity index is something that could be useful. + + +## Conclusions + +Based on these findings, I think it might be best to create a reference that has all possible pairs of labels between PanglaoDB and Blueprint Encode and the resulting consensus label for those pairs. +To do this we could come up with a whitelist of LCA terms that we would be comfortable including and all other cell types would be unknowns. +I would use the following criteria to come up with my whitelist: + +- Pairs should not have more than 1 LCA, with the exception of the matches that have the label hematopoietic precursor cell. +- The LCA should have equal to or less than 170 total descendants. +- We whould include the term for `neuron` even though it has 500 descendants. +- Terms that are too broad (like `supporting cell`, `blood cell`, `bone cell`, `lining cell`) should be removed. + +Alternatively, rather than eliminate terms that are too broad we could look at the similarity index for individual matches and decide on a case by case basis if those should be allowed. +Although I still think having a term that is too braod, even if it's a good match, is not super informative. + +## Session info + + +```{r} +sessionInfo() +``` + + From 6aa406bec5ca8fd57c8d8d0bb4e85c6a8cf68648 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 12 Dec 2024 12:55:26 -0600 Subject: [PATCH 03/15] add some headers for tables --- .../01-reference-exploration.Rmd | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd index 7fee6beb7..93a867ccf 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -351,9 +351,11 @@ One could also argue to remove `stromal cell` or `extracellular matrix secreting Below are tables that look specifically at the combinations of cell type annotations that resulted in some of the terms that I might consider removing. +#### Blood cell + ```{r} print_df <- lca_df |> - dplyr::select(blueprint_annotation_main, blueprint_annotation_fine, panglao_annotation, total_lca, cl_annotation) + dplyr::select(blueprint_ontology, blueprint_annotation_main, blueprint_annotation_fine, panglao_ontology, panglao_annotation, total_lca, lca, cl_annotation) # blood cell print_df |> @@ -363,6 +365,8 @@ print_df |> I think I'm in favor of not having a "blood cell" label, since I'm not sure that it's helpful. Also, if two different methods label something a platelet and a neutrophil, then perhaps that label is inaccurate and it's really a tumor cell. +#### Bone cell + ```{r} # bone cell print_df |> @@ -371,6 +375,8 @@ print_df |> I think I would also remove bone cell, since hematopoietic stem cells and osteoclasts seem pretty different to me. +#### Myeloid leukocyte + ```{r} # myeloid leukocyte cell print_df |> @@ -379,6 +385,8 @@ print_df |> I'm torn on this one, because I do think it's helpful to know if something is of the myeloid lineage, but if we aren't keeping lymphocyte then I would argue we shouldn't keep myeloid leukocyte. +#### Progenitor cell + ```{r} # progenitor cell print_df |> @@ -390,12 +398,15 @@ Same with `progenitor cell`, I do think it could be helpful to know that somethi Along those same lines, I think the below terms, `lining cell` and `supporting cell`, are too broad even though they have few descendants. +#### Lining cell + ```{r} # lining cell print_df |> dplyr::filter(cl_annotation == "lining cell") ``` +#### Supporting cell ```{r} # supporting cell @@ -404,6 +415,7 @@ print_df |> ``` +### Discarded cell types We can also look at what cell type labels we are excluding when using this cut off to see if there are any terms we might actually want to keep instead. @@ -417,6 +429,8 @@ lca_df |> The only term in this list that I would be concerned about losing is "neuron". Let's look at those combinations. +#### Neuron + ```{r} # blood cell print_df |> @@ -436,7 +450,8 @@ Here I'm looking at the total number of descendants for all terms that show up b lca_df |> dplyr::filter(total_lca > 1) |> dplyr::select(cl_annotation, total_descendants) |> - unique() + unique() |> + dplyr::arrange(total_descendants) ``` It looks like most of these terms are pretty broad and are either much higher than the cutoff or right around the cutoff with a few exceptions. @@ -462,6 +477,8 @@ setdiff(celltypes_to_keep, updated_celltypes) It looks like I am losing a few terms I already said were not specific and then a few other terms, like "hematopoietic precursor cell" and "perivascular cell". I'll look at both of those to confirm we would not want them. +#### Hematopoietic precursor cell + ```{r} print_df |> dplyr::filter(cl_annotation == "hematopoietic precursor cell") @@ -481,6 +498,7 @@ lca_df |> It looks like these terms have both `hematopoietic precursor cell` and `progenitor cell` as LCAs. Personally, I would keep the term for `hematopoietic precursor cell` because I think it's more informative and specific to the type of progenitor cell. +#### Perivascular cell ```{r} print_df |> From 6e9a6114dad74ae5686f63161666f7bec85bf86e Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 12 Dec 2024 12:55:37 -0600 Subject: [PATCH 04/15] rendered notebook --- .../01-reference-exploration.html | 3015 +++++++++++++++++ 1 file changed, 3015 insertions(+) create mode 100644 analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html new file mode 100644 index 000000000..474052d82 --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html @@ -0,0 +1,3015 @@ + + + + + + + + + + + + + + + +Summary of cell type ontologies in reference files + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + +

This notebook aims to identify a set of consensus labels between cell +types in the PanglaoDB and Blueprint Encode references.

+
    +
  • First I look at the entire cell type ontology and summarize the +total ancestors and descendants.
  • +
  • Then we find all possible combinations of cell type labels between +PanglaoDB and Blueprint Encode and find the latest common ancestor (LCA) +for all combinations.
  • +
  • The total descendants for each LCA is used to define a cutoff for +consensus terms we may want to use.
  • +
  • I also explored the pairs for some terms in depth to look at +borderline LCA terms.
  • +
  • Finally, I calculated the similarity index for all pairs.
  • +
+
+

Setup

+
suppressPackageStartupMessages({
+  # load required packages
+  library(ggplot2)
+})
+
+# Set default ggplot theme
+theme_set(
+  theme_bw()
+)
+
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The path to this module
+ref_dir <- file.path(repository_base, "analyses", "cell-type-consensus", "references")
+
+# path to ref file for panglao 
+panglao_file <- file.path(ref_dir, "panglao-cell-type-ontologies.tsv")
+
# grab obo file 
+cl_ont <- ontologyIndex::get_ontology("http://purl.obolibrary.org/obo/cl-basic.obo")
+
+# read in panglao file 
+panglao_df <- readr::read_tsv(panglao_file) |>
+  # rename columns to have panglao in them for easy joining later 
+  dplyr::select(
+    panglao_ontology = "ontology_id",
+    panglao_annotation = "human_readable_value"
+  )
+
## Rows: 178 Columns: 3
+## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
+## Delimiter: "\t"
+## chr (3): ontology_id, human_readable_value, panglao_cell_type
+## 
+## ℹ Use `spec()` to retrieve the full column specification for this data.
+## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
+
# grab singler ref from celldex
+blueprint_ref <- celldex::BlueprintEncodeData() 
+# get ontologies and human readable name into data frame
+blueprint_df <- data.frame(
+  blueprint_ontology = blueprint_ref$label.ont,
+  blueprint_annotation_main = blueprint_ref$label.main,
+  blueprint_annotation_fine = blueprint_ref$label.fine
+) |> 
+  unique()
+
+
+

Full cell ontology

+

Below I will calculate the total number of ancestors and the total +number of descendants for each term in the full cell type ontology and +then show the distributions for those statistics. This will give us an +idea of the range of values we expect to see when looking at the +PanglaoDB and Blueprint Encode references.

+
# turn cl_ont into data frame with one row per term 
+cl_df <- data.frame(
+  cl_ontology = cl_ont$id,
+  cl_annotation = cl_ont$name
+) |> 
+  dplyr::rowwise() |> 
+  dplyr::mutate(
+    # list all ancestors and descendants calculate total
+    ancestors = list(ontologyIndex::get_ancestors(cl_ont, cl_ontology)),
+    total_ancestors = length(ancestors),
+    descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology)),
+    total_descendants = length(descendants)
+  )
+

The vertical lines in the below plot indicate the value for cell +types of varying granularity.

+
celltypes_of_interest <- c("eukaryotic cell", "lymphocyte", "leukocyte", "hematopoietic cell", "T cell", "endothelial cell", "smooth muscle cell", "memory T cell")
+line_df <- cl_df |> 
+  dplyr::filter(cl_annotation %in% celltypes_of_interest) |> 
+  dplyr::select(cl_annotation, total_descendants, total_ancestors) |> 
+  unique()
+
+# group any labels that have the same number of ancestors 
+ancestor_labels_df <- line_df |> 
+  dplyr::group_by(total_ancestors) |> 
+  dplyr::summarise(cl_annotation = paste(cl_annotation, collapse = ","))
+
# make density plots showing distribution of ancestors and descendants 
+ggplot(cl_df, aes(x = total_ancestors)) +
+  geom_density(fill = "#00274C", alpha = 0.5) +
+  geom_vline(data = ancestor_labels_df,
+             mapping = aes(xintercept = total_ancestors), 
+             lty = 2) +
+  geom_text(
+    data = ancestor_labels_df,
+    mapping = aes(x = total_ancestors, y = 0.04, label = cl_annotation),
+    angle = 90, 
+    vjust = -0.5
+  ) +
+  labs(
+    x = "Number of ancestors",
+    y = "Density"
+  )
+

+

Generally it looks like as the cell types get more specific we see a +greater number of ancestors. However, the range of values is small and +we see some cell types have the same value and probably not the same +level of granularity.

+

Below we will look at total number of descendants.

+
ggplot(cl_df, aes(x = total_descendants)) +
+  geom_density(fill = "#FFCB05", alpha = 0.5) +
+  geom_vline(data = line_df,
+             mapping = aes(xintercept = total_descendants), 
+             lty = 2) +
+  geom_text(
+    data = line_df,
+    mapping = aes(x = total_descendants, y = 0.6, label = cl_annotation),
+    angle = 90, 
+    vjust = -0.5
+  ) +
+  labs(
+    x = "Number of descendants",
+    y = "Density"
+  )
+

+

It looks like most cell types have very few descendants, so let’s +zoom into the area below 500 to get a better look.

+
ggplot(cl_df, aes(x = total_descendants)) +
+  geom_density(fill = "#FFCB05", alpha = 0.5) +
+  geom_vline(data = line_df,
+             mapping = aes(xintercept = total_descendants), 
+             lty = 2) +
+  geom_text(
+    data = line_df,
+    mapping = aes(x = total_descendants, y = 0.6, label = cl_annotation),
+    angle = 90, 
+    vjust = -0.5
+  ) +
+  labs(
+    x = "Number of descendants",
+    y = "Density"
+  ) +
+  xlim(c(0,500))
+
## Warning: Removed 14 rows containing non-finite outside the scale range (`stat_density()`).
+
## Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_vline()`).
+
## Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_text()`).
+

+

Here we see a much larger range of values and that cell types become +more general as the number of descendants goes up. However, this +distribution alone is probably not helpful in determining a cutoff. The +next section we will look at this distribution specifically for cell +types present in our references, PanglaoDB and Blueprint encode.

+
+
+

Latest common ancestor (LCA) between PanglaoDB and Blueprint +encode

+

This section will look at identifying the latest common ancestor +(LCA) between all possible combinations of terms from PanglaoDB (used +for assigning cell types with CellAssign) and the +BlueprintEncodeData reference from celldex +(used for assigning cell types with SingleR). The LCA +refers to the latest term in the cell ontology heirarchy that is common +between two terms. I will use the ontoProc::findCommonAncestors() +function to get the LCA for each combination.

+

Note that it is possible to have more than one LCA for a set of +terms. To start, I will keep all LCA terms found.

+

For each LCA, I will again look at the total number of ancestors and +descendants and see if I can identify an appropriate cutoff. Ultimately, +I would like to see if we can use that cutoff to decide if we should +keep the LCA term as the consensus label or use “Unknown”.

+
# first set up the graph from cl ont
+parent_terms <- cl$parents
+cl_graph <- igraph::make_graph(rbind(unlist(parent_terms), rep(names(parent_terms), lengths(parent_terms))))
+
# get a data frame with all combinations of panglao and blueprint terms
+# one row for each combination 
+all_ref_df <- expand.grid(panglao_df$panglao_ontology,
+                          blueprint_df$blueprint_ontology) |> 
+  dplyr::rename(
+    panglao_ontology = "Var1",
+    blueprint_ontology = "Var2"
+  ) |> 
+  # add in the human readable values for each ontology term
+  dplyr::left_join(blueprint_df, by = "blueprint_ontology") |> 
+  dplyr::left_join(panglao_df, by = "panglao_ontology") |> 
+  tidyr::drop_na() |> 
+  dplyr::rowwise() |> 
+  dplyr::mutate(
+    # least common shared ancestor
+    lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = g)))
+  )
+
## Warning in dplyr::left_join(dplyr::left_join(dplyr::rename(expand.grid(panglao_df$panglao_ontology, : Detected an unexpected many-to-many relationship between `x` and `y`.
+## ℹ Row 49 of `x` matches multiple rows in `y`.
+## ℹ Row 99 of `y` matches multiple rows in `x`.
+## ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
+
## Warning: There were 23859 warnings in `dplyr::mutate()`.
+## The first warning was:
+## ℹ In argument: `lca = list(...)`.
+## ℹ In row 1.
+## Caused by warning in `dim()`:
+## ! The dim() method for DataFrameList objects is deprecated. Please use dims() on these objects instead.
+## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23858 remaining warnings.
+
lca_df <- all_ref_df |> 
+  dplyr::mutate(
+    total_lca = length(lca), # max is three terms 
+    lca = paste0(lca, collapse = ",") # make it easier to split the df 
+  ) |>
+  # split each lca term into its own column 
+  tidyr::separate(lca, into = c("lca_1", "lca_2", "lca_3"), sep = ",") |> 
+  tidyr::pivot_longer(
+    cols = dplyr::starts_with("lca"),
+    names_to = "lca_number",
+    values_to = "lca"
+  ) |> 
+  tidyr::drop_na() |> 
+  dplyr::select(-lca_number) |> 
+  # account for any cases where the ontology IDs are exact matches 
+  # r complains about doing this earlier since the lca column holds lists until now
+  dplyr::mutate(lca = dplyr::if_else(blueprint_ontology == panglao_ontology, blueprint_ontology, lca)) |> 
+  # join in information for each of the lca terms including name, number of ancestors and descendants 
+  dplyr::left_join(cl_df, by = c("lca" = "cl_ontology")) 
+
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 7967 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+## 18, 19, 20, ...].
+
+

Distribution of ancestors and descendants

+
ggplot(lca_df, aes(x = total_ancestors)) +
+  geom_density() +
+  geom_vline(data = ancestor_labels_df,
+             mapping = aes(xintercept = total_ancestors), 
+             lty = 2) +
+  geom_text(
+    data = ancestor_labels_df,
+    mapping = aes(x = total_ancestors, y = 0.6, label = cl_annotation),
+    angle = 90, 
+    vjust = -0.5
+  ) +
+  labs(
+    x = "Total number of ancestors",
+    y = "Density"
+  )
+

+
ggplot(lca_df, aes(x = total_descendants)) +
+  geom_density() +
+  geom_vline(data = line_df,
+             mapping = aes(xintercept = total_descendants), 
+             lty = 2) +
+  geom_text(
+    data = line_df,
+    mapping = aes(x = total_descendants, y = 0.002, label = cl_annotation),
+    angle = 90, 
+    vjust = -0.5
+  ) +
+  labs(
+    x = "Total number of descendants",
+    y = "Density"
+  )
+

+

Let’s zoom into the area below 1000, since we already know we would +want to exlude anything above that based on this plot.

+
ggplot(lca_df, aes(x = total_descendants)) +
+  geom_density() +
+  geom_vline(data = line_df,
+             mapping = aes(xintercept = total_descendants), 
+             lty = 2) +
+  geom_text(
+    data = line_df,
+    mapping = aes(x = total_descendants, y = 0.002, label = cl_annotation),
+    angle = 90, 
+    vjust = -0.5
+  ) +
+  xlim(c(0, 1000)) +
+  labs(
+    x = "Total number of descendants",
+    y = "Density"
+  )
+
## Warning: Removed 6856 rows containing non-finite outside the scale range (`stat_density()`).
+
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_vline()`).
+
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_text()`).
+

+

We can use the vertical lines for cells of interest to help us define +a potential cutoff based on the granularity we would like to see in our +consensus label. We want to be able to label things like T cell, but we +don’t want to label anything as lymphocyte as that’s probably not +helpful. I don’t see any obvious cutoffs that may be present in the +total number of ancestors, but the number of descendants is likely to be +informative. I think it might be a good idea to start by drawing a line +at the local maxima between the T cell and lymphocyte lines on the +number of descendants graph.

+
+
+

Defining a cutoff for number of descendants

+

First we will find the value for the first peak shown in the +distribution. This is likely to be a good cutoff for deciding which LCA +labels to keep.

+
peak_idx <- splus2R::peaks(lca_df$total_descendants)
+cutoff <- lca_df$total_descendants[peak_idx] |> 
+  min() # find the smallest peak and use that as the cutoff for number of descendants 
+

Below is the list of all consensus cell type labels that we will be +keeping if we were to just use this cutoff.

+
celltypes_to_keep <- lca_df |> 
+  dplyr::filter(total_descendants <= cutoff) |> 
+  dplyr::pull(cl_annotation) |> 
+  unique()
+
+celltypes_to_keep
+
##  [1] "myeloid leukocyte"                   "granulocyte"                         "neutrophil"                         
+##  [4] "blood cell"                          "mononuclear phagocyte"               "progenitor cell"                    
+##  [7] "monocyte"                            "hematopoietic precursor cell"        "T cell"                             
+## [10] "CD4-positive, alpha-beta T cell"     "mature alpha-beta T cell"            "mature T cell"                      
+## [13] "regulatory T cell"                   "memory T cell"                       "natural killer cell"                
+## [16] "innate lymphoid cell"                "B cell"                              "lymphocyte of B lineage"            
+## [19] "mature B cell"                       "naive B cell"                        "memory B cell"                      
+## [22] "somatic stem cell"                   "stem cell"                           "hematopoietic stem cell"            
+## [25] "bone cell"                           "macrophage"                          "erythroid lineage cell"             
+## [28] "megakaryocyte"                       "endothelial cell"                    "lining cell"                        
+## [31] "dendritic cell"                      "eosinophil"                          "plasma cell"                        
+## [34] "chondrocyte"                         "stromal cell"                        "extracellular matrix secreting cell"
+## [37] "fibroblast"                          "smooth muscle cell"                  "muscle cell"                        
+## [40] "melanocyte"                          "cell of skeletal muscle"             "ecto-epithelial cell"               
+## [43] "keratinocyte"                        "squamous epithelial cell"            "epidermal cell"                     
+## [46] "blood vessel endothelial cell"       "microvascular endothelial cell"      "adipocyte"                          
+## [49] "pericyte"                            "perivascular cell"                   "supporting cell"                    
+## [52] "astrocyte"                           "glial cell"                          "macroglial cell"                    
+## [55] "neuron associated cell"              "mesangial cell"
+

We can also look at all the cell types we are keeping and the total +number of descendants to see if there are any that may be we don’t want +to include because the term is too braod.

+
# pull out the cell types and total descendants for cell types to keep
+plot_celltype_df <- lca_df |>
+  dplyr::filter(cl_annotation %in% celltypes_to_keep) |>
+  dplyr::select(cl_annotation, total_descendants) |> 
+  unique()
+
+# bar chart showing total number of descendants for each cell type
+ggplot(plot_celltype_df, aes(x = reorder(cl_annotation, total_descendants), y = total_descendants)) +
+  geom_bar(stat = "identity") +
+  theme(
+    axis.text.x = element_text(angle = 90)
+  ) +
+  labs(
+    x = "cell type",
+    y = "Total descendants"
+  )
+

+

There are a few terms that I think might be more broad than we want +like blood cell, bone cell, +supporting cell, and lining cell. I’m on the +fence about keeping myeloid leukocyte and +progenitor cell. I think if we wanted to remove those terms +we could move our cutoff to be the same number of descendants as +T cell, since we do want to keep that.

+

One could also argue to remove stromal cell or +extracellular matrix secreting cell.

+

Below are tables that look specifically at the combinations of cell +type annotations that resulted in some of the terms that I might +consider removing.

+
+

Blood cell

+
print_df <- lca_df |> 
+  dplyr::select(blueprint_ontology, blueprint_annotation_main, blueprint_annotation_fine, panglao_ontology, panglao_annotation, total_lca, lca, cl_annotation)
+
+# blood cell
+print_df |> 
+  dplyr::filter(cl_annotation == "blood cell") 
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000775NeutrophilsNeutrophilsCL:0000233platelet2CL:0000081blood cell
CL:0000232ErythrocytesErythrocytesCL:0000767basophil2CL:0000081blood cell
CL:0000232ErythrocytesErythrocytesCL:0000771eosinophil2CL:0000081blood cell
CL:0000232ErythrocytesErythrocytesCL:0000775neutrophil2CL:0000081blood cell
CL:0000232ErythrocytesErythrocytesCL:0000233platelet2CL:0000081blood cell
CL:0000771EosinophilsEosinophilsCL:0000233platelet2CL:0000081blood cell
+
+

I think I’m in favor of not having a “blood cell” label, since I’m +not sure that it’s helpful. Also, if two different methods label +something a platelet and a neutrophil, then perhaps that label is +inaccurate and it’s really a tumor cell.

+
+
+

Bone cell

+
# bone cell
+print_df |> 
+  dplyr::filter(cl_annotation == "bone cell")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000557HSCGMPCL:0000092osteoclast2CL:0001035bone cell
CL:0000557HSCGMPCL:0000137osteocyte1CL:0001035bone cell
+
+

I think I would also remove bone cell, since hematopoietic stem cells +and osteoclasts seem pretty different to me.

+
+
+

Myeloid leukocyte

+
# myeloid leukocyte cell
+print_df |> 
+  dplyr::filter(cl_annotation == "myeloid leukocyte")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000775NeutrophilsNeutrophilsCL:0000583alveolar macrophage1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000235macrophage1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000097mast cell1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000092osteoclast1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000091Kupffer cell1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000453Langerhans cell1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000129microglial cell1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000889myeloid suppressor cell1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000775NeutrophilsNeutrophilsCL:0000874splenic red pulp macrophage1CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000583alveolar macrophage2CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000767basophil1CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000771eosinophil1CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000235macrophage2CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000097mast cell1CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000775neutrophil1CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000092osteoclast2CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000091Kupffer cell2CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000453Langerhans cell2CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000129microglial cell2CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000889myeloid suppressor cell1CL:0000766myeloid leukocyte
CL:0000576MonocytesMonocytesCL:0000874splenic red pulp macrophage2CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000767basophil1CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000771eosinophil1CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000097mast cell1CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000775neutrophil1CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000092osteoclast2CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000453Langerhans cell3CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000889myeloid suppressor cell1CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000235MacrophagesMacrophagesCL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000767basophil1CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000771eosinophil1CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000097mast cell1CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000775neutrophil1CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000092osteoclast2CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000453Langerhans cell3CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000889myeloid suppressor cell1CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000863MacrophagesMacrophages M1CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000767basophil1CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000771eosinophil1CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000097mast cell1CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000775neutrophil1CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000092osteoclast2CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000453Langerhans cell3CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000889myeloid suppressor cell1CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000890MacrophagesMacrophages M2CL:0000576monocyte2CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000583alveolar macrophage1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000235macrophage1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000097mast cell1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000092osteoclast1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000091Kupffer cell1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000453Langerhans cell1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000129microglial cell1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000889myeloid suppressor cell1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000576monocyte1CL:0000766myeloid leukocyte
CL:0000771EosinophilsEosinophilsCL:0000874splenic red pulp macrophage1CL:0000766myeloid leukocyte
+
+

I’m torn on this one, because I do think it’s helpful to know if +something is of the myeloid lineage, but if we aren’t keeping lymphocyte +then I would argue we shouldn’t keep myeloid leukocyte.

+
+
+

Progenitor cell

+
# progenitor cell
+print_df |> 
+  dplyr::filter(cl_annotation == "progenitor cell") |> 
+  head(n=15) # there's a lot of these so let's only print out some 
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000576MonocytesMonocytesCL:0000765erythroblast2CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:0000037hematopoietic stem cell2CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:0000062osteoblast1CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:0000158club cell1CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:0000038erythroid progenitor cell2CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:4042021neuronal-restricted precursor1CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:0002453oligodendrocyte precursor cell1CL:0011026progenitor cell
CL:0000576MonocytesMonocytesCL:0002351progenitor cell of endocrine pancreas1CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000765erythroblast2CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000037hematopoietic stem cell2CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000576monocyte2CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000576monocyte2CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000062osteoblast1CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000158club cell1CL:0011026progenitor cell
CL:0000050HSCMEPCL:0000038erythroid progenitor cell3CL:0011026progenitor cell
+
+

Same with progenitor cell, I do think it could be +helpful to know that something may be a progenitor cell, but when you +have a cell with the label for HSC and the label for cells like +monocytes or osteoblasts, then maybe we are talking about a tumor cell +instead.

+

Along those same lines, I think the below terms, +lining cell and supporting cell, are too broad +even though they have few descendants.

+
+
+

Lining cell

+
# lining cell 
+print_df |> 
+  dplyr::filter(cl_annotation == "lining cell")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000115Endothelial cellsEndothelial cellsCL:0000077mesothelial cell2CL:0000213lining cell
CL:0000115Endothelial cellsEndothelial cellsCL:0002481peritubular myoid cell2CL:0000213lining cell
CL:0000115Endothelial cellsEndothelial cellsCL:0000216Sertoli cell2CL:0000213lining cell
CL:2000008Endothelial cellsmv Endothelial cellsCL:0000077mesothelial cell2CL:0000213lining cell
CL:2000008Endothelial cellsmv Endothelial cellsCL:0002481peritubular myoid cell2CL:0000213lining cell
CL:2000008Endothelial cellsmv Endothelial cellsCL:0000216Sertoli cell2CL:0000213lining cell
+
+
+
+

Supporting cell

+
# supporting cell 
+print_df |> 
+  dplyr::filter(cl_annotation == "supporting cell")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000669PericytesPericytesCL:0000216Sertoli cell2CL:0000630supporting cell
CL:0000650Mesangial cellsMesangial cellsCL:0000216Sertoli cell2CL:0000630supporting cell
+
+
+
+
+

Discarded cell types

+

We can also look at what cell type labels we are excluding when using +this cut off to see if there are any terms we might actually want to +keep instead.

+
lca_df |> 
+  dplyr::filter(total_descendants > cutoff) |> 
+  dplyr::pull(cl_annotation) |> 
+  unique()
+
##  [1] "leukocyte"                            "eukaryotic cell"                      "myeloid cell"                        
+##  [4] "cell"                                 "hematopoietic cell"                   "mononuclear cell"                    
+##  [7] "stuff accumulating cell"              "precursor cell"                       "phagocyte (sensu Vertebrata)"        
+## [10] "defensive cell"                       "lymphocyte"                           "professional antigen presenting cell"
+## [13] "secretory cell"                       "connective tissue cell"               "electrically responsive cell"        
+## [16] "contractile cell"                     "epithelial cell"                      "neuron"                              
+## [19] "neural cell"
+

The only term in this list that I would be concerned about losing is +“neuron”. Let’s look at those combinations.

+
+

Neuron

+
# blood cell
+print_df |> 
+  dplyr::filter(cl_annotation == "neuron")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000540NeuronsNeuronsCL:0000109adrenergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000108cholinergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000166chromaffin cell1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000700dopaminergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0007011enteric neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:1001509glycinergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000099interneuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000100motor neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000165neuroendocrine cell1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000540neuron0CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0008025noradrenergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000210photoreceptor cell1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000740retinal ganglion cell1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000850serotonergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:4023169trigeminal neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000695Cajal-Retzius cell1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000617GABAergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000679glutamatergic neuron1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000121Purkinje cell1CL:0000540neuron
CL:0000540NeuronsNeuronsCL:0000598pyramidal neuron1CL:0000540neuron
+
+

It looks like there are a lot of types of neurons in the PanglaoDB +reference and only “neuron” as a term in Blueprint. Even though neuron +has ~ 500 descendants, I think we should keep these labels.

+
+
+
+

Removing anything with more than 1 LCA

+

One thing I noticed when looking at the labels that have less than +the cutoff is that most of them are from scenarios where we have +multiple LCAs. Maybe in the case where we have multiple LCAs we are +already too broad and we should just eliminate those matches from the +beginning. Here I’m looking at the total number of descendants for all +terms that show up because a term has multiple LCAs.

+
lca_df |> 
+  dplyr::filter(total_lca > 1) |> 
+  dplyr::select(cl_annotation, total_descendants) |> 
+  unique() |>
+  dplyr::arrange(total_descendants)
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cl_annotationtotal_descendants
bone cell39
blood cell42
perivascular cell42
stromal cell54
supporting cell62
hematopoietic precursor cell106
lining cell121
myeloid leukocyte166
progenitor cell166
mononuclear phagocyte170
phagocyte (sensu Vertebrata)176
contractile cell178
defensive cell200
professional antigen presenting cell213
connective tissue cell224
myeloid cell248
stuff accumulating cell267
precursor cell272
secretory cell458
mononuclear cell504
leukocyte541
electrically responsive cell674
hematopoietic cell685
eukaryotic cell2646
+
+

It looks like most of these terms are pretty broad and are either +much higher than the cutoff or right around the cutoff with a few +exceptions. Things like “bone cell” and “supporting cell” have few +descendants, but I would still argue these are very broad terms and not +useful.

+

I’m going to filter out any matches that show two LCA terms first and +then use the cutoff to define labels we would keep. I’ll also look to +see what cell types we lose when we add this extra filtering step to be +sure they are ones that we want to lose.

+
# remove any combinations with more than one lca
+filtered_lca_df <- lca_df |> 
+  dplyr::filter(total_lca < 2)
+
+# get a list of cell types to keep based on cutoff 
+updated_celltypes <- filtered_lca_df |> 
+  dplyr::filter(total_descendants <= cutoff) |> 
+  dplyr::pull(cl_annotation) |> 
+  unique()
+
+# which cell types are now missing from the list to keep
+setdiff(celltypes_to_keep, updated_celltypes)
+
## [1] "blood cell"                   "hematopoietic precursor cell" "lining cell"                 
+## [4] "perivascular cell"            "supporting cell"
+

It looks like I am losing a few terms I already said were not +specific and then a few other terms, like “hematopoietic precursor cell” +and “perivascular cell”. I’ll look at both of those to confirm we would +not want them.

+
+

Hematopoietic precursor cell

+
print_df |> 
+  dplyr::filter(cl_annotation == "hematopoietic precursor cell")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000050HSCMEPCL:0000037hematopoietic stem cell2CL:0008001hematopoietic precursor cell
CL:0000050HSCMEPCL:0000038erythroid progenitor cell3CL:0008001hematopoietic precursor cell
CL:0000037HSCHSCCL:0000038erythroid progenitor cell2CL:0008001hematopoietic precursor cell
CL:0000837HSCMPPCL:0000037hematopoietic stem cell2CL:0008001hematopoietic precursor cell
CL:0000837HSCMPPCL:0000038erythroid progenitor cell2CL:0008001hematopoietic precursor cell
CL:0000051HSCCLPCL:0000037hematopoietic stem cell2CL:0008001hematopoietic precursor cell
CL:0000051HSCCLPCL:0000038erythroid progenitor cell2CL:0008001hematopoietic precursor cell
CL:0000557HSCGMPCL:0000037hematopoietic stem cell2CL:0008001hematopoietic precursor cell
CL:0000557HSCGMPCL:0000038erythroid progenitor cell3CL:0008001hematopoietic precursor cell
CL:0000049HSCCMPCL:0000037hematopoietic stem cell2CL:0008001hematopoietic precursor cell
CL:0000049HSCCMPCL:0000038erythroid progenitor cell2CL:0008001hematopoietic precursor cell
+
+

It looks like here we should be keeping these matches because both +references have these labels as hematopoietic stem and progenitor cells. +I think in the context of pediatric cancer having this label would be +helpful, so maybe we shouldn’t remove all terms that have 2 LCAs.

+

Let’s look at what the other LCA is for an example set.

+
lca_df |> 
+  dplyr::filter(panglao_ontology == "CL:0000037" & blueprint_ontology == "CL:0000050") |> 
+  dplyr::select(blueprint_annotation_main, blueprint_annotation_fine, panglao_annotation, cl_annotation)
+
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
blueprint_annotation_mainblueprint_annotation_finepanglao_annotationcl_annotation
HSCMEPhematopoietic stem cellhematopoietic precursor cell
HSCMEPhematopoietic stem cellprogenitor cell
+
+

It looks like these terms have both +hematopoietic precursor cell and +progenitor cell as LCAs. Personally, I would keep the term +for hematopoietic precursor cell because I think it’s more +informative and specific to the type of progenitor cell.

+
+
+

Perivascular cell

+
print_df |> 
+  dplyr::filter(cl_annotation == "perivascular cell")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000669PericytesPericytesCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000669PericytesPericytesCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000669PericytesPericytesCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000669PericytesPericytesCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000650Mesangial cellsMesangial cellsCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000650Mesangial cellsMesangial cellsCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000650Mesangial cellsMesangial cellsCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
CL:0000650Mesangial cellsMesangial cellsCL:0000359vascular associated smooth muscle cell3CL:4033054perivascular cell
+
+

I would remove perivascular cell, since the cell type +labels from PanglaoDB and Blueprint are pretty different from each +other.

+
+
+
+
+

Similarity index

+

An alternative approach would be to calculate the similarity +index between each set of terms and define a cutoff for which set of +terms are similar. This is a value on a 0-1 scale where 0 indicates no +similarity and 1 indicates the terms are equal.

+

Although this could provide a metric that we could use to define +similar cell types, we would still have to identify the label to use +which would most likely be the LCA. Even if the similarity index is +close to 1, if the LCA term is not informative then I don’t know that we +would want to use that.

+

However, we could use this to finalize the actual pairs of terms that +we trust. For example, if the LCA for a pair is T cell we +can look at the similarity index to confirm that specific pair of terms +has high similarity.

+

Below I’ll calculate the similarity index for each set of terms and +plot the distribution. Then we will look at the values for pairs that +have an LCA that pass the total descendants threshold we set to see if +those pairs have a higher similarity index.

+
information_content <- ontologySimilarity::descendants_IC(cl_ont)
+
+# get similarity index for each set of terms 
+si_df <- lca_df |> 
+  dplyr::rowwise() |> 
+  dplyr::mutate(
+    similarity_index = ontologySimilarity::get_sim_grid(ontology = cl_ont,
+                                                        term_sets = list(panglao_ontology, blueprint_ontology)) |> 
+      ontologySimilarity::get_sim()
+  )
+
si_df <- si_df |> 
+  dplyr::mutate(
+    lca_threshold = dplyr::if_else(total_descendants < cutoff, "PASS", "FAIL")
+  )
+
+ggplot(si_df, aes(x = similarity_index, fill = lca_threshold)) +
+  geom_density(bw = 0.05, alpha = 0.5) +
+  labs(
+    x = "Similarity index",
+    y = "Density"
+  )
+

+

This looks as I expected with most of the pairs that pass the total +descendants cutoff having a higher similarity index than those that do +not pass. There is still some overlap though so perhaps even if a set of +terms shares an LCA that passes the threshold, the actual terms being +compared may be further apart than we would like.

+

Now let’s look at the similarity index for various LCA terms. Here +each LCA term is its own plot and the vertical lines are the similarity +index for each pair of terms that results in that LCA.

+
celltypes_to_plot <- c("myeloid leukocyte", "T cell", "cell", "supporting cell", "B cell")
+
+celltypes_to_plot |> 
+  purrr::map(\(celltype){
+    line_df <- si_df |> 
+      dplyr::filter(cl_annotation == celltype) |> 
+      dplyr::select(cl_annotation, similarity_index) |> 
+      unique()
+    
+    ggplot(si_df, aes(x = similarity_index)) +
+      geom_density() +
+      geom_vline(data = line_df,
+                 mapping = aes(xintercept = similarity_index), 
+                 lty = 2) +
+      labs(
+        x = "Similarity index",
+        y = "Density",
+        title = celltype
+      )
+    
+  })
+
## [[1]]
+

+
## 
+## [[2]]
+

+
## 
+## [[3]]
+

+
## 
+## [[4]]
+

+
## 
+## [[5]]
+

+

It looks like terms that are more granular like T and B cell have +higher similarity index values than terms that are less granular which +is what we would expect. However, within terms like myeloid leukocyte +and even T cell we do see a range of values. We could dig deeper into +which pairs are resulting in which similarity index values if we wanted +to, but I think that might be a future direction if we feel like the +similarity index is something that could be useful.

+
+
+

Conclusions

+

Based on these findings, I think it might be best to create a +reference that has all possible pairs of labels between PanglaoDB and +Blueprint Encode and the resulting consensus label for those pairs. To +do this we could come up with a whitelist of LCA terms that we would be +comfortable including and all other cell types would be unknowns. I +would use the following criteria to come up with my whitelist:

+
    +
  • Pairs should not have more than 1 LCA, with the exception of the +matches that have the label hematopoietic precursor cell.
  • +
  • The LCA should have equal to or less than 170 total +descendants.
  • +
  • We whould include the term for neuron even though it +has 500 descendants.
  • +
  • Terms that are too broad (like supporting cell, +blood cell, bone cell, +lining cell) should be removed.
  • +
+

Alternatively, rather than eliminate terms that are too broad we +could look at the similarity index for individual matches and decide on +a case by case basis if those should be allowed. Although I still think +having a term that is too braod, even if it’s a good match, is not super +informative.

+
+
+

Session info

+
sessionInfo()
+
## R version 4.4.2 (2024-10-31)
+## Platform: aarch64-apple-darwin20
+## Running under: macOS Sonoma 14.4
+## 
+## Matrix products: default
+## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: America/Chicago
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices datasets  utils     methods   base     
+## 
+## other attached packages:
+## [1] ggplot2_3.5.1
+## 
+## loaded via a namespace (and not attached):
+##   [1] RColorBrewer_1.1-3          jsonlite_1.8.9              magrittr_2.0.3              gypsum_1.2.0               
+##   [5] farver_2.1.2                rmarkdown_2.29              zlibbioc_1.52.0             vctrs_0.6.5                
+##   [9] memoise_2.0.1               DelayedMatrixStats_1.28.0   htmltools_0.5.8.1           S4Arrays_1.6.0             
+##  [13] polynom_1.4-1               AnnotationHub_3.14.0        curl_6.0.1                  Rhdf5lib_1.28.0            
+##  [17] SparseArray_1.6.0           rhdf5_2.50.0                sass_0.4.9                  alabaster.base_1.6.1       
+##  [21] bslib_0.8.0                 htmlwidgets_1.6.4           httr2_1.0.7                 cachem_1.1.0               
+##  [25] igraph_2.1.1                mime_0.12                   lifecycle_1.0.4             pkgconfig_2.0.3            
+##  [29] Matrix_1.7-1                R6_2.5.1                    fastmap_1.2.0               GenomeInfoDbData_1.2.13    
+##  [33] MatrixGenerics_1.18.0       shiny_1.9.1                 digest_0.6.37               colorspace_2.1-1           
+##  [37] AnnotationDbi_1.68.0        S4Vectors_0.44.0            rprojroot_2.0.4             ExperimentHub_2.14.0       
+##  [41] GenomicRanges_1.58.0        RSQLite_2.3.9               filelock_1.0.3              labeling_0.4.3             
+##  [45] fansi_1.0.6                 httr_1.4.7                  polyclip_1.10-7             abind_1.4-8                
+##  [49] compiler_4.4.2              bit64_4.5.2                 withr_3.0.2                 DBI_1.2.3                  
+##  [53] ontologySimilarity_2.7      HDF5Array_1.34.0            ggforce_0.4.2               alabaster.ranges_1.6.0     
+##  [57] alabaster.schemas_1.6.0     MASS_7.3-61                 quantreg_5.99.1             rappdirs_0.3.3             
+##  [61] DelayedArray_0.32.0         ggpp_0.5.8-1                tools_4.4.2                 httpuv_1.6.15              
+##  [65] glue_1.8.0                  rhdf5filters_1.18.0         promises_1.3.2              grid_4.4.2                 
+##  [69] generics_0.1.3              gtable_0.3.6                tzdb_0.4.0                  tidyr_1.3.1                
+##  [73] hms_1.1.3                   utf8_1.2.4                  XVector_0.46.0              BiocGenerics_0.52.0        
+##  [77] BiocVersion_3.20.0          pillar_1.9.0                stringr_1.5.1               vroom_1.6.5                
+##  [81] later_1.4.1                 splines_4.4.2               dplyr_1.1.4                 tweenr_2.0.3               
+##  [85] BiocFileCache_2.14.0        lattice_0.22-6              survival_3.7-0              renv_1.0.11                
+##  [89] bit_4.5.0.1                 SparseM_1.84-2              tidyselect_1.2.1            Biostrings_2.74.0          
+##  [93] knitr_1.49                  ggpmisc_0.6.1               IRanges_2.40.0              ontologyPlot_1.7           
+##  [97] SummarizedExperiment_1.36.0 stats4_4.4.2                xfun_0.49                   Biobase_2.66.0             
+## [101] matrixStats_1.4.1           DT_0.33                     stringi_1.8.4               UCSC.utils_1.2.0           
+## [105] paintmap_1.0                yaml_2.3.10                 evaluate_1.0.1              tibble_3.2.1               
+## [109] Rgraphviz_2.50.0            alabaster.matrix_1.6.1      BiocManager_1.30.25         graph_1.84.0               
+## [113] cli_3.6.3                   ontologyIndex_2.12          xtable_1.8-4                reticulate_1.40.0          
+## [117] jquerylib_0.1.4             munsell_0.5.1               Rcpp_1.0.13-1               GenomeInfoDb_1.42.1        
+## [121] dbplyr_2.5.0                ontoProc_2.0.0              png_0.1-8                   parallel_4.4.2             
+## [125] MatrixModels_0.5-3          readr_2.1.5                 blob_1.2.4                  splus2R_1.3-5              
+## [129] sparseMatrixStats_1.18.0    alabaster.se_1.6.0          scales_1.3.0                purrr_1.0.2                
+## [133] crayon_1.5.3                rlang_1.1.4                 KEGGREST_1.46.0             celldex_1.16.0
+
+ + + + +
+ + + + + + + + + + + + + + + + From c011170fc82441f26bc9dde40cbfdd939405b76e Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 12 Dec 2024 13:00:27 -0600 Subject: [PATCH 05/15] update lockfile --- analyses/cell-type-consensus/renv.lock | 1815 ++++++++++++++++++++++-- 1 file changed, 1693 insertions(+), 122 deletions(-) diff --git a/analyses/cell-type-consensus/renv.lock b/analyses/cell-type-consensus/renv.lock index 4542a7000..e189ec681 100644 --- a/analyses/cell-type-consensus/renv.lock +++ b/analyses/cell-type-consensus/renv.lock @@ -32,6 +32,50 @@ "Version": "3.19" }, "Packages": { + "AnnotationDbi": { + "Package": "AnnotationDbi", + "Version": "1.68.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "Biobase", + "BiocGenerics", + "DBI", + "IRanges", + "KEGGREST", + "R", + "RSQLite", + "S4Vectors", + "methods", + "stats", + "stats4" + ], + "Hash": "62ed471119c2fe7898c1feaa05d397dc" + }, + "AnnotationHub": { + "Package": "AnnotationHub", + "Version": "3.14.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "AnnotationDbi", + "BiocFileCache", + "BiocGenerics", + "BiocManager", + "BiocVersion", + "RSQLite", + "S4Vectors", + "curl", + "dplyr", + "grDevices", + "httr", + "methods", + "rappdirs", + "utils", + "yaml" + ], + "Hash": "07a8b7f7a8a23998324a40eab02a44e7" + }, "Biobase": { "Package": "Biobase", "Version": "2.66.0", @@ -45,6 +89,26 @@ ], "Hash": "f6e716bdfed8acfd2d4137be7d4fa8f9" }, + "BiocFileCache": { + "Package": "BiocFileCache", + "Version": "2.14.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "DBI", + "R", + "RSQLite", + "curl", + "dbplyr", + "dplyr", + "filelock", + "httr", + "methods", + "stats", + "utils" + ], + "Hash": "41f12a5ef4f6ea211228b1f84a72bba3" + }, "BiocGenerics": { "Package": "BiocGenerics", "Version": "0.52.0", @@ -79,227 +143,1465 @@ ], "Hash": "3c70eb3b78929c0ee452350cea8432a5" }, - "R6": { - "Package": "R6", - "Version": "2.5.1", + "Biostrings": { + "Package": "Biostrings", + "Version": "2.74.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDb", + "IRanges", + "R", + "S4Vectors", + "XVector", + "crayon", + "grDevices", + "methods", + "stats", + "utils" + ], + "Hash": "0f10c15e1017bde87734c980b27dea1f" + }, + "DBI": { + "Package": "DBI", + "Version": "1.2.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "R" + "R", + "methods" ], - "Hash": "470851b6d5d0ac559e9d01bb352b4021" + "Hash": "065ae649b05f1ff66bb0c793107508f5" }, - "askpass": { - "Package": "askpass", - "Version": "1.2.1", + "DT": { + "Package": "DT", + "Version": "0.33", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "sys" + "crosstalk", + "htmltools", + "htmlwidgets", + "httpuv", + "jquerylib", + "jsonlite", + "magrittr", + "promises" ], - "Hash": "c39f4155b3ceb1a9a2799d700fbd4b6a" + "Hash": "64ff3427f559ce3f2597a4fe13255cb6" }, - "bit": { - "Package": "bit", - "Version": "4.5.0", + "DelayedArray": { + "Package": "DelayedArray", + "Version": "0.32.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "IRanges", + "Matrix", + "MatrixGenerics", + "R", + "S4Arrays", + "S4Vectors", + "SparseArray", + "methods", + "stats", + "stats4" + ], + "Hash": "c4f42dda8d17648382f46b5d0e8a962a" + }, + "DelayedMatrixStats": { + "Package": "DelayedMatrixStats", + "Version": "1.28.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "DelayedArray", + "IRanges", + "Matrix", + "MatrixGenerics", + "S4Vectors", + "SparseArray", + "methods", + "sparseMatrixStats" + ], + "Hash": "276f7fc6bd85f0bf25f8358609894e9c" + }, + "ExperimentHub": { + "Package": "ExperimentHub", + "Version": "2.14.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "AnnotationHub", + "BiocFileCache", + "BiocGenerics", + "BiocManager", + "S4Vectors", + "methods", + "rappdirs", + "utils" + ], + "Hash": "30a37fca92cf663bb4b2f4e2343fae5f" + }, + "GenomeInfoDb": { + "Package": "GenomeInfoDb", + "Version": "1.42.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDbData", + "IRanges", + "R", + "S4Vectors", + "UCSC.utils", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "65f7ac310373771d6f956fc0e813a215" + }, + "GenomeInfoDbData": { + "Package": "GenomeInfoDbData", + "Version": "1.2.13", + "Source": "Bioconductor", + "Requirements": [ + "R" + ], + "Hash": "51962084ec5754c349f8aff4d6d709bf" + }, + "GenomicRanges": { + "Package": "GenomicRanges", + "Version": "1.58.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDb", + "IRanges", + "R", + "S4Vectors", + "XVector", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "41a8ef4550a7da29749cb739b8e701be" + }, + "HDF5Array": { + "Package": "HDF5Array", + "Version": "1.34.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "DelayedArray", + "IRanges", + "Matrix", + "R", + "Rhdf5lib", + "S4Arrays", + "S4Vectors", + "SparseArray", + "methods", + "rhdf5", + "rhdf5filters", + "stats", + "tools", + "utils" + ], + "Hash": "e4ac21c3b0704e812d750cc69dad41ef" + }, + "IRanges": { + "Package": "IRanges", + "Version": "2.40.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "R", + "S4Vectors", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "ecc5317f9624d9992f36e3a900a8ec3b" + }, + "KEGGREST": { + "Package": "KEGGREST", + "Version": "1.46.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "Biostrings", + "R", + "httr", + "methods", + "png" + ], + "Hash": "55259706a0783463e937c71b998407b7" + }, + "MASS": { + "Package": "MASS", + "Version": "7.3-61", "Source": "Repository", "Repository": "CRAN", "Requirements": [ - "R" + "R", + "grDevices", + "graphics", + "methods", + "stats", + "utils" ], - "Hash": "5dc7b2677d65d0e874fc4aaf0e879987" + "Hash": "0cafd6f0500e5deba33be22c46bf6055" }, - "bit64": { - "Package": "bit64", - "Version": "4.5.2", + "Matrix": { + "Package": "Matrix", + "Version": "1.7-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", - "bit", + "grDevices", + "graphics", + "grid", + "lattice", "methods", "stats", "utils" ], - "Hash": "e84984bf5f12a18628d9a02322128dfd" + "Hash": "5122bb14d8736372411f955e1b16bc8a" }, - "cli": { - "Package": "cli", - "Version": "3.6.3", + "MatrixGenerics": { + "Package": "MatrixGenerics", + "Version": "1.18.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "matrixStats", + "methods" + ], + "Hash": "34b71fa5563032c43a7741ba04a7b145" + }, + "R.methodsS3": { + "Package": "R.methodsS3", + "Version": "1.8.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", "utils" ], - "Hash": "b21916dd77a27642b447374a5d30ecf3" + "Hash": "278c286fd6e9e75d0c2e8f731ea445c8" }, - "clipr": { - "Package": "clipr", - "Version": "0.8.0", + "R.oo": { + "Package": "R.oo", + "Version": "1.27.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R.methodsS3", + "methods", + "utils" + ], + "Hash": "6ac79ff194202248cf946fe3a5d6d498" + }, + "R.utils": { + "Package": "R.utils", + "Version": "2.12.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ + "R", + "R.methodsS3", + "R.oo", + "methods", + "tools", "utils" ], - "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" + "Hash": "3dc2829b790254bfba21e60965787651" }, - "cpp11": { - "Package": "cpp11", - "Version": "0.5.0", + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "RColorBrewer": { + "Package": "RColorBrewer", + "Version": "1.1-3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "45f0398006e83a5b10b72a90663d8d8c" + }, + "RSQLite": { + "Package": "RSQLite", + "Version": "2.3.9", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "R", + "bit64", + "blob", + "cpp11", + "memoise", + "methods", + "pkgconfig", + "plogr", + "rlang" + ], + "Hash": "52294139fc7a21bca806b49ae2f315ca" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.13-1", "Source": "Repository", "Repository": "CRAN", + "Requirements": [ + "methods", + "utils" + ], + "Hash": "6b868847b365672d6c1677b1608da9ed" + }, + "RcppTOML": { + "Package": "RcppTOML", + "Version": "0.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp" + ], + "Hash": "c232938949fcd8126034419cc529333a" + }, + "Rgraphviz": { + "Package": "Rgraphviz", + "Version": "2.50.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "R", + "grDevices", + "graph", + "graphics", + "grid", + "methods", + "stats4", + "utils" + ], + "Hash": "a79fe80031459fe9fe60e427780b877c" + }, + "Rhdf5lib": { + "Package": "Rhdf5lib", + "Version": "1.28.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", "Requirements": [ "R" ], - "Hash": "91570bba75d0c9d3f1040c835cee8fba" + "Hash": "f5a052c32d0479a1c12460a4f45a2bfe" }, - "crayon": { - "Package": "crayon", - "Version": "1.5.3", + "S4Arrays": { + "Package": "S4Arrays", + "Version": "1.6.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "IRanges", + "Matrix", + "R", + "S4Vectors", + "abind", + "crayon", + "methods", + "stats" + ], + "Hash": "53b78397b6a584e74ded9d2e369b0eec" + }, + "S4Vectors": { + "Package": "S4Vectors", + "Version": "0.44.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "R", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "20149fcead4dd6f9da3605f98b5220fc" + }, + "SparseArray": { + "Package": "SparseArray", + "Version": "1.6.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "IRanges", + "Matrix", + "MatrixGenerics", + "R", + "S4Arrays", + "S4Vectors", + "XVector", + "matrixStats", + "methods", + "stats", + "utils" + ], + "Hash": "095594ebd5fb811468d20b94af1fc248" + }, + "SummarizedExperiment": { + "Package": "SummarizedExperiment", + "Version": "1.36.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "Biobase", + "BiocGenerics", + "DelayedArray", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "Matrix", + "MatrixGenerics", + "R", + "S4Arrays", + "S4Vectors", + "methods", + "stats", + "tools", + "utils" + ], + "Hash": "5acddb171281d0859c6610d374eacbee" + }, + "UCSC.utils": { + "Package": "UCSC.utils", + "Version": "1.2.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "S4Vectors", + "httr", + "jsonlite", + "methods", + "stats" + ], + "Hash": "499f71d1787a61fe69c8805798650777" + }, + "V8": { + "Package": "V8", + "Version": "6.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "Rcpp", + "curl", + "jsonlite", + "utils" + ], + "Hash": "6603bfcbc7883a5fed41fb13042a3899" + }, + "XVector": { + "Package": "XVector", + "Version": "0.46.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "IRanges", + "R", + "S4Vectors", + "methods", + "tools", + "utils", + "zlibbioc" + ], + "Hash": "fc9af0d482076d1eace4405c44cfecfb" + }, + "abind": { + "Package": "abind", + "Version": "1.4-8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods", + "utils" + ], + "Hash": "2288423bb0f20a457800d7fc47f6aa54" + }, + "alabaster.base": { + "Package": "alabaster.base", + "Version": "1.6.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "Rcpp", + "Rhdf5lib", + "S4Vectors", + "alabaster.schemas", + "jsonlite", + "jsonvalidate", + "methods", + "rhdf5", + "utils" + ], + "Hash": "d6d3782a497af655f883f11bf4e44f55" + }, + "alabaster.matrix": { + "Package": "alabaster.matrix", + "Version": "1.6.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "DelayedArray", + "HDF5Array", + "Matrix", + "Rcpp", + "S4Arrays", + "S4Vectors", + "SparseArray", + "alabaster.base", + "methods", + "rhdf5" + ], + "Hash": "987f69b11522a3adeeda99647bb487f0" + }, + "alabaster.ranges": { + "Package": "alabaster.ranges", + "Version": "1.6.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "S4Vectors", + "alabaster.base", + "methods", + "rhdf5" + ], + "Hash": "8ba5c308670264dc8d39e44e5c4cd257" + }, + "alabaster.schemas": { + "Package": "alabaster.schemas", + "Version": "1.6.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Hash": "6ac19a759e604dccafff793cab4c7059" + }, + "alabaster.se": { + "Package": "alabaster.se", + "Version": "1.6.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "GenomicRanges", + "IRanges", + "S4Vectors", + "SummarizedExperiment", + "alabaster.base", + "alabaster.matrix", + "alabaster.ranges", + "jsonlite", + "methods" + ], + "Hash": "fc60805f12fdb322deee50173f0a7280" + }, + "askpass": { + "Package": "askpass", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "sys" + ], + "Hash": "c39f4155b3ceb1a9a2799d700fbd4b6a" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "basilisk": { + "Package": "basilisk", + "Version": "1.18.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "basilisk.utils", + "dir.expiry", + "methods", + "parallel", + "reticulate", + "utils" + ], + "Hash": "dd15bf5b8704dff7e586a05994d598dd" + }, + "basilisk.utils": { + "Package": "basilisk.utils", + "Version": "1.18.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "dir.expiry", + "methods", + "tools", + "utils" + ], + "Hash": "73361f44874bfcecf54f7ba9238612d1" + }, + "bit": { + "Package": "bit", + "Version": "4.5.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "f89f074e0e49bf1dbe3eba0a15a91476" + }, + "bit64": { + "Package": "bit64", + "Version": "4.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "e84984bf5f12a18628d9a02322128dfd" + }, + "blob": { + "Package": "blob", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods", + "rlang", + "vctrs" + ], + "Hash": "40415719b5a479b87949f3aa0aee737c" + }, + "bslib": { + "Package": "bslib", + "Version": "0.8.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "base64enc", + "cachem", + "fastmap", + "grDevices", + "htmltools", + "jquerylib", + "jsonlite", + "lifecycle", + "memoise", + "mime", + "rlang", + "sass" + ], + "Hash": "b299c6741ca9746fb227debcb0f9fb6c" + }, + "cachem": { + "Package": "cachem", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "fastmap", + "rlang" + ], + "Hash": "cd9a672193789068eb5a2aad65a0dedf" + }, + "celldex": { + "Package": "celldex", + "Version": "1.16.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "AnnotationDbi", + "AnnotationHub", + "DBI", + "DelayedArray", + "DelayedMatrixStats", + "ExperimentHub", + "Matrix", + "RSQLite", + "S4Vectors", + "SummarizedExperiment", + "alabaster.base", + "alabaster.matrix", + "alabaster.se", + "gypsum", + "jsonlite", + "methods", + "utils" + ], + "Hash": "ad94ecdbb71beaf0023e17cdf5ad507b" + }, + "cli": { + "Package": "cli", + "Version": "3.6.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "b21916dd77a27642b447374a5d30ecf3" + }, + "clipr": { + "Package": "clipr", + "Version": "0.8.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "utils" + ], + "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" + }, + "colorspace": { + "Package": "colorspace", + "Version": "2.1-1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats" + ], + "Hash": "d954cb1c57e8d8b756165d7ba18aa55a" + }, + "commonmark": { + "Package": "commonmark", + "Version": "1.9.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "14eb0596f987c71535d07c3aff814742" + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "9df43854f1c84685d095ed6270b52387" + }, + "crayon": { + "Package": "crayon", + "Version": "1.5.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grDevices", + "methods", + "utils" + ], + "Hash": "859d96e65ef198fd43e82b9628d593ef" + }, + "crosstalk": { + "Package": "crosstalk", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "htmltools", + "jsonlite", + "lazyeval" + ], + "Hash": "ab12c7b080a57475248a30f4db6298c0" + }, + "curl": { + "Package": "curl", + "Version": "6.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "e8ba62486230951fcd2b881c5be23f96" + }, + "dbplyr": { + "Package": "dbplyr", + "Version": "2.5.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "DBI", + "R", + "R6", + "blob", + "cli", + "dplyr", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "purrr", + "rlang", + "tibble", + "tidyr", + "tidyselect", + "utils", + "vctrs", + "withr" + ], + "Hash": "39b2e002522bfd258039ee4e889e0fd1" + }, + "digest": { + "Package": "digest", + "Version": "0.6.37", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "33698c4b3127fc9f506654607fb73676" + }, + "dir.expiry": { + "Package": "dir.expiry", + "Version": "1.14.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "filelock", + "utils" + ], + "Hash": "633bd175d10797773fa73669d2bda69c" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + }, + "evaluate": { + "Package": "evaluate", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "3fd29944b231036ad67c3edb32e02201" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "utils" + ], + "Hash": "962174cf2aeb5b9eea581522286a911f" + }, + "farver": { + "Package": "farver", + "Version": "2.1.2", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "680887028577f3fa2a81e410ed0d6e42" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "aa5e1cd11c2d15497494c5292d7ffcc8" + }, + "filelock": { + "Package": "filelock", + "Version": "1.0.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "192053c276525c8495ccfd523aa8f2d1" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "htmltools", + "rlang" + ], + "Hash": "bd1297f9b5b1fc1372d19e2c4cd82215" + }, + "fs": { + "Package": "fs", + "Version": "1.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "7f48af39fa27711ea5fbd183b399920d" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "ggplot2": { + "Package": "ggplot2", + "Version": "3.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "MASS", + "R", + "cli", + "glue", + "grDevices", + "grid", + "gtable", + "isoband", + "lifecycle", + "mgcv", + "rlang", + "scales", + "stats", + "tibble", + "vctrs", + "withr" + ], + "Hash": "44c6a2f8202d5b7e878ea274b1092426" + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "5899f1eaa825580172bb56c08266f37c" + }, + "graph": { + "Package": "graph", + "Version": "1.84.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "BiocGenerics", + "R", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "2ec1bffb3481ff1934c4dabedaa2d5d8" + }, + "gtable": { + "Package": "gtable", + "Version": "0.3.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "grid", + "lifecycle", + "rlang", + "stats" + ], + "Hash": "de949855009e2d4d0e52a844e30617ae" + }, + "gypsum": { + "Package": "gypsum", + "Version": "1.2.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "filelock", + "httr2", + "jsonlite", + "parallel", + "rappdirs", + "utils" + ], + "Hash": "52a3e577fce593a8411a5bd301cae574" + }, + "here": { + "Package": "here", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "rprojroot" + ], + "Hash": "24b224366f9c2e7534d2344d10d59211" + }, + "highr": { + "Package": "highr", + "Version": "0.11", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "xfun" + ], + "Hash": "d65ba49117ca223614f71b60d85b8ab7" + }, + "hms": { + "Package": "hms", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "lifecycle", + "methods", + "pkgconfig", + "rlang", + "vctrs" + ], + "Hash": "b59377caa7ed00fa41808342002138f9" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.8.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "base64enc", + "digest", + "fastmap", + "grDevices", + "rlang", + "utils" + ], + "Hash": "81d371a9cc60640e74e4ab6ac46dcedc" + }, + "htmlwidgets": { + "Package": "htmlwidgets", + "Version": "1.6.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grDevices", + "htmltools", + "jsonlite", + "knitr", + "rmarkdown", + "yaml" + ], + "Hash": "04291cc45198225444a397606810ac37" + }, + "httpuv": { + "Package": "httpuv", + "Version": "1.6.15", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "Rcpp", + "later", + "promises", + "utils" + ], + "Hash": "d55aa087c47a63ead0f6fc10f8fa1ee0" + }, + "httr": { + "Package": "httr", + "Version": "1.4.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + }, + "httr2": { + "Package": "httr2", + "Version": "1.0.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "curl", + "glue", + "lifecycle", + "magrittr", + "openssl", + "rappdirs", + "rlang", + "vctrs", + "withr" + ], + "Hash": "5a76da345ed4f3e6430517e08441edaf" + }, + "igraph": { + "Package": "igraph", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "cli", + "cpp11", + "grDevices", + "graphics", + "lifecycle", + "magrittr", + "methods", + "pkgconfig", + "rlang", + "stats", + "utils", + "vctrs" + ], + "Hash": "c03878b48737a0e2da3b772d7b2e22da" + }, + "isoband": { + "Package": "isoband", + "Version": "0.2.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grid", + "utils" + ], + "Hash": "0080607b4a1a7b28979aecef976d8bc2" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "htmltools" + ], + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.9", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods" + ], + "Hash": "4e993b65c2c3ffbffce7bb3e2c6f832b" + }, + "jsonvalidate": { + "Package": "jsonvalidate", + "Version": "1.3.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "V8" + ], + "Hash": "cdc2843ef7f44f157198bb99aea7552d" + }, + "knitr": { + "Package": "knitr", + "Version": "1.49", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "evaluate", + "highr", + "methods", + "tools", + "xfun", + "yaml" + ], + "Hash": "9fcb189926d93c636dea94fbe4f44480" + }, + "labeling": { + "Package": "labeling", + "Version": "0.4.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "graphics", + "stats" + ], + "Hash": "b64ec208ac5bc1852b285f665d6368b3" + }, + "later": { + "Package": "later", + "Version": "1.4.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "Rcpp", + "rlang" + ], + "Hash": "501744395cac0bab0fbcfab9375ae92c" + }, + "lattice": { + "Package": "lattice", + "Version": "0.22-6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "stats", + "utils" + ], + "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" + }, + "lazyeval": { + "Package": "lazyeval", + "Version": "0.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d908914ae53b04d4c0c0fd72ecc35370" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "b8552d117e1b808b09a832f589b79035" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "grDevices", - "methods", - "utils" + "R" ], - "Hash": "859d96e65ef198fd43e82b9628d593ef" + "Hash": "7ce2733a9826b3aeb1775d56fd305472" }, - "curl": { - "Package": "curl", - "Version": "6.0.1", + "matrixStats": { + "Package": "matrixStats", + "Version": "1.4.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R" ], - "Hash": "e8ba62486230951fcd2b881c5be23f96" + "Hash": "8885ffb1f46e820dede6b2ca9442abca" }, - "dplyr": { - "Package": "dplyr", - "Version": "1.1.4", + "memoise": { + "Package": "memoise", + "Version": "2.0.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "R", - "R6", - "cli", - "generics", - "glue", - "lifecycle", - "magrittr", - "methods", - "pillar", - "rlang", - "tibble", - "tidyselect", - "utils", - "vctrs" + "cachem", + "rlang" ], - "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" }, - "fansi": { - "Package": "fansi", - "Version": "1.0.6", + "mgcv": { + "Package": "mgcv", + "Version": "1.9-1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ + "Matrix", "R", - "grDevices", + "graphics", + "methods", + "nlme", + "splines", + "stats", "utils" ], - "Hash": "962174cf2aeb5b9eea581522286a911f" + "Hash": "110ee9d83b496279960e162ac97764ce" }, - "generics": { - "Package": "generics", - "Version": "0.1.3", + "mime": { + "Package": "mime", + "Version": "0.12", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "R", - "methods" + "tools" ], - "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" }, - "glue": { - "Package": "glue", - "Version": "1.8.0", + "munsell": { + "Package": "munsell", + "Version": "0.5.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "R", + "colorspace", "methods" ], - "Hash": "5899f1eaa825580172bb56c08266f37c" + "Hash": "4fd8900853b746af55b81fda99da7695" }, - "hms": { - "Package": "hms", - "Version": "1.1.3", + "nlme": { + "Package": "nlme", + "Version": "3.1-166", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ - "lifecycle", - "methods", - "pkgconfig", - "rlang", - "vctrs" + "R", + "graphics", + "lattice", + "stats", + "utils" ], - "Hash": "b59377caa7ed00fa41808342002138f9" + "Hash": "ccbb8846be320b627e6aa2b4616a2ded" }, - "httr2": { - "Package": "httr2", - "Version": "1.0.6", - "Source": "Repository", - "Repository": "RSPM", + "ontoProc": { + "Package": "ontoProc", + "Version": "2.0.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", "Requirements": [ + "AnnotationHub", + "Biobase", + "BiocFileCache", + "DT", "R", - "R6", - "cli", - "curl", - "glue", - "lifecycle", + "R.utils", + "Rgraphviz", + "S4Vectors", + "SummarizedExperiment", + "basilisk", + "dplyr", + "graph", + "httr", + "igraph", "magrittr", - "openssl", - "rappdirs", - "rlang", - "vctrs", - "withr" + "methods", + "ontologyIndex", + "ontologyPlot", + "reticulate", + "shiny", + "stats", + "utils" ], - "Hash": "3ef5d07ec78803475a94367d71b40c41" + "Hash": "3e2d74a1103accb852c3ebbac04b14e1" }, - "jsonlite": { - "Package": "jsonlite", - "Version": "1.8.9", + "ontologyIndex": { + "Package": "ontologyIndex", + "Version": "2.12", "Source": "Repository", "Repository": "RSPM", "Requirements": [ - "methods" + "R" ], - "Hash": "4e993b65c2c3ffbffce7bb3e2c6f832b" + "Hash": "8a17ea30dbeb3a9a40a22c74bb084982" }, - "lifecycle": { - "Package": "lifecycle", - "Version": "1.0.4", + "ontologyPlot": { + "Package": "ontologyPlot", + "Version": "1.7", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "R", - "cli", - "glue", - "rlang" + "Rgraphviz", + "methods", + "ontologyIndex", + "paintmap" ], - "Hash": "b8552d117e1b808b09a832f589b79035" + "Hash": "e0a20082a0450c7054bc5478c0f95250" }, - "magrittr": { - "Package": "magrittr", - "Version": "2.0.3", + "ontologySimilarity": { + "Package": "ontologySimilarity", + "Version": "2.7", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ - "R" + "R", + "Rcpp", + "ontologyIndex" ], - "Hash": "7ce2733a9826b3aeb1775d56fd305472" + "Hash": "42035a6435f1ba3df2c4c89236b50415" }, "openssl": { "Package": "openssl", @@ -311,6 +1613,13 @@ ], "Hash": "d413e0fef796c9401a4419485f709ca1" }, + "paintmap": { + "Package": "paintmap", + "Version": "1.0", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "864410e690c3e4d660c025037638058d" + }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -338,6 +1647,23 @@ ], "Hash": "01f28d4278f15c76cddbea05899c5d6f" }, + "plogr": { + "Package": "plogr", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "09eb987710984fc2905c7129c7d85e65" + }, + "png": { + "Package": "png", + "Version": "0.1-8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "bd54ba8a0a5faded999a7aab6e46b374" + }, "prettyunits": { "Package": "prettyunits", "Version": "1.2.0", @@ -362,6 +1688,22 @@ ], "Hash": "f4625e061cb2865f111b47ff163a5ca6" }, + "promises": { + "Package": "promises", + "Version": "1.3.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "Rcpp", + "fastmap", + "later", + "magrittr", + "rlang", + "stats" + ], + "Hash": "c84fd4f75ea1f5434735e08b7f50fbca" + }, "purrr": { "Package": "purrr", "Version": "1.0.2", @@ -420,6 +1762,51 @@ ], "Hash": "47623f66b4e80b3b0587bc5d7b309888" }, + "reticulate": { + "Package": "reticulate", + "Version": "1.40.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "Rcpp", + "RcppTOML", + "graphics", + "here", + "jsonlite", + "methods", + "png", + "rappdirs", + "rlang", + "utils", + "withr" + ], + "Hash": "04740f615607c4e6099356ff6d6694ee" + }, + "rhdf5": { + "Package": "rhdf5", + "Version": "2.50.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "R", + "Rhdf5lib", + "methods", + "rhdf5filters" + ], + "Hash": "c442180fd94c3d38929094e35a6ef92f" + }, + "rhdf5filters": { + "Package": "rhdf5filters", + "Version": "1.18.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "Rhdf5lib" + ], + "Hash": "9b5f34d79bd57b83f43672296062267f" + }, "rlang": { "Package": "rlang", "Version": "1.1.4", @@ -431,6 +1818,29 @@ ], "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1" }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.29", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "evaluate", + "fontawesome", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "methods", + "tinytex", + "tools", + "utils", + "xfun", + "yaml" + ], + "Hash": "df99277f63d01c34e95e3d2f06a79736" + }, "rols": { "Package": "rols", "Version": "3.2.0", @@ -456,6 +1866,108 @@ ], "Hash": "4c8415e0ec1e29f3f4f6fc108bef0144" }, + "sass": { + "Package": "sass", + "Version": "0.4.9", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "fs", + "htmltools", + "rappdirs", + "rlang" + ], + "Hash": "d53dbfddf695303ea4ad66f86e99b95d" + }, + "scales": { + "Package": "scales", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "RColorBrewer", + "cli", + "farver", + "glue", + "labeling", + "lifecycle", + "munsell", + "rlang", + "viridisLite" + ], + "Hash": "c19df082ba346b0ffa6f833e92de34d1" + }, + "shiny": { + "Package": "shiny", + "Version": "1.9.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "bslib", + "cachem", + "commonmark", + "crayon", + "fastmap", + "fontawesome", + "glue", + "grDevices", + "htmltools", + "httpuv", + "jsonlite", + "later", + "lifecycle", + "methods", + "mime", + "promises", + "rlang", + "sourcetools", + "tools", + "utils", + "withr", + "xtable" + ], + "Hash": "6a293995a66e12c48d13aa1f957d09c7" + }, + "sourcetools": { + "Package": "sourcetools", + "Version": "0.1.7-1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5f5a7629f956619d519205ec475fe647" + }, + "sparseMatrixStats": { + "Package": "sparseMatrixStats", + "Version": "1.18.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Requirements": [ + "Matrix", + "MatrixGenerics", + "Rcpp", + "matrixStats", + "methods" + ], + "Hash": "41f8a7d7b60e939d13e1a5f1f07fa42c" + }, + "splus2R": { + "Package": "splus2R", + "Version": "1.3-5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "51f4ca9989e0de9782037e426298bee6" + }, "stringi": { "Package": "stringi", "Version": "1.8.4", @@ -551,6 +2063,16 @@ ], "Hash": "829f27b9c4919c16b593794a6344d6c0" }, + "tinytex": { + "Package": "tinytex", + "Version": "0.54", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "xfun" + ], + "Hash": "3ec7e3ddcacc2d34a9046941222bf94d" + }, "tzdb": { "Package": "tzdb", "Version": "0.4.0", @@ -586,6 +2108,16 @@ ], "Hash": "c03fa420630029418f7e6da3667aac4a" }, + "viridisLite": { + "Package": "viridisLite", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" + }, "vroom": { "Package": "vroom", "Version": "1.6.5", @@ -623,6 +2155,45 @@ "graphics" ], "Hash": "cc2d62c76458d425210d1eb1478b30b4" + }, + "xfun": { + "Package": "xfun", + "Version": "0.49", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "stats", + "tools" + ], + "Hash": "8687398773806cfff9401a2feca96298" + }, + "xtable": { + "Package": "xtable", + "Version": "1.8-4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "stats", + "utils" + ], + "Hash": "b8acdf8af494d9ec19ccb2481a9b11c2" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.10", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "51dab85c6c98e50a18d7551e9d49f76c" + }, + "zlibbioc": { + "Package": "zlibbioc", + "Version": "1.52.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.20", + "Hash": "89bfb698d6eb2fdf03c923ffd32e0c3f" } } } From 27dc5914f47852420f7e4d3bae98ab1009f26e65 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Mon, 16 Dec 2024 08:48:01 -0500 Subject: [PATCH 06/15] Start running cell-type-consensus module CI workflows --- .../workflows/docker_cell-type-consensus.yml | 32 +++++++++---------- .github/workflows/run_cell-type-consensus.yml | 18 +++++------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/.github/workflows/docker_cell-type-consensus.yml b/.github/workflows/docker_cell-type-consensus.yml index 8a7435aa5..05eab341a 100644 --- a/.github/workflows/docker_cell-type-consensus.yml +++ b/.github/workflows/docker_cell-type-consensus.yml @@ -13,22 +13,22 @@ concurrency: cancel-in-progress: true on: - # pull_request: - # branches: - # - main - # paths: - # - "analyses/cell-type-consensus/Dockerfile" - # - "analyses/cell-type-consensus/.dockerignore" - # - "analyses/cell-type-consensus/renv.lock" - # - "analyses/cell-type-consensus/conda-lock.yml" - # push: - # branches: - # - main - # paths: - # - "analyses/cell-type-consensus/Dockerfile" - # - "analyses/cell-type-consensus/.dockerignore" - # - "analyses/cell-type-consensus/renv.lock" - # - "analyses/cell-type-consensus/conda-lock.yml" + pull_request: + branches: + - main + paths: + - "analyses/cell-type-consensus/Dockerfile" + - "analyses/cell-type-consensus/.dockerignore" + - "analyses/cell-type-consensus/renv.lock" + - "analyses/cell-type-consensus/conda-lock.yml" + push: + branches: + - main + paths: + - "analyses/cell-type-consensus/Dockerfile" + - "analyses/cell-type-consensus/.dockerignore" + - "analyses/cell-type-consensus/renv.lock" + - "analyses/cell-type-consensus/conda-lock.yml" workflow_dispatch: inputs: push-ecr: diff --git a/.github/workflows/run_cell-type-consensus.yml b/.github/workflows/run_cell-type-consensus.yml index 4e5d062e4..4338c834d 100644 --- a/.github/workflows/run_cell-type-consensus.yml +++ b/.github/workflows/run_cell-type-consensus.yml @@ -19,15 +19,15 @@ concurrency: on: workflow_dispatch: - # workflow_call: - # pull_request: - # branches: - # - main - # paths: - # - analyses/cell-type-consensus/** - # - "!analyses/cell-type-consensus/Dockerfile" - # - "!analyses/cell-type-consensus/.dockerignore" - # - .github/workflows/run_cell-type-consensus.yml + workflow_call: + pull_request: + branches: + - main + paths: + - analyses/cell-type-consensus/** + - "!analyses/cell-type-consensus/Dockerfile" + - "!analyses/cell-type-consensus/.dockerignore" + - .github/workflows/run_cell-type-consensus.yml jobs: run-module: From 6357a4417acbadfacafeb0fd8c1d3d0987614e85 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> Date: Mon, 16 Dec 2024 08:48:15 -0500 Subject: [PATCH 07/15] Add cell-type-consensus to workflows pertaining to all modules --- .github/workflows/docker_all-modules.yml | 1 + .github/workflows/run_all-modules.yml | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/.github/workflows/docker_all-modules.yml b/.github/workflows/docker_all-modules.yml index a2285a0e0..b383b5624 100644 --- a/.github/workflows/docker_all-modules.yml +++ b/.github/workflows/docker_all-modules.yml @@ -42,6 +42,7 @@ jobs: - cell-type-wilms-tumor-14 - cell-type-nonETP-ALL-03 - cell-type-ETP-ALL-03 + - cell-type-consensus uses: ./.github/workflows/build-push-docker-module.yml if: github.repository_owner == 'AlexsLemonade' with: diff --git a/.github/workflows/run_all-modules.yml b/.github/workflows/run_all-modules.yml index 88b0bbb00..1a007b096 100644 --- a/.github/workflows/run_all-modules.yml +++ b/.github/workflows/run_all-modules.yml @@ -40,6 +40,9 @@ jobs: cell-type-nonETP-ALL-03: uses: ./.github/workflows/run_cell-type-nonETP-ALL-03.yml + cell-type-consensus: + uses: ./.github/workflows/run_cell-type-consensus.yml + ## Add additional modules above this comment, and to the needs list below check-jobs: if: ${{ always() }} @@ -52,6 +55,7 @@ jobs: - cell-type-wilms-14 - cell-type-ETP-ALL-03 - cell-type-nonETP-ALL-03 + - cell-type-consensus runs-on: ubuntu-latest steps: - name: Checkout template file From 7f65e641ce373ddd652d17ef418473492672b819 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Tue, 17 Dec 2024 10:53:43 -0600 Subject: [PATCH 08/15] test updating lock file --- analyses/cell-type-consensus/renv.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/analyses/cell-type-consensus/renv.lock b/analyses/cell-type-consensus/renv.lock index e189ec681..0e6a57bb8 100644 --- a/analyses/cell-type-consensus/renv.lock +++ b/analyses/cell-type-consensus/renv.lock @@ -1,26 +1,26 @@ { "R": { - "Version": "4.4.0", + "Version": "4.4.2", "Repositories": [ { "Name": "BioCsoft", - "URL": "https://bioconductor.org/packages/3.19/bioc" + "URL": "https://bioconductor.org/packages/3.20/bioc" }, { "Name": "BioCann", - "URL": "https://bioconductor.org/packages/3.19/data/annotation" + "URL": "https://bioconductor.org/packages/3.20/data/annotation" }, { "Name": "BioCexp", - "URL": "https://bioconductor.org/packages/3.19/data/experiment" + "URL": "https://bioconductor.org/packages/3.20/data/experiment" }, { "Name": "BioCworkflows", - "URL": "https://bioconductor.org/packages/3.19/workflows" + "URL": "https://bioconductor.org/packages/3.20/workflows" }, { "Name": "BioCbooks", - "URL": "https://bioconductor.org/packages/3.19/books" + "URL": "https://bioconductor.org/packages/3.20/books" }, { "Name": "CRAN", @@ -29,7 +29,7 @@ ] }, "Bioconductor": { - "Version": "3.19" + "Version": "3.20" }, "Packages": { "AnnotationDbi": { From 66cfcf5cac859d53ce745ed481e807a83497eda5 Mon Sep 17 00:00:00 2001 From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:55:16 -0600 Subject: [PATCH 09/15] Apply suggestions from code review Co-authored-by: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> --- .../exploratory-notebooks/01-reference-exploration.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd index 93a867ccf..7c9eff4d2 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -176,7 +176,7 @@ The next section we will look at this distribution specifically for cell types p ## Latest common ancestor (LCA) between PanglaoDB and Blueprint encode This section will look at identifying the latest common ancestor (LCA) between all possible combinations of terms from PanglaoDB (used for assigning cell types with `CellAssign`) and the `BlueprintEncodeData` reference from `celldex` (used for assigning cell types with `SingleR`). -The LCA refers to the latest term in the cell ontology heirarchy that is common between two terms. +The LCA refers to the latest term in the cell ontology hierarchy that is common between two terms. I will use the [`ontoProc::findCommonAncestors()` function](https://rdrr.io/bioc/ontoProc/man/findCommonAncestors.html) to get the LCA for each combination. Note that it is possible to have more than one LCA for a set of terms. @@ -208,7 +208,7 @@ all_ref_df <- expand.grid(panglao_df$panglao_ontology, dplyr::rowwise() |> dplyr::mutate( # least common shared ancestor - lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = g))) + lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = cl_graph))) ) lca_df <- all_ref_df |> @@ -596,7 +596,7 @@ I would use the following criteria to come up with my whitelist: - Terms that are too broad (like `supporting cell`, `blood cell`, `bone cell`, `lining cell`) should be removed. Alternatively, rather than eliminate terms that are too broad we could look at the similarity index for individual matches and decide on a case by case basis if those should be allowed. -Although I still think having a term that is too braod, even if it's a good match, is not super informative. +Although I still think having a term that is too broad, even if it's a good match, is not super informative. ## Session info From 0e89309664a8ff5da10f59ae2b2d61cd430cc7bb Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Tue, 17 Dec 2024 13:22:53 -0600 Subject: [PATCH 10/15] keep myeloid and epithelial --- .../01-reference-exploration.Rmd | 26 +- .../01-reference-exploration.html | 1191 +++++++++++++++-- 2 files changed, 1125 insertions(+), 92 deletions(-) diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd index 7c9eff4d2..472661dd1 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -84,7 +84,7 @@ cl_df <- data.frame( # list all ancestors and descendants calculate total ancestors = list(ontologyIndex::get_ancestors(cl_ont, cl_ontology)), total_ancestors = length(ancestors), - descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology)), + descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology, exclude_roots = TRUE)), total_descendants = length(descendants) ) ``` @@ -187,7 +187,7 @@ Ultimately, I would like to see if we can use that cutoff to decide if we should ```{r} # first set up the graph from cl ont -parent_terms <- cl$parents +parent_terms <- cl_ont$parents cl_graph <- igraph::make_graph(rbind(unlist(parent_terms), rep(names(parent_terms), lengths(parent_terms)))) ``` @@ -384,6 +384,7 @@ print_df |> ``` I'm torn on this one, because I do think it's helpful to know if something is of the myeloid lineage, but if we aren't keeping lymphocyte then I would argue we shouldn't keep myeloid leukocyte. +Noting that after discussion we have decided to keep this one since T and B cells are much easier to differentiate based on gene expression alone than cells that are party of the myeloid lineage. #### Progenitor cell @@ -395,6 +396,7 @@ print_df |> ``` Same with `progenitor cell`, I do think it could be helpful to know that something may be a progenitor cell, but when you have a cell with the label for HSC and the label for cells like monocytes or osteoblasts, then maybe we are talking about a tumor cell instead. +After discussion, we are going to remove progenitor cells. Along those same lines, I think the below terms, `lining cell` and `supporting cell`, are too broad even though they have few descendants. @@ -426,13 +428,13 @@ lca_df |> unique() ``` -The only term in this list that I would be concerned about losing is "neuron". +The only terms in this list that I would be concerned about losing are "neuron" and epithelial cells. Let's look at those combinations. #### Neuron ```{r} -# blood cell +# neuron print_df |> dplyr::filter(cl_annotation == "neuron") ``` @@ -440,6 +442,17 @@ print_df |> It looks like there are a lot of types of neurons in the PanglaoDB reference and only "neuron" as a term in Blueprint. Even though neuron has ~ 500 descendants, I think we should keep these labels. +#### Epithelial cell + +```{r} +# epithelial cell +print_df |> + dplyr::filter(cl_annotation == "epithelial cell") +``` + +The PanglaoDB cell types seem to be more specific than the ones present in Blueprint Encode, similar to the observation with neurons. +We should keep epithelial cell. + ### Removing anything with more than 1 LCA One thing I noticed when looking at the labels that have less than the cutoff is that most of them are from scenarios where we have multiple LCAs. @@ -592,8 +605,9 @@ I would use the following criteria to come up with my whitelist: - Pairs should not have more than 1 LCA, with the exception of the matches that have the label hematopoietic precursor cell. - The LCA should have equal to or less than 170 total descendants. -- We whould include the term for `neuron` even though it has 500 descendants. -- Terms that are too broad (like `supporting cell`, `blood cell`, `bone cell`, `lining cell`) should be removed. +- We whould include the term for `neuron` and `epithelial cell` even though they do not pass the threshold for number of descendants. +- Terms that are too broad should be removed. +This includes: `lining cell`, `blood cell`, `progenitor cell`, `bone cell`, and `supporting cell` Alternatively, rather than eliminate terms that are too broad we could look at the similarity index for individual matches and decide on a case by case basis if those should be allowed. Although I still think having a term that is too broad, even if it's a good match, is not super informative. diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html index 474052d82..cd0998b3e 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html @@ -11,7 +11,7 @@ - + Summary of cell type ontologies in reference files @@ -441,7 +441,7 @@

Summary of cell type ontologies in reference files

Ally Hawkins

-

2024-12-12

+

2024-12-17

@@ -512,7 +512,7 @@

Setup

panglao_annotation = "human_readable_value" )
## Rows: 178 Columns: 3
-## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
+## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 ## Delimiter: "\t"
 ## chr (3): ontology_id, human_readable_value, panglao_cell_type
 ## 
@@ -545,7 +545,7 @@ 

Full cell ontology

# list all ancestors and descendants calculate total ancestors = list(ontologyIndex::get_ancestors(cl_ont, cl_ontology)), total_ancestors = length(ancestors), - descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology)), + descendants = list(ontologyIndex::get_descendants(cl_ont, cl_ontology, exclude_roots = TRUE)), total_descendants = length(descendants) )

The vertical lines in the below plot indicate the value for cell @@ -597,7 +597,7 @@

Full cell ontology

x = "Number of descendants", y = "Density" ) -

+

It looks like most cell types have very few descendants, so let’s zoom into the area below 500 to get a better look.

ggplot(cl_df, aes(x = total_descendants)) +
@@ -619,7 +619,7 @@ 

Full cell ontology

## Warning: Removed 14 rows containing non-finite outside the scale range (`stat_density()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_vline()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_text()`).
-

+

Here we see a much larger range of values and that cell types become more general as the number of descendants goes up. However, this distribution alone is probably not helpful in determining a cutoff. The @@ -634,7 +634,7 @@

Latest common ancestor (LCA) between PanglaoDB and Blueprint for assigning cell types with CellAssign) and the BlueprintEncodeData reference from celldex (used for assigning cell types with SingleR). The LCA -refers to the latest term in the cell ontology heirarchy that is common +refers to the latest term in the cell ontology hierarchy that is common between two terms. I will use the ontoProc::findCommonAncestors() function to get the LCA for each combination.

Note that it is possible to have more than one LCA for a set of @@ -644,7 +644,7 @@

Latest common ancestor (LCA) between PanglaoDB and Blueprint I would like to see if we can use that cutoff to decide if we should keep the LCA term as the consensus label or use “Unknown”.

# first set up the graph from cl ont
-parent_terms <- cl$parents
+parent_terms <- cl_ont$parents
 cl_graph <- igraph::make_graph(rbind(unlist(parent_terms), rep(names(parent_terms), lengths(parent_terms))))
# get a data frame with all combinations of panglao and blueprint terms
 # one row for each combination 
@@ -661,7 +661,7 @@ 

Latest common ancestor (LCA) between PanglaoDB and Blueprint dplyr::rowwise() |> dplyr::mutate( # least common shared ancestor - lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = g))) + lca = list(rownames(ontoProc::findCommonAncestors(blueprint_ontology, panglao_ontology, g = cl_graph))) )

## Warning in dplyr::left_join(dplyr::left_join(dplyr::rename(expand.grid(panglao_df$panglao_ontology, : Detected an unexpected many-to-many relationship between `x` and `y`.
 ## ℹ Row 49 of `x` matches multiple rows in `y`.
@@ -693,8 +693,8 @@ 

Latest common ancestor (LCA) between PanglaoDB and Blueprint dplyr::mutate(lca = dplyr::if_else(blueprint_ontology == panglao_ontology, blueprint_ontology, lca)) |> # join in information for each of the lca terms including name, number of ancestors and descendants dplyr::left_join(cl_df, by = c("lca" = "cl_ontology"))

-
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 7967 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
-## 18, 19, 20, ...].
+
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 7967 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+## 20, ...].

Distribution of ancestors and descendants

ggplot(lca_df, aes(x = total_ancestors)) +
@@ -728,7 +728,7 @@ 

Distribution of ancestors and descendants

x = "Total number of descendants", y = "Density" )
-

+

Let’s zoom into the area below 1000, since we already know we would want to exlude anything above that based on this plot.

ggplot(lca_df, aes(x = total_descendants)) +
@@ -750,7 +750,7 @@ 

Distribution of ancestors and descendants

## Warning: Removed 6856 rows containing non-finite outside the scale range (`stat_density()`).
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_vline()`).
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_text()`).
-

+

We can use the vertical lines for cells of interest to help us define a potential cutoff based on the granularity we would like to see in our consensus label. We want to be able to label things like T cell, but we @@ -815,7 +815,7 @@

Defining a cutoff for number of descendants

x = "cell type", y = "Total descendants" )
-

+

There are a few terms that I think might be more broad than we want like blood cell, bone cell, supporting cell, and lining cell. I’m on the @@ -1730,7 +1730,10 @@

Myeloid leukocyte

I’m torn on this one, because I do think it’s helpful to know if something is of the myeloid lineage, but if we aren’t keeping lymphocyte -then I would argue we shouldn’t keep myeloid leukocyte.

+then I would argue we shouldn’t keep myeloid leukocyte. Noting that +after discussion we have decided to keep this one since T and B cells +are much easier to differentiate based on gene expression alone than +cells that are party of the myeloid lineage.

Progenitor cell

@@ -1920,7 +1923,7 @@

Progenitor cell

helpful to know that something may be a progenitor cell, but when you have a cell with the label for HSC and the label for cells like monocytes or osteoblasts, then maybe we are talking about a tumor cell -instead.

+instead. After discussion, we are going to remove progenitor cells.

Along those same lines, I think the below terms, lining cell and supporting cell, are too broad even though they have few descendants.

@@ -2090,11 +2093,11 @@

Discarded cell types

## [13] "secretory cell" "connective tissue cell" "electrically responsive cell" ## [16] "contractile cell" "epithelial cell" "neuron" ## [19] "neural cell"

-

The only term in this list that I would be concerned about losing is -“neuron”. Let’s look at those combinations.

+

The only terms in this list that I would be concerned about losing +are “neuron” and epithelial cells. Let’s look at those combinations.

Neuron

-
# blood cell
+
# neuron
 print_df |> 
   dplyr::filter(cl_annotation == "neuron")
@@ -2329,6 +2332,1023 @@

Neuron

reference and only “neuron” as a term in Blueprint. Even though neuron has ~ 500 descendants, I think we should keep these labels.

+
+

Epithelial cell

+
# epithelial cell
+print_df |> 
+  dplyr::filter(cl_annotation == "epithelial cell")
+
+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
blueprint_ontologyblueprint_annotation_mainblueprint_annotation_finepanglao_ontologypanglao_annotationtotal_lcalcacl_annotation
CL:0000066Epithelial cellsEpithelial cellsCL:0000622acinar cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:1000488cholangiocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000166chromaffin cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000584enterocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000164enteroendocrine cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000065ependymal cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000066epithelial cell0CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000160goblet cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000501granulosa cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000182hepatocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0005006ionocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000312keratinocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000077mesothelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000185myoepithelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000165neuroendocrine cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002167olfactory epithelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000510paneth cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000162parietal cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002481peritubular myoid cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000652pinealocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000653podocyte1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000209taste receptor cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000731urothelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002368respiratory epithelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002370respiratory goblet cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000171pancreatic A cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000169type B pancreatic cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000706choroid plexus epithelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000158club cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002250intestinal crypt stem cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000173pancreatic D cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002305epithelial cell of distal tubule1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002079pancreatic ductal cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000504enterochromaffin-like cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0005019pancreatic epsilon cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002258thyroid follicular cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002179foveolar cell of stomach1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000696PP cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000155peptic cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002292type I cell of carotid body1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0005010renal intercalated cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:1000909kidney loop of Henle epithelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002326luminal epithelial cell of mammary gland1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002327mammary gland epithelial cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000242Merkel cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000682M cell of gut1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002199oxyphil cell of parathyroid gland1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000446chief cell of parathyroid gland1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0005009renal principal cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002306epithelial cell of proximal tubule1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002062pulmonary alveolar type 1 cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002063pulmonary alveolar type 2 cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:1001596salivary gland glandular cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002140acinar cell of sebaceous gland1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0000216Sertoli cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002562hair germinal matrix cell1CL:0000066epithelial cell
CL:0000066Epithelial cellsEpithelial cellsCL:0002204brush cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000622acinar cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:1000488cholangiocyte1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000584enterocyte1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000164enteroendocrine cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000066epithelial cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000160goblet cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000501granulosa cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000182hepatocyte1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0005006ionocyte1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000185myoepithelial cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000510paneth cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000162parietal cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000653podocyte1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000209taste receptor cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000731urothelial cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002368respiratory epithelial cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002370respiratory goblet cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000171pancreatic A cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000169type B pancreatic cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000158club cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002250intestinal crypt stem cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000173pancreatic D cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002305epithelial cell of distal tubule1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002079pancreatic ductal cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000504enterochromaffin-like cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0005019pancreatic epsilon cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002258thyroid follicular cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002179foveolar cell of stomach1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000696PP cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000155peptic cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0005010renal intercalated cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:1000909kidney loop of Henle epithelial cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002326luminal epithelial cell of mammary gland1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002327mammary gland epithelial cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000682M cell of gut1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002199oxyphil cell of parathyroid gland1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0000446chief cell of parathyroid gland1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0005009renal principal cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002306epithelial cell of proximal tubule1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:1001596salivary gland glandular cell1CL:0000066epithelial cell
CL:0000312KeratinocytesKeratinocytesCL:0002204brush cell1CL:0000066epithelial cell
+
+

The PanglaoDB cell types seem to be more specific than the ones +present in Blueprint Encode, similar to the observation with neurons. We +should keep epithelial cell.

+

Removing anything with more than 1 LCA

@@ -2354,99 +3374,99 @@

Removing anything with more than 1 LCA

bone cell -39 +38 blood cell -42 +41 perivascular cell -42 +41 stromal cell -54 +53 supporting cell -62 +61 hematopoietic precursor cell -106 +105 lining cell -121 +120 myeloid leukocyte -166 +165 progenitor cell -166 +165 mononuclear phagocyte -170 +169 phagocyte (sensu Vertebrata) -176 +175 contractile cell -178 +177 defensive cell -200 +199 professional antigen presenting cell -213 +212 connective tissue cell -224 +223 myeloid cell -248 +247 stuff accumulating cell -267 +266 precursor cell -272 +271 secretory cell -458 +457 mononuclear cell -504 +503 leukocyte -541 +540 electrically responsive cell -674 +673 hematopoietic cell -685 +684 eukaryotic cell -2646 +2645 @@ -2472,8 +3492,8 @@

Removing anything with more than 1 LCA

# which cell types are now missing from the list to keep setdiff(celltypes_to_keep, updated_celltypes)
-
## [1] "blood cell"                   "hematopoietic precursor cell" "lining cell"                 
-## [4] "perivascular cell"            "supporting cell"
+
## [1] "blood cell"                   "hematopoietic precursor cell" "lining cell"                  "perivascular cell"           
+## [5] "supporting cell"

It looks like I am losing a few terms I already said were not specific and then a few other terms, like “hematopoietic precursor cell” and “perivascular cell”. I’ll look at both of those to confirm we would @@ -2889,16 +3909,18 @@

Conclusions

matches that have the label hematopoietic precursor cell.
  • The LCA should have equal to or less than 170 total descendants.
  • -
  • We whould include the term for neuron even though it -has 500 descendants.
  • -
  • Terms that are too broad (like supporting cell, -blood cell, bone cell, -lining cell) should be removed.
  • +
  • We whould include the term for neuron and +epithelial cell even though they do not pass the threshold +for number of descendants.
  • +
  • Terms that are too broad should be removed. This includes: +lining cell, blood cell, +progenitor cell, bone cell, and +supporting cell
  • Alternatively, rather than eliminate terms that are too broad we could look at the similarity index for individual matches and decide on a case by case basis if those should be allowed. Although I still think -having a term that is too braod, even if it’s a good match, is not super +having a term that is too broad, even if it’s a good match, is not super informative.

    @@ -2906,7 +3928,7 @@

    Session info

    sessionInfo()
    ## R version 4.4.2 (2024-10-31)
     ## Platform: aarch64-apple-darwin20
    -## Running under: macOS Sonoma 14.4
    +## Running under: macOS Sequoia 15.2
     ## 
     ## Matrix products: default
     ## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
    @@ -2925,40 +3947,37 @@ 

    Session info

    ## [1] ggplot2_3.5.1 ## ## loaded via a namespace (and not attached): -## [1] RColorBrewer_1.1-3 jsonlite_1.8.9 magrittr_2.0.3 gypsum_1.2.0 -## [5] farver_2.1.2 rmarkdown_2.29 zlibbioc_1.52.0 vctrs_0.6.5 -## [9] memoise_2.0.1 DelayedMatrixStats_1.28.0 htmltools_0.5.8.1 S4Arrays_1.6.0 -## [13] polynom_1.4-1 AnnotationHub_3.14.0 curl_6.0.1 Rhdf5lib_1.28.0 -## [17] SparseArray_1.6.0 rhdf5_2.50.0 sass_0.4.9 alabaster.base_1.6.1 -## [21] bslib_0.8.0 htmlwidgets_1.6.4 httr2_1.0.7 cachem_1.1.0 -## [25] igraph_2.1.1 mime_0.12 lifecycle_1.0.4 pkgconfig_2.0.3 -## [29] Matrix_1.7-1 R6_2.5.1 fastmap_1.2.0 GenomeInfoDbData_1.2.13 -## [33] MatrixGenerics_1.18.0 shiny_1.9.1 digest_0.6.37 colorspace_2.1-1 -## [37] AnnotationDbi_1.68.0 S4Vectors_0.44.0 rprojroot_2.0.4 ExperimentHub_2.14.0 -## [41] GenomicRanges_1.58.0 RSQLite_2.3.9 filelock_1.0.3 labeling_0.4.3 -## [45] fansi_1.0.6 httr_1.4.7 polyclip_1.10-7 abind_1.4-8 -## [49] compiler_4.4.2 bit64_4.5.2 withr_3.0.2 DBI_1.2.3 -## [53] ontologySimilarity_2.7 HDF5Array_1.34.0 ggforce_0.4.2 alabaster.ranges_1.6.0 -## [57] alabaster.schemas_1.6.0 MASS_7.3-61 quantreg_5.99.1 rappdirs_0.3.3 -## [61] DelayedArray_0.32.0 ggpp_0.5.8-1 tools_4.4.2 httpuv_1.6.15 -## [65] glue_1.8.0 rhdf5filters_1.18.0 promises_1.3.2 grid_4.4.2 -## [69] generics_0.1.3 gtable_0.3.6 tzdb_0.4.0 tidyr_1.3.1 -## [73] hms_1.1.3 utf8_1.2.4 XVector_0.46.0 BiocGenerics_0.52.0 -## [77] BiocVersion_3.20.0 pillar_1.9.0 stringr_1.5.1 vroom_1.6.5 -## [81] later_1.4.1 splines_4.4.2 dplyr_1.1.4 tweenr_2.0.3 -## [85] BiocFileCache_2.14.0 lattice_0.22-6 survival_3.7-0 renv_1.0.11 -## [89] bit_4.5.0.1 SparseM_1.84-2 tidyselect_1.2.1 Biostrings_2.74.0 -## [93] knitr_1.49 ggpmisc_0.6.1 IRanges_2.40.0 ontologyPlot_1.7 -## [97] SummarizedExperiment_1.36.0 stats4_4.4.2 xfun_0.49 Biobase_2.66.0 -## [101] matrixStats_1.4.1 DT_0.33 stringi_1.8.4 UCSC.utils_1.2.0 -## [105] paintmap_1.0 yaml_2.3.10 evaluate_1.0.1 tibble_3.2.1 -## [109] Rgraphviz_2.50.0 alabaster.matrix_1.6.1 BiocManager_1.30.25 graph_1.84.0 -## [113] cli_3.6.3 ontologyIndex_2.12 xtable_1.8-4 reticulate_1.40.0 -## [117] jquerylib_0.1.4 munsell_0.5.1 Rcpp_1.0.13-1 GenomeInfoDb_1.42.1 -## [121] dbplyr_2.5.0 ontoProc_2.0.0 png_0.1-8 parallel_4.4.2 -## [125] MatrixModels_0.5-3 readr_2.1.5 blob_1.2.4 splus2R_1.3-5 -## [129] sparseMatrixStats_1.18.0 alabaster.se_1.6.0 scales_1.3.0 purrr_1.0.2 -## [133] crayon_1.5.3 rlang_1.1.4 KEGGREST_1.46.0 celldex_1.16.0
    +## [1] celldex_1.16.0 DBI_1.2.3 httr2_1.0.7 rlang_1.1.4 +## [5] magrittr_2.0.3 matrixStats_1.4.1 gypsum_1.2.0 compiler_4.4.2 +## [9] RSQLite_2.3.9 DelayedMatrixStats_1.28.0 png_0.1-8 vctrs_0.6.5 +## [13] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0 dbplyr_2.5.0 +## [17] XVector_0.46.0 labeling_0.4.3 utf8_1.2.4 promises_1.3.2 +## [21] rmarkdown_2.29 tzdb_0.4.0 graph_1.84.0 UCSC.utils_1.2.0 +## [25] purrr_1.0.2 bit_4.5.0.1 xfun_0.49 zlibbioc_1.52.0 +## [29] cachem_1.1.0 splus2R_1.3-5 GenomeInfoDb_1.42.1 jsonlite_1.8.9 +## [33] blob_1.2.4 later_1.4.1 rhdf5filters_1.18.0 DelayedArray_0.32.0 +## [37] Rhdf5lib_1.28.0 parallel_4.4.2 R6_2.5.1 bslib_0.8.0 +## [41] reticulate_1.40.0 jquerylib_0.1.4 GenomicRanges_1.58.0 Rcpp_1.0.13-1 +## [45] SummarizedExperiment_1.36.0 knitr_1.49 readr_2.1.5 IRanges_2.40.0 +## [49] httpuv_1.6.15 Matrix_1.7-1 igraph_2.1.1 tidyselect_1.2.1 +## [53] abind_1.4-8 yaml_2.3.10 curl_6.0.1 ontologySimilarity_2.7 +## [57] lattice_0.22-6 tibble_3.2.1 shiny_1.9.1 Biobase_2.66.0 +## [61] withr_3.0.2 KEGGREST_1.46.0 evaluate_1.0.1 ontologyIndex_2.12 +## [65] BiocFileCache_2.14.0 alabaster.schemas_1.6.0 ExperimentHub_2.14.0 Biostrings_2.74.0 +## [69] pillar_1.9.0 BiocManager_1.30.25 filelock_1.0.3 MatrixGenerics_1.18.0 +## [73] DT_0.33 renv_1.0.11 stats4_4.4.2 generics_0.1.3 +## [77] vroom_1.6.5 rprojroot_2.0.4 BiocVersion_3.20.0 S4Vectors_0.44.0 +## [81] hms_1.1.3 sparseMatrixStats_1.18.0 munsell_0.5.1 scales_1.3.0 +## [85] alabaster.base_1.6.1 xtable_1.8-4 glue_1.8.0 alabaster.ranges_1.6.0 +## [89] alabaster.matrix_1.6.1 tools_4.4.2 ontologyPlot_1.7 AnnotationHub_3.14.0 +## [93] ontoProc_2.0.0 rhdf5_2.50.0 grid_4.4.2 tidyr_1.3.1 +## [97] AnnotationDbi_1.68.0 colorspace_2.1-1 GenomeInfoDbData_1.2.13 HDF5Array_1.34.0 +## [101] cli_3.6.3 rappdirs_0.3.3 fansi_1.0.6 S4Arrays_1.6.0 +## [105] dplyr_1.1.4 Rgraphviz_2.50.0 gtable_0.3.6 alabaster.se_1.6.0 +## [109] sass_0.4.9 digest_0.6.37 BiocGenerics_0.52.0 paintmap_1.0 +## [113] SparseArray_1.6.0 htmlwidgets_1.6.4 farver_2.1.2 memoise_2.0.1 +## [117] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7 mime_0.12 +## [121] bit64_4.5.2
    From 80264bda086877fcf2d1f36cd368581e46e414b2 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Tue, 17 Dec 2024 14:11:16 -0600 Subject: [PATCH 11/15] run GHA on docker image --- .github/workflows/run_cell-type-consensus.yml | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/.github/workflows/run_cell-type-consensus.yml b/.github/workflows/run_cell-type-consensus.yml index 4338c834d..569779757 100644 --- a/.github/workflows/run_cell-type-consensus.yml +++ b/.github/workflows/run_cell-type-consensus.yml @@ -33,25 +33,12 @@ jobs: run-module: if: github.repository_owner == 'AlexsLemonade' runs-on: ubuntu-latest + container: public.ecr.aws/openscpca/cell-type-consensus:latest steps: - name: Checkout repo uses: actions/checkout@v4 - - name: Set up R - uses: r-lib/actions/setup-r@v2 - with: - r-version: 4.4.0 - use-public-rspm: true - - - name: Set up pandoc - uses: r-lib/actions/setup-pandoc@v2 - - - name: Set up renv - uses: r-lib/actions/setup-renv@v2 - with: - working-directory: ${{ env.MODULE_PATH }} - # Update this step as needed to download the desired data - name: Download test data run: ./download-data.py --test-data --format SCE From 98eb9df170e278e37d782db3f078a515e908cbb0 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Tue, 17 Dec 2024 15:12:41 -0600 Subject: [PATCH 12/15] install awscli --- .github/workflows/run_cell-type-consensus.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/run_cell-type-consensus.yml b/.github/workflows/run_cell-type-consensus.yml index 569779757..12c5f9c17 100644 --- a/.github/workflows/run_cell-type-consensus.yml +++ b/.github/workflows/run_cell-type-consensus.yml @@ -36,6 +36,12 @@ jobs: container: public.ecr.aws/openscpca/cell-type-consensus:latest steps: + - name: Install aws-cli + run: | + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip -q awscliv2.zip + ./aws/install + - name: Checkout repo uses: actions/checkout@v4 From f1d165c6331186f9cd5372595d55db42514c3e39 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Tue, 17 Dec 2024 15:13:54 -0600 Subject: [PATCH 13/15] formatting --- .github/workflows/run_cell-type-consensus.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_cell-type-consensus.yml b/.github/workflows/run_cell-type-consensus.yml index 12c5f9c17..e3d110e04 100644 --- a/.github/workflows/run_cell-type-consensus.yml +++ b/.github/workflows/run_cell-type-consensus.yml @@ -37,10 +37,10 @@ jobs: steps: - name: Install aws-cli - run: | - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip -q awscliv2.zip - ./aws/install + run: | + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip -q awscliv2.zip + ./aws/install - name: Checkout repo uses: actions/checkout@v4 From e3ad414b3db223860ce919d79dcaef34b3ad835c Mon Sep 17 00:00:00 2001 From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:03:48 -0600 Subject: [PATCH 14/15] spell part Co-authored-by: Jaclyn Taroni <19534205+jaclyn-taroni@users.noreply.github.com> --- .../exploratory-notebooks/01-reference-exploration.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd index 472661dd1..6701c5077 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -384,7 +384,7 @@ print_df |> ``` I'm torn on this one, because I do think it's helpful to know if something is of the myeloid lineage, but if we aren't keeping lymphocyte then I would argue we shouldn't keep myeloid leukocyte. -Noting that after discussion we have decided to keep this one since T and B cells are much easier to differentiate based on gene expression alone than cells that are party of the myeloid lineage. +Noting that after discussion we have decided to keep this one since T and B cells are much easier to differentiate based on gene expression alone than cells that are part of the myeloid lineage. #### Progenitor cell From 3676e8d73e89f0f63b18c3d42232cd780344d452 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Wed, 18 Dec 2024 09:08:50 -0600 Subject: [PATCH 15/15] add note about keratinocytes --- .../01-reference-exploration.Rmd | 5 ++-- .../01-reference-exploration.html | 27 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd index 6701c5077..9a06b85cb 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.Rmd @@ -451,7 +451,7 @@ print_df |> ``` The PanglaoDB cell types seem to be more specific than the ones present in Blueprint Encode, similar to the observation with neurons. -We should keep epithelial cell. +We should keep epithelial cell in the cases where the Blueprint Encode annotation is `Epithelial cells` but not when it is `Keratinocytes`. ### Removing anything with more than 1 LCA @@ -605,7 +605,8 @@ I would use the following criteria to come up with my whitelist: - Pairs should not have more than 1 LCA, with the exception of the matches that have the label hematopoietic precursor cell. - The LCA should have equal to or less than 170 total descendants. -- We whould include the term for `neuron` and `epithelial cell` even though they do not pass the threshold for number of descendants. +- We should include the term for `neuron` and `epithelial cell` even though they do not pass the threshold for number of descendants. +However, `epithelial cell` should only be included if the Blueprint Encode name is `Epithelial cells` and _not_ `Keratinocytes`. - Terms that are too broad should be removed. This includes: `lining cell`, `blood cell`, `progenitor cell`, `bone cell`, and `supporting cell` diff --git a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html index cd0998b3e..e2cae57ad 100644 --- a/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html +++ b/analyses/cell-type-consensus/exploratory-notebooks/01-reference-exploration.html @@ -11,7 +11,7 @@ - + Summary of cell type ontologies in reference files @@ -441,7 +441,7 @@

    Summary of cell type ontologies in reference files

    Ally Hawkins

    -

    2024-12-17

    +

    2024-12-18

    @@ -512,7 +512,7 @@

    Setup

    panglao_annotation = "human_readable_value" )
    ## Rows: 178 Columns: 3
    -## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    +## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
     ## Delimiter: "\t"
     ## chr (3): ontology_id, human_readable_value, panglao_cell_type
     ## 
    @@ -693,8 +693,8 @@ 

    Latest common ancestor (LCA) between PanglaoDB and Blueprint dplyr::mutate(lca = dplyr::if_else(blueprint_ontology == panglao_ontology, blueprint_ontology, lca)) |> # join in information for each of the lca terms including name, number of ancestors and descendants dplyr::left_join(cl_df, by = c("lca" = "cl_ontology"))

    -
    ## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 7967 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
    -## 20, ...].
    +
    ## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 7967 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
    +## 18, 19, 20, ...].

    Distribution of ancestors and descendants

    ggplot(lca_df, aes(x = total_ancestors)) +
    @@ -1733,7 +1733,7 @@ 

    Myeloid leukocyte

    then I would argue we shouldn’t keep myeloid leukocyte. Noting that after discussion we have decided to keep this one since T and B cells are much easier to differentiate based on gene expression alone than -cells that are party of the myeloid lineage.

    +cells that are part of the myeloid lineage.

    Progenitor cell

    @@ -3347,7 +3347,9 @@

    Epithelial cell

    The PanglaoDB cell types seem to be more specific than the ones present in Blueprint Encode, similar to the observation with neurons. We -should keep epithelial cell.

    +should keep epithelial cell in the cases where the Blueprint Encode +annotation is Epithelial cells but not when it is +Keratinocytes.

    @@ -3492,8 +3494,8 @@

    Removing anything with more than 1 LCA

    # which cell types are now missing from the list to keep setdiff(celltypes_to_keep, updated_celltypes)
    -
    ## [1] "blood cell"                   "hematopoietic precursor cell" "lining cell"                  "perivascular cell"           
    -## [5] "supporting cell"
    +
    ## [1] "blood cell"                   "hematopoietic precursor cell" "lining cell"                 
    +## [4] "perivascular cell"            "supporting cell"

    It looks like I am losing a few terms I already said were not specific and then a few other terms, like “hematopoietic precursor cell” and “perivascular cell”. I’ll look at both of those to confirm we would @@ -3909,9 +3911,12 @@

    Conclusions

    matches that have the label hematopoietic precursor cell.
  • The LCA should have equal to or less than 170 total descendants.
  • -
  • We whould include the term for neuron and +
  • We should include the term for neuron and epithelial cell even though they do not pass the threshold -for number of descendants.
  • +for number of descendants. However, epithelial cell should +only be included if the Blueprint Encode name is +Epithelial cells and not +Keratinocytes.
  • Terms that are too broad should be removed. This includes: lining cell, blood cell, progenitor cell, bone cell, and