utils.R

# Function to install missing packages from GitHub with dependencies and upgrade option
install_missing_github_fx <- function(pkg_name, github_repo, use_remotes = FALSE, dependencies = TRUE, upgrade = "ask") {

  # Initialize a flag to track installation
  installed <- FALSE
  
  if (!requireNamespace(pkg_name, quietly = TRUE)) {
    message(paste("Installing", pkg_name, "from", github_repo))
    
    # Ensure the installer package is available
    if (use_remotes) {
      if (!requireNamespace("remotes", quietly = TRUE)) {
        install.packages("remotes")
      }
    } else {
      if (!requireNamespace("devtools", quietly = TRUE)) {
        install.packages("devtools")
      }
    }
    
    # Attempt to install the package and handle potential errors
    tryCatch({
      if (use_remotes) {
        remotes::install_github(github_repo, dependencies = dependencies, upgrade = upgrade, quiet = TRUE)
      } else {
        devtools::install_github(github_repo, dependencies = dependencies, upgrade = upgrade, quiet = TRUE)
      }
      installed <- TRUE  # Set flag to TRUE if installation succeeds
      message(paste(pkg_name, "installed successfully"))
    }, error = function(e) {
      message(paste("Failed to install", pkg_name, ":", e$message))
    })
  } else {
    message(paste(pkg_name, "is already installed"))
  }
  
  return(installed)  # Ensure the function returns TRUE or FALSE
}

# Function for smoothing according to the neighbours in the UMAP space
# From the Nanostring ScratchSpace CosMx analysis vignette: 
# https://nanostring-biostats.github.io/CosMx-Analysis-Scratch-Space/posts/marker-gene-smoothing/#introduction
umap_nn_fx <- function(sem, umapreduc, n_neighbors=100){
  
  ## extract umap coordinates 
  umapd <-  
    data.table(sem@reductions[[umapreduc]]@cell.embeddings
               ,keep.rownames = TRUE)
  setnames(umapd, c(names(umapd)[1:3]), c("cell_ID", "UMAP_1", "UMAP_2"))
  
  ## identify nearest n_neighbors (+1 includes the cell as a neighbor to itself)
  nn_umap <- RANN::nn2(umapd[,.(UMAP_1, UMAP_2)],k = n_neighbors + 1)$nn.idx
  nn_umap <- data.table::melt(cbind(umapd[,.(cell_ID)], data.table(nn_umap))
                              , id.vars=c("cell_ID", "V1"))
  colnames(nn_umap) <- c("cell_ID1", "cell_ID1_idx", "neighbor", "cell_ID2_idx")
  nn_umap <- merge(nn_umap
                   , nn_umap[,.(cell_ID2=cell_ID1, cell_ID2_idx=cell_ID1_idx)][
                     ,unique(.SD)],by="cell_ID2_idx")
  
  ## Cell x cell neighbor indicator matrix
  wumap <- Matrix::sparseMatrix(i = c(unique(nn_umap$cell_ID1_idx), nn_umap$cell_ID2_idx)
                                ,j=c(unique(nn_umap$cell_ID1_idx), nn_umap$cell_ID1_idx)
                                ,x=1)
  
  rownames(wumap) <- colnames(wumap) <- nn_umap[order(nn_umap$cell_ID1_idx),unique(cell_ID1)]
  
  ## Column standardize, so that columns (cells) sum to 1, and each neighbor given equal weight.
  mumap <- Matrix::sparseMatrix(i=1:ncol(wumap)
                                ,j=1:ncol(wumap)
                                ,x=1/Matrix::colSums(wumap)
  )
  dimnames(mumap) <- dimnames(wumap)
  smoother <- mumap %*% wumap
  smoother <- smoother[,colnames(sem)]
  return(smoother)
}

# Function to extract unique marker genes as vectors to use for heatmaps or dot plots
# Starts from a dataframe generated by Seurat::FindAllMarkers()
extract_and_clean_genes_fx <- function(data, gene_col = "gene", cluster_col = "cluster", top_n = 10) {
  # Extract and collapse top n genes per cluster
  genes_per_cluster <- tapply(data[[gene_col]], list(data[[cluster_col]]), function(i) paste0(i[1:top_n], collapse = ","))
  
  # Unname and unlist the collapsed gene strings
  genes_unlisted <- unname(unlist(strsplit(unlist(genes_per_cluster), ",")))
  
  # Remove "NA" strings and actual NA values
  genes_cleaned <- genes_unlisted[genes_unlisted != "NA"]
  genes_cleaned <- genes_cleaned[!is.na(genes_cleaned)]
  
  return(genes_cleaned)
}

# Define the function to subset and count cells
# To prepare for running dittoHeatmap at single cell level

subset_and_count_cells_fx <- function(spe, cluster_col) {
  # Extract unique clusters
  unique_clusters <- unique(colData(spe)[[cluster_col]])
  
  # Create a named vector to store the number of cells in each subset1
  cell_counts <- numeric(length(unique_clusters))
  names(cell_counts) <- unique_clusters
  
  # subset the SpatialExperiment object by each unique cluster and count cells
  for (cluster in unique_clusters) {
    subset1_spe <- spe[, colData(spe)[[cluster_col]] == cluster]
    cell_counts[as.character(cluster)] <- ncol(subset1_spe)
  }
  
  return(cell_counts)
}

# Define the function to sort and compute cumulative counts
# To prepare for running dittoHeatmap at single cell level

sort_and_cumulate_fx <- function(cell_counts, matching_values_sort) {
  # Sort the cell counts in descending order
  sorted_cell_counts <- cell_counts[match(matching_values_sort, names(cell_counts))]
  
  # Add the previous element's cell count to each element
  cumulative_counts <- sorted_cell_counts
  for (i in 2:length(sorted_cell_counts)) {
    cumulative_counts[i] <- cumulative_counts[i] + cumulative_counts[i - 1]
  }
  
  return(cumulative_counts)
}

# Function to create beautiful radar chart
## codes from datanova https://www.datanovia.com/en/blog/beautiful-radar-chart-in-r-using-fmsb-and-ggplot-packages/ 
create_beautiful_radarchart_fx <- function(data, color = "#00AFBB", 
                                           vlabels = colnames(data), vlcex = 0.7,
                                           caxislabels = NULL, title = NULL, ...){
  radarchart(
    data, axistype = 1,
    # Customize the polygon
    pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
    # Customize the grid
    cglcol = "grey", cglty = 1, cglwd = 0.8,
    # Customize the axis
    axislabcol = "grey", 
    # Variable labels
    vlcex = vlcex, vlabels = vlabels,
    caxislabels = caxislabels, title = title, ...
  )
}

output_dir_svg_plots_fx <- function(ts) {
  # Use getwd() to get the output directory base
  output_dir_base <- getwd()
  
  # Use knitr::current_input() to get the R Markdown filename without extension
  if (!is.null(knitr::current_input())) {
    md_filename <- tools::file_path_sans_ext(basename(knitr::current_input()))
  } else {
    # Fallback filename for interactive sessions
    md_filename <- "interactive_session"
  }
  
  # Define the output directory path
  output_dir <- file.path(output_dir_base, paste0(md_filename, "_svg_files_", ts))
  
  # Create the directory without prompts
  if (!dir.exists(output_dir)) {
    dir.create(
      output_dir,
      recursive = TRUE,
      showWarnings = FALSE
    )
  }
  
  # Return the directory path
  return(output_dir)
}

# Define a function to return common plot components to use with boxplots
get_common_boxplot_components_fx <- function() {
  list(
    geom_point(position = position_dodge2(width = 0.5, padding = 1.5)),
    theme_minimal(),
    theme(
      axis.text.x = element_text(size = 14, face = "bold", angle = 45, hjust = 1),
      axis.text.y = element_text(size = 14, face = "bold"),
      legend.text = element_text(size = 14, face = "bold"),
      axis.title = element_text(size = 14, face = "bold"),
      legend.title = element_text(size = 14, face = "bold"),
      plot.title = element_text(size = 16, face = "bold"),
      strip.text = element_text(size = 14, face = "bold")
    )
  )
}

# Define a function to return common plot components to use with UMAP plots 
get_common_UMAP_components_fx <- function(p) {
  p +
    guides(color = guide_legend(override.aes = list(size = 5), # Adjust legend symbol size
                                title = NULL)) +  # Remove legend title
    theme(
      legend.text = element_text(size = 16),
      # Legend text size
      legend.key.size = unit(1.5, 'lines'),
      # Adjust legend symbol size
      axis.text.x = element_text(size = 16),
      # X-axis tick labels size
      axis.text.y = element_text(size = 16),
      # Y-axis tick labels size
      axis.title.x = element_text(size = 16),
      # X-axis title size
      axis.title.y = element_text(size = 16)        # Y-axis title size
    ) +
    coord_fixed(ratio = 1)  # Fix the aspect ratio
}

# Sanitize a filename by replacing spaces with underscores, removing or replacing special characters
# optionally removing other non-alphanumeric characters
sanitize_filename_fx <- function(filename) {
  # Replace spaces with underscores
  filename <- gsub(" ", "_", filename)
  # Remove or replace special characters (e.g., '+', '/', '\', ':', '*', '?', '"', '<', '>', '|')
  filename <- gsub("[/\\:*?\"<>|+]", "", filename)
  # Optionally remove other non-alphanumeric characters
  filename <- gsub("[^A-Za-z0-9_\\-]", "", filename)
  return(filename)
}

# Create a heatmap from a contingency table with custom axis labels
plot_conti_ht_fx <- function(tbl, xlab, ylab) {
  tmp <- reshape2::melt(tbl)
  p <- ggplot(tmp, aes(x = Var2, y = Var1, fill = value)) +
    geom_tile() +
    scale_fill_gradient(low = "white", high = "blue") +
    labs(x = xlab, y = ylab, fill = "prop") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  return(p)
}

# Function to replace duplicate colors with unique colors
replace_duplicate_colors_fx <- function(color_vector) {
  unique_colors <- unique(color_vector)
  if (length(unique_colors) < length(color_vector)) {
    # Generate a larger set of unique colors to replace duplicates
    all_unique_colors <- colorRampPalette(brewer.pal(12, "Paired"))(length(color_vector) * 2)
    
    # Remove already assigned colors
    remaining_colors <- setdiff(all_unique_colors, unique_colors)
    
    # Replace duplicates with unique colors
    used_colors <- setNames(character(0), character(0))
    for (i in names(color_vector)) {
      if (color_vector[i] %in% used_colors) {
        color_vector[i] <- remaining_colors[1]
        remaining_colors <- remaining_colors[-1]
      }
      used_colors <- c(used_colors, color_vector[i])
    }
  }
  return(color_vector)
}

# Function to generate side by side spatial plots for non-spatial and BANKSY clusters for each ROI
generate_spatial_plots_fx <- function(spe, cnames, color_vector_nsp, color_vector_bank) {
  plot_nsp <- plotColData(spe, x = "x", y = "y", point_size = 0.6, colour_by = cnames[1]) +
    scale_color_manual(values = color_vector_nsp) +
    labs(title = paste0("Non-spatial clusters \n", cell_type_anno, " cells"))
  
  plot_bank <- plotColData(spe, x = "x", y = "y", point_size = 0.6, colour_by = cnames[2]) +
    scale_color_manual(values = color_vector_bank) +
    labs(title = paste0("BANKSY clusters \n", cell_type_anno, " cells"))
  
  return(list(plot_nsp = plot_nsp, plot_bank = plot_bank))
}

rescale_quantiles_asinh_fx <- function(x, cofactor = 1) {
  # Perform the asinh transformation
  x_transformed <- asinh(x / cofactor)
  
  # Determine the 5th and 95th percentiles of the transformed data
  q5 <- quantile(x_transformed, 0.05, na.rm = TRUE)
  q95 <- quantile(x_transformed, 0.95, na.rm = TRUE)
  
  # Rescale values between 0 and 1
  x_rescaled <- (x_transformed - q5) / (q95 - q5)
  
  # Clip values to be within [0, 1]
  x_rescaled[x_rescaled < 0] <- 0
  x_rescaled[x_rescaled > 1] <- 1
  
  return(x_rescaled)
}

# Function to plot violin plot for distribution and spatial at once
plot_violin_spatial_fx <- function(sfe, feature,sample_id="all",size=0.5) {
  violin <- plotColData(sfe, feature, point_fun = function(...) list())
  spatial <- plotSpatialFeature(sfe, feature, colGeometryName = "centroids",
                                scattermore = TRUE,sample_id=sample_id,size=size)
  violin + spatial +
    plot_layout(widths = c(1, 2))
}

# Function to apply thresholds to a single SpatialExperiment object - required for the function apply_thresholds_to_list_fx
apply_thresholds_fx <- function(sfe, donor, dapi_thresholds, area_thresholds) {
  # Extract the thresholds for the current donor
  threshold_DAPI <- unname(dapi_thresholds[donor])
  threshold_Area <- unname(area_thresholds[donor])
  
  # Apply the thresholds to the relevant columns
  pass_DAPI <- sfe$Mean.DAPI > threshold_DAPI
  pass_Area <- sfe$area < threshold_Area
  
  # Create a logical vector for cells that pass both thresholds
  sfe$pass_both_DAPI_area <- pass_DAPI & pass_Area
  
  # Return the updated SpatialExperiment object
  return(sfe)
}

# Function to apply thresholds to a list of SpatialExperiment objects
apply_thresholds_to_list_fx <- function(sfe_list, dapi_thresholds, area_thresholds) {
  # Loop over the list of SpatialExperiment objects and apply the thresholds
  for (i in seq_along(sfe_list)) {
    donor <- names(sfe_list)[i]
    sfe_list[[i]] <- apply_thresholds_fx(sfe_list[[i]], donor, dapi_thresholds, area_thresholds)
  }
  
  return(sfe_list)
}

# Function to exclude non-coding genes from a SpatialFeatureExperiment object based on patterns in the row names
exclude_noncoding_genes_fx <- function(sfe, patterns = c("^Neg", "^SystemControl")) {
  # Defaults to exclude genes starting with "Neg" or "SystemControl" (generated by CosMx)
  
  # Check if 'sfe' has row names
  if (is.null(rownames(sfe))) {
    stop("'sfe' does not have row names. Please assign row names before using this function.")
  }
  
  # Get the number of rows before exclusion
  n_before <- nrow(sfe)
  
  # Combine the patterns into a single regular expression
  combined_pattern <- paste(patterns, collapse = "|")
  
  # Identify rows to exclude based on the combined pattern
  rows_to_exclude <- str_detect(rownames(sfe), combined_pattern)
  
  # Calculate the number of rows to exclude
  n_excluded <- sum(rows_to_exclude)
  
  # Check if any rows match the exclusion criteria
  if (n_excluded > 0) {
    # Subset 'sfe' to exclude the identified rows
    sfe_subset <- sfe[!rows_to_exclude, ]
    
    # Get the number of rows after exclusion
    n_after <- nrow(sfe_subset)
    
    # Inform the user about the exclusion details
    message(sprintf(
      "Excluding non-coding genes:\n- Genes before exclusion: %d\n- Genes excluded: %d\n- Genes after exclusion: %d",
      n_before,
      n_excluded,
      n_after
    ))
    
    # Return the subsetted object
    return(sfe_subset)
  } else {
    # Inform the user that no rows were excluded
    message(sprintf(
      "No genes matching the specified patterns were found in 'sfe'.\n- Genes before: %d\n- Genes after: %d",
      n_before,
      n_before  # Since no rows were excluded
    ))
    
    # Return the original object unchanged
    return(sfe)
  }
}

# Function for checking and optionally fixing compatibility issues in 'colData' slots of S4 objects (DataFrame) when converting to base R data frame
check_coldata_compatibility_fx <- function(
    s4_object,
    auto_convert = FALSE,
    verbose = TRUE
) {
  # Function to check and optionally fix compatibility issues in colData
  
  # Parameters:
  # - s4_object: An S4 object with a 'colData' slot (e.g., SingleCellExperiment)
  # - auto_convert: Logical. If TRUE, attempts to automatically convert problematic columns
  # - verbose: Logical. If TRUE, prints informative messages
  
  # Returns:
  # - A list containing:
  #   - 'problematic_cols': Named list of problematic columns and their classes
  #   - 'coldata_fixed': The (possibly modified) colData as a DataFrame
  
  # Ensure the object has a 'colData' slot
  if (!"colData" %in% slotNames(s4_object)) {
    stop("The provided object does not have a 'colData' slot.")
  }
  
  # Extract 'colData'
  col_data <- colData(s4_object)
  
  # Check if 'colData' is NULL
  if (is.null(col_data)) {
    stop("The 'colData' slot is NULL.")
  }
  
  # Initialize a list to store problematic columns
  problematic_cols <- list()
  
  # Initialize a logical vector to track columns to drop (if necessary)
  cols_to_drop <- logical(length = ncol(col_data))
  names(cols_to_drop) <- colnames(col_data)
  
  # Iterate over each column to check for compatibility issues
  for (col_name in colnames(col_data)) {
    column_data <- col_data[[col_name]]
    col_class <- class(column_data)
    
    # Check for list columns or columns containing S4 objects
    if (is.list(column_data) || isS4(column_data) || any(sapply(column_data, isS4))) {
      problematic_cols[[col_name]] <- col_class
      
      if (auto_convert) {
        # Attempt to flatten list columns with single-element lists
        if (is.list(column_data) && all(sapply(column_data, length) == 1)) {
          col_data[[col_name]] <- unlist(column_data)
          if (verbose) {
            message(sprintf("Auto-converted column '%s' from list to vector.", col_name))
          }
        } else {
          # Mark column for dropping if it cannot be auto-converted
          cols_to_drop[col_name] <- TRUE
          if (verbose) {
            message(sprintf("Column '%s' could not be auto-converted and will be dropped.", col_name))
          }
        }
      }
    }
  }
  
  # Drop problematic columns if auto_convert is TRUE and columns couldn't be fixed
  if (auto_convert && any(cols_to_drop)) {
    col_data <- col_data[, !cols_to_drop, drop = FALSE]
    if (verbose) {
      message("Dropped columns: ", paste(names(cols_to_drop)[cols_to_drop], collapse = ", "))
    }
  }
  
  # Provide messages based on the findings
  if (length(problematic_cols) > 0) {
    if (verbose) {
      message("⚠️ The following columns in 'colData' may cause compatibility issues:")
      for (col in names(problematic_cols)) {
        message(sprintf("- '%s' (class: %s)", col, paste(problematic_cols[[col]], collapse = ", ")))
      }
      if (!auto_convert) {
        message("Consider converting these columns to atomic vectors or handling them before conversion.")
      }
    }
  } else {
    if (verbose) {
      message("✅ No compatibility issues detected in 'colData'.")
    }
  }
  
  # Return a list with problematic columns and the (possibly modified) colData
  return(list(
    problematic_cols = problematic_cols,
    coldata_fixed = col_data
  ))
}

# Function to inspect the distribution of total counts and determine the percentile corresponding to a given floor value
inspect_floor_threshold_fx <- function(counts_matrix, 
                                                floor_value = 20, 
                                                plot = TRUE, 
                                                binwidth = NULL, 
                                                floor_label = NULL,
                                                log_scale = FALSE,
                                                limit_upper_percentile = FALSE) {
  # Function to inspect the distribution of total counts per cell and report the percentile corresponding to a given floor value.
  #
  # Parameters:
  # - counts_matrix: Matrix of counts (genes x cells).
  # - floor_value: Numeric. The floor value to use for normalization (default: 20).
  # - plot: Logical. If TRUE, generates plots and includes them in the output (default: TRUE).
  # - binwidth: Numeric. Width of histogram bins. If NULL, it's automatically calculated.
  # - floor_label: Character. Label for the floor value in plots. If NULL, defaults to "Floor Value: [floor_value]".
  # - log_scale: Logical. If TRUE, plots the total counts on a log scale (default: FALSE).
  # - limit_upper_percentile: Logical. If TRUE, limits the plots to the 95th percentile of total counts (default: FALSE).
  #
  # Returns:
  # - A list containing:
  #   - floor_value: The floor value used.
  #   - floor_percentile: The percentile corresponding to the floor value (as a percentage).
  #   - plots: A list containing the three ggplot objects (histogram, boxplot, CDF plot).
  
  # Load required package
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    install.packages("ggplot2")
  }
  library(ggplot2)
  
  # Ensure counts_matrix is a matrix
  if (!is.matrix(counts_matrix)) {
    counts_matrix <- as.matrix(counts_matrix)
  }
  
  # 1. Calculate total counts per cell
  total_counts <- colSums(counts_matrix)
  
  # 2. Create a data frame for plotting
  counts_df <- data.frame(TotalCounts = total_counts)
  
  # 3. Determine the percentile corresponding to the floor value
  min_total <- min(total_counts)
  max_total <- max(total_counts)
  
  if (floor_value < min_total) {
    warning("The floor_value is less than the minimum total count. Setting floor_percentile to 0.")
    floor_percentile <- 0
  } else if (floor_value > max_total) {
    warning("The floor_value is greater than the maximum total count. Setting floor_percentile to 100.")
    floor_percentile <- 100
  } else {
    # Calculate the percentile
    floor_percentile <- ecdf(total_counts)(floor_value) * 100
  }
  
  # 4. Prepare floor_label if NULL
  if (is.null(floor_label)) {
    floor_label <- paste0("Floor Value: ", floor_value)
  }
  
  # Initialize plot variables
  p1 <- p2 <- p3 <- NULL
  
  # 5. Generate plots with threshold lines if requested
  if (plot) {
    # Optionally limit the data to the 95th percentile
    if (limit_upper_percentile) {
      upper_limit <- quantile(total_counts, 0.95)
      counts_df_plot <- counts_df[counts_df$TotalCounts <= upper_limit, , drop = FALSE]  # Added 'drop = FALSE' here
    } else {
      counts_df_plot <- counts_df
    }
    
    # Calculate default binwidth if not provided
    if (is.null(binwidth)) {
      binwidth <- (max(counts_df_plot$TotalCounts) - min(counts_df_plot$TotalCounts)) / 30
    }
    
    # a. Histogram
    p1 <- ggplot(counts_df_plot, aes(x = TotalCounts)) +
      geom_histogram(binwidth = binwidth, 
                     fill = "steelblue", 
                     color = "black", 
                     alpha = 0.7) +
      geom_vline(xintercept = floor_value, 
                 color = "red", 
                 linetype = "dashed", 
                 size = 1) +
      labs(title = "Histogram of Total Counts per Cell",
           x = "Total Counts",
           y = "Number of Cells") +
      annotate("text", 
               x = floor_value, 
               y = Inf, 
               label = floor_label, 
               vjust = -0.5, 
               hjust = 1.1, 
               color = "red", 
               angle = 90, 
               size = 3.5) +
      theme_minimal()
    
    # Apply log scale if requested
    if (log_scale) {
      p1 <- p1 + scale_x_log10() + labs(x = "Total Counts (log scale)")
    }
    
    # b. Boxplot
    p2 <- ggplot(counts_df_plot, aes(y = TotalCounts)) +
      geom_boxplot(fill = "tomato", 
                   color = "black", 
                   alpha = 0.7) +
      geom_hline(yintercept = floor_value, 
                 color = "red", 
                 linetype = "dashed", 
                 size = 1) +
      labs(title = "Boxplot of Total Counts per Cell",
           y = "Total Counts") +
      annotate("text", 
               x = 1.05, 
               y = floor_value, 
               label = floor_label, 
               color = "red", 
               vjust = -0.5, 
               hjust = 0, 
               size = 3.5) +
      theme_minimal()
    
    # Apply log scale if requested
    if (log_scale) {
      p2 <- p2 + scale_y_log10() + labs(y = "Total Counts (log scale)")
    }
    
    # c. CDF Plot
    p3 <- ggplot(counts_df, aes(x = TotalCounts)) +
      stat_ecdf(geom = "step", 
                color = "purple") +
      geom_vline(xintercept = floor_value, 
                 color = "red", 
                 linetype = "dashed", 
                 size = 1) +
      labs(title = "Cumulative Distribution of Total Counts per Cell",
           x = "Total Counts",
           y = "Proportion of Cells") +
      annotate("text", 
               x = floor_value, 
               y = 1.05, 
               label = floor_label, 
               color = "red", 
               vjust = 0, 
               hjust = 1.1, 
               size = 3.5) +
      theme_minimal()
    
    # Apply log scale and limit x-axis if requested
    if (log_scale) {
      p3 <- p3 + scale_x_log10() + labs(x = "Total Counts (log scale)")
    }
    if (limit_upper_percentile) {
      p3 <- p3 + coord_cartesian(xlim = c(min_total, upper_limit))
    }
  }
  
  # 6. Return results as a list, including plots
  return(list(
    floor_value = floor_value,
    floor_percentile = floor_percentile,
    plots = list(
      histogram = p1,
      boxplot = p2,
      cdf_plot = p3
    )
  ))
}

create_cell_types_annotation_fx <- function(marker_genes, organism, organ) {
  # Load necessary library
  library(dplyr)
  
  # Check if required columns exist in the data frame
  required_columns <- c("gene", "cluster")
  if (!all(required_columns %in% colnames(marker_genes))) {
    stop("The data frame 'marker_genes' must contain 'gene' and 'cluster' columns.")
  }
  
  # Ensure 'cluster' is treated as a factor to maintain order
  marker_genes$cluster <- as.factor(marker_genes$cluster)
  
  # Process the gene lists for each cluster, separating genes with commas
  cell_types_annotation <- marker_genes %>%
    group_by(cluster) %>%
    summarise(genes = paste(gene, collapse = ",")) %>%
    arrange(as.numeric(as.character(cluster))) # Ensure proper ordering
  
  # Create a named vector with names as cluster indices
  cell_types_annotation_vector <- setNames(cell_types_annotation$genes, cell_types_annotation$cluster)
  
  # Construct the initial descriptive message
  initial_message <- paste(
    "Identify cell types of", organism, organ, "cells using the following markers.",
    "Identify one cell type for each row. Only provide the cell type name."
  )
  
  # Combine all gene lists into the message
  gene_lists <- cell_types_annotation$genes
  
  # Combine all parts into the final message
  message_parts <- c(
    initial_message,
    gene_lists,
    "Some can be a mixture of multiple cell types.",
    "Return this as a named vector called cell_types_annotation in R with the names = indices."
  )
  
  # Create the final message string with newline separators
  final_message <- paste(message_parts, collapse = "\n")
  
  # Return both the message and the named vector as a list
  return(list(
    message = final_message,
    cell_types_annotation = cell_types_annotation_vector
  ))
}


#' Find Markers Using Custom BANKSY Function
#'
#' This function identifies marker genes using the Seurat `FindAllMarkers` function
#' on different data matrices derived from a SingleCellExperiment (SCE) object,
#' including BANKSY and smoothed matrices. It allows for customization of parameters
#' and is designed to be generalizable and reproducible.
#'
#' @param spe A `SingleCellExperiment` object containing the data.
#' @param matrix_used A character string specifying which data matrix to use.
#'        Possible values are:
#'        - `"normcounts"`: Use the scaled and normalized counts.
#'        - `"banksy"`: Use the BANKSY matrix.
#'        - `"banksy_smooth"`: Use the smoothed BANKSY matrix.
#'        - `"normcounts_smooth"`: Use the smoothed scaled and normalized counts.
#'        Default is `"normcounts"`.
#' @param ident_banksy Logical value indicating whether to use BANKSY clusters (`TRUE`) or
#'        non-spatial clusters (`FALSE`) for cell identities. Default is `TRUE`.
#' @param level_anno Character or numeric value specifying the level annotation.
#'        Default is `level_anno`.
#' @param repeat_anno Character or numeric value specifying the repeat annotation.
#'        Default is `repeat_anno`.
#'
#' @return A named list containing:
#' \describe{
#'   \item{sce}{The updated `SingleCellExperiment` object with new assays added.}
#'   \item{markers}{A data frame containing all marker genes identified by `FindAllMarkers`.}
#' }
#'
#' @details
#' The function performs the following steps:
#' 1. Constructs the `name_level_repeat` identifier from `level_anno` and `repeat_anno`.
#' 2. Extracts necessary metadata (`rdnames_spe`, `cnames_spe`) from `spe`.
#' 3. Determines the cell identity column and UMAP pattern based on `ident_banksy`.
#' 4. Creates Seurat objects using different data matrices.
#' 5. Generates a smoother based on UMAP embeddings.
#' 6. Applies smoothing to create smoothed assays.
#' 7. Selects the appropriate Seurat object based on `matrix_used`.
#' 8. Identifies marker genes using `FindAllMarkers`.
#' 9. Returns the updated SCE object and marker genes.
#'
#' @examples
#' # Define level and repeat annotations
#' level_anno <- "1"
#' repeat_anno <- "1"
#'
#' # Run the function with specified parameters
#' results <- find_markers_banksy_custom_fx(
#'   spe = spe,
#'   matrix_used = "banksy",
#'   ident_banksy = TRUE,
#'   level_anno = level_anno,
#'   repeat_anno = repeat_anno
#' )
#'
#' # Access the updated SCE object
#' updated_spe <- results$sce
#'
#' # View the marker genes
#' head(results$markers)
#'
#' @export
find_markers_banksy_custom_fx <- function(spe,
                                          matrix_used = "normcounts",
                                          ident_banksy = TRUE,
                                          level_anno = level_anno,
                                          repeat_anno = repeat_anno) {
  # Step 1: Construct name_level_repeat
  name_level_repeat <- paste0("BANKSY_params_level_", level_anno, "_repeat_", repeat_anno)
  
  # Step 2: Extract metadata
  rdnames_spe <- metadata(spe)[[name_level_repeat]]$rdnames_spe
  cnames_spe <- metadata(spe)[[name_level_repeat]]$cnames_spe
  
  # Step 3: Determine identity column and UMAP pattern
  if (ident_banksy) {
    ident_column <- cnames_spe[2]
    umap_pattern <- "^UMAP_M\\d+_lam0\\.\\d+$"
    message(
      "BANKSY clusters were used. The cluster variable is: ",
      ident_column,
      ". The reduced dim variable is: ",
      umap_pattern
    )
  } else {
    ident_column <- cnames_spe[1]
    umap_pattern <- "^UMAP_M\\d+_lam0\\.\\d+$"
    message(
      "Non-spatial clusters were used. The cluster variable is: ",
      ident_column,
      ". The reduced dim variable is: ",
      umap_pattern
    )
  }
  
  # Step 4: Create initial Seurat object with normalized counts
  seurat <- CreateSeuratObject(counts = assays(spe)[["normcounts"]], meta.data = data.frame(colData(spe)[, cnames_spe]))
  # seurat <- as.Seurat(spe) %>% UpdateSeuratObject()
  # seurat@assays$originalexp$data <- assays(spe)[["scalenormcounts"]]
  Idents(seurat) <- seurat[[ident_column]][, 1]
  seurat@assays$RNA$data <- log1p(seurat@assays$RNA$counts) # Fill the data slot with the log counts
  
  # Step 5: Create BANKSY Seurat object
  seurat_bank <- seurat
  seurat_bank@assays$RNA$scale.data <- getBanksyMatrix(spe, M = 1, lambda = 0.2)
  
  # Step 6: Find UMAP embedding based on the pattern
  umap_name <- grep(umap_pattern, rdnames_spe, value = TRUE)
  if (length(umap_name) == 0) {
    stop("No UMAP dimension found matching pattern: ", umap_pattern)
  }
  seurat@reductions[[umap_name]] <- CreateDimReducObject(embeddings = reducedDims(spe)[[umap_name]],
                                                         key = "UMAP_",
                                                         assay = "RNA")
  
  # Step 7: Create smoother using UMAP embeddings
  smoother <- umap_nn_fx(seurat, umap_name, n_neighbors = 500)
  
  # Step 8: Add smoothed assays to the SCE object
  assays(spe, withDimnames = FALSE)[["BANKSY_matrix"]] <- getBanksyMatrix(spe, M = 1, lambda = 0.2)[1:nrow(spe), ]
  assays(spe, withDimnames = FALSE)[["BANKSY_matrix_smooth"]] <- assays(spe)[["BANKSY_matrix"]] %*% smoother
  assays(spe, withDimnames = FALSE)[["normcounts_smooth"]] <- assays(spe)[["normcounts"]] %*% smoother
  
  # Step 9: Create smoothed Seurat objects
  seurat_bank_smooth <- CreateSeuratObject(counts = assays(spe)[["BANKSY_matrix_smooth"]], meta.data = data.frame(colData(spe)[, cnames_spe]))
  seurat_bank_smooth@assays$RNA$scale.data <- seurat_bank_smooth@assays$RNA$counts
  seurat_bank_smooth@assays$RNA$data <- log1p(seurat_bank_smooth@assays$RNA$counts)
  #seurat_bank_smooth <- as.Seurat(spe) %>% UpdateSeuratObject()
  #seurat_bank_smooth@assays$originalexp$data <- as(assays(spe)[["BANKSY_matrix_smooth"]], "dgCMatrix")
  Idents(seurat_bank_smooth) <- seurat_bank_smooth[[ident_column]][, 1]
  
  seurat_smooth <- CreateSeuratObject(counts = assays(spe)[["normcounts_smooth"]], meta.data = data.frame(colData(spe)[, cnames_spe]))
  # seurat_smooth <- as.Seurat(spe) %>% UpdateSeuratObject()
  #  seurat_smooth@assays$originalexp$data <- as(assays(spe)[["scalenormcounts_smooth"]], "dgCMatrix")
  Idents(seurat_smooth) <- seurat_smooth[[ident_column]][, 1]
  seurat_smooth@assays$RNA$data <- log1p(seurat_smooth@assays$RNA$counts)
  
  
  # Step 10: Select the appropriate Seurat object based on matrix_used
  seurat_obj <- switch(
    matrix_used,
    "normcounts" = seurat,
    "normcounts_smooth" = seurat_smooth,
    "banksy" = seurat_bank,
    "banksy_smooth" = seurat_bank_smooth,
    stop(
      "Invalid matrix_used. Must be one of 'normcounts', 'banksy', 'banksy_smooth', 'normcounts_smooth'."
    )
  )
  
  # Step 11: Identify markers using FindAllMarkers
  markers_seurat <- FindAllMarkers(
    object = seurat_obj,
    slot = "data",
    only.pos = TRUE,
    min.pct = 0.1
  )
  
  # Step 12: Clean up Seurat objects to free up resources
  rm(seurat,
     seurat_bank,
     seurat_bank_smooth,
     seurat_smooth,
     seurat_obj)
  gc()  # Suggest garbage collection to free memory
  
  # Step 13: Return the updated SCE object and marker genes as a list
  return(list(sce = spe, markers = markers_seurat))
}

plan_future_fx <- function(memory_fraction = 0.8,
                           handler = "txtprogressbar",
                           workers = NULL) {
  # Load required libraries
  if (!requireNamespace("future", quietly = TRUE)) {
    stop("Package 'future' is required but not installed.")
  }
  if (!requireNamespace("progressr", quietly = TRUE)) {
    stop("Package 'progressr' is required but not installed.")
  }
  
  library(future)
  library(progressr)
  
  # Set up the progress handler based on the input
  if (!is.null(handler)) {
    progressr::handlers(handler)  # Register the specified handler
    message(sprintf("Using '%s' as the progress handler.", handler))
  }
  
  # Determine the operating system
  os_type <- .Platform$OS.type  # "windows" or "unix"
  message(sprintf("Operating system type: %s", os_type))
  
  # Check if running inside RStudio
  in_rstudio <- FALSE
  if (requireNamespace("rstudioapi", quietly = TRUE)) {
    in_rstudio <- rstudioapi::isAvailable()
  }
  message(sprintf("Running in RStudio: %s", in_rstudio))
  
  # Determine the number of workers
  if (is.null(workers)) {
    available_workers <- future::availableCores() - 1  # Use all cores minus one
    available_workers <- max(1, available_workers)    # Ensure at least one worker
    message(sprintf("Number of workers not specified. Using available workers minus one: %d", available_workers))
  } else {
    # Validate the 'workers' argument
    if (!is.numeric(workers) || length(workers) != 1 || workers < 1) {
      stop("'workers' must be a single positive integer.")
    }
    available_workers <- min(as.integer(workers), future::availableCores())
    message(sprintf("Using specified number of workers: %d", available_workers))
  }
  
  # Set the parallel plan based on the OS
  if (os_type == "windows") {
    # Use multisession on Windows
    future::plan("multisession", workers = available_workers)
    message("Using multisession for parallel processing on Windows.")
  } else {
    # On Unix-like systems (e.g., Ubuntu), use multicore
    # This applies regardless of whether running in RStudio
    future::plan("multicore", workers = available_workers)
    message("Using multicore for parallel processing on Unix-like OS (e.g., Ubuntu).")
  }
  
  # Initialize variables for memory calculation
  total_physical_bytes <- 0
  total_swap_bytes <- 0
  
  # Get total system memory depending on the OS
  sys_info <- Sys.info()
  sysname <- sys_info["sysname"]
  
  if (sysname == "Windows") {
    # Retrieve total physical memory in bytes using WMIC
    physical_mem_output <- system("wmic computersystem get TotalPhysicalMemory", intern = TRUE)
    total_physical_bytes <- as.numeric(gsub("[^0-9]", "", physical_mem_output[2]))
    message("Retrieved total physical memory for Windows.")
    
    # **Exclude Swap Memory in Windows**
    # Swap memory retrieval is commented out as per original function
    # Uncomment if needed
    # swap_mem_output <- system("wmic pagefile get AllocatedBaseSize", intern = TRUE)
    # total_swap_bytes <- as.numeric(gsub("[^0-9]", "", swap_mem_output[2])) * 1024^2
    # message("Retrieved total swap memory for Windows.")
    
  } else if (sysname == "Darwin") {
    # Retrieve total physical memory in bytes using sysctl
    total_physical_bytes <- as.numeric(system("sysctl -n hw.memsize", intern = TRUE))
    message("Retrieved total physical memory for macOS.")
    
    # Retrieve swap memory using vm_stat
    vm_stat <- system("vm_stat", intern = TRUE)
    page_size_line <- grep("page size of", vm_stat, value = TRUE)
    page_size <- as.numeric(sub(".*page size of (\\d+) bytes.*", "\\1", page_size_line))
    
    # Extract swap pages
    swap_pages_used <- as.numeric(gsub("\\.", "", grep("Pages occupied by compressor", vm_stat, value = TRUE)))
    swap_pages_free <- as.numeric(gsub("\\.", "", grep("Pages free", vm_stat, value = TRUE)))
    
    # Calculate total swap in bytes (simplified estimation)
    total_swap_bytes <- (swap_pages_used + swap_pages_free) * page_size
    message("Retrieved total swap memory for macOS (estimated).")
    
  } else {
    # Assume Linux (e.g., Ubuntu)
    # Retrieve total physical memory in bytes using free
    physical_mem_output <- system("free -b | grep Mem", intern = TRUE)
    total_physical_bytes <- as.numeric(strsplit(physical_mem_output, "\\s+")[[1]][2])
    message("Retrieved total physical memory for Linux.")
    
    # Retrieve total swap memory in bytes using free
    swap_mem_output <- system("free -b | grep Swap", intern = TRUE)
    total_swap_bytes <- as.numeric(strsplit(swap_mem_output, "\\s+")[[1]][2])
    message("Retrieved total swap memory for Linux.")
  }
  
  # Calculate the combined total memory (physical + swap)
  # **For Windows, total_swap_bytes remains 0**
  total_memory_bytes <- total_physical_bytes + total_swap_bytes
  
  # Calculate the maximum size for globals based on available memory
  max_size_bytes <- total_memory_bytes * memory_fraction
  
  # Set the future.globals.maxSize option based on the calculated memory size
  options(future.globals.maxSize = max_size_bytes)
  
  # Convert to MiB for easier readability
  max_size_mib <- max_size_bytes / (1024 ^ 2)
  message(sprintf("Setting future.globals.maxSize to %.2f MiB", max_size_mib))
}


# Function to check annotations between levels
check_annotations_between_levels_fx <- function(higher_level, lower_level) {
  # Check if higher_level and lower_level exist in 'spe'
  if (!(higher_level %in% colnames(spe))) {
    cat("### Higher level '", higher_level, "' does not exist in 'spe'.\n\n", sep = "")
    return()
  }
  if (!(lower_level %in% colnames(spe))) {
    cat("### Lower level '", lower_level, "' does not exist in 'spe'.\n\n", sep = "")
    return()
  }
  
  unique_higher <- unique(spe[[higher_level]])
  for (anno in unique_higher) {
    # Handle NA values explicitly
    idx <- !is.na(spe[[higher_level]]) & spe[[higher_level]] == anno
    if (any(idx)) {
      associated_lower <- unique(spe[[lower_level]][idx])
      cat("### Cells with ", higher_level, " annotation '", anno, "' have the following ", lower_level, " annotations:\n", sep = "")
      print(associated_lower)
      cat("\n")
    } else {
      cat("### No cells with ", higher_level, " annotation '", anno, "' found.\n\n")
    }
  }
}

# Function to perform replacements and check annotations
perform_replacements_fx <- function(spe, replacements, level_names, level_to_modify_col) {
  for (replacement in replacements) {
    # Extract replacement details
    old_value <- replacement$old_value
    new_value <- replacement$new_value
    target_level <- replacement$level
    new_values_lv <- replacement$new_values_lv
    
    cat("\n----------------------------------------\n")
    cat(sprintf("Processing replacement: '%s' -> '%s' at '%s'\n", old_value, new_value, target_level))
    
    # Display current annotations corresponding to old_value
    for (i in seq_along(level_names)) {
      current_level <- level_names[i]
      current_annotations <- unique(spe[,spe[[level_to_modify_col]] == old_value][[ current_level]])
      
      cat(sprintf("\nLevel %d annotation corresponding to value to change (%s):\n", 
                  i, current_level))
      print(current_annotations)
    }
    
    # Replace old_value with new_value in level_to_modify_col
    spe[[level_to_modify_col]] <- ifelse(spe[[level_to_modify_col]] == old_value, 
                                         new_value, 
                                         spe[[level_to_modify_col]])
    
    # Verify if replacement was successful
    if (!new_value %in% spe[[level_to_modify_col]]) {
      warning(
        sprintf(
          "Replacement failed: '%s' was not found in '%s' after replacement.",
          new_value,
          level_to_modify_col
        )
      )
      next  # Skip to the next replacement
    }
    
    # Determine target level index
    target_index <- match(target_level, level_names)
    if (is.na(target_index)) {
      warning(sprintf("Target level '%s' not found in level_names. Skipping this replacement.", target_level))
      next
    }
    
    # Identify earlier levels
    earlier_indices <- which(seq_along(level_names) < target_index)
    
    # Update earlier levels if new_values_lv is provided
    for (i in earlier_indices) {
      current_level <- level_names[i]
      lv_name <- paste0("lv", i)
      
      if (!is.null(new_values_lv) && lv_name %in% names(new_values_lv)) {
        new_level_value <- new_values_lv[[lv_name]]
        
        # Debugging: Print the condition being applied
        cat(sprintf("\nApplying condition: %s == '%s'\n", level_to_modify_col, new_value))
        
        # Update the current_level with new_level_value where level_to_modify_col == new_value
        spe[[current_level]] <- ifelse(spe[[level_to_modify_col]] == new_value, 
                                       new_level_value, 
                                       spe[[current_level]])
        
        cat(sprintf("Updated %s to '%s' where %s == '%s'\n", 
                    current_level, new_level_value, 
                    level_to_modify_col, new_value))
      } else {
        cat(sprintf("No new value provided for %s. Skipping update for this level.\n", current_level))
      }
    }
    
    # Display updated annotations corresponding to the new value for each level
    for (i in seq_along(level_names)) {
      current_level <- level_names[i]
      updated_annotations <- unique(spe[, spe[[level_to_modify_col]] == new_value][[current_level]])
      
      cat(sprintf("\nLevel %d updated annotation corresponding to value to change (%s):\n", 
                  i, current_level))
      if (length(updated_annotations) > 0) {
        print(updated_annotations)
      } else {
        cat("None\n")
      }
    }
    
    cat("----------------------------------------\n")
  }
  
  return(spe)
}


# Function to list objects sorted by size
list_objects_by_size_fx <- function(env = .GlobalEnv) {
  obj_names <- ls(envir = env)
  obj_sizes <- sapply(obj_names, function(x) object.size(get(x, envir = env)))
  obj_sizes_sorted <- sort(obj_sizes, decreasing = TRUE)
  data.frame(
    Object = names(obj_sizes_sorted),
    Size = format(obj_sizes_sorted, units = "auto")
  )
}

#' @description This function counts the number of neighboring cells for each specified target cell type based on the pairs of neighboring cells.
#' @param spe A SpatialExperiment object containing the spatial data.
#' @param target A character vector specifying the target cell types for which neighboring cells are counted.
#' @param colPairs A character specifying the name of the pairs of neighboring cells save in \code{spe}. Default is 'neighborhood'.
#' @param celltype A character specifying the column in \code{colData(spe)} containing the cell cluster information. Default is 'cell_clusters2'.
#' @param new_col_name A character specifying the name of the new column to be added to the \code{colData(spe)}, which will contain the count of neighboring cells. Default is 'NeighCount'.
#' 
#' @return A SpatialExperiment object with an additional column added to its 'colData', containing the count of neighboring cells for each specified target cell type.
#' 
#' @details The function computes the count of neighboring cells for each specified target cell type based on the pairs of neighboring cells provided in the input data. It then adds the counts as \code{DataFrame} object in \code{colData(spe)[[new_col_name]]}.

CountNeighCells_fx <- function(spe,
                               target,
                               colPairs = 'neighborhood',
                               celltype = 'cell_clusters2',
                               new_col_name = 'NeighCount') {
  pairs <- colPair(spe, colPairs) %>% as_tibble()
  pairs$from_id <- colnames(spe)[pairs$from]
  pairs$to_label <- colData(spe)[pairs$to, celltype]
  lab <- matrix()
  res <- data.frame(row.names = colnames(spe))
  NeighCounts <- as_tibble(sapply(target, function(target_i) {
    cur_lab <- as.numeric((table(pairs[pairs$to_label == target_i, ]$from_id)))
    names(cur_lab) <- names(table(pairs[pairs$to_label == target_i, ]$from_id))
    lab <- rep(0, ncol(spe))
    names(lab) <- colnames(spe)
    lab[names(cur_lab)] <- cur_lab
    res <- data.frame(count = lab, row.names = names(lab))
    res
  }))
  NeighCounts_DF <- DataFrame(NeighCounts, row.names = colnames(spe))
  colnames(NeighCounts_DF) <- colnames(NeighCounts)
  colData(spe)[[new_col_name]] <- NeighCounts
  
  spe_df <- as.data.frame(colData(spe))
  
  # Annotate cells based on neighbor counts
  for (target_i in target) {
    annotation_cols <- paste0("has_", target_i, "_neigh")
    spe_df <- spe_df %>%
      mutate(!!sym(annotation_cols) := ifelse(
        spe[[new_col_name]][, paste0(target_i, ".count")] > 0,
        paste0("with_", target_i, "_neigh"),
        paste0("no_", target_i, "_neigh")
      ))
  }
  
  spe_df_hasNeigh <- as.data.frame(spe_df[, which(grepl("^has", colnames(spe_df)))])
  counter <- 1
  for (i in unique(colnames(spe_df_hasNeigh))) {
    attr(spe_df_hasNeigh[, i], "dimnames")[[2]] <- colnames(spe_df_hasNeigh)[counter]
    counter <- counter + 1
  }
  spe$hasNeigh_df <- spe_df_hasNeigh
  rm(spe_df_hasNeigh, spe_df)
  return(spe)
}


#' @description This function calculates the frequency of the specified cell types of interest with at least one indicated neighboring cells based on the neighboring cell counts generated by \code{CountNeigh} function.
#' @param spe A SpatialExperiment/SingleCellExperiment object containing the spatial data.
#' @param cell_of_interest A character vector specifying the cell types of interest for which interaction frequency is calculated.
#' @param celltype A character specifying the column containing the cell cluster information. Default is 'cell_clusters2'.
#' @param sample_id A character string specifying the column name in the colData of 'spe' that contains sample IDs or images.
#' @param grouping.var A character specifying the column containing the grouping variable for samples, e.g., clinical indications or experimental conditions.
#' @param neighbor_count A character specifying the name of the column containing the neighboring cell counts generated by the \code{CountNeigh} function. Default is 'NeighCount'.
#' 
#' @return A data frame summarizing the frequency cells with at least one specified neighboring cell types across samples.
#' 
#' @details The function calculates the frequency of cells with at least one specified neighboring cell types of interest based on the neighboring cell counts provided by \code{CountNeigh} function. It filters cells based on the provided cell types of interest and calculates the percentage of cells in each sample/image that have at least one neighboring cell of the specified types.

InteractFreq_fx <- function(
    spe, 
    cell_of_interest, 
    celltype = "cell_clusters2", 
    sample_id = "patient_id", 
    grouping.var = "Group", 
    neighbor_count = 'NeighCount',
    add_zeroes = FALSE  # New argument to control adding zeroes
) {
  # --- 1. Validate Inputs ---
  
  # Check if grouping.var exists in spe
  if (!grouping.var %in% colnames(colData(spe))) {
    stop(paste("The grouping variable", grouping.var, "does not exist in 'spe'. Please check the column name."))
  }
  
  # Check if sample_id exists in spe
  if (!sample_id %in% colnames(colData(spe))) {
    stop(paste("The sample_id variable", sample_id, "does not exist in 'spe'. Please check the column name."))
  }
  
  # Check if celltype exists in spe
  if (!celltype %in% colnames(colData(spe))) {
    stop(paste("The celltype variable", celltype, "does not exist in 'spe'. Please check the column name."))
  }
  
  # Check if neighbor_count exists in spe
  if (!neighbor_count %in% colnames(colData(spe))) {
    stop(paste("The neighbor_count variable", neighbor_count, "does not exist in 'spe'. Please check the column name."))
  }
  
  # --- 2. Extract and Prepare Data ---
  
  # Extract the neighbor count matrix and convert to a data frame
  freq <- spe[[neighbor_count]] %>% as.data.frame()
  
  # Add metadata columns
  freq$cell_id <- colnames(spe)
  freq[['sample_id']] <- spe[[sample_id]]
  freq[['celltype']] <- spe[[celltype]]
  freq[['group']] <- spe[[grouping.var]]
  
  # Reshape the data to long format
  freq_long <- pivot_longer(
    freq, 
    names_to = 'Neighbor', 
    values_to = 'NeighCount', 
    cols = colnames(spe[[neighbor_count]])
  )
  
  # Clean the Neighbor names by removing the '.count' suffix
  freq_long$Neighbor <- gsub('.count', '', freq_long$Neighbor)
  
  # --- 3. Create Unique Mapping of sample_id to group ---
  
  # Create a mapping of sample_id to group
  sample_group <- freq_long %>%
    select(sample_id, group) %>%
    distinct()
  
  # Check for duplicate sample_id with different groups
  duplicate_mappings <- sample_group %>%
    group_by(sample_id) %>%
    filter(n() > 1)
  
  if (nrow(duplicate_mappings) > 0) {
    stop("Some sample_ids are associated with multiple groups. Please ensure each sample_id maps to only one group.")
  }
  
  # --- 4. Summarize Data Without Including `group` in group_by ---
  
  freq_summary <- freq_long %>%
    mutate(HasNeighbor = NeighCount > 0) %>%
    filter(celltype %in% cell_of_interest) %>%
    group_by(sample_id, celltype, Neighbor) %>%  # Excluded 'group' from group_by
    summarise(
      PercentHasNeigh = mean(HasNeighbor) * 100,
      .groups = 'drop'
    )
  
  # --- 5. Handle Missing Combinations ---
  
  if (add_zeroes) {
    freq_summary <- freq_summary %>%
      complete(
        sample_id, 
        celltype, 
        Neighbor,
        fill = list(PercentHasNeigh = 0)
      )
  }
  
  # --- 6. Rejoin `group` Information Based on `sample_id` ---
  
  freq_summary <- freq_summary %>%
    left_join(sample_group, by = "sample_id")
  
  # --- 7. Optional: Final Grouping Including `group` ---
  # If you need to perform further operations grouped by group, sample_id, celltype, Neighbor,
  # you can do so here. However, the summarization above has already been completed.
  
  # --- 8. Reorder and Return Columns ---
  
  freq_summary <- freq_summary %>%
    select(sample_id, celltype, Neighbor, group, PercentHasNeigh)
  
  return(freq_summary)
}

# Define a function to perform the interaction test only for a subset of the spe object
process_interaction_fx <- function(spe,
                                   subset = TRUE,
                                   subset_variable = NULL,
                                   subset_value = NULL,
                                   group_by,
                                   label,
                                   colPairName) {
  if (subset) {
    if (subset_variable %in% colnames(colData(spe))) {
      if (subset_value %in% unique(spe[[subset_variable]])) {
        message("Subseting the data based on ",
                subset_variable,
                " == ",
                subset_value)
      } else {
        stop(subset_value, " is not a valid value for ", subset_variable)
      }
    } else {
      stop(subset_variable, " is not a valid column in the spe object")
    }
  }
  
  if (subset) {
    spe2 <- spe[, spe[[subset_variable]] == subset_value]
  } else {
    spe2 <- spe
  }
  
  # Perform the testInteractions function
  interaction <- testInteractions(
    spe2,
    group_by = group_by,
    label = label,
    colPairName = colPairName,
    BPPARAM = SerialParam(RNGseed = 221029)
  )
  
  if (subset) {
    interaction[subset_variable] <- subset_value
  } else {
    interaction[subset_variable] <- "all"
  }
  rm(spe2)
  return(interaction)
}


lm_model_clinical_data_fx <- function(spe = spe,
                                      var_analysis,
                                      disease_var = NULL,
                                      disease_var_value = NULL,
                                      donor_var = NULL,
                                      clinical_data,
                                      clinical_vars,
                                      subset_spe = FALSE,
                                      var_subset = NULL,
                                      value_var_subset = NULL) {
  # Function to calculate frequencies and return linear model results.
  #
  # Args:
  #   spe: The SingleCellExperiment object or data frame containing the data. Default is 'spe'.
  #   var_analysis: The variable in 'spe' to use for frequency calculations. This argument has no default and must be provided.
  #   disease_var: The variable in 'spe' to subset on. Default is NULL.
  #   disease_var_value: The value of 'disease_var' to subset on. Default is NULL.
  #   clinical_data: Data frame containing clinical data to be merged. Should include both binary and non-binary variables.
  #   clinical_vars: Vector of clinical variables to analyze.
  #   subset_spe: Logical. Indicates whether to subset 'spe' on specific variable values. Default is FALSE.
  #     - When TRUE, 'var_subset' and 'value_var_subset' must be provided.
  #     - When FALSE, 'var_subset' and 'value_var_subset' must be NULL.
  #   var_subset: (Optional) The variable in 'spe' to subset on.
  #     - Required if 'subset_spe' is TRUE.
  #   value_var_subset: (Optional) The value(s) of 'var_subset' to filter on.
  #     - Required if 'subset_spe' is TRUE.
  #
  # Returns:
  #   A matrix 'mt' or a list of matrices if multiple values are provided and 'subset_spe' is TRUE.
  
  # Ensure that disease_var and 'Donor' exist in colData(spe)
  required_spe_vars <- c(disease_var, donor_var)
  missing_spe_vars <- setdiff(required_spe_vars, colnames(colData(spe)))
  if (length(missing_spe_vars) > 0) {
    stop(
      "The following variable(s) are not found in 'spe': ",
      paste(missing_spe_vars, collapse = ", "),
      ". Please ensure they are present in colnames(colData(spe))."
    )
  }
  
  # Ensure that 'Donor' exists in clinical_data
  if (!donor_var %in% colnames(clinical_data)) {
    stop(
      "The variable 'Donor' is not found in 'clinical_data'. Please ensure it is present in colnames(clinical_data)."
    )
  }
  
  # Check for dependencies between arguments
  if (subset_spe) {
    if (is.null(var_subset) || is.null(value_var_subset)) {
      stop(
        "When 'subset_spe' is TRUE, both 'var_subset' and 'value_var_subset' must be provided."
      )
    }
    # Ensure that 'var_subset' exists in colData(spe)
    if (!var_subset %in% colnames(colData(spe))) {
      stop(
        "The variable '",
        var_subset,
        "' is not found in your 'spe'. Please provide a valid variable from colnames(colData(spe))."
      )
    }
    # Ensure that all 'value_var_subset' values exist in unique(spe[[var_subset]])
    missing_values <- setdiff(value_var_subset, unique(spe[[var_subset]]))
    if (length(missing_values) > 0) {
      stop(
        "The following value(s) in 'value_var_subset' are not found in the unique values of the '",
        var_subset,
        "' variable: ",
        paste(missing_values, collapse = ", "),
        ". Ensure that these values are present in unique(spe[['",
        var_subset,
        "']])."
      )
    }
    # Ensure that 'var_analysis' exists in colData(spe)
    if (!var_analysis %in% colnames(colData(spe))) {
      stop(
        "The variable '",
        var_analysis,
        "' is not found in your 'spe'. Please provide a valid 'var_analysis' from colnames(colData(spe))."
      )
    }
    # Return a message indicating the subsetting
    message(
      "spe subsetted by '",
      var_subset,
      "' to the value(s) '",
      paste(value_var_subset, collapse = ", "),
      "'."
    )
    
    # Initialize a list to store results for each value in value_var_subset
    result_list <- list()
    # Loop over each value in value_var_subset
    for (value in value_var_subset) {
      # Subset 'spe' to include only cells with the specified value in 'var_subset' and 'Disease' == "SSc"
      spe_sub <- spe[, spe[[var_subset]] == value &
                       spe[[disease_var]] == disease_var_value]
      
      # Ensure the 'var_analysis' variable exists in 'spe_sub'
      if (!var_analysis %in% colnames(colData(spe_sub))) {
        stop(
          "The variable '",
          var_analysis,
          "' is not found in your subsetted 'spe'. Please provide a valid 'var_analysis' from colnames(colData(spe_sub))."
        )
      }
      
      # Convert the 'var_analysis' variable to character type
      spe_sub[[var_analysis]] <- as.character(spe_sub[[var_analysis]])
      
      # Generate frequency data using dittoFreqPlot
      freq_data <- dittoSeq::dittoFreqPlot(
        spe_sub,
        var = var_analysis,
        group.by = disease_var,
        sample.by = donor_var,
        data.out = TRUE
      )$data
      
      # Check for unmatched donors
      unmatched_donors <- setdiff(unique(freq_data$Donor), unique(clinical_data$Donor))
      if(length(unmatched_donors) > 0){
        warning("There are donors in 'freq_data' not present in 'clinical_data': ", paste(unmatched_donors, collapse = ", "))
      }
      
      # Join frequency data with clinical data based on 'Donor' column
      freq_data <- left_join(freq_data, clinical_data, by = setNames(donor_var, donor_var))
      
      # Calculate percentage multiplied by 100
      freq_data$percent_100 <- freq_data$percent * 100
      
      # Ensure that 'clinical_vars' are present in 'freq_data'
      missing_clinical_vars <- setdiff(clinical_vars, colnames(freq_data))
      if (length(missing_clinical_vars) > 0) {
        stop(
          "The following clinical variables are not found in 'clinical_data': ",
          paste(missing_clinical_vars, collapse = ", "),
          "."
        )
      }
      
      # Initialize an empty data frame to store linear model results
      lm_res <- data.frame()
      
      # For each clinical variable
      for (clinical_var in clinical_vars) {
        # For each unique cluster label
        for (cluster_label in unique(spe_sub[[var_analysis]])) {
          # Subset data for the current cluster label
          temp_data <- freq_data[freq_data$label == cluster_label, c("label", "percent_100", clinical_var)]
          
          # Remove rows with NA in clinical_var or percent_100
          temp_data <- temp_data %>% drop_na(percent_100, !!sym(clinical_var))
          
          # Check if there are enough unique values in clinical_var
          unique_vals <- unique(temp_data[[clinical_var]])
          if (length(unique_vals) < 2) {
            message(
              "Skipping lm() for combination: ",
              clinical_var,
              " and cluster: ",
              cluster_label,
              " due to insufficient variation in '",
              clinical_var,
              "'."
            )
            next
          }
          
          # Check if there are enough observations for the test
          if (nrow(temp_data) < 3) { # Adjust threshold as needed
            message(
              "Skipping lm() for combination: ",
              clinical_var,
              " and cluster: ",
              cluster_label,
              " due to insufficient observations (",
              nrow(temp_data),
              ")."
            )
            next
          }
          
          # Check if there are enough observations for the test
          if (nrow(temp_data) > 1) {
            # Construct the formula for linear regression
            lm_formula <- as.formula(paste0("percent_100 ~ `", clinical_var, "`"))
            
            # Fit the linear model using tryCatch to handle errors
            lm_summary <- tryCatch({
              summary(lm(formula = lm_formula, data = temp_data))
            }, error = function(e) {
              message(
                "Error in lm() for combination: ",
                clinical_var,
                " and cluster: ",
                cluster_label
              )
              return(NULL)
            })
            
            # If the linear model was successfully fitted
            if (!is.null(lm_summary)) {
              # Extract p-value and estimate from the linear model summary
              res_temp <- data.frame(
                cluster = cluster_label,
                clinical_par = clinical_var,
                p_value = lm_summary$coefficients[2, 'Pr(>|t|)'],
                lm_estimate = lm_summary$coefficients[2, 'Estimate']
              )
              # Append the results to 'lm_res'
              lm_res <- rbind(lm_res, res_temp)
            } else {
              message(
                "No successful results for lm() for combination: ",
                clinical_var,
                " and cluster: ",
                cluster_label
              )
            }
          }
        }
      }
      
      # Filter results for p-values less than 0.2
      lm_res <- lm_res[lm_res$p_value < 0.2, ]
      
      if (nrow(lm_res) == 0) {
        warning(
          "No valid linear models were fitted. Returning an empty list or appropriate default values."
        )
        # Depending on desired behavior, you can return an empty list or a list with default matrices
        return(list(matrix = NULL, heatmap = NULL))
      }
      
      # Create a complete grid of clusters and clinical parameters
      all_clusters <- unique(spe_sub[[var_analysis]])
      all_combinations <- expand.grid(
        cluster = all_clusters,
        clinical_par = clinical_vars,
        stringsAsFactors = FALSE
      )
      
      # Merge with lm_res to ensure all combinations are present
      lm_res <- merge(all_combinations, lm_res, by = c("cluster", "clinical_par"), all.x = TRUE)
      
      # Assign non-significant p-values to missing combinations
      lm_res$p_value[is.na(lm_res$p_value)] <- 1  # or another appropriate default
      
      # Compute negative log10 of p-values
      lm_res$log_p_value <- -log10(lm_res$p_value)
      
      # Convert 'clinical_par' to a factor with levels ordered as in 'clinical_vars'
      lm_res$clinical_par <- factor(lm_res$clinical_par, levels = clinical_vars)
      
      # Reshape the data to create a matrix for plotting or further analysis
      mt <- lm_res %>%
        tidyr::pivot_wider(
          id_cols = c('cluster'),
          names_from = 'clinical_par',
          values_from = 'log_p_value'
        ) %>%
        column_to_rownames('cluster') %>%
        dplyr::select(-donor_var)%>%
        as.matrix()
      
      # Store the result in the list
      result_list[[value]][["matrix"]] <- mt
      
      heatmap <- Heatmap(
        mt,
        name = '-log(p-value)',
        cluster_columns = FALSE,
        cluster_rows = FALSE,
        rect_gp = gpar(col = "white", lwd = 2),
       col = colorRamp2(c(0, seq(0.7, 3, length = 15)), viridis::inferno(16))
      )
      
      # Draw the heatmap with title
      #draw(heatmap, column_title = paste0(var_analysis, " ", value))
      
      result_list[[value]][["heatmap"]] <- heatmap
    }
    
    return(result_list)
    
  } else {
    if (!is.null(var_subset) || !is.null(value_var_subset)) {
      stop("When 'subset_spe' is FALSE, 'var_subset' and 'value_var_subset' must be NULL.")
    }
    # Ensure that 'var_analysis' exists in colData(spe)
    if (!var_analysis %in% colnames(colData(spe))) {
      stop(
        "The variable '",
        var_analysis,
        "' is not found in your 'spe'. Please provide a valid 'var_analysis' from colnames(colData(spe))."
      )
    }
    # Proceed with the processing without subsetting spe
    # Subset 'spe' to include all cells with 'Disease' == "SSc"
    spe_sub <- spe[, spe[[disease_var]] == disease_var_value]
    
    # Convert the 'var_analysis' variable to character type
    spe_sub[[var_analysis]] <- as.character(spe_sub[[var_analysis]])
    
    # Generate frequency data using dittoFreqPlot
    freq_data <- dittoSeq::dittoFreqPlot(
      spe_sub,
      var = var_analysis,
      group.by = disease_var,
      sample.by = donor_var,
      data.out = TRUE
    )$data
    
    # Check for unmatched donors
    unmatched_donors <- setdiff(unique(freq_data$Donor), unique(clinical_data$Donor))
    if(length(unmatched_donors) > 0){
      warning("There are donors in 'freq_data' not present in 'clinical_data': ", paste(unmatched_donors, collapse = ", "))
    }
    
    # Join frequency data with clinical data based on 'Donor' column
    freq_data <- left_join(freq_data, clinical_data, by = setNames(donor_var, donor_var))
    
    # Calculate percentage multiplied by 100
    freq_data$percent_100 <- freq_data$percent * 100
    
    # Ensure that 'clinical_vars' are present in 'freq_data'
    missing_clinical_vars <- setdiff(clinical_vars, colnames(freq_data))
    if (length(missing_clinical_vars) > 0) {
      stop(
        "The following clinical variables are not found in 'clinical_data': ",
        paste(missing_clinical_vars, collapse = ", "),
        "."
      )
    }
    
    # Initialize an empty data frame to store linear model results
    lm_res <- data.frame()
    
    result_list <- list()
    
    # For each clinical variable
    for (clinical_var in clinical_vars) {
      # For each unique cluster label
      for (cluster_label in unique(spe_sub[[var_analysis]])) {
        # Subset data for the current cluster label
        temp_data <- freq_data[freq_data$label == cluster_label, c("label", "percent_100", clinical_var)]
        
        # Remove rows with NA in clinical_var or percent_100
        temp_data <- temp_data %>% drop_na(percent_100, !!sym(clinical_var))
        
        # Check if there are enough unique values in clinical_var
        unique_vals <- unique(temp_data[[clinical_var]])
        if (length(unique_vals) < 2) {
          message(
            "Skipping lm() for combination: ",
            clinical_var,
            " and cluster: ",
            cluster_label,
            " due to insufficient variation in '",
            clinical_var,
            "'."
          )
          next
        }
        
        # Check if there are enough observations for the test
        if (nrow(temp_data) < 3) { # Adjust threshold as needed
          message(
            "Skipping lm() for combination: ",
            clinical_var,
            " and cluster: ",
            cluster_label,
            " due to insufficient observations (",
            nrow(temp_data),
            ")."
          )
          next
        }
        
        # Check if there are enough observations for the test
        if (nrow(temp_data) > 1) {
          # Construct the formula for linear regression
          lm_formula <- as.formula(paste0("percent_100 ~ `", clinical_var, "`"))
          
          # Fit the linear model using tryCatch to handle errors
          lm_summary <- tryCatch({
            summary(lm(formula = lm_formula, data = temp_data))
          }, error = function(e) {
            message(
              "Error in lm() for combination: ",
              clinical_var,
              " and cluster: ",
              cluster_label
            )
            return(NULL)
          })
          
          # If the linear model was successfully fitted
          if (!is.null(lm_summary)) {
            # Extract p-value and estimate from the linear model summary
            res_temp <- data.frame(
              cluster = cluster_label,
              clinical_par = clinical_var,
              p_value = lm_summary$coefficients[2, 'Pr(>|t|)'],
              lm_estimate = lm_summary$coefficients[2, 'Estimate']
            )
            # Append the results to 'lm_res'
            lm_res <- rbind(lm_res, res_temp)
          } else {
            message(
              "No successful results for lm() for combination: ",
              clinical_var,
              " and cluster: ",
              cluster_label
            )
          }
        }
      }
    }
    
    # Filter results for p-values less than 0.2
    lm_res <- lm_res[lm_res$p_value < 0.2, ]
    
    if (nrow(lm_res) == 0) {
      warning(
        "No valid linear models were fitted. Returning an empty list or appropriate default values."
      )
      # Depending on desired behavior, you can return an empty list or a list with default matrices
      return(list(matrix = NULL, heatmap = NULL))
    }
    
    # Create a complete grid of clusters and clinical parameters
    all_clusters <- unique(spe_sub[[var_analysis]])
    all_combinations <- expand.grid(
      cluster = all_clusters,
      clinical_par = clinical_vars,
      stringsAsFactors = FALSE
    )
    
    # Merge with lm_res to ensure all combinations are present
    lm_res <- merge(all_combinations, lm_res, by = c("cluster", "clinical_par"), all.x = TRUE)
    
    # Assign non-significant p-values to missing combinations
    lm_res$p_value[is.na(lm_res$p_value)] <- 1  # or another appropriate default
    
    # Compute negative log10 of p-values
    lm_res$log_p_value <- -log10(lm_res$p_value)
    
    # Convert 'clinical_par' to a factor with levels ordered as in 'clinical_vars'
    lm_res$clinical_par <- factor(lm_res$clinical_par, levels = clinical_vars)
    
    # Reshape the data to create a matrix for plotting or further analysis
    mt <- lm_res %>%
      tidyr::pivot_wider(
        id_cols = c('cluster'),
        names_from = 'clinical_par',
        values_from = 'log_p_value'
      ) %>%
      column_to_rownames('cluster') %>%
      dplyr::select(-donor_var)%>%
      as.matrix()
    
    result_list[["matrix"]] <- mt
    
    # Iterate over the list and generate heatmaps
    heatmap <- Heatmap(
      mt,
      name = '-log(p-value)',
      cluster_columns = FALSE,
      cluster_rows = FALSE,
      rect_gp = gpar(col = "white", lwd = 2),
      col = colorRamp2(c(0, seq(0.7, 3, length = 15)), viridis::inferno(16))
    )
    
    # Draw the heatmap with title
    #draw(heatmap, column_title = var_analysis)
    
    result_list[["heatmap"]] <- heatmap
    
    return(result_list)
  }
}


plot_spearman_clinical_data_fx <- function(spe = spe,
                                           subset_vars = NULL,
                                           var_analysis = NULL,
                                           value_analysis = NULL,
                                           disease_var = "Disease",
                                           donor_var = "Donor",
                                           clinical_data = NULL,
                                           clinical_var = NULL) {
  # Function to plot the Spearman correlation between a clinical parameter and
  # the percentage of a specific cell type label in your data.
  #
  # Args:
  #   spe: SummarizedExperiment object containing your data.
  #   subset_vars: Named list of variables and their values to subset the spe object.
  #   var_analysis: Variable to use in dittoFreqPlot.
  #   value_analysis: The value of 'label' to filter the data by.
  #   disease_var: Disease variable in spe.
  #   donor_var: Donor variable in spe.
  #   clinical_data: Data frame containing clinical data.
  #   clinical_var: Variable from clinical_data to plot on the x-axis.
  #
  # Returns:
  #   A ggplot object showing the scatter plot with Spearman correlation.
  
  # Load required libraries
  library(dittoSeq)
  library(ggpubr)
  library(dplyr)
  library(tidyr)
  
  # Check that disease_var and donor_var are in colData(spe)
  required_spe_columns <- c(disease_var, donor_var)
  missing_spe_columns <- setdiff(required_spe_columns, colnames(colData(spe)))
  if (length(missing_spe_columns) > 0) {
    stop(
      "The following variables are missing in colData(spe): ",
      paste(missing_spe_columns, collapse = ", ")
    )
  }
  
  # Check that donor_var and clinical_var are in clinical_data
  required_clinical_columns <- c(donor_var, clinical_var)
  missing_clinical_columns <- setdiff(required_clinical_columns, colnames(clinical_data))
  if (length(missing_clinical_columns) > 0) {
    stop(
      "The following variables are missing in clinical_data: ",
      paste(missing_clinical_columns, collapse = ", ")
    )
  }
  
  # Check that variables and values in subset_vars are in colData(spe)
  for (var_name in names(subset_vars)) {
    if (!(var_name %in% colnames(colData(spe)))) {
      stop(paste(
        "Variable '",
        var_name,
        "' is not present in colData(spe).",
        sep = ""
      ))
    }
    # Check that the value is present in spe[[var_name]]
    if (!(subset_vars[[var_name]] %in% unique(spe[[var_name]]))) {
      stop(paste(
        "Value '",
        subset_vars[[var_name]],
        "' not found in spe[['",
        var_name,
        "']].",
        sep = ""
      ))
    }
  }
  
  # Subset the spe object based on the provided variables and values
  subset_indices <- rep(TRUE, ncol(spe))
  for (var_name in names(subset_vars)) {
    subset_indices <- subset_indices &
      spe[[var_name]] == subset_vars[[var_name]]
  }
  spe_subset <- spe[, subset_indices]
  
  # Check that var_analysis is in colData(spe_subset)
  if (!(var_analysis %in% colnames(colData(spe_subset)))) {
    stop(
      paste(
        "The var_analysis '",
        var_analysis,
        "' is not present in colData(spe_subset).",
        sep = ""
      )
    )
  }
  
  # Check that value_analysis is in unique(spe_subset[[var_analysis]])
  if (!(value_analysis %in% unique(spe_subset[[var_analysis]]))) {
    stop(
      paste(
        "The value_analysis '",
        value_analysis,
        "' is not present in spe_subset[[var_analysis]].",
        sep = ""
      )
    )
  }
  
  # Generate the data using dittoFreqPlot
  data <- dittoSeq::dittoFreqPlot(
    spe_subset,
    var = var_analysis,
    group.by = disease_var,
    sample.by = donor_var,
    data.out = TRUE
  )$data
  
  # Add percent_100 column
  data$percent_100 <- data$percent * 100
  
  # Filter the data based on data$label == value_analysis
  data <- data[data$label == value_analysis, ]
  
  # Merge with clinical_data to ensure all donors are included
  data <- full_join(
    clinical_data,
    data[, c(donor_var, "percent_100")],
    by = donor_var
  )
  
  # Replace NAs in percent_100 with zeros
  data$percent_100[is.na(data$percent_100)] <- 0
  
  # Create the scatter plot
  p <- ggscatter(
    data,
    x = clinical_var,
    y = "percent_100",
    color = "black",
    shape = 21,
    size = 3,
    add = "reg.line",
    add.params = list(color = "blue", fill = "lightgray"),
    conf.int = TRUE,
    cor.coef = TRUE,
    cor.coeff.args = list(method = "spearman", size = 4),
    xlab = clinical_var,
    ylab = paste0("% (", value_analysis, ")"),
    ylim = c(0, max(data[["percent_100"]], na.rm = TRUE)),
    title = paste0(
      "Spearman correlation \n",
      paste(unlist(subset_vars, use.names = FALSE), collapse = ",")
    )
  ) +
    scale_x_continuous(expand = c(0, 0.25))
  
  return(p)
}


plot_categorical_clinical_data_fx <- function(spe = spe,
                                              subset_vars = NULL,
                                              var_analysis = NULL,
                                              value_analysis = NULL,
                                              disease_var = "Disease",
                                              donor_var = "Donor",
                                              clinical_data = NULL,
                                              clinical_var = NULL,
                                              color = RColorBrewer::brewer.pal(8, "Set2")) {
  # Load required libraries
  # Ensure these are loaded in your environment
  library(dittoSeq)
  library(ggpubr)
  library(dplyr)
  library(rstatix)
  library(RColorBrewer)
  
  # --- Input Checks ---
  required_spe_columns <- c(disease_var, donor_var)
  missing_spe_columns <- setdiff(required_spe_columns, colnames(colData(spe)))
  if (length(missing_spe_columns) > 0) {
    stop(
      "The following variables are missing in colData(spe): ",
      paste(missing_spe_columns, collapse = ", ")
    )
  }
  
  required_clinical_columns <- c(donor_var, clinical_var)
  missing_clinical_columns <- setdiff(required_clinical_columns, colnames(clinical_data))
  if (length(missing_clinical_columns) > 0) {
    stop(
      "The following variables are missing in clinical_data: ",
      paste(missing_clinical_columns, collapse = ", ")
    )
  }
  
  if (!is.null(subset_vars)) {
    for (var_name in names(subset_vars)) {
      if (!(var_name %in% colnames(colData(spe)))) {
        stop(paste("Variable '", var_name, "' is not present in colData(spe).", sep = ""))
      }
      required_values <- subset_vars[[var_name]]
      missing_values <- setdiff(required_values, unique(spe[[var_name]]))
      if (length(missing_values) > 0) {
        stop(paste("Value(s) '", paste(missing_values, collapse = ", "), 
                   "' not found in spe[['", var_name, "']].", sep = ""))
      }
    }
  }
  
  # Subset the spe object
  subset_indices <- rep(TRUE, ncol(spe))
  if (!is.null(subset_vars)) {
    for (var_name in names(subset_vars)) {
      subset_indices <- subset_indices & (spe[[var_name]] %in% subset_vars[[var_name]])
    }
  }
  spe_subset <- spe[, subset_indices]
  
  # Check var_analysis and value_analysis
  if (!(var_analysis %in% colnames(colData(spe_subset)))) {
    stop("The var_analysis '", var_analysis, "' is not present in colData(spe_subset).")
  }
  
  if (!(value_analysis %in% unique(spe_subset[[var_analysis]]))) {
    stop("The value_analysis '", value_analysis, "' is not present in spe_subset[[var_analysis]].")
  }
  
  # Generate frequency data
  freq_plot_result <- dittoSeq::dittoFreqPlot(
    spe_subset,
    var = var_analysis,
    group.by = disease_var,
    sample.by = donor_var,
    data.out = TRUE
  )
  
  if (is.null(freq_plot_result$data)) {
    stop("dittoFreqPlot did not return data.")
  }
  
  freq_data <- freq_plot_result$data
  freq_data$percent_100 <- freq_data$percent * 100
  freq_data <- freq_data[freq_data$label == value_analysis, ]
  
  # Merge with clinical_data
  data <- tryCatch({
    dplyr::full_join(
      clinical_data,
      freq_data[, c(donor_var, "percent_100")],
      by = donor_var
    )
  }, error = function(e) {
    return(NULL)
  })
  
  if (is.null(data)) {
    return(NULL)
  }
  
  # Replace NA percent_100 with 0
  data$percent_100[is.na(data$percent_100)] <- 0
  
  # Filter out NAs in clinical_var to consider only relevant groups
  data <- dplyr::filter(data, !is.na(.data[[clinical_var]]))
  
  unique_levels <- unique(data[[clinical_var]])
  n_groups <- length(unique_levels)
  
  if (n_groups == 0) {
    # No groups - no data
    boxplot_plot <- ggplot() + labs(title = "No data for these conditions.")
    return(boxplot_plot)
    
  } else {
    # Calculate base y-axis metrics
    max_y <- max(data$percent_100, na.rm = TRUE)
    y_limit <- max_y * 1.3  # 30% buffer to ensure annotations fit
    
    # Initialize the plot
    boxplot_plot <- ggplot(data, aes(x = .data[[clinical_var]], y = percent_100, fill = .data[[clinical_var]])) +
      geom_boxplot(outlier.shape = NA) +
      scale_fill_manual(values = color) +
      scale_y_continuous(limits = c(0, y_limit), expand = expansion(mult = c(0, 0))) +
      coord_cartesian(clip = "off") +  # Allow annotations outside the plot area
      get_common_boxplot_components_fx() +
      labs(
        x = clinical_var,
        y = paste0("% (", value_analysis, ")"),
        title = paste0(
          "Frequency distribution \n", 
          if (!is.null(subset_vars)) paste(unlist(subset_vars, use.names = FALSE), collapse = ", ") else ""
        )
      ) +
      theme(
        plot.title = element_text(
          hjust = 0.5,        # Center the title horizontally
          vjust = 1.2,        # Move the title upward
          size = 16,          # Increase title text size
          face = "bold",      # Make the title bold
          margin = margin(t = 10, b = 20)  # Add top and bottom margins
        ),
        plot.margin = margin(t = 30, r = 10, b = 10, l = 10)  # Add extra top margin
      )
    
    if (n_groups == 1) {
      # One group: just boxplot
      return(boxplot_plot)
      
    } else if (n_groups == 2) {
      # Two groups: Wilcoxon
      comp <- list(c(as.character(unique_levels[1]), as.character(unique_levels[2])))
      boxplot_plot <- boxplot_plot +
        stat_compare_means(
          comparisons = comp,
          method = "wilcox.test",
          hide.ns = TRUE,
          label.y = y_limit - (0.05 * max_y),  # Position annotation 5% below y_limit
          size = 6  # Increase text size for better visibility
        )
      return(boxplot_plot)
      
    } else {
      # More than two groups: Kruskal-Wallis + Dunn
      # Check variability within each group before Dunn's test
      variability <- data %>%
        group_by(.data[[clinical_var]]) %>%
        summarise(unique_values = n_distinct(percent_100), .groups = "drop")
      
      # If insufficient variability in any group, skip Dunn's test and do only Kruskal-Wallis
      if (any(variability$unique_values < 2)) {
        # Only Kruskal-Wallis test
        boxplot_plot <- boxplot_plot +
          stat_compare_means(method = "kruskal", label.y = y_limit - (0.05 * max_y), size = 6)
        return(boxplot_plot)
        
      } else {
        # Perform Dunn's test without p-value adjustment
        data[[clinical_var]] <- as.factor(data[[clinical_var]])
        formula_obj <- as.formula(paste("percent_100 ~", clinical_var))
        
        dunn_res <- dunn_test(
          formula = formula_obj,
          data = data,
          p.adjust.method = "none"
        )
        
        # Drop levels again if needed
        data[[clinical_var]] <- droplevels(data[[clinical_var]])
        unique_levels <- levels(data[[clinical_var]])
        
        # Ensure group1 and group2 are factors aligned with unique_levels
        dunn_res <- dunn_res %>%
          mutate(
            group1 = factor(group1, levels = unique_levels),
            group2 = factor(group2, levels = unique_levels)
          ) %>%
          filter(!is.na(group1) & !is.na(group2))
        
        # If no pairwise comparisons remain, just do Kruskal-Wallis
        if (nrow(dunn_res) == 0) {
          boxplot_plot <- boxplot_plot +
            stat_compare_means(method = "kruskal", label.y = y_limit - (0.05 * max_y), size = 6)
          return(boxplot_plot)
        }
        
        # Calculate y positions for comparisons
        num_comparisons <- nrow(dunn_res)
        increment <- 0.05 * max_y  # 5% per comparison
        base_annotation <- max_y + (0.05 * max_y)  # Start 5% above max_y
        
        dunn_res <- dunn_res %>%
          arrange(desc(p)) %>%  # Optional: arrange comparisons by p-value
          mutate(y.position = base_annotation + (seq_len(num_comparisons)) * increment)
        
        # Ensure y_positions do not exceed y_limit - 0.05 * max_y
        dunn_res <- dunn_res %>%
          mutate(y.position = pmin(y.position, y_limit - (0.05 * max_y)))
        
        # Add Dunn's test annotations
        boxplot_plot <- boxplot_plot +
          stat_pvalue_manual(
            data = dunn_res,
            inherit.aes = FALSE,
            mapping = aes(
              x = group1,
              xend = group2,
              y.position = y.position,
              label = p
            ),
            bracket.size = 0.5,
            tip.length = 0.01,
            size = 6  # Increase text size for better visibility
          ) +
          # Add Kruskal-Wallis test annotation
          stat_compare_means(method = "kruskal", label.y = y_limit - (0.05 * max_y), size = 6)
        
        return(boxplot_plot)
      }
    }
  }
}