diff --git a/DESCRIPTION b/DESCRIPTION index c128580..cd07877 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: Ulisse Type: Package Title: Pathway, pathway cross-talk and cell-cell communication -Version: 1.2.5 -Date: 2022-03-28 +Version: 1.3.0 +Date: 2023-11-27 Authors@R: c(person(given = "Alice", family = "Chiodi", diff --git a/NAMESPACE b/NAMESPACE index 2da8615..acc2849 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,14 +18,12 @@ export(gsea) export(ora) export(ora2enrich) export(ora_clusters) -export(ora_pipeline) export(pathway_data) export(pathway_sim_comm) export(pathway_similarity) export(perm_link) export(plot_functional_relevance) export(plot_network_CT) -export(plot_ora_comparison) export(preparing_DEG_list) export(preparing_expr_list) export(preparing_gs_list) @@ -41,12 +39,15 @@ import(grid) import(igraph) import(kit) import(msigdbr) +import(openxlsx) import(pals) import(parallel) import(plotrix) import(stats) import(stringr) importClassesFrom(DOSE,enrichResult) +importClassesFrom(DOSE,gseaResult) +importFrom(ComplexHeatmap,Heatmap) importFrom(RColorBrewer,brewer.pal) importFrom(circlize,colorRamp2) importFrom(ggforce,geom_mark_rect) @@ -55,18 +56,19 @@ importFrom(grDevices,adjustcolor) importFrom(grDevices,dev.off) importFrom(grDevices,jpeg) importFrom(grDevices,rainbow) +importFrom(grid,gpar) +importFrom(grid,grid.text) importFrom(gtools,permutations) importFrom(methods,as) importFrom(methods,is) importFrom(methods,new) -importFrom(parallel,mclapply) +importFrom(pals,brewer.purples) importFrom(qvalue,qvalue) importFrom(reshape2,acast) importFrom(scales,rescale) importFrom(stats,dhyper) +importFrom(stats,p.adjust) importFrom(stats,phyper) importFrom(stats,setNames) importFrom(stringi,stri_c) importFrom(utils,write.table) -importFrom(viridis,turbo) -importFrom(viridis,viridis) diff --git a/R/calc_gs_perm.R b/R/calc_gs_perm.R index fbb3981..265ca52 100644 --- a/R/calc_gs_perm.R +++ b/R/calc_gs_perm.R @@ -1,11 +1,10 @@ #' Internal function of pathtools -#' @param rll numeric matrix of genes-by-ranking criteria; -#' each column contains numeric values; rownames are mandatory +#' @param rll list of named ranked vectors #' @param perm vector of permuted names #' @param gs gene set -calc_gs_perm <- function(rll, perm, gs){ - out <- unlist(lapply(rll, function(x) es(which(perm %in% gs), - array(x, dimnames=list(perm))))) +calc_gs_perm <- function(rll=NULL, perm=NULL, gs=NULL){ + + out <- unlist(lapply(rll, function(x) es(idx = which(perm %in% gs), array(x, dimnames=list(perm)))[, 1])) return(out) diff --git a/R/es.R b/R/es.R index bc74269..836bccb 100644 --- a/R/es.R +++ b/R/es.R @@ -2,19 +2,18 @@ #' #' @param idx vector of indices a subset of elements of x #' @param x named vector, ranked list -#' @param le logical -#' @return enrichment score +#' @return data.frame with as, enrichmet score; tags, leading edge size; tags_perc, leading edge size percent over gene set; list_top, rank of the ES; list_top_perc, rank of the ES percent over full ranked list; lead_edge, gene names of the leading edge. #' @export #' -es <- function(idx, x, le=F){ - +es <- function(idx=NULL, x=NULL){ + #idx: indexes of a subset of elements of x #x: array of elements ## ES score #Phit(S,i) <- SUM_{i in idx, j=1..i}( r_j / Nr) #r: score #Nr: total score in x[idx] - + N <- length(x) Nh <- length(idx) hits <- rep(0, length(x)) @@ -23,60 +22,52 @@ es <- function(idx, x, le=F){ misses[idx] <- 0 hits.cumsum <- cumsum(abs(hits)) Nr <- sum(abs(hits)) #equal to sum(abs(x[idx])) - + if(Nr==0){ #nothing to do - + es <- 0 - if(le){ - deviation <- 0 - tags <- 0 - tags_perc <- 0 - list_top <- 0 - list_top_perc <- 0 - lead_edge <- 0 - lead_edge_subset <- 0 - } - + + deviation <- 0 + tags <- 0 + tags_perc <- 0 + list_top <- 0 + list_top_perc <- 0 + lead_edge <- 0 + lead_edge_subset <- 0 + }else{ - + misses.cumsum <- cumsum(misses) Phits <- hits.cumsum / Nr Pmiss <- misses.cumsum / (N - Nh) deviation <- Phits - Pmiss wm <- which.max(abs(deviation)) - + #es <- deviation[which.max(abs(deviation))] es <- deviation[wm] - - if(le){ - if(es >=0){ - - tags <- sum(idx <= wm) - list_top <- wm - lead_edge_subset <- paste0(names(x)[intersect(1:wm, idx)], collapse = ";") - - }else{ - - tags <- sum(idx > wm) - list_top <- N - wm - lead_edge_subset <- paste0(names(x)[intersect(wm:N, idx)], collapse = ";") - - } - - tags_perc <- tags / length(idx) - list_top_perc <- list_top / length(x) + + + if(es >=0){ + + tags <- sum(idx <= wm) + list_top <- wm + lead_edge_subset <- paste0(names(x)[intersect(1:wm, idx)], collapse = ";") + + }else{ + + tags <- sum(idx >= wm) + list_top <- N - (wm-1) + lead_edge_subset <- paste0(names(x)[intersect(wm:N, idx)], collapse = ";") + } + + tags_perc <- tags / length(idx) + list_top_perc <- list_top / length(x) + } - - if(le){ - - lead_edge <- tags_perc * (1-list_top_perc) * N / (N - Nh) - - return(list(es=es, deviation=deviation, lea=data.frame(tags=tags, tags_perc=tags_perc, list_top=list_top, list_top_perc=list_top_perc, lead_edge=lead_edge, lead_edge_subset=lead_edge_subset, stringsAsFactors = F))) - - }else{ - - return(es) - } - + + lead_edge <- tags_perc * (1-list_top_perc) * N / (N - Nh) + + return(data.frame(es=es, tags=tags, tags_perc=tags_perc, list_top=list_top, list_top_perc=list_top_perc, lead_edge=lead_edge, lead_edge_subset=lead_edge_subset, stringsAsFactors = F)) + } diff --git a/R/filter_gsl.R b/R/filter_gsl.R index b142249..ffd6890 100644 --- a/R/filter_gsl.R +++ b/R/filter_gsl.R @@ -6,11 +6,14 @@ #' @export -filter_gsl <- function(gsl, universe, min_size=5, max_size=500){ +filter_gsl <- function(gsl=NULL, universe=NULL, min_size=5, max_size=500){ ans <- lapply(gsl, function(x) x[x %in% universe]) ans <- lapply(ans, unique) - idx_keep <- unlist(lapply(ans, function(x) length(x) >= min_size & length(x) <= max_size)) + #idx_keep <- unlist(lapply(ans, function(x) length(x) >= min_size & length(x) <= max_size)) + + gs_size <- lengths(ans) + idx_keep <- gs_size >= min_size & gs_size <= max_size ans <- ans[idx_keep] diff --git a/R/get_genes.R b/R/get_genes.R new file mode 100644 index 0000000..7c73435 --- /dev/null +++ b/R/get_genes.R @@ -0,0 +1,13 @@ +#' Internal function + +get_genes <- function(gene_set=NULL, wb=NULL){ + + ans <- gene_set[gene_set %in% wb] + if(!is.null(eg2sym)){ + ans <- sort(eg2sym$symbol[eg2sym$gene_id %in% ans]) + } + + ans <- paste0(ans, collapse = ";") + return(ans) + +} diff --git a/R/gsea.R b/R/gsea.R index a64329d..3a2af0c 100644 --- a/R/gsea.R +++ b/R/gsea.R @@ -2,80 +2,92 @@ #' @param rl numeric matrix of genes-by-ranking criteria; each column contains numeric values; rownames are mandatory #' @param gsl named list of gene sets #' @param k integer, number of permutations -#' @param ord.mode ordering mode: -1 -> descending; 1 ascending; must be of lenght equal to ncol(rl) +#' @param min_size minimum gene set size +#' @param max_size maximum gene set size +#' @param decreasing TRUE/FALSE vector that specifies whether to order each column of rl decreasingly or not; must be of length equal to `ncol(rl)`. If NULL, all columns will be ranked in decreasing order #' @param mc_cores_path number of cores to use for parallel calculation of gene set lists; the total number of cpu used will be mc_cores_path x mc_cores_perm #' @param mc_cores_perm number of cores to use for parallel calculation of ranked list permutations; the total number of cpu used will be mc_cores_path x mc_cores_perm -#' @import parallel -#' @return data.frame with es, nes, p-value, adjusted p-value and FDR q-value +#' @param description optional named vector with gene set description; names must be gene seet identifiers +#' @param out_file_prefix prefix for .xlsx and .txt output files +#' @param min.k minimum number of permutations to obtain valid permutation-based statistics +#' @import parallel openxlsx +#' @importFrom qvalue qvalue +#' @importFrom utils write.table +#' @return data.frame with: es, enrichment score; nes normalized enrichment score; nperm, number of permutations actually used; p-value, empirical p-value; adjusted p-value, BH FDR; q_val: q-value estimnated from p-values using qvalue package; FDR q-value, empirical FDR; tags, leading edge size; tags_perc, leading edge size percent over gene set; list_top, rank of the ES; list_top_perc, rank of the ES percent over full ranked list; lead_edge, gene names of the leading edge #' @export -gsea <- function(rl, gsl, k=100, ord.mode=-1, mc_cores_path=1, mc_cores_perm=1){ - - #cheks +gsea <- function(rl=NULL, gsl=NULL, k=99, min_size=5, max_size=500, decreasing=NULL, mc_cores_path=1, mc_cores_perm=1, description=NULL, out_file_prefix="gsea_res", min.k=20){ + + #checks if(!is.matrix(rl) | !is.numeric(rl)){ stop("rl must be a numeric matrix") } - - if(length(ord.mode) != ncol(rl)){ - stop("length(ord.mode) must be equal to ncol(rl)") + + if(is.null(decreasing)){ + decreasing <- rep(TRUE, ncol(rl)) } - + + if(length(decreasing) != ncol(rl)){ + stop("length(decreasing) must be equal to ncol(rl)") + } + + #gene set size + cat("Checking gene sets\n") + #gsl <- lapply(gsl, function(x) x[ x %in% rownames(rl)]) + #gsl_size <- lengths(gsl) + #gsl <- gsl[gsl_size >= min_size & gsl_size <= max_size] + #print(gsl_size) + + gsl <- filter_gsl(gsl = gsl, universe = rownames(rl), min_size = min_size, max_size = max_size) + gsl_size <- lengths(gsl) + + if(length(gsl)==0){ + stop("Problem with gene sets. Check identifiers and sizes.\n") + } + + cat("#{gene sets} with genes occurring in the ranked lists and size in [", min_size, ",", max_size, "]:", length(gsl), "\n") + + + if(!is.null(description)){ + description <- description[match(names(gsl), names(description))] + }else{ + description <- setNames(rep(NA, length(gsl)), names(gsl)) + } + if(length(description)==0){ + description <- setNames(rep(NA, length(gsl)), names(gsl)) + } + #create the list of ranked vectors + cat("Decreasing:", decreasing, "\n") rll <- vector('list', ncol(rl)) names(rll) <- colnames(rl) for(i in 1:length(rll)){ - rll[[i]] <- sort(array(rl[, i], dimnames = list(rownames(rl))), decreasing = ord.mode[i]==-1) + #rll[[i]] <- sort(array(rl[, i], dimnames = list(rownames(rl))), decreasing = decreasing[i]) + rll[[i]] <- sort(setNames(rl[, i], rownames(rl)), decreasing = decreasing[i]) + if(length(unique(rll[[i]])) != length(rll[[i]])){ + cat("Found ties in ranked list ", names(rll)[i], "!!!\n") + } } - + #permutation of gene ids cat('generating', k, 'permutations\n') x_perm <- lapply(1:k, function(x) sample(rownames(rl), nrow(rl))) - + #real es - print("ES...") - #real_es <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y))) - #real_es <- do.call(rbind, real_es) - - real_es_data <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y, le=T))) + cat("ES of input data...\n") + real_es_data <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y))) real_es <- do.call(rbind, lapply(real_es_data, function(x) unlist(lapply(x, function(y) y$es)))) - + leading_edge <- vector("list", length(rll)) names(leading_edge) <- names(rll) for(i in 1:length(leading_edge)){ - leading_edge[[i]] <- do.call(rbind, lapply(real_es_data, function(x) x[[i]]$lea)) + leading_edge[[i]] <- do.call(rbind, lapply(real_es_data, function(x) x[[i]][, -1])) #rm the enrichment score } - + #permutations - print("calculating permutations") - if(mc_cores_path==1){ - if(mc_cores_perm == 1){ - #res <- lapply(gsl, function(x) unlist(lapply(x_perm, function(y) calc_gs_perm(rl, y, x)))) - res <- lapply(gsl, function(x) do.call(rbind, lapply(x_perm, function(y) calc_gs_perm(rll, y, x)))) - }else{ - cat(k, "permutations on", mc_cores_perm, "cores\n") - #res <- lapply(gsl, function(x) unlist(mclapply(x_perm, function(y) calc_gs_perm(rl, y, x), mc.cores=mc_cores_perm))) - res <- lapply(gsl, function(x) do.call(rbind, parallel::mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm))) - } - }else{ - cat(length(gsl), "gene sets on", mc_cores_path, "cores\n") - if(mc_cores_perm == 1){ - #res <- parallel::mclapply(gsl, function(x) unlist(lapply(x_perm, function(y) calc_gs_perm(rl, y, x))), mc.cores = mc_cores_path) - res <- parallel::mclapply(gsl, function(x) do.call(rbind, lapply(x_perm, function(y) calc_gs_perm(rll, y, x))), mc.cores = mc_cores_path) - }else{ - cat(k, "permutations on", mc_cores_perm, "cores\n") - #res <- parallel::mclapply(gsl, function(x) unlist(parallel::mclapply(x_perm, function(y) calc_gs_perm(rl, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path) - res <- parallel::mclapply(gsl, function(x) do.call(rbind, parallel::mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path) - } - } - - - #list of pathwa-by-k matrices of permuted es - #res <- do.call(rbind, res) - # if(nrow(res) != length(gsl) | ncol(res) != k){ - # stop("not all the permutations returned a correct value\n") - # } - #res <- cbind(real_es, res) - + cat("ES of permutations...\n") + res <- mclapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path) + temp <- vector('list', length(rll)) names(temp) <- colnames(rl) for(i in 1:length(rll)){ @@ -83,52 +95,127 @@ gsea <- function(rl, gsl, k=100, ord.mode=-1, mc_cores_path=1, mc_cores_perm=1){ } res <- temp rm(temp) - - - - + + #statistics print("calculating statistics...") - + #the first column is the real value, and it is included in the calculation of p - #if ES* > 0 -> p = # (ESp >= ES*) / (k+1) - #if ES* < 0 -> p = # (ESp <= ES*) / (k+1) + #if ES* > 0 -> p = # (ESp >= ES*) / (positive null) + #if ES* < 0 -> p = # (ESp <= ES*) / (negative null) out <- res - + for(i in 1:length(rll)){ - - p_val <- apply(res[[i]], 1, function(x) ifelse(x[1] >= 0, sum(x >= x[1]) / length(x), sum(x <= x[1]) / length(x))) - + + cat("Proceeding with", names(rll)[i], "...\n") + n_pos_perm <- rowSums(res[[i]]>0) + n_neg_perm <- rowSums(res[[i]]<0) + + p_val <- apply(res[[i]], 1, function(x) ifelse(x[1] >= 0, sum(x >= x[1]) / length(x[x>=0]), sum(x <= x[1]) / length(x[x<=0]))) + + idx_na <- which(res[[i]][, 1] == 0) + if(length(idx_na)>0){ + cat("\tES==0:\n") + cat("\t", rownames(res[[i]])[idx_na], "\n") + p_val[idx_na] <- NA ### + } + idx_na <- which(res[[i]][, 1] > 0 & n_pos_perm < min.k) + if(length(idx_na)>0){ + cat("\tES>0 but less than", min.k, " positive ES in permutations\n") + cat("\t", rownames(res[[i]])[idx_na], "\n") + p_val[idx_na] <- NA ### + } + idx_na <- which(res[[i]][, 1] < 0 & n_neg_perm < min.k) + if(length(idx_na)>0){ + cat("\tES<0 but less than", min.k, " negative ES in permutations\n") + cat("\t", rownames(res[[i]])[idx_na], "\n") + p_val[idx_na] <- NA ### + } + #normalized ES means <- t(apply(res[[i]], 1, function(x) c(mean(x[x>0]), abs(mean(x[x<0]))))) #positive, negative + means[is.nan(means)] <- NA #NaN values are caused by the absence of any positive or negative value nes <- res[[i]] / means[, 1] nes_neg <- res[[i]] / means[, 2] nes[res[[i]] < 0] <- nes_neg[res[[i]] < 0] + nes[is.nan(nes)] <- NA #NaN values are caused by 0/0 rm(means, nes_neg) - + + nes[nes > 0 & n_pos_perm < min.k] <- NA + nes[nes < 0 & n_neg_perm < min.k] <- NA + #calculate FDR all_nes <- as.numeric(nes) - n_nes_pos <- sum(nes>=0) - n_nes_neg <- sum(nes<=0) - n_real_nes_pos <- sum(nes[,1] >= 0) - n_real_nes_neg <- sum(nes[,1] <= 0) - if((n_nes_pos + n_nes_neg) != (k*length(gsl) + nrow(nes))){ - warning('some nes value is equal to zero') - } - - #FDR: NES* > 0: fdrq = #(all positive NESp >= NES*) / #(all positive NESp) / [ #(all NES* >= NES*) / (all positive NES*) ] - #FDR: NES* < 0: fdrq = #(all negative NESp <= NES*) / #(all negative NESp) / [ #(all NES* <= NES*) / (all negative NES*) ] - fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(all_nes >= x) / n_nes_pos, sum(all_nes <= x) / n_nes_neg)) - fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(nes[, 1] >= x) / n_real_nes_pos, sum(nes[, 1] <= x) / n_real_nes_neg)) + nes_pos <- all_nes[which(all_nes > 0)] + nes_neg <- all_nes[which(all_nes < 0)] + n_nes_pos <- length(nes_pos) + n_nes_neg <- length(nes_neg) + + real_nes <- nes[, 1] + real_nes <- real_nes[!is.na(real_nes)] + real_nes_pos <- real_nes[which(real_nes>0)] + real_nes_neg <- real_nes[which(real_nes<0)] + n_real_nes_pos <- length(real_nes_pos) + n_real_nes_neg <- length(real_nes_neg) + + #FDR: NES* > 0: fdrq = [#(all positive NESp >= NES*) / #(all positive NESp)] / [ #(all positive NES* >= NES*) / (all positive NES*) ] + #FDR: NES* < 0: fdrq = [#(all negative NESp <= NES*) / #(all negative NESp)] / [ #(all negative NES* <= NES*) / (all negative NES*) ] + #fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(all_nes >= x, na.rm = T) / n_nes_pos, sum(all_nes <= x, na.rm = T) / n_nes_neg)) + #fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(nes[, 1] >= x, na.rm = T) / n_real_nes_pos, sum(nes[, 1] <= x, na.rm = T) / n_real_nes_neg)) + + fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(nes_pos >= x, na.rm = T) / n_nes_pos, sum(nes_neg <= x, na.rm = T) / n_nes_neg)) + fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(real_nes_pos >= x, na.rm = T) / n_real_nes_pos, sum(real_nes_neg <= x, na.rm = T) / n_real_nes_neg)) + rm(all_nes) - + + fdrq[fdrq>1] <- 1 + idx_replace <- which(fdrq abs(min.ES) ) { + ES <- max.ES + } else { + ES <- min.ES + } + + df <- data.frame(x=seq_along(runningES), + runningScore=runningES, + position=as.integer(hits) + ) + + if(fortify==TRUE) { + return(df) + } + + df$gene = names(geneList) + res <- list(ES=ES, runningES = df) + + return(res) + } + + leading_edge <- function(observed_info) { + core_enrichment <- lapply(observed_info, function(x) { + runningES <- x$runningES + runningES <- runningES[runningES$position == 1,] + ES <- x$ES + if (ES >= 0) { + i <- which.max(runningES$runningScore) + leading_gene <- runningES$gene[1:i] + } else { + i <- which.min(runningES$runningScore) + leading_gene <- runningES$gene[-c(1:(i-1))] + } + return(leading_gene) + }) + + rank <- sapply(observed_info, function(x) { + runningES <- x$runningES + ES <- x$ES + if (ES >= 0) { + rr <- which.max(runningES$runningScore) + } else { + i <- which.min(runningES$runningScore) + rr <- nrow(runningES) - i + 1 + } + return(rr) + }) + + tags <- sapply(observed_info, function(x) { + runningES <- x$runningES + runningES <- runningES[runningES$position == 1,] + ES <- x$ES + if (ES >= 0) { + i <- which.max(runningES$runningScore) + res <- i/nrow(runningES) + } else { + i <- which.min(runningES$runningScore) + res <- (nrow(runningES) - i + 1)/nrow(runningES) + } + return(res) + }) + + ll <- sapply(observed_info, function(x) { + runningES <- x$runningES + ES <- x$ES + if (ES >= 0) { + i <- which.max(runningES$runningScore) + res <- i/nrow(runningES) + } else { + i <- which.min(runningES$runningScore) + res <- (nrow(runningES) - i + 1)/nrow(runningES) + } + return(res) + }) + + N <- nrow(observed_info[[1]]$runningES) + setSize <- sapply(observed_info, function(x) sum(x$runningES$position)) + signal <- tags * (1-ll) * (N / (N - setSize)) + + tags <- paste0(round(tags * 100), "%") + ll <- paste0(round(ll * 100), "%") + signal <- paste0(round(signal * 100), "%") + leading_edge <- paste0('tags=', tags, ", list=", ll, ", signal=", signal) + + res <- list(rank = rank, + tags = tags, + list = ll, + signal = signal, + leading_edge = leading_edge, + core_enrichment = core_enrichment) + return(res) + } + + params <- list(pvalueCutoff = 1, + nPerm = NA, + pAdjustMethod = "BH", + exponent = 1, + minGSSize = NA, + maxGSSize = NA + ) + #checks + if(!is.matrix(rl) | !is.numeric(rl)){ + stop("rl must be a numeric matrix") + } + + if(is.null(decreasing)){ + decreasing <- rep(TRUE, ncol(rl)) + } + + if(length(decreasing) != ncol(rl)){ + stop("length(decreasing) must be equal to ncol(rl)") + } + + #gene set size + gsl <- filter_gsl(gsl = gsl, universe = rownames(rl), min_size = min_size, max_size = max_size) + gsl_size <- lengths(gsl) + + #create the list of ranked vectors + cat("Decreasing:", decreasing, "\n") + rll <- vector('list', ncol(rl)) + names(rll) <- colnames(rl) + ans <- rll + + for(i in 1:length(rll)){ + + rll[[i]] <- sort(setNames(rl[, i], rownames(rl)), decreasing = decreasing[i]) + if(length(unique(rll[[i]])) != length(rll[[i]])){ + cat("Found ties in ranked list ", names(rll)[i], "!!!\n") + } + + ans[[i]] <- data.frame( + ID = as.character(gsea_res[[i]]$id), + Description = as.character(gsea_res[[i]]$description), + setSize = gsl_size[match(gsea_res[[i]]$id, names(gsl_size))], + enrichmentScore = gsea_res[[i]]$es, + NES = gsea_res[[i]]$nes, + pvalue = gsea_res[[i]]$p_val, + p.adjust = gsea_res[[i]]$adj_p_val, + qvalue = gsea_res[[i]]$q_val, + stringsAsFactors = FALSE + ) + + idx <- order(ans[[i]]$p.adjust, -abs(ans[[i]]$NES), decreasing = FALSE) + ans[[i]] <- ans[[i]][idx, ] + + rownames(ans[[i]]) <- ans[[i]]$ID + + observed_info <- lapply(gsl[ans[[i]]$ID], function(gs) gseaScores(geneSet=gs, geneList=rll[[i]])) + + ledge <- leading_edge(observed_info) + + ans[[i]]$rank <- ledge$rank + ans[[i]]$leading_edge <- ledge$leading_edge + ans[[i]]$core_enrichment <- sapply(ledge$core_enrichment, paste0, collapse='/') + + ans[[i]] <- new("gseaResult", + result = ans[[i]], + geneSets = gsl, + geneList = rll[[i]], + permScores = matrix(), + params = params, + readable = FALSE + ) + + } + + return(ans) + +} \ No newline at end of file diff --git a/R/ora.R b/R/ora.R index c40a80b..e6f5e0f 100644 --- a/R/ora.R +++ b/R/ora.R @@ -1,24 +1,98 @@ #' Over Representation Analysis -#' @param wb hits (white balls) -#' @param bb other elements (black balls) +#' @param wb list of character vectors with hits (white balls) +#' @param universe universe, character vector +#' @param wbd_min minimum number of white balls drawn #' @param gsl named list of sets #' @param p_adj_method p value adjustment method, see p.adjust.methods +#' @param min_size minimum gene set size +#' @param max_size maximum gene set size +#' @param out_file_prefix prefix for .xlsx and .txt output files +#' @import openxlsx #' @importFrom qvalue qvalue +#' @importFrom stats p.adjust +#' @importFrom utils write.table #' @export -ora <- function(wb, bb, gsl, p_adj_method='fdr'){ - - - out <- lapply(gsl, function(x) ora1gs(wb, bb, x)) - - out <- as.data.frame(do.call(rbind, out), stringsAsFactors = FALSE) - out$N <- length(wb) + length(bb) - out$exp <- out$wb * out$bd / out$N - out$id <- rownames(out) - out$p_adj <- stats::p.adjust(out$p, method = p_adj_method) - #out$q_val <- qvalue::qvalue(p=out$p, lambda=0.05, pi0.method="bootstrap")$qvalues - out$er <- out$wbd / out$exp - - return(out[, c('id', 'N', 'wb', 'bb', 'bd', 'wbd', 'exp', 'er', 'p', 'p_adj')]) - +ora <- function(wb=NULL, universe=NULL, gsl=NULL, p_adj_method='fdr', description=NULL, wbd_min=1, min_size=5, max_size=500, out_file_prefix="ora_res"){ + + #gene set size + cat("Checking gene sets\n") + + gsl <- filter_gsl(gsl = gsl, universe = universe, min_size = min_size, max_size = max_size) + gsl_size <- lengths(gsl) + + if(length(gsl)==0){ + stop("Problem with gene sets. Check identifiers and sizes.\n") + } + + if(!is.null(description)){ + description <- description[match(names(gsl), names(description))] + }else{ + description <- setNames(names(gsl), names(gsl)) + } + if(length(description)==0){ + description <- setNames(names(gsl), names(gsl)) + } + + + cat("#{gene sets} that overlap with the universe and have size in [", min_size, ",", max_size, "]:", length(gsl), "\n") + + out <- wb + + for(i in 1:length(wb)){ + + bb <- universe[!universe %in% wb[[i]]] + out[[i]] <- lapply(gsl, function(x) ora1gs(wb[[i]], bb, x)) + out[[i]] <- as.data.frame(do.call(rbind, out[[i]]), stringsAsFactors = FALSE) + + if(wbd_min>0){ + out[[i]] <- out[[i]][out[[i]]$wbd >= wbd_min, ] + } + + cat("#{gene sets} with wbd >=", wbd_min, ":", nrow(out[[i]]), "\n") + + out[[i]]$N <- length(wb) + length(bb) + out[[i]]$exp <- out[[i]]$wb * out[[i]]$bd / out[[i]]$N + out[[i]]$id <- rownames(out[[i]]) + out[[i]]$p_adj <- p.adjust(out[[i]]$p, method = p_adj_method) + out[[i]]$genes <- unlist(lapply(gsl[match(out[[i]]$id, names(gsl))], function(x) paste0(sort(x[x %in% wb[[i]]]), collapse = ";"))) + + #### FROM DOSE + qobj <- tryCatch(qvalue(out[[i]]$p, lambda=0.05, pi0.method="bootstrap"), error=function(e) NULL) + if (inherits(qobj, "qvalue")) { + qvalues <- qobj$qvalues + } else { + qvalues <- NA + } + + out[[i]]$q_val <- qvalues + out[[i]]$er <- out[[i]]$wbd / out[[i]]$exp + out[[i]]$description <- description[match(out[[i]]$id, names(description))] + + out[[i]] <- out[[i]][order(out[[i]]$p_adj, -out[[i]]$er), c('id', 'N', 'wb', 'bb', 'bd', 'wbd', 'exp', 'er', 'p', 'p_adj', 'q_val', 'description', 'genes')] + } + + #writing output to file + if(!is.null(out_file_prefix)){ + + out_file_xlsx <- paste0(out_file_prefix, ".xlsx") + cat("Writing output to", out_file_xlsx, " and ", paste0(out_file_prefix, ".[run].txt"), "...\n") + wb <- createWorkbook() + + for(i in 1:length(out)){ + + addWorksheet(wb, names(out)[i]) + writeData(wb, names(out)[i], out[[i]]) + + write.table(out[[i]], file = paste0(out_file_prefix, ".", names(out)[i], ".txt"), row.names = F, sep="\t") + + } + + saveWorkbook(wb, out_file_xlsx, TRUE) + + } + + + return(out) + } diff --git a/R/ora1gs.R b/R/ora1gs.R index 9e479ab..b837cea 100644 --- a/R/ora1gs.R +++ b/R/ora1gs.R @@ -5,10 +5,10 @@ #' @importFrom stats phyper dhyper #' -ora1gs <- function(wb, bb, bd){ +ora1gs <- function(wb=NULL, bb=NULL, bd=NULL){ wbd <- bd[bd %in% wb] - p_out <- stats::phyper(length(wbd), length(wb), length(bb), length(bd), lower.tail = FALSE) + stats::dhyper(length(wbd), length(wb), length(bb), length(bd)) + p_out <- phyper(length(wbd), length(wb), length(bb), length(bd), lower.tail = FALSE) + dhyper(length(wbd), length(wb), length(bb), length(bd)) return(data.frame(wb=length(wb), bb=length(bb), bd=length(bd), wbd=length(wbd), p=p_out, stringsAsFactors = F)) } diff --git a/R/ora2enrich.R b/R/ora2enrich.R index db628d8..17b0cb1 100644 --- a/R/ora2enrich.R +++ b/R/ora2enrich.R @@ -1,12 +1,12 @@ #' ora2enrich #' this function translates the result of an ORA into the DOSE class "enrichResult" -#' @param ulisse_res ora analysis result +#' @param ora_res ora analysis result #' @param pvalueCutoff cut off over adjusted p value= 0.25 #' @param pAdjustMethod method for p value adjustement #' @param qvalueCutoff cut off over q values -#' @param gene input gene list +#' @param wb input gene list #' @param universe universe of genes -#' @param geneSets list of gene sets +#' @param gsl list of gene sets #' @param organism organism #' @param keytype keytype #' @param ontology ontology @@ -18,28 +18,38 @@ #' @importFrom methods new -ora2enrich <- function(ulisse_res, pvalueCutoff = 0.25, pAdjustMethod = "BH", qvalueCutoff = 0.25, gene = NULL, universe = NULL, geneSets = NULL, organism = "UNKNOWN", keytype = "UNKNOWN", ontology = "UNKNOWN", readable = FALSE){ - - geneID <- unlist(sapply(ulisse_res$wbd_symbols, function(x) gsub(";", "/", x))) - - if(is.null(ulisse_res$q_val)){ - q_val <- qvalue::qvalue(p=ulisse_res$p, lambda=0.05, pi0.method="bootstrap")$qvalues - }else{ - q_val <- ulisse_res$q_val +ora2enrich <- function(ora_res=NULL, pvalueCutoff = 1, pAdjustMethod = "BH", qvalueCutoff = 1, wb = NULL, universe = NULL, gsl = NULL, organism = "UNKNOWN", keytype = "UNKNOWN", ontology = "UNKNOWN", readable = FALSE, min_size=5, max_size=500){ + + gsl <- filter_gsl(gsl = gsl, universe = universe, min_size = min_size, max_size = max_size) + gsl_size <- lengths(gsl) + + if(length(gsl)==0){ + stop("Problem with gene sets. Check identifiers and sizes.\n") } - - Over <- data.frame(ID = as.character(ulisse_res$gsid), GeneRatio = paste0(ulisse_res$wbd, "/", ulisse_res$bd), BgRatio = paste0(ulisse_res$wb, "/", ulisse_res$N), pvalue = ulisse_res$p, p.adjust=ulisse_res$p_adj, qvalue=q_val, Count=ulisse_res$wbd, geneID=geneID, stringsAsFactors = FALSE) - - Over$Description <- ulisse_res$name - row.names(Over) <- as.character(Over$ID) - - x <- new("enrichResult", result = Over, pvalueCutoff = pvalueCutoff, - pAdjustMethod = pAdjustMethod, qvalueCutoff = qvalueCutoff, - gene = as.character(gene), universe = universe, geneSets = geneSets, - organism = "UNKNOWN", keytype = "UNKNOWN", ontology = "UNKNOWN", - readable = FALSE) - - return(x) - - + + ora_res_gsl <- lapply(ora_res, function(x) x$id) + gsl <- gsl[names(gsl) %in% ora_res_gsl] + + ans <- ora_res + for(i in 1:length(ora_res)){ + + geneID <- unlist(sapply(ora_res[[i]]$genes, function(x) gsub(";", "/", x))) + + Over <- data.frame(ID = as.character(ora_res[[i]]$id), GeneRatio = paste0(ora_res[[i]]$wbd, "/", ora_res[[i]]$bd), BgRatio = paste0(ora_res[[i]]$wb, "/", ora_res[[i]]$N), pvalue = ora_res[[i]]$p, p.adjust=ora_res[[i]]$p_adj, qvalue=ora_res[[i]]$q_val, Count=ora_res[[i]]$wbd, geneID=geneID, stringsAsFactors = FALSE) + + Over$Description <- ora_res[[i]]$description + row.names(Over) <- as.character(Over$ID) + + ans[[i]] <- new("enrichResult", result = Over, pvalueCutoff = pvalueCutoff, + pAdjustMethod = pAdjustMethod, qvalueCutoff = qvalueCutoff, + gene = as.character(wb[[i]]), universe = universe, geneSets = gsl, + organism = "UNKNOWN", keytype = "UNKNOWN", ontology = "UNKNOWN", + readable = FALSE) + + + } + + return(ans) + + } diff --git a/R/ora_pipeline.R b/R/ora_pipeline.R deleted file mode 100644 index 485db1d..0000000 --- a/R/ora_pipeline.R +++ /dev/null @@ -1,60 +0,0 @@ -#' ora pipeline -#' @param deg_list gene list of interest -#' @param universe all genes -#' @param gs list of gene lists -#' @param gsid2name data.frame for annotation of gene sets. It must contain the column "gsid" with gene set ids that match those in gs -#' @param mc.cores number of cores -#' @param eg2sym data.frame for translating gene ids into symbol. It must contain the columncs gene_id and symbol -#' @param min_size minimum gene set size -#' @param max_size maximum gene set size -#' @param out_dir output directory -#' @param write_tables whether to write or not the output -#' @param id project name -#' @importFrom parallel mclapply -#' @export - -ora_pipeline <- function(deg_list=NULL, universe=NULL, gs=NULL, gsid2name=NULL, mc.cores=1, eg2sym=NULL, min_size = 5, max_size = 500, out_dir="./", write_tables=FALSE, id="ora"){ - - #### ORA - get_genes <- function(gene_set=NULL, wb=NULL, eg2sym=NULL){ - - ans <- gene_set[gene_set %in% wb] - if(!is.null(eg2sym)){ - ans <- sort(eg2sym$symbol[eg2sym$gene_id %in% ans]) - } - - ans <- paste0(ans, collapse = ";") - return(ans) - - } - - #filter gene set lists according to the universe - gs_i <- lapply(gs, function(x) filter_gsl(x, universe, min_size = min_size, max_size = max_size)) - - #ORA - ora_res <- mclapply(gs_i, function(x) ora(deg_list, universe[!universe %in% deg_list], gsl = x, p_adj_method = "fdr"), mc.cores = mc.cores) - - #add pathwway info - ora_res <- lapply(ora_res, function(x) merge(gsid2name, x, by.x="gsid", by.y="id", all.y=T)) - - #add gene info - cat("adding gene symbols...\n") - #k are the pathway dbs - for(k in 1:length(ora_res)){ - ora_res[[k]]$wbd_symbols <- unlist(parallel::mclapply(ora_res[[k]]$gsid, function(x) get_genes(gs_i[[k]][names(gs_i[[k]]) == x][[1]], deg_list, eg2sym), mc.cores=mc.cores)) - } - - #re-order and attach the project id - ora_res <- lapply(ora_res, function(x) data.frame(id=id, x[order(x$p_adj, x$p, -x$er), ], stringsAsFactors = F)) - - #write the output - if(write_tables){ - dir.create(out_dir, recursive = T) - for(k in 1:length(ora_res)){ - file <- paste0(out_dir, "/", id, "_", names(ora_res)[k], ".txt") - utils::write.table(ora_res[[k]], file=file, sep="\t", row.names = F) - } - } - - return(ora_res) -} diff --git a/R/plot_gsea_heatmap.R b/R/plot_gsea_heatmap.R new file mode 100644 index 0000000..f863295 --- /dev/null +++ b/R/plot_gsea_heatmap.R @@ -0,0 +1,73 @@ +#' Plot heatmap of GSEA results for multiple runs +#' @param gsea_res result of function gsea() +#' @param nes_sign whether to print nes sign +#' @param a alpha over FDR +#' @param na_col color for FDR > a +#' @param max_gs maximum number of gene sets that will be plotted +#' @param ... further arguments to ComplexHeatmap::Heatmap() +#' @importFrom ComplexHeatmap Heatmap +#' @importFrom pals brewer.purples +#' @importFrom grid gpar grid.text + +plot_gsea_heatmap <- function(gsea_res=NULL, nes_sign=TRUE, a=0.25, na_col="khaki", max_gs=50, ...){ + + if(length(gsea_res)<2){ + stop("This function requires at least two GSEAs.\n") + } + + cat("Size: ", unlist(lapply(gsea_res, nrow)), "\n") + + X_matrix <- merge(gsea_res[[1]][, c("id", "FDRq")], gsea_res[[2]][, c("id", "FDRq")], by=1, all=T, sort=F) + colnames(X_matrix)[2:3] <- names(gsea_res)[1:2] + + if(length(gsea_res)>2){ + for(i in 3:length(gsea_res)){ + X_matrix <- merge(X_matrix, gsea_res[[i]][, c("id", "FDRq")], by=1, all = T, sort=F) + colnames(X_matrix)[i+1] <- names(gsea_res)[i] + } + } + + rownames(X_matrix) <- X_matrix$id + X_matrix$id <- NULL + #rownames(X_matrix) <- gss$gs_name[match(rownames(X_matrix), gss$gs_id)] + X_matrix <- -log10(X_matrix) + X_matrix[X_matrix < -log10(a)] <- NA + X_matrix <- as.matrix(X_matrix) + + if(nrow(X_matrix)>max_gs){ + cat("Only the top", max_gs, "gene sets will be considered...\n") + X_matrix <- X_matrix[1:max_gs, ] + } + + if(any(is.na(X_matrix))){ + warning("The presence of many NA values may cause errors in hclust(). To avoid this error, (i) increase 'a' or (ii) set cluster_columns = F and/or cluster_rows = F.\n") + } + + if(nes_sign){ + + nes_sign <- merge(gsea_res[[1]][, c("id", "nes")], gsea_res[[2]][, c("id", "nes")], by=1, all = T, sort=F) + colnames(nes_sign)[2:3] <- names(gsea_res)[1:2] + + if(length(gsea_res)>2){ + for(i in 3:length(gsea_res)){ + nes_sign <- merge(nes_sign, gsea_res[[i]][, c("id", "nes")], by=1, all = T, sort=F) + colnames(nes_sign)[1+i] <- names(gsea_res)[i] + } + } + rownames(nes_sign) <- nes_sign$id + nes_sign$id <- NULL + nes_sign <- sign(nes_sign) + nes_sign[which(nes_sign>0, arr.ind = T)] <- "+" + nes_sign[which(nes_sign==-1, arr.ind = T)] <- "-" + nes_sign[which(is.na(nes_sign), arr.ind = T)] <- "" + }else{ + nes_sign <- X_matrix + nes_sign[] <- "" + } + + nes_sign <- nes_sign[match(rownames(X_matrix), rownames(nes_sign)), ] + nes_sign[is.na(X_matrix)] <- "" + + Heatmap(X_matrix, col=brewer.purples(5), na_col = na_col, name="-log10(q)", rect_gp = gpar(col = "black", lwd = 1), cell_fun = function(j, i, x, y, width, height, fill) { grid.text(nes_sign[i, j], x, y, gp = gpar(fontsize = 10, fontface = "bold", col=ifelse(nes_sign[i, j]=="+", "firebrick", "limegreen")))}, ...) + +} \ No newline at end of file diff --git a/R/plot_ora_comparison.R b/R/plot_ora_comparison.R deleted file mode 100644 index 4bdf366..0000000 --- a/R/plot_ora_comparison.R +++ /dev/null @@ -1,112 +0,0 @@ -#' Plot a comparion between two ORAs -#' @param ora_res_list list of results of ora_pipeline -#' @param p_sig thresold for significant pathways -#' @param p_max maximum p-value allowed for marginally significant pathways -#' @param dir_out output directory -#' @param mar mar parameter -#' @param mgp mgp parameter -#' @param cex.axis cex.axis -#' @param use_nominal_p whether to use or not nominal p-values -#' @param col_pal color palette function -#' @return data.frame with merged results only for pathways that satisfy p_sig in at least one condition -#' @importFrom viridis turbo viridis -#' @export - -plot_ora_comparison <- function(ora_res_list=NULL, p_sig=0.001, p_max=0.1, dir_out="./", mar=c(2.5, 20, 1, 1), mgp=c(1.2, 0.3, 0), cex.axis=0.6, use_nominal_p=TRUE, col_pal=NULL){ - - p_type <- "p" - if(!use_nominal_p){ - p_type <- "p_adj" - } - if(is.null(names(ora_res_list))){ - ora_names <- paste0("id", 1:length(ora_res_list)) - }else{ - ora_names <- names(ora_res_list) - } - - - ora_res_merged <- merge(ora_res_list[[1]][, c("gsid", "name", "er", p_type)], ora_res_list[[2]][, c("gsid", "name", "er", p_type)], by=c("gsid", "name"), sort=F, suffixes = c(paste0("_", ora_names[1:2]))) - if(length(ora_res_list) > 2){ - for(i in 3:length(ora_res_list)){ - ora_res_merged <- merge(ora_res_merged, ora_res_list[[i]][, c("gsid", "name", "er", p_type)], by=c("gsid", "name"), sort=F) - } - } - colnames(ora_res_merged)[match(c("er", "p"), colnames(ora_res_merged))] <- c(paste0(c("er", "p"), "_", ora_names[i])) - - p_columns <- which(grepl("^p_", colnames(ora_res_merged))) - er_columns <- which(grepl("^er_", colnames(ora_res_merged))) - - ora_res_merged_filt <- ora_res_merged[apply(ora_res_merged[, p_columns], 1, function(x) any(x", p_max), paste0("<", p_max), paste0("<", p_sig), ora_names), cex=cex.axis, col=c(1, 1, 1, col_pal1)) - - grDevices::dev.off() - - - grDevices::jpeg(paste0(dir_out, "/ora_res_dotplot2.jpg"), width = 200, height = 200, units="mm", res=300) - - graphics::par(mar=mar) - graphics::par(mgp=mgp) - graphics::layout(matrix(1:2, ncol = 2), widths = c(0.85, 0.15)) - - plot(rep(1, nrow(ora_res_merged_filt)), 1:nrow(ora_res_merged_filt), cex=c(1, 1.3, 1.6, 2)[as.numeric(er_fact)[1:nrow(ora_res_merged_filt)]], pch=16, col=grDevices::adjustcolor(col_pal2[as.numeric(p_fact)[1:nrow(ora_res_merged_filt)]], 0.8), xaxt="none", yaxt="none", ylab="", xlab="", cex.axis=cex.axis, xlim = c(0.5, length(ora_res_list)+0.5)) - - for(i in 2:length(ora_res_list)){ - graphics::points(rep(i, nrow(ora_res_merged_filt)), 1:nrow(ora_res_merged_filt), cex=c(1, 1.3, 1.6, 2)[as.numeric(er_fact)[(((i-1)*nrow(ora_res_merged_filt))+1):(nrow(ora_res_merged_filt)*i)]], pch=16, col=grDevices::adjustcolor(col_pal2[as.numeric(p_fact)[(((i-1)*nrow(ora_res_merged_filt))+1):(nrow(ora_res_merged_filt)*i)]], 0.8), cex.axis=cex.axis) - } - - graphics::abline(h=1:nrow(ora_res_merged_filt), col="gray", lty=3) - graphics::abline(v=1:length(ora_res_list), col="gray", lty=3) - - graphics::axis(1, at=1:length(ora_res_list), ora_names, cex.axis=cex.axis, las=2, tick = F) - graphics::axis(2, at=1:nrow(ora_res_merged_filt), ora_res_merged_filt$name, cex.axis=cex.axis, las=2, tick = F) - graphics::axis(2, at=(1:nrow(ora_res_merged_filt))[all_ok], ora_res_merged_filt$name[all_ok], cex.axis=cex.axis, las=2, tick = F, font=2) - - graphics::par(mar=c(mar[1], 0, mar[3], 1)) - graphics::plot.new() - graphics::legend("center", pt.cex=c(1, 1.3, 1.6, 2), legend = levels(er_fact), cex=cex.axis, pch=1) - graphics::legend("bottom", pch=16, legend=c(paste0(">", p_max), paste0("<", p_max), paste0("<", p_sig)), cex=cex.axis, col=col_pal2) - - grDevices::dev.off() - - - return(ora_res_merged_filt) - -} diff --git a/R/plot_ora_heatmap.R b/R/plot_ora_heatmap.R new file mode 100644 index 0000000..f0fab09 --- /dev/null +++ b/R/plot_ora_heatmap.R @@ -0,0 +1,52 @@ +#' Plot heatmap of ORA results for multiple runs +#' @param ora_res result of function gsea() +#' @param nes_sign whether to print nes sign +#' @param a alpha over FDR +#' @param na_col color for FDR > a +#' @param max_gs maximum number of gene sets that will be plotted +#' @param ... further arguments to ComplexHeatmap::Heatmap() +#' @importFrom ComplexHeatmap Heatmap +#' @importFrom pals brewer.purples +#' @importFrom grid gpar grid.text + +plot_ora_heatmap <- function(ora_res=NULL, p.stat="p_adj", a=0.25, na_col="khaki", max_gs=50, ...){ + + if(length(ora_res)<2){ + stop("This function requires at least two ORAs.\n") + } + + cat("Size: ", unlist(lapply(ora_res, nrow)), "\n") + + if(!p.stat %in% colnames(ora_res[[1]])){ + stop("p.stat", p.stat, "is not in", colnames(ora_res[[1]]), ".\n") + } + + X_matrix <- merge(ora_res[[1]][, c("id", p.stat)], ora_res[[2]][, c("id", p.stat)], by=1, all=T, sort=F) + colnames(X_matrix)[2:3] <- names(ora_res)[1:2] + + if(length(ora_res)>2){ + for(i in 3:length(ora_res)){ + X_matrix <- merge(X_matrix, ora_res[[i]][, c("id", p.stat)], by=1, all = T, sort=F) + colnames(X_matrix)[i+1] <- names(ora_res)[i] + } + } + + rownames(X_matrix) <- X_matrix$id + X_matrix$id <- NULL + #rownames(X_matrix) <- gss$gs_name[match(rownames(X_matrix), gss$gs_id)] + X_matrix <- -log10(X_matrix) + X_matrix[X_matrix < -log10(a)] <- NA + X_matrix <- as.matrix(X_matrix) + + if(nrow(X_matrix)>max_gs){ + cat("Only the top", max_gs, "gene sets will be considered...\n") + X_matrix <- X_matrix[1:max_gs, ] + } + + if(any(is.na(X_matrix))){ + warning("The presence of many NA values may cause errors in hclust(). To avoid this error, (i) increase 'a' or (ii) set cluster_columns = F and/or cluster_rows = F.\n") + } + + Heatmap(X_matrix, col=brewer.purples(5), na_col = na_col, name="-log10(p)", rect_gp = gpar(col = "black", lwd = 1), ...) + +} diff --git a/README.md b/README.md index 8174859..f3eeb36 100644 --- a/README.md +++ b/README.md @@ -7,21 +7,21 @@ Cross-talks between pathways are relevant to dissect regulatory mechanisms, iden Ulisse provide the tools to perform: -- Gene-set enrichment analysis: - - ORA (Over Representation Analysis) - - GSEA (Gene set enrichment analysis) - - Enrichment map to help interpretation of ORA or GSEA results - Cross-talk analysis: - Gene-set cross-talk - Gene-set connected components - Gene-Set Topological-Module cross-talk - Functional relevance analysis to reconstruct gene role in Gene-set cross-talk analysis in terms of number of interactor genes and processes/cell types involved +- Gene-set enrichment analysis: + - ORA (Over Representation Analysis) + - GSEA (Gene set enrichment analysis) + - Enrichment map to help interpretation of ORA or GSEA results Typical application of Ulisse include: -- Pathway enrichment analysis of omics data obtained from bulk or single-cell samples - Pathway cross-talk analysis of omics data obtained from bulk or single-cell samples - Communication analysis between clusters or cell-type in single-cell samples +- Pathway enrichment analysis of omics data obtained from bulk or single-cell samples Source code: https://github.com/emosca-cnr/Ulisse diff --git a/man/calc_gs_perm.Rd b/man/calc_gs_perm.Rd index 4264500..8754496 100644 --- a/man/calc_gs_perm.Rd +++ b/man/calc_gs_perm.Rd @@ -4,11 +4,10 @@ \alias{calc_gs_perm} \title{Internal function of pathtools} \usage{ -calc_gs_perm(rll, perm, gs) +calc_gs_perm(rll = NULL, perm = NULL, gs = NULL) } \arguments{ -\item{rll}{numeric matrix of genes-by-ranking criteria; -each column contains numeric values; rownames are mandatory} +\item{rll}{list of named ranked vectors} \item{perm}{vector of permuted names} diff --git a/man/es.Rd b/man/es.Rd index 0d18a8f..7271c90 100644 --- a/man/es.Rd +++ b/man/es.Rd @@ -4,17 +4,15 @@ \alias{es} \title{Enrichment Score} \usage{ -es(idx, x, le = F) +es(idx = NULL, x = NULL) } \arguments{ \item{idx}{vector of indices a subset of elements of x} \item{x}{named vector, ranked list} - -\item{le}{logical} } \value{ -enrichment score +data.frame with as, enrichmet score; tags, leading edge size; tags_perc, leading edge size percent over gene set; list_top, rank of the ES; list_top_perc, rank of the ES percent over full ranked list; lead_edge, gene names of the leading edge. } \description{ Enrichment Score diff --git a/man/filter_gsl.Rd b/man/filter_gsl.Rd index 3632026..0c29ea2 100644 --- a/man/filter_gsl.Rd +++ b/man/filter_gsl.Rd @@ -4,7 +4,7 @@ \alias{filter_gsl} \title{Filter a gene set list} \usage{ -filter_gsl(gsl, universe, min_size = 5, max_size = 500) +filter_gsl(gsl = NULL, universe = NULL, min_size = 5, max_size = 500) } \arguments{ \item{gsl}{gene set list (named list)} diff --git a/man/get_genes.Rd b/man/get_genes.Rd new file mode 100644 index 0000000..ec54ffb --- /dev/null +++ b/man/get_genes.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_genes.R +\name{get_genes} +\alias{get_genes} +\title{Internal function} +\usage{ +get_genes(gene_set = NULL, wb = NULL) +} +\description{ +Internal function +} diff --git a/man/gsea.Rd b/man/gsea.Rd index 6903074..08c2ef1 100644 --- a/man/gsea.Rd +++ b/man/gsea.Rd @@ -4,7 +4,19 @@ \alias{gsea} \title{Gene Sert Enrichment Analysis} \usage{ -gsea(rl, gsl, k = 100, ord.mode = -1, mc_cores_path = 1, mc_cores_perm = 1) +gsea( + rl = NULL, + gsl = NULL, + k = 99, + min_size = 5, + max_size = 500, + decreasing = NULL, + mc_cores_path = 1, + mc_cores_perm = 1, + description = NULL, + out_file_prefix = "gsea_res", + min.k = 20 +) } \arguments{ \item{rl}{numeric matrix of genes-by-ranking criteria; each column contains numeric values; rownames are mandatory} @@ -13,14 +25,24 @@ gsea(rl, gsl, k = 100, ord.mode = -1, mc_cores_path = 1, mc_cores_perm = 1) \item{k}{integer, number of permutations} -\item{ord.mode}{ordering mode: -1 -> descending; 1 ascending; must be of lenght equal to ncol(rl)} +\item{min_size}{minimum gene set size} + +\item{max_size}{maximum gene set size} + +\item{decreasing}{TRUE/FALSE vector that specifies whether to order each column of rl decreasingly or not; must be of length equal to `ncol(rl)`. If NULL, all columns will be ranked in decreasing order} \item{mc_cores_path}{number of cores to use for parallel calculation of gene set lists; the total number of cpu used will be mc_cores_path x mc_cores_perm} \item{mc_cores_perm}{number of cores to use for parallel calculation of ranked list permutations; the total number of cpu used will be mc_cores_path x mc_cores_perm} + +\item{description}{optional named vector with gene set description; names must be gene seet identifiers} + +\item{out_file_prefix}{prefix for .xlsx and .txt output files} + +\item{min.k}{minimum number of permutations to obtain valid permutation-based statistics} } \value{ -data.frame with es, nes, p-value, adjusted p-value and FDR q-value +data.frame with: es, enrichment score; nes normalized enrichment score; nperm, number of permutations actually used; p-value, empirical p-value; adjusted p-value, BH FDR; q_val: q-value estimnated from p-values using qvalue package; FDR q-value, empirical FDR; tags, leading edge size; tags_perc, leading edge size percent over gene set; list_top, rank of the ES; list_top_perc, rank of the ES percent over full ranked list; lead_edge, gene names of the leading edge } \description{ Gene Sert Enrichment Analysis diff --git a/man/gsea2enrich.Rd b/man/gsea2enrich.Rd new file mode 100644 index 0000000..2748794 --- /dev/null +++ b/man/gsea2enrich.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gsea2enrich.R +\name{gsea2enrich} +\alias{gsea2enrich} +\title{Create a gseaResult instance from Ulisse GSEA result} +\usage{ +gsea2enrich( + gsea_res = NULL, + rl = NULL, + gsl = NULL, + min_size = 5, + max_size = 500, + decreasing = NULL +) +} +\arguments{ +\item{gsea_res}{output of function gsea()} + +\item{rl}{the input used to obtain gsea_res; numeric matrix of genes-by-ranking criteria; each column contains numeric values; rownames are mandatory;} + +\item{gsl}{named list of gene sets used to obtain gsea_res} + +\item{min_size}{minimum gene set size} + +\item{max_size}{maximum gene set size} + +\item{decreasing}{TRUE/FALSE vector that specifies whether to order each column of rl decreasingly or not; must be of length equal to `ncol(rl)`. If NULL, all #' @export} +} +\description{ +Create a gseaResult instance from Ulisse GSEA result +} diff --git a/man/ora.Rd b/man/ora.Rd index 784fab6..9da7105 100644 --- a/man/ora.Rd +++ b/man/ora.Rd @@ -4,16 +4,34 @@ \alias{ora} \title{Over Representation Analysis} \usage{ -ora(wb, bb, gsl, p_adj_method = "fdr") +ora( + wb = NULL, + universe = NULL, + gsl = NULL, + p_adj_method = "fdr", + description = NULL, + wbd_min = 1, + min_size = 5, + max_size = 500, + out_file_prefix = "ora_res" +) } \arguments{ -\item{wb}{hits (white balls)} +\item{wb}{list of character vectors with hits (white balls)} -\item{bb}{other elements (black balls)} +\item{universe}{universe, character vector} \item{gsl}{named list of sets} \item{p_adj_method}{p value adjustment method, see p.adjust.methods} + +\item{wbd_min}{minimum number of white balls drawn} + +\item{min_size}{minimum gene set size} + +\item{max_size}{maximum gene set size} + +\item{out_file_prefix}{prefix for .xlsx and .txt output files} } \description{ Over Representation Analysis diff --git a/man/ora1gs.Rd b/man/ora1gs.Rd index 5b414e9..03f00ba 100644 --- a/man/ora1gs.Rd +++ b/man/ora1gs.Rd @@ -4,7 +4,7 @@ \alias{ora1gs} \title{Hypergeometric test on 1 dataset} \usage{ -ora1gs(wb, bb, bd) +ora1gs(wb = NULL, bb = NULL, bd = NULL) } \arguments{ \item{wb}{white balls} diff --git a/man/ora2enrich.Rd b/man/ora2enrich.Rd index 6ee9c84..0b84b48 100644 --- a/man/ora2enrich.Rd +++ b/man/ora2enrich.Rd @@ -6,21 +6,23 @@ this function translates the result of an ORA into the DOSE class "enrichResult"} \usage{ ora2enrich( - ulisse_res, - pvalueCutoff = 0.25, + ora_res = NULL, + pvalueCutoff = 1, pAdjustMethod = "BH", - qvalueCutoff = 0.25, - gene = NULL, + qvalueCutoff = 1, + wb = NULL, universe = NULL, - geneSets = NULL, + gsl = NULL, organism = "UNKNOWN", keytype = "UNKNOWN", ontology = "UNKNOWN", - readable = FALSE + readable = FALSE, + min_size = 5, + max_size = 500 ) } \arguments{ -\item{ulisse_res}{ora analysis result} +\item{ora_res}{ora analysis result} \item{pvalueCutoff}{cut off over adjusted p value= 0.25} @@ -28,11 +30,11 @@ ora2enrich( \item{qvalueCutoff}{cut off over q values} -\item{gene}{input gene list} +\item{wb}{input gene list} \item{universe}{universe of genes} -\item{geneSets}{list of gene sets} +\item{gsl}{list of gene sets} \item{organism}{organism} diff --git a/man/ora_pipeline.Rd b/man/ora_pipeline.Rd deleted file mode 100644 index f1d581f..0000000 --- a/man/ora_pipeline.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ora_pipeline.R -\name{ora_pipeline} -\alias{ora_pipeline} -\title{ora pipeline} -\usage{ -ora_pipeline( - deg_list = NULL, - universe = NULL, - gs = NULL, - gsid2name = NULL, - mc.cores = 1, - eg2sym = NULL, - min_size = 5, - max_size = 500, - out_dir = "./", - write_tables = FALSE, - id = "ora" -) -} -\arguments{ -\item{deg_list}{gene list of interest} - -\item{universe}{all genes} - -\item{gs}{list of gene lists} - -\item{gsid2name}{data.frame for annotation of gene sets. It must contain the column "gsid" with gene set ids that match those in gs} - -\item{mc.cores}{number of cores} - -\item{eg2sym}{data.frame for translating gene ids into symbol. It must contain the columncs gene_id and symbol} - -\item{min_size}{minimum gene set size} - -\item{max_size}{maximum gene set size} - -\item{out_dir}{output directory} - -\item{write_tables}{whether to write or not the output} - -\item{id}{project name} -} -\description{ -ora pipeline -} diff --git a/man/plot_gsea_heatmap.Rd b/man/plot_gsea_heatmap.Rd new file mode 100644 index 0000000..935f48b --- /dev/null +++ b/man/plot_gsea_heatmap.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_gsea_heatmap.R +\name{plot_gsea_heatmap} +\alias{plot_gsea_heatmap} +\title{Plot heatmap of GSEA results for multiple runs} +\usage{ +plot_gsea_heatmap( + gsea_res = NULL, + nes_sign = TRUE, + a = 0.25, + na_col = "khaki", + max_gs = 50, + ... +) +} +\arguments{ +\item{gsea_res}{result of function gsea()} + +\item{nes_sign}{whether to print nes sign} + +\item{a}{alpha over FDR} + +\item{na_col}{color for FDR > a} + +\item{max_gs}{maximum number of gene sets that will be plotted} + +\item{...}{further arguments to ComplexHeatmap::Heatmap()} +} +\description{ +Plot heatmap of GSEA results for multiple runs +} diff --git a/man/plot_ora_comparison.Rd b/man/plot_ora_comparison.Rd deleted file mode 100644 index 5d76c2f..0000000 --- a/man/plot_ora_comparison.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_ora_comparison.R -\name{plot_ora_comparison} -\alias{plot_ora_comparison} -\title{Plot a comparion between two ORAs} -\usage{ -plot_ora_comparison( - ora_res_list = NULL, - p_sig = 0.001, - p_max = 0.1, - dir_out = "./", - mar = c(2.5, 20, 1, 1), - mgp = c(1.2, 0.3, 0), - cex.axis = 0.6, - use_nominal_p = TRUE, - col_pal = NULL -) -} -\arguments{ -\item{ora_res_list}{list of results of ora_pipeline} - -\item{p_sig}{thresold for significant pathways} - -\item{p_max}{maximum p-value allowed for marginally significant pathways} - -\item{dir_out}{output directory} - -\item{mar}{mar parameter} - -\item{mgp}{mgp parameter} - -\item{cex.axis}{cex.axis} - -\item{use_nominal_p}{whether to use or not nominal p-values} - -\item{col_pal}{color palette function} -} -\value{ -data.frame with merged results only for pathways that satisfy p_sig in at least one condition -} -\description{ -Plot a comparion between two ORAs -} diff --git a/man/plot_ora_heatmap.Rd b/man/plot_ora_heatmap.Rd new file mode 100644 index 0000000..a52854f --- /dev/null +++ b/man/plot_ora_heatmap.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_ora_heatmap.R +\name{plot_ora_heatmap} +\alias{plot_ora_heatmap} +\title{Plot heatmap of ORA results for multiple runs} +\usage{ +plot_ora_heatmap( + ora_res = NULL, + p.stat = "p_adj", + a = 0.25, + na_col = "khaki", + max_gs = 50, + ... +) +} +\arguments{ +\item{ora_res}{result of function gsea()} + +\item{a}{alpha over FDR} + +\item{na_col}{color for FDR > a} + +\item{max_gs}{maximum number of gene sets that will be plotted} + +\item{...}{further arguments to ComplexHeatmap::Heatmap()} + +\item{nes_sign}{whether to print nes sign} +} +\description{ +Plot heatmap of ORA results for multiple runs +}