diff --git a/DESCRIPTION b/DESCRIPTION index 6acbbd7..9207f6c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,5 +44,5 @@ Suggests: vdiffr, quanteda.textmodels Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.3 LinkingTo: Rcpp diff --git a/R/plot.R b/R/plot.R index 826d9f3..1aa1701 100644 --- a/R/plot.R +++ b/R/plot.R @@ -1,13 +1,12 @@ ## Generate a "terms bar plot", based on terms keyness for a group keyness_barplot <- function(tab, range = NULL, title = "", title_color = "firebrick3", - stat_col = "chi2", n_terms = NULL, text_size = 10, top_margin = 0, - keyness_plot_xlab = NULL) { - + stat_col = "chi2", n_terms = NULL, text_size = 10, top_margin = 0, + keyness_plot_xlab = NULL) { if (is.null(keyness_plot_xlab)) { keyness_plot_xlab <- stat_col } - + ## Column with statistic values if (!is.null(range)) { stat_max <- max(range) @@ -34,7 +33,7 @@ keyness_barplot <- function(tab, range = NULL, title = "", title_color = "firebr panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.background = element_rect(fill = grDevices::rgb(.9, .9, .9, .2), - colour = "transparent")) + colour = "transparent")) ## Fix x limits if necessary and remove horizontal axis values if (!is.null(range)) { g <- g + ggplot2::scale_y_continuous(keyness_plot_xlab, limits = range, breaks = NULL) @@ -49,7 +48,7 @@ keyness_barplot <- function(tab, range = NULL, title = "", title_color = "firebr } else { g <- g + ggplot2::scale_x_discrete(breaks = NULL) } - + ## Align title element to the left to center it with hjust g <- ggplot2::ggplotGrob(g) g$layout$l[g$layout$name == "title"] <- 1 @@ -62,8 +61,8 @@ keyness_barplot <- function(tab, range = NULL, title = "", title_color = "firebr #' @import ggwordcloud keyness_worcloud <- function(tab, range = NULL, title = "", title_color = "firebrick3", - stat_col = "chi2", max_size = 15, top_margin = 0) { - + stat_col = "chi2", max_size = 15, top_margin = 0) { + ## Plot g <- ggplot(data = tab) + #geom_hline(yintercept = 0, color = "grey70") + @@ -79,14 +78,14 @@ keyness_worcloud <- function(tab, range = NULL, title = "", title_color = "fireb panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.background = element_rect(fill = grDevices::rgb(.9, .9, .9, .3), - colour = "transparent")) + colour = "transparent")) ## Fix x limits if necessary and remove horizontal axis values if (!is.null(range)) { g <- g + ggplot2::scale_size_area(limits = range, max_size = max_size) } else { g <- g + ggplot2::scale_size_area(max_size = max_size) } - + ## Align title element to the left to center it with hjust g <- ggplot2::ggplotGrob(g) g$layout$l[g$layout$name == "title"] <- 1 @@ -107,7 +106,7 @@ groups_colors <- function(k, i = NULL) { "#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC" ) col <- rep_len(col, k) - + if (!is.null(i)) { return(col[i]) } else { @@ -120,14 +119,15 @@ groups_colors <- function(k, i = NULL) { ## Generate a list of terms plots from a list of keyness statistic tables keyness_plots <- function(tabs, groups, type = "bar", - free_scales = FALSE, stat_col = "chi2", n_terms, text_size, top_margin = 0, - cluster_label = NULL, keyness_plot_xlab = NULL) { - + free_scales = FALSE, stat_col = "chi2", n_terms, text_size, top_margin = 0, + cluster_label = NULL, keyness_plot_xlab = NULL, colors=NULL) { + ## Frequency and proportion of each cluster clust_n <- table(groups) clust_prop <- round(clust_n / sum(clust_n) * 100, 1) k <- length(tabs) - + if(is.null(colors[1])){colors=purrr::map_chr(.x=1:k,~groups_colors(k,.x))} + ## Min and max statistics to fix x axis in terms plots if (stat_col == "docprop") { range <- c(0, 1) @@ -135,45 +135,52 @@ keyness_plots <- function(tabs, groups, type = "bar", range <- NULL if (!free_scales) { max_stat <- max(purrr::map_dbl(tabs, function(tab) { - v <- tab[[stat_col]] - if (length(v) == 0) return(0) - max(v) + v <- tab[[stat_col]] + if (length(v) == 0) return(0) + max(v) })) range <- c(0, max_stat) } } - + purrr::map(1:k, function(i) { + # in case label for class is provided use it and do not show number of class + cluster_num=paste0(" ",i) + if(length(cluster_label)>1){ + cluster_label=cluster_label[i] + cluster_num="" + } if (k <= 6) { label <- ifelse(is.null(cluster_label), "Cluster", cluster_label) - title <- paste0(label, " ", i, "\nn = ", clust_n[i], " (", clust_prop[i], "%)") + title <- paste0(label, cluster_num, "\nn = ", clust_n[i], " (", clust_prop[i], "%)") } else if (k <= 8) { label <- ifelse(is.null(cluster_label), "Cluster", cluster_label) - title <- paste0(label, " ", i, "\nn = ", clust_n[i]) + title <- paste0(label, cluster_num, "\nn = ", clust_n[i]) } else { label <- ifelse(is.null(cluster_label), "Cl.", cluster_label) - title <- paste0(label, " ", i, "\nn = ", clust_n[i]) + title <- paste0(label, cluster_num, "\nn = ", clust_n[i]) } if (type == "bar") { if (is.null(text_size)) text_size <- 10 - keyness_barplot(tabs[[i]], range, title = title, title_color = groups_colors(k, i), - stat_col = stat_col, n_terms, text_size = text_size, top_margin, - keyness_plot_xlab = keyness_plot_xlab) + keyness_barplot(tabs[[i]], range, title = title, title_color = colors[i], + stat_col = stat_col, n_terms, text_size = text_size, top_margin, + keyness_plot_xlab = keyness_plot_xlab) } else { if (is.null(text_size)) text_size <- 15 - keyness_worcloud(tabs[[i]], range, title = title, title_color = groups_colors(k, i), - stat_col = stat_col, max_size = text_size, top_margin) + keyness_worcloud(tabs[[i]], range, title = title, title_color = colors[i], + stat_col = stat_col, max_size = text_size, top_margin) } }) } # Generate the dendrogram plot for rainette_plot() -dendrogram_plot <- function(res, k, groups, text_size, show_na_title) { - +dendrogram_plot <- function(res, k, groups, text_size, show_na_title,colors=NULL) { + dend <- stats::as.dendrogram(res) max_k <- max(res$group, na.rm = TRUE) - + + if(is.null(colors[1])){colors=purrr::map_chr(.x=1:k,~groups_colors(k,.x))} ## Cut the dendrogram if necessary if (k < max_k) { dend <- cut(dend, res$height[max_k - k])$upper @@ -181,13 +188,13 @@ dendrogram_plot <- function(res, k, groups, text_size, show_na_title) { dend <- stats::as.dendrogram(stats::as.hclust(dend)) dend <- dendextend::set(dend, "labels", 1:k) } - + ## Style labels and branches - dendextend::labels_colors(dend) <- groups_colors(k) + dendextend::labels_colors(dend) <- colors dend <- dend %>% - dendextend::color_branches(k = k, col = groups_colors(k)) %>% + dendextend::color_branches(k = k, col = colors) %>% dendextend::set("branches_lwd", 0.4) - + ## Generate plot dend <- dendextend::as.ggdend(dend) margin <- ifelse(k >= 7, 0, 0.175 - k * 0.025) @@ -196,16 +203,16 @@ dendrogram_plot <- function(res, k, groups, text_size, show_na_title) { scale_y_continuous(breaks = NULL) + theme(plot.margin = grid::unit(c(0.05, margin, 0, margin), "npc"), plot.title = element_text(hjust = 0.5, size = title_size)) - + ## Add NA number and proportion as title if (show_na_title) { ## Compute number of NA na_n <- sum(is.na(groups)) na_prop <- round(na_n / length(groups) * 100, 1) g <- g + - ggtitle(paste0("NA : ", na_n, " (", na_prop, "%)")) + ggtitle(paste0("NA : ", na_n, " (", na_prop, "%)")) } - + g } @@ -225,6 +232,7 @@ dendrogram_plot <- function(res, k, groups, text_size, show_na_title) { #' @param cluster_label define a specific term for clusters identification in keyness plots. #' Default is "Cluster" or "Cl." depending on the number of groups. #' @param keyness_plot_xlab define a specific x label for keyness plots. +#' @param colors vector of custom colors for cluster titles and branches (in the order of the clusters) #' #' @seealso [quanteda.textstats::textstat_keyness()], [rainette_explor()], [rainette_stats()] #' @@ -244,6 +252,7 @@ dendrogram_plot <- function(res, k, groups, text_size, show_na_title) { #' dtm <- dfm_trim(dtm, min_docfreq = 3) #' res <- rainette(dtm, k = 3, min_segment_size = 15) #' rainette_plot(res, dtm) +#' rainette_plot(res,dtm,cluster_label=c("Assets","Future","Values"), colors=c("red","slateblue","forestgreen")) #' } #' #' @import ggplot2 @@ -257,11 +266,14 @@ rainette_plot <- function(res, dtm, k = NULL, text_size = NULL, show_na_title = TRUE, cluster_label = NULL, - keyness_plot_xlab = NULL) { - + keyness_plot_xlab = NULL, + colors=NULL) { + + + if (!inherits(res, "rainette")) stop("res must be a rainette result object") - + type <- match.arg(type) measure <- match.arg(measure) stat_col <- stat_col(measure) @@ -269,10 +281,10 @@ rainette_plot <- function(res, dtm, k = NULL, warning("wordcloud plots will soon be deprecated. Please use type = \"bar\" instead.") show_negative <- FALSE } - + ## Maximum number of clusters max_k <- max(res$group, na.rm = TRUE) - + ## Get groups if (is.null(k)) { groups <- res$group @@ -281,51 +293,55 @@ rainette_plot <- function(res, dtm, k = NULL, if (k < 2 || k > max_k) stop("k must be between 2 and ", max_k) groups <- rainette::cutree_rainette(res, k) } - + + if(is.null(colors[1])){colors=purrr::map_chr(.x=1:k,~groups_colors(k,.x))} ## Keyness statistics tabs <- rainette::rainette_stats(groups, dtm, measure, n_terms, show_negative) - + ## Graph layout lay <- matrix( c(rep(1, k), - rep(2:(k + 1), 2)), + rep(2:(k + 1), 2)), nrow = 3, ncol = k, byrow = TRUE ) plots <- list() - + ## Dendrogram - plots[[1]] <- dendrogram_plot(res, k, groups, text_size, show_na_title) - + plots[[1]] <- rainette:::dendrogram_plot(res, k, groups, text_size, show_na_title, colors=colors) + ## Add terms plots plots <- c( plots, - keyness_plots( + rainette:::keyness_plots( tabs, groups, type, free_scales, stat_col, n_terms, text_size, cluster_label = cluster_label, - keyness_plot_xlab = keyness_plot_xlab + keyness_plot_xlab = keyness_plot_xlab, + colors=colors ) ) - + ## Generate grid gridExtra::grid.arrange(grobs = plots, layout_matrix = lay) - + } ## Generate barplot of cluster sizes -frequency_barplot <- function(groups, k, text_size) { - +frequency_barplot <- function(groups, k, text_size, colors=NULL) { + + if(is.null(colors[1])){colors=purrr::map_chr(.x=1:k,~groups_colors(k,.x))} + # Compute cluster sizes and number of NA freq <- data.frame(table(groups, exclude = NULL)) n_na <- sum(is.na(groups)) title <- paste0("Clusters size\n(NA = ", n_na, ")") colnames(freq) <- c("Group", "n") - + # Generate barplot g <- ggplot(freq) + geom_col(aes(x = .data$Group, y = .data$n, fill = .data$Group)) + - scale_fill_manual(values = c(groups_colors(k)), na.value = "grey20") + + scale_fill_manual(values = colors, na.value = "grey20") + guides(fill = "none") + ggtitle(title) + theme( @@ -362,48 +378,48 @@ frequency_barplot <- function(groups, k, text_size) { #' @import ggplot2 rainette2_plot <- function(res, dtm, k = NULL, criterion = c("chi2", "n"), - complete_groups = FALSE, - type = c("bar", "cloud"), n_terms = 15, - free_scales = FALSE, measure = c("chi2", "lr", "frequency", "docprop"), - show_negative = FALSE, - text_size = 10) { - + complete_groups = FALSE, + type = c("bar", "cloud"), n_terms = 15, + free_scales = FALSE, measure = c("chi2", "lr", "frequency", "docprop"), + show_negative = FALSE, + text_size = 10) { + if (!inherits(res, "rainette2")) stop("res must be a rainette2 result object") - + type <- match.arg(type) measure <- match.arg(measure) criterion <- match.arg(criterion) stat_col <- stat_col(measure) - + if (type == "cloud") { warning("wordcloud plots will soon be deprecated. Please use type = \"bar\" instead.") show_negative <- FALSE } - + ## Stop if not full and criterion == "n" if ((is.null(attr(res, "full")) || !attr(res, "full")) && criterion != "chi2") { stop("if rainette2 has been computed with full=FALSE, only 'chi2' criterion is available") } - + ## Maximum number of clusters max_k <- max(res$k, na.rm = TRUE) - + ## Get groups if (is.null(k) || k < 2 || k > max_k) stop("k must be between 2 and ", max_k) groups <- rainette::cutree_rainette2(res, k, criterion) if (complete_groups) { groups <- rainette::rainette2_complete_groups(dtm, groups) } - + ## Keyness statistics tabs <- rainette::rainette_stats(groups, dtm, measure, n_terms, show_negative) - + plots <- list() - + ## Barplot of clusters size plots[[1]] <- frequency_barplot(groups, k, text_size) - + ## Add terms plots plots <- c( plots, @@ -412,7 +428,7 @@ rainette2_plot <- function(res, dtm, k = NULL, criterion = c("chi2", "n"), stat_col, n_terms, text_size, top_margin = 0.05 ) ) - + ## Graph layout nrow <- ifelse(k > 5, 2, 1) index <- 1:(k + 1) @@ -420,8 +436,8 @@ rainette2_plot <- function(res, dtm, k = NULL, criterion = c("chi2", "n"), index <- c(index, NA) } lay <- matrix(index, nrow = nrow, byrow = TRUE) - + ## Generate grid gridExtra::grid.arrange(grobs = plots, layout_matrix = lay) - + } diff --git a/man/rainette_plot.Rd b/man/rainette_plot.Rd index 0f73e43..8ccdc77 100644 --- a/man/rainette_plot.Rd +++ b/man/rainette_plot.Rd @@ -16,7 +16,8 @@ rainette_plot( text_size = NULL, show_na_title = TRUE, cluster_label = NULL, - keyness_plot_xlab = NULL + keyness_plot_xlab = NULL, + colors = NULL ) } \arguments{ @@ -44,6 +45,8 @@ rainette_plot( Default is "Cluster" or "Cl." depending on the number of groups.} \item{keyness_plot_xlab}{define a specific x label for keyness plots.} + +\item{colors}{vector of custom colors for cluster titles and branches (in the order of the clusters)} } \value{ A gtable object.