From 94613427b320d327b2d007f2456f717173a9ecd4 Mon Sep 17 00:00:00 2001 From: Stefan Moog Date: Sun, 19 May 2024 12:40:25 +0200 Subject: [PATCH] feat: add functionality to get information on formatting properties of runs * Add functionality to get information on formatting properties of runs. * Make bold and italic booleans. Account for 0/1 and off/on. --- NEWS.md | 6 +++ R/fortify_docx.R | 76 ++++++++++++++++++++++++++++--- man/docx_summary.Rd | 8 +++- tests/testthat/test-doc-summary.R | 69 ++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6254ef95..1968dcaa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,13 @@ - Fix. `docx_summary` preserves non-breaking hyphens. Non-breaking hyphens are replaced with a hyphen-minus (Unicode character 002D). Closes #573. +## Features +- `docx_summary()` gains parameter 'detailed' which allows to get a detailed +summary including formatting properties of runs in a paragraph. Formatting +properties are stored in a list column `run`, where each element +is a dataframe with rows corresponding to a single +run and columns containing the information on formatting properties. # officer 0.6.6 diff --git a/R/fortify_docx.R b/R/fortify_docx.R index f762af5f..330c11aa 100644 --- a/R/fortify_docx.R +++ b/R/fortify_docx.R @@ -109,7 +109,7 @@ docxtable_as_tibble <- function(node, styles, preserve = FALSE) { } #' @importFrom xml2 xml_has_attr -par_as_tibble <- function(node, styles) { +par_as_tibble <- function(node, styles, detailed = FALSE) { style_node <- xml_child(node, "w:pPr/w:pStyle") if (inherits(style_node, "xml_missing")) { style_name <- NA @@ -129,14 +129,71 @@ par_as_tibble <- function(node, styles) { stringsAsFactors = FALSE ) + if (detailed) { + nodes_run <- xml_find_all(node, "w:r") + run_data <- lapply(nodes_run, run_as_tibble) + + run_data <- mapply(function(x, id) { + x$id <- id + x + }, run_data, seq_along(run_data), SIMPLIFY = FALSE) + run_data <- rbind_match_columns(run_data) + + par_data$run <- I(list(run_data)) + } + par_data$content_type <- rep("paragraph", nrow(par_data)) par_data } +#' @importFrom xml2 xml_has_attr +val_child <- function(node, child_path, attr = "val", default = NULL) { + child_node <- xml_child(node, child_path) + if (inherits(child_node, "xml_missing")) return(NA_character_) + if (!xml_has_attr(child_node, attr)) default + else xml_attr(child_node, attr) +} + +val_child_lgl <- function(node, child_path, attr = "val", default = NULL) { + val <- val_child(node = node, child_path = child_path, attr = attr, default = default) + if (is.na(val)) return(NA) + else (val %in% c("1", "on", "true")) +} + +val_child_int <- function(node, child_path, attr = "val", default = NULL) { + as.integer( + val_child(node = node, child_path = child_path, attr = attr, default = default) + ) +} + +run_as_tibble <- function(node, styles) { + style_node <- xml_child(node, "w:rPr/w:rStyle") + if (inherits(style_node, "xml_missing")) { + style_name <- NA + } else { + style_id <- xml_attr(style_node, "val") + style_name <- styles$style_name[styles$style_id %in% style_id] + } + run_data <- data.frame( + text = xml_text(node), + bold = val_child_lgl(node, "w:rPr/w:b", default = "true"), + italic = val_child_lgl(node, "w:rPr/w:i", default = "true"), + underline = val_child(node, "w:rPr/w:u"), + sz = val_child_int(node, "w:rPr/w:sz"), + szCs = val_child_int(node, "w:rPr/w:szCs"), + color = val_child(node, "w:rPr/w:color"), + shading = val_child(node, "w:rPr/w:shd"), + shading_color = val_child(node, "w:rPr/w:shd", attr = "color"), + shading_fill = val_child(node, "w:rPr/w:shd", attr = "fill"), + stringsAsFactors = FALSE + ) + + run_data +} -node_content <- function(node, x, preserve = FALSE) { +node_content <- function(node, x, preserve = FALSE, detailed = FALSE) { node_name <- xml_name(node) switch(node_name, - p = par_as_tibble(node, styles_info(x)), + p = par_as_tibble(node, styles_info(x), detailed = detailed), tbl = docxtable_as_tibble(node, styles_info(x), preserve = preserve), NULL ) @@ -158,6 +215,12 @@ node_content <- function(node, x, preserve = FALSE) { #' the `{docxtractr}` package by Bob Rudis. #' @param remove_fields if TRUE, prevent field codes from appearing in the #' returned data.frame. +#' @param detailed Should information on runs be included in summary dataframe? +#' Defaults to `FALSE`. If `TRUE` a list column `run` is added to the summary +#' containing a summary of formatting properties of runs as a dataframe with +#' rows corresponding to a single run and columns containing the information +#' on formatting properties. +#' #' @examples #' example_docx <- system.file( #' package = "officer", @@ -169,7 +232,7 @@ node_content <- function(node, x, preserve = FALSE) { #' #' docx_summary(doc, preserve = TRUE)[28, ] #' @export -docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) { +docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE) { if (remove_fields) { instrText_nodes <- xml_find_all(x$doc_obj$get(), "//w:instrText") xml_remove(instrText_nodes) @@ -178,18 +241,19 @@ docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) { all_nodes <- xml_find_all(x$doc_obj$get(), "/w:document/w:body/*[self::w:p or self::w:tbl]") - data <- lapply(all_nodes, node_content, x = x, preserve = preserve) + data <- lapply(all_nodes, node_content, x = x, preserve = preserve, detailed = detailed) data <- mapply(function(x, id) { x$doc_index <- id x }, data, seq_along(data), SIMPLIFY = FALSE) + data <- rbind_match_columns(data) colnames <- c( "doc_index", "content_type", "style_name", "text", "level", "num_id", "row_id", "is_header", "cell_id", - "col_span", "row_span" + "col_span", "row_span", "run" ) colnames <- intersect(colnames, names(data)) data[, colnames] diff --git a/man/docx_summary.Rd b/man/docx_summary.Rd index f8410b5a..e388b826 100644 --- a/man/docx_summary.Rd +++ b/man/docx_summary.Rd @@ -4,7 +4,7 @@ \alias{docx_summary} \title{Get Word content in a data.frame} \usage{ -docx_summary(x, preserve = FALSE, remove_fields = FALSE) +docx_summary(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE) } \arguments{ \item{x}{an rdocx object} @@ -17,6 +17,12 @@ the \code{{docxtractr}} package by Bob Rudis.} \item{remove_fields}{if TRUE, prevent field codes from appearing in the returned data.frame.} + +\item{detailed}{Should information on runs be included in summary dataframe? +Defaults to \code{FALSE}. If \code{TRUE} a list column \code{run} is added to the summary +containing a summary of formatting properties of runs as a dataframe with +rows corresponding to a single run and columns containing the information +on formatting properties.} } \description{ read content of a Word document and diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R index 371248a4..b59a2a0a 100644 --- a/tests/testthat/test-doc-summary.R +++ b/tests/testthat/test-doc-summary.R @@ -64,6 +64,75 @@ test_that("preserves non breaking hyphens", { ) }) +test_that("detailed summary", { + doc <- read_docx() + + fpar_ <- fpar( + ftext("Formatted ", prop = fp_text(bold = TRUE, color = "red")), + ftext("paragraph ", prop = fp_text( + shading.color = "blue" + )), + ftext("with multiple runs.", + prop = fp_text(italic = TRUE, font.size = 20, font.family = "Arial") + ) + ) + + doc <- body_add_fpar(doc, fpar_, style = "Normal") + + fpar_ <- fpar( + "Unformatted ", + "paragraph ", + "with multiple runs." + ) + + doc <- body_add_fpar(doc, fpar_, style = "Normal") + + doc <- body_add_par(doc, "Single Run", style = "Normal") + + doc <- body_add_fpar(doc, + fpar( + "Single formatetd run ", + fp_t = fp_text(bold = TRUE, color = "red") + ) + ) + + xml_elt <- paste0( + officer:::wp_ns_yes, + "", + "NA", + "toggle", + "0", + "1", + "false", + "true", + "off", + "on", + "" + ) + + doc <- officer:::body_add_xml( + x = doc, str = xml_elt + ) + + doc_sum <- docx_summary(doc, detailed = TRUE) + + expect_true("run" %in% names(doc_sum)) + expect_type(doc_sum$run, "list") + expect_equal(lengths(doc_sum$run), rep(11, 5)) + expect_equal(sapply(doc_sum$run, nrow), c(3, 3, 1, 1, 8)) + + expect_true(all(sapply(doc_sum$run$bold, is.logical))) + expect_true(all(sapply(doc_sum$run$italic, is.logical))) + expect_true(all(sapply(doc_sum$run$sz, is.integer))) + expect_true(all(sapply(doc_sum$run$szCs, is.integer))) + expect_true(all(sapply(doc_sum$run$underline, is_character))) + expect_true(all(sapply(doc_sum$run$color, is_character))) + expect_true(all(sapply(doc_sum$run$shading, is_character))) + expect_true(all(sapply(doc_sum$run$shading_color, is_character))) + expect_true(all(sapply(doc_sum$run$shading_fill, is_character))) +}) + + test_that("pptx summary", {