From 94613427b320d327b2d007f2456f717173a9ecd4 Mon Sep 17 00:00:00 2001
From: Stefan Moog <moogs@gmx.de>
Date: Sun, 19 May 2024 12:40:25 +0200
Subject: [PATCH] feat: add functionality to get information on formatting
 properties of runs

* Add functionality to get information on formatting properties of runs.
* Make bold and italic booleans. Account for 0/1 and off/on.
---
 NEWS.md                           |  6 +++
 R/fortify_docx.R                  | 76 ++++++++++++++++++++++++++++---
 man/docx_summary.Rd               |  8 +++-
 tests/testthat/test-doc-summary.R | 69 ++++++++++++++++++++++++++++
 4 files changed, 152 insertions(+), 7 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 6254ef95..1968dcaa 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,7 +5,13 @@
 - Fix. `docx_summary` preserves non-breaking hyphens. Non-breaking hyphens are 
 replaced with a hyphen-minus (Unicode character 002D). Closes #573.
 
+## Features
 
+- `docx_summary()` gains parameter 'detailed' which allows to get a detailed 
+summary including formatting properties of runs in a paragraph. Formatting 
+properties are stored in a list column `run`, where each element 
+is a dataframe with rows corresponding to a single 
+run and columns containing the information on formatting properties.
 
 # officer 0.6.6
 
diff --git a/R/fortify_docx.R b/R/fortify_docx.R
index f762af5f..330c11aa 100644
--- a/R/fortify_docx.R
+++ b/R/fortify_docx.R
@@ -109,7 +109,7 @@ docxtable_as_tibble <- function(node, styles, preserve = FALSE) {
 }
 
 #' @importFrom xml2 xml_has_attr
-par_as_tibble <- function(node, styles) {
+par_as_tibble <- function(node, styles, detailed = FALSE) {
   style_node <- xml_child(node, "w:pPr/w:pStyle")
   if (inherits(style_node, "xml_missing")) {
     style_name <- NA
@@ -129,14 +129,71 @@ par_as_tibble <- function(node, styles) {
     stringsAsFactors = FALSE
   )
 
+  if (detailed) {
+    nodes_run <- xml_find_all(node, "w:r")
+    run_data <- lapply(nodes_run, run_as_tibble)
+
+    run_data <- mapply(function(x, id) {
+      x$id <- id
+      x
+    }, run_data, seq_along(run_data), SIMPLIFY = FALSE)
+    run_data <- rbind_match_columns(run_data)
+
+    par_data$run <- I(list(run_data))
+  }
+
   par_data$content_type <- rep("paragraph", nrow(par_data))
   par_data
 }
+#' @importFrom xml2 xml_has_attr
+val_child <- function(node, child_path, attr = "val", default = NULL) {
+  child_node <- xml_child(node, child_path)
+  if (inherits(child_node, "xml_missing")) return(NA_character_)
+  if (!xml_has_attr(child_node, attr)) default
+  else xml_attr(child_node, attr)
+}
+
+val_child_lgl <- function(node, child_path, attr = "val", default = NULL) {
+  val <- val_child(node = node, child_path = child_path, attr = attr, default = default)
+  if (is.na(val)) return(NA)
+  else (val %in% c("1", "on", "true"))
+}
+
+val_child_int <- function(node, child_path, attr = "val", default = NULL) {
+  as.integer(
+    val_child(node = node, child_path = child_path, attr = attr, default = default)
+  )
+}
+
+run_as_tibble <- function(node, styles) {
+  style_node <- xml_child(node, "w:rPr/w:rStyle")
+  if (inherits(style_node, "xml_missing")) {
+    style_name <- NA
+  } else {
+    style_id <- xml_attr(style_node, "val")
+    style_name <- styles$style_name[styles$style_id %in% style_id]
+  }
+  run_data <- data.frame(
+    text = xml_text(node),
+    bold = val_child_lgl(node, "w:rPr/w:b", default = "true"),
+    italic = val_child_lgl(node, "w:rPr/w:i", default = "true"),
+    underline = val_child(node, "w:rPr/w:u"),
+    sz = val_child_int(node, "w:rPr/w:sz"),
+    szCs = val_child_int(node, "w:rPr/w:szCs"),
+    color = val_child(node, "w:rPr/w:color"),
+    shading = val_child(node, "w:rPr/w:shd"),
+    shading_color = val_child(node, "w:rPr/w:shd", attr = "color"),
+    shading_fill = val_child(node, "w:rPr/w:shd", attr = "fill"),
+    stringsAsFactors = FALSE
+  )
+
+  run_data
+}
 
-node_content <- function(node, x, preserve = FALSE) {
+node_content <- function(node, x, preserve = FALSE, detailed = FALSE) {
   node_name <- xml_name(node)
   switch(node_name,
-    p = par_as_tibble(node, styles_info(x)),
+    p = par_as_tibble(node, styles_info(x), detailed = detailed),
     tbl = docxtable_as_tibble(node, styles_info(x), preserve = preserve),
     NULL
   )
@@ -158,6 +215,12 @@ node_content <- function(node, x, preserve = FALSE) {
 #'   the `{docxtractr}` package by Bob Rudis.
 #' @param remove_fields if TRUE, prevent field codes from appearing in the
 #' returned data.frame.
+#' @param detailed Should information on runs be included in summary dataframe?
+#'   Defaults to `FALSE`. If `TRUE` a list column `run` is added to the summary
+#'   containing a summary of formatting properties of runs as a dataframe with
+#'   rows corresponding to a single run and columns containing the information
+#'   on formatting properties.
+#'
 #' @examples
 #' example_docx <- system.file(
 #'   package = "officer",
@@ -169,7 +232,7 @@ node_content <- function(node, x, preserve = FALSE) {
 #'
 #' docx_summary(doc, preserve = TRUE)[28, ]
 #' @export
-docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
+docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE) {
   if (remove_fields) {
     instrText_nodes <- xml_find_all(x$doc_obj$get(), "//w:instrText")
     xml_remove(instrText_nodes)
@@ -178,18 +241,19 @@ docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
   all_nodes <- xml_find_all(x$doc_obj$get(), "/w:document/w:body/*[self::w:p or self::w:tbl]")
 
 
-  data <- lapply(all_nodes, node_content, x = x, preserve = preserve)
+  data <- lapply(all_nodes, node_content, x = x, preserve = preserve, detailed = detailed)
 
   data <- mapply(function(x, id) {
     x$doc_index <- id
     x
   }, data, seq_along(data), SIMPLIFY = FALSE)
+
   data <- rbind_match_columns(data)
 
   colnames <- c(
     "doc_index", "content_type", "style_name", "text",
     "level", "num_id", "row_id", "is_header", "cell_id",
-    "col_span", "row_span"
+    "col_span", "row_span", "run"
   )
   colnames <- intersect(colnames, names(data))
   data[, colnames]
diff --git a/man/docx_summary.Rd b/man/docx_summary.Rd
index f8410b5a..e388b826 100644
--- a/man/docx_summary.Rd
+++ b/man/docx_summary.Rd
@@ -4,7 +4,7 @@
 \alias{docx_summary}
 \title{Get Word content in a data.frame}
 \usage{
-docx_summary(x, preserve = FALSE, remove_fields = FALSE)
+docx_summary(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE)
 }
 \arguments{
 \item{x}{an rdocx object}
@@ -17,6 +17,12 @@ the \code{{docxtractr}} package by Bob Rudis.}
 
 \item{remove_fields}{if TRUE, prevent field codes from appearing in the
 returned data.frame.}
+
+\item{detailed}{Should information on runs be included in summary dataframe?
+Defaults to \code{FALSE}. If \code{TRUE} a list column \code{run} is added to the summary
+containing a summary of formatting properties of runs as a dataframe with
+rows corresponding to a single run and columns containing the information
+on formatting properties.}
 }
 \description{
 read content of a Word document and
diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R
index 371248a4..b59a2a0a 100644
--- a/tests/testthat/test-doc-summary.R
+++ b/tests/testthat/test-doc-summary.R
@@ -64,6 +64,75 @@ test_that("preserves non breaking hyphens", {
   )
 })
 
+test_that("detailed summary", {
+  doc <- read_docx()
+
+  fpar_ <- fpar(
+    ftext("Formatted ", prop = fp_text(bold = TRUE, color = "red")),
+    ftext("paragraph ", prop = fp_text(
+      shading.color = "blue"
+    )),
+    ftext("with multiple runs.",
+          prop = fp_text(italic = TRUE, font.size = 20, font.family = "Arial")
+    )
+  )
+
+  doc <- body_add_fpar(doc, fpar_, style = "Normal")
+
+  fpar_ <- fpar(
+    "Unformatted ",
+    "paragraph ",
+    "with multiple runs."
+  )
+
+  doc <- body_add_fpar(doc, fpar_, style = "Normal")
+
+  doc <- body_add_par(doc, "Single Run", style = "Normal")
+
+  doc <- body_add_fpar(doc,
+                       fpar(
+                         "Single formatetd run ",
+                         fp_t = fp_text(bold = TRUE, color = "red")
+                       )
+  )
+
+  xml_elt <- paste0(
+    officer:::wp_ns_yes,
+    "<w:pPr><w:pStyle w:val=\"Normal\"/></w:pPr>",
+    "<w:r><w:rPr></w:rPr><w:t>NA</w:t></w:r>",
+    "<w:r><w:rPr><w:b/><w:i/></w:rPr><w:t>toggle</w:t></w:r>",
+    "<w:r><w:rPr><w:b w:val=\"0\"/><w:i w:val=\"0\"/></w:rPr><w:t>0</w:t></w:r>",
+    "<w:r><w:rPr><w:b w:val=\"1\"/><w:i w:val=\"1\"/></w:rPr><w:t>1</w:t></w:r>",
+    "<w:r><w:rPr><w:b w:val=\"false\"/><w:i w:val=\"false\"/></w:rPr><w:t>false</w:t></w:r>",
+    "<w:r><w:rPr><w:b w:val=\"true\"/><w:i w:val=\"true\"/></w:rPr><w:t>true</w:t></w:r>",
+    "<w:r><w:rPr><w:b w:val=\"off\"/><w:i w:val=\"off\"/></w:rPr><w:t>off</w:t></w:r>",
+    "<w:r><w:rPr><w:b w:val=\"on\"/><w:i w:val=\"on\"/></w:rPr><w:t>on</w:t></w:r>",
+    "</w:p>"
+  )
+
+  doc <- officer:::body_add_xml(
+    x = doc, str = xml_elt
+  )
+
+  doc_sum <- docx_summary(doc, detailed = TRUE)
+
+  expect_true("run" %in% names(doc_sum))
+  expect_type(doc_sum$run, "list")
+  expect_equal(lengths(doc_sum$run), rep(11, 5))
+  expect_equal(sapply(doc_sum$run, nrow), c(3, 3, 1, 1, 8))
+
+  expect_true(all(sapply(doc_sum$run$bold, is.logical)))
+  expect_true(all(sapply(doc_sum$run$italic, is.logical)))
+  expect_true(all(sapply(doc_sum$run$sz, is.integer)))
+  expect_true(all(sapply(doc_sum$run$szCs, is.integer)))
+  expect_true(all(sapply(doc_sum$run$underline, is_character)))
+  expect_true(all(sapply(doc_sum$run$color, is_character)))
+  expect_true(all(sapply(doc_sum$run$shading, is_character)))
+  expect_true(all(sapply(doc_sum$run$shading_color, is_character)))
+  expect_true(all(sapply(doc_sum$run$shading_fill, is_character)))
+})
+
+
 
 
 test_that("pptx summary", {