Skip to content

Commit

Permalink
feat: add functionality to get information on formatting properties o…
Browse files Browse the repository at this point in the history
…f runs

* Add functionality to get information on formatting properties of runs.
* Make bold and italic booleans. Account for 0/1 and off/on.
  • Loading branch information
trekonom authored May 19, 2024
1 parent f326c31 commit 9461342
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 7 deletions.
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
- Fix. `docx_summary` preserves non-breaking hyphens. Non-breaking hyphens are
replaced with a hyphen-minus (Unicode character 002D). Closes #573.

## Features

- `docx_summary()` gains parameter 'detailed' which allows to get a detailed
summary including formatting properties of runs in a paragraph. Formatting
properties are stored in a list column `run`, where each element
is a dataframe with rows corresponding to a single
run and columns containing the information on formatting properties.

# officer 0.6.6

Expand Down
76 changes: 70 additions & 6 deletions R/fortify_docx.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ docxtable_as_tibble <- function(node, styles, preserve = FALSE) {
}

#' @importFrom xml2 xml_has_attr
par_as_tibble <- function(node, styles) {
par_as_tibble <- function(node, styles, detailed = FALSE) {
style_node <- xml_child(node, "w:pPr/w:pStyle")
if (inherits(style_node, "xml_missing")) {
style_name <- NA
Expand All @@ -129,14 +129,71 @@ par_as_tibble <- function(node, styles) {
stringsAsFactors = FALSE
)

if (detailed) {
nodes_run <- xml_find_all(node, "w:r")
run_data <- lapply(nodes_run, run_as_tibble)

run_data <- mapply(function(x, id) {
x$id <- id
x
}, run_data, seq_along(run_data), SIMPLIFY = FALSE)
run_data <- rbind_match_columns(run_data)

par_data$run <- I(list(run_data))
}

par_data$content_type <- rep("paragraph", nrow(par_data))
par_data
}
#' @importFrom xml2 xml_has_attr
val_child <- function(node, child_path, attr = "val", default = NULL) {
child_node <- xml_child(node, child_path)
if (inherits(child_node, "xml_missing")) return(NA_character_)
if (!xml_has_attr(child_node, attr)) default
else xml_attr(child_node, attr)
}

val_child_lgl <- function(node, child_path, attr = "val", default = NULL) {
val <- val_child(node = node, child_path = child_path, attr = attr, default = default)
if (is.na(val)) return(NA)
else (val %in% c("1", "on", "true"))
}

val_child_int <- function(node, child_path, attr = "val", default = NULL) {
as.integer(
val_child(node = node, child_path = child_path, attr = attr, default = default)
)
}

run_as_tibble <- function(node, styles) {
style_node <- xml_child(node, "w:rPr/w:rStyle")
if (inherits(style_node, "xml_missing")) {
style_name <- NA
} else {
style_id <- xml_attr(style_node, "val")
style_name <- styles$style_name[styles$style_id %in% style_id]
}
run_data <- data.frame(
text = xml_text(node),
bold = val_child_lgl(node, "w:rPr/w:b", default = "true"),
italic = val_child_lgl(node, "w:rPr/w:i", default = "true"),
underline = val_child(node, "w:rPr/w:u"),
sz = val_child_int(node, "w:rPr/w:sz"),
szCs = val_child_int(node, "w:rPr/w:szCs"),
color = val_child(node, "w:rPr/w:color"),
shading = val_child(node, "w:rPr/w:shd"),
shading_color = val_child(node, "w:rPr/w:shd", attr = "color"),
shading_fill = val_child(node, "w:rPr/w:shd", attr = "fill"),
stringsAsFactors = FALSE
)

run_data
}

node_content <- function(node, x, preserve = FALSE) {
node_content <- function(node, x, preserve = FALSE, detailed = FALSE) {
node_name <- xml_name(node)
switch(node_name,
p = par_as_tibble(node, styles_info(x)),
p = par_as_tibble(node, styles_info(x), detailed = detailed),
tbl = docxtable_as_tibble(node, styles_info(x), preserve = preserve),
NULL
)
Expand All @@ -158,6 +215,12 @@ node_content <- function(node, x, preserve = FALSE) {
#' the `{docxtractr}` package by Bob Rudis.
#' @param remove_fields if TRUE, prevent field codes from appearing in the
#' returned data.frame.
#' @param detailed Should information on runs be included in summary dataframe?
#' Defaults to `FALSE`. If `TRUE` a list column `run` is added to the summary
#' containing a summary of formatting properties of runs as a dataframe with
#' rows corresponding to a single run and columns containing the information
#' on formatting properties.
#'
#' @examples
#' example_docx <- system.file(
#' package = "officer",
Expand All @@ -169,7 +232,7 @@ node_content <- function(node, x, preserve = FALSE) {
#'
#' docx_summary(doc, preserve = TRUE)[28, ]
#' @export
docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE) {
if (remove_fields) {
instrText_nodes <- xml_find_all(x$doc_obj$get(), "//w:instrText")
xml_remove(instrText_nodes)
Expand All @@ -178,18 +241,19 @@ docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
all_nodes <- xml_find_all(x$doc_obj$get(), "/w:document/w:body/*[self::w:p or self::w:tbl]")


data <- lapply(all_nodes, node_content, x = x, preserve = preserve)
data <- lapply(all_nodes, node_content, x = x, preserve = preserve, detailed = detailed)

data <- mapply(function(x, id) {
x$doc_index <- id
x
}, data, seq_along(data), SIMPLIFY = FALSE)

data <- rbind_match_columns(data)

colnames <- c(
"doc_index", "content_type", "style_name", "text",
"level", "num_id", "row_id", "is_header", "cell_id",
"col_span", "row_span"
"col_span", "row_span", "run"
)
colnames <- intersect(colnames, names(data))
data[, colnames]
Expand Down
8 changes: 7 additions & 1 deletion man/docx_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

69 changes: 69 additions & 0 deletions tests/testthat/test-doc-summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,75 @@ test_that("preserves non breaking hyphens", {
)
})

test_that("detailed summary", {
doc <- read_docx()

fpar_ <- fpar(
ftext("Formatted ", prop = fp_text(bold = TRUE, color = "red")),
ftext("paragraph ", prop = fp_text(
shading.color = "blue"
)),
ftext("with multiple runs.",
prop = fp_text(italic = TRUE, font.size = 20, font.family = "Arial")
)
)

doc <- body_add_fpar(doc, fpar_, style = "Normal")

fpar_ <- fpar(
"Unformatted ",
"paragraph ",
"with multiple runs."
)

doc <- body_add_fpar(doc, fpar_, style = "Normal")

doc <- body_add_par(doc, "Single Run", style = "Normal")

doc <- body_add_fpar(doc,
fpar(
"Single formatetd run ",
fp_t = fp_text(bold = TRUE, color = "red")
)
)

xml_elt <- paste0(
officer:::wp_ns_yes,
"<w:pPr><w:pStyle w:val=\"Normal\"/></w:pPr>",
"<w:r><w:rPr></w:rPr><w:t>NA</w:t></w:r>",
"<w:r><w:rPr><w:b/><w:i/></w:rPr><w:t>toggle</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"0\"/><w:i w:val=\"0\"/></w:rPr><w:t>0</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"1\"/><w:i w:val=\"1\"/></w:rPr><w:t>1</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"false\"/><w:i w:val=\"false\"/></w:rPr><w:t>false</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"true\"/><w:i w:val=\"true\"/></w:rPr><w:t>true</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"off\"/><w:i w:val=\"off\"/></w:rPr><w:t>off</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"on\"/><w:i w:val=\"on\"/></w:rPr><w:t>on</w:t></w:r>",
"</w:p>"
)

doc <- officer:::body_add_xml(
x = doc, str = xml_elt
)

doc_sum <- docx_summary(doc, detailed = TRUE)

expect_true("run" %in% names(doc_sum))
expect_type(doc_sum$run, "list")
expect_equal(lengths(doc_sum$run), rep(11, 5))
expect_equal(sapply(doc_sum$run, nrow), c(3, 3, 1, 1, 8))

expect_true(all(sapply(doc_sum$run$bold, is.logical)))
expect_true(all(sapply(doc_sum$run$italic, is.logical)))
expect_true(all(sapply(doc_sum$run$sz, is.integer)))
expect_true(all(sapply(doc_sum$run$szCs, is.integer)))
expect_true(all(sapply(doc_sum$run$underline, is_character)))
expect_true(all(sapply(doc_sum$run$color, is_character)))
expect_true(all(sapply(doc_sum$run$shading, is_character)))
expect_true(all(sapply(doc_sum$run$shading_color, is_character)))
expect_true(all(sapply(doc_sum$run$shading_fill, is_character)))
})




test_that("pptx summary", {
Expand Down

0 comments on commit 9461342

Please sign in to comment.