Merge pull request #76 from ropensci/curly

feat: protect_curly()
ropensci · Sep 20, 2022 · b87da44 · b87da44
2 parents a577e9b + 631a99d
commit b87da44
Show file tree

Hide file tree

Showing 16 changed files with 375 additions and 37 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tinkr
 Title: Cast '(R)Markdown' Files to 'XML' and Back Again
-Version: 0.1.0
+Version: 0.1.0.9000
 Authors@R: 
     c(person(given = "Maëlle",
              family = "Salmon",
@@ -56,5 +56,5 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.1.9000
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(find_between)
 export(md_ns)
+export(protect_curly)
 export(protect_math)
 export(stylesheet)
 export(to_md)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# tinkr (development version)
+
+* New `protect_curly()` function will add a `curly='true'` attribute to text wrapped in curly braces ('{', '}') to allow parsing of the XML for sending to external APIs. This function will also parse alt text and place it in an attribute.
+* New `$protect_curly()` method implements `protect_curly()` on yarn objects
+
 # tinkr 0.1.0
 
 First version on CRAN

diff --git a/R/asis-nodes.R b/R/asis-nodes.R
@@ -4,13 +4,13 @@
 #' @param ns an XML namespace object (defaults: [md_ns()]).
 #' @return a copy of the modified XML object
 #' @details Commonmark does not know what LaTeX is and will LaTeX equations as
-#' normal text. This means that content surrounded by underscores are 
+#' normal text. This means that content surrounded by underscores are
 #' interpreted as `<emph>` elements and all backslashes are escaped by default.
 #' This function protects inline and block math elements that use `$` and `$$`
-#' for delimiters, respectively. 
-#' 
+#' for delimiters, respectively.
+#'
 #' @note this function is also a method in the [tinkr::yarn] object.
-#' 
+#'
 #' @export
 #' @examples
 #' m <- tinkr::to_xml(system.file("extdata", "math-example.md", package = "tinkr"))
@@ -47,11 +47,11 @@ inline_dollars_regex <- function(type = c("start", "stop", "full")) {
   # any space
   ace   <- "[:space:]"
   punks <- glue::glue("[{ace}[:punct:]]")
-  # Note about this regex: the first part is a lookahead (?=...) that searches 
+  # Note about this regex: the first part is a lookahead (?=...) that searches
   # for the line start, space, or punctuation. Importantly about lookaheads,
-  # they do not consume the string 
+  # they do not consume the string
   # (https://junli.netlify.app/en/overlapping-regular-expression-in-python/)
-  # 
+  #
   # This looks for a potetial minus sign followed by maybe a space to allow for
   # $\beta, $$\beta, $-\beta, $- \beta
   minus_maybe <- glue::glue("(?=([-][{ace}]?)?")
@@ -87,7 +87,7 @@ find_broken_math <- function(math) {
 }
 
 #' Find and protect all inline math elements
-#' 
+#'
 #' @param body an XML document
 #' @param ns an XML namespace
 #' @return a modified _copy_ of the original XML document
@@ -144,10 +144,10 @@ protect_inline_math <- function(body, ns) {
     le <- length(bmath[endless])
     lh <- length(bmath[headless])
     if (le != lh) {
-      unbalanced_math_error(bmath, endless, headless, le, lh) 
+      unbalanced_math_error(bmath, endless, headless, le, lh)
     }
     # assign sequential tags to the pairs of inline math elements
-    tags <- seq(length(bmath[endless]))  
+    tags <- seq(length(bmath[endless]))
     xml2::xml_set_attr(bmath[endless], "latex-pair", tags)
     xml2::xml_set_attr(bmath[headless], "latex-pair", tags)
     for (i in tags) {
@@ -157,13 +157,13 @@ protect_inline_math <- function(body, ns) {
   copy_xml(body)
 }
 
-# Partial inline math are math elements that are not entirely embedded in a 
-# single `<text>` element. There are two reasons for this: 
+# Partial inline math are math elements that are not entirely embedded in a
+# single `<text>` element. There are two reasons for this:
 #
 # 1. Math is split across separate lines in the markdown document
 # 2. There are elements like `_` that are interpreted as markdown elements.
 #
-# To use this function, an inline pair needs to be first tagged with a 
+# To use this function, an inline pair needs to be first tagged with a
 # `latex-pair` attribute that uniquely identifies that pair of tags. It assumes
 # that all of the content between that pair of tags belongs to the math element.
 fix_partial_inline <- function(tag, body, ns) {
@@ -181,7 +181,7 @@ fix_partial_inline <- function(tag, body, ns) {
   char[[n]] <- sub("[<]text ", "<text asis='true' ", char[[n]])
   nodes <- paste(char, collapse = "")
   nodes <- make_text_nodes(nodes)
-  # add the new nodes to the bottom of the existing math lines 
+  # add the new nodes to the bottom of the existing math lines
   last_line <- math_lines[n]
   to_remove <- math_lines[-n]
   add_node_siblings(last_line, nodes, remove = TRUE)
@@ -197,7 +197,7 @@ fix_fully_inline <- function(math) {
   # <text>this is </text><text asis='true'>$\LaTeX$</text><text> text</text>
   char <- gsub(
     pattern = inline_dollars_regex("full"),
-    replacement = "</text><text asis='true'>\\1</text><text>", 
+    replacement = "</text><text asis='true'>\\1</text><text>",
     x = char,
     perl = TRUE
   )
@@ -209,7 +209,7 @@ fix_fully_inline <- function(math) {
 #' This is useful in the case where we want to modify some text content to
 #' split it and label a portion of it 'asis' to protect it from commonmark's
 #' escape processing.
-#' 
+#'
 #' `fix_fully_inline()` uses `make_text_nodes()` to modify a single text node
 #' into several text nodes. It first takes a string of a single text node like
 #' below...
@@ -228,15 +228,15 @@ fix_fully_inline <- function(math) {
 #' The `make_text_nodes()` function takes the above text string and converts it
 #' into nodes so that the original text node can be replaced.
 #'
-#' @param a character vector of modified text nodes 
+#' @param a character vector of modified text nodes
 #' @return a nodeset with no associated namespace
 #' @noRd
 make_text_nodes <- function(txt) {
   # We are hijacking commonmark here to produce an XML markdown document with
   # a single element: {paste(txt, collapse = ''). This gets passed to glue where
-  # it is expanded into nodes that we can read in via {xml2}, strip the 
+  # it is expanded into nodes that we can read in via {xml2}, strip the
   # namespace, and extract all nodes below
-  doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '')}")) 
+  doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '')}"))
   nodes <- xml2::xml_ns_strip(xml2::read_xml(doc))
   xml2::xml_find_all(nodes, ".//paragraph/text/*")
 }
@@ -257,7 +257,7 @@ protect_block_math <- function(body, ns) {
   bm <- find_block_math(body, ns)
   # get all of the internal nodes
   bm <- xml2::xml_find_all(bm, ".//descendant-or-self::md:*", ns = ns)
-  set_asis(bm) 
+  set_asis(bm)
 }
 
 # TICK BOXES -------------------------------------------------------------------
@@ -298,7 +298,7 @@ fix_footnotes <- function(feet) {
   char <- as.character(feet)
   char <- gsub(
     pattern = "([\\[][\\^]|\\])",
-    replacement = "</text><text asis='true'>\\1</text><text>", 
+    replacement = "</text><text asis='true'>\\1</text><text>",
     x = char,
     perl = TRUE
   )
@@ -307,7 +307,7 @@ fix_footnotes <- function(feet) {
 
 protect_footnotes <- function(body, ns = md_ns()) {
   body <- copy_xml(body)
-  
+
   feet <- footnote_check(body, ns)
   if (length(feet) == 0) {
     return(body)

diff --git a/R/attr-nodes.R b/R/attr-nodes.R
@@ -0,0 +1,82 @@
+# CURLY ------------------
+find_curly <- function(body, ns) {
+  i <- ".//md:text[not(@asis) and contains(text(), '{')]"
+  curlies <- xml2::xml_find_all(body, i, ns = ns)
+  # https://github.com/carpentries/pegboard/blob/a8db02ab037f2ffeab6e13cc3b662ea8c42822ad/R/get_images.R#L84
+  attr_texts <- xml2::xml_text(curlies)
+  no_closing <- !grepl("[}]", attr_texts)
+  if (any(no_closing)) {
+    close_xpath <- "self::*/following-sibling::md:text[contains(text(), '}')]"
+    for (not_closed in curlies[no_closing]) {
+      closing <- xml2::xml_find_all(
+        not_closed,
+        glue::glue("./{close_xpath}"),
+        ns
+      )
+      xml2::xml_text(not_closed) <- paste(
+        xml2::xml_text(not_closed),
+        xml2::xml_text(closing),
+        sep = "\n"
+      )
+      xml2::xml_remove(closing)
+    }
+  }
+  curlies
+}
+
+digest_curly <- function(curly, ns) {
+  char <- as.character(curly)
+  curlies <- regmatches(char, gregexpr("\\{.*?\\}", char))[[1]]
+  for (curl in curlies) {
+    attributes <- "curly='true'"
+
+    alt_fragment <- regmatches(curl, gregexpr("alt=['\"].*?['\"]", curl))[[1]]
+    if (length(alt_fragment) > 0) {
+      alt_text <- sub("^alt=", "", alt_fragment)
+      attributes <- sprintf("%s alt=%s", attributes, alt_text)
+    }
+
+    char <- sub(
+      curl,
+      sprintf("</text><text %s>%s</text><text>", attributes, curl),
+      char,
+      fixed = TRUE
+    )
+  }
+  make_text_nodes(char)
+}
+
+#' Protect curly elements for further processing
+#'
+#' @inheritParams protect_math
+#' @return a copy of the modified XML object
+#' @details Commonmark will render text such as `{.unnumbered}`
+#' (Pandoc/Quarto option) or
+#' `{#hello .greeting .message style="color: red;"}`
+#' (Markdown custom block)
+#' as normal text which might be problematic if trying to extract
+#' real text from the XML.
+#'
+#' If sending the XML to, say, a translation API that allows some tags
+#' to be ignored, you could first transform the text tags with the
+#' attribute `curly` to `curly` tags, and then transform them back
+#' to text tags before using `to_md()`.
+#'
+#' @note this function is also a method in the [tinkr::yarn] object.
+#'
+#' @export
+#' @examples
+#' m <- tinkr::to_xml(system.file("extdata", "basic-curly.md", package = "tinkr"))
+#' xml2::xml_child(m$body)
+#' m$body <- protect_curly(m$body)
+#' xml2::xml_child(m$body)
+protect_curly <- function(body, ns = md_ns()) {
+  body  <- copy_xml(body)
+  curly  <- find_curly(body, ns)
+  new_nodes <- purrr::map(curly, digest_curly, ns = ns)
+  # since we split up the nodes, we have to do this node by node
+  for (i in seq(new_nodes)) {
+    add_node_siblings(curly[[i]], new_nodes[[i]], remove = TRUE)
+  }
+  copy_xml(body)
+}
diff --git a/R/class-yarn.R b/R/class-yarn.R
@@ -6,9 +6,9 @@
 #' @details
 #' This class is a fancy wrapper around the results of [tinkr::to_xml()] and
 #' has methods that make it easier to add, analyze, remove, or write elements
-#' of your markdown document. 
+#' of your markdown document.
 #' @export
-yarn <- R6::R6Class("yarn", 
+yarn <- R6::R6Class("yarn",
   portable = TRUE,
   public = list(
     #' @field path \[`character`\] path to file on disk
@@ -17,21 +17,21 @@ yarn <- R6::R6Class("yarn",
     #' @field yaml \[`character`\] text block at head of file
     yaml = NULL,
 
-    #' @field body \[`xml_document`\] an xml document of the (R)Markdown file. 
+    #' @field body \[`xml_document`\] an xml document of the (R)Markdown file.
     body = NULL,
 
     #' @field ns \[`xml_document`\] an xml namespace object defining "md" to
     #'   commonmark.
     ns = NULL,
-    #' @description Create a new yarn document 
-    #' 
+    #' @description Create a new yarn document
+    #'
     #' @param path \[`character`\] path to a markdown episode file on disk
     #' @param encoding \[`character`\] encoding passed to [readLines()]
     #' @param sourcepos passed to [commonmark::markdown_xml()]. If `TRUE`, the
     #'   source position of the file will be included as a "sourcepos" attribute.
     #'   Defaults to `FALSE`.
     #' @param ... arguments passed on to [to_xml()].
-    #' @return A new yarn object containing an XML representation of a 
+    #' @return A new yarn object containing an XML representation of a
     #' (R)Markdown file.
     #'
     #' @examples
@@ -123,12 +123,12 @@ yarn <- R6::R6Class("yarn",
     #' @param n the number of elements to show from the bottom. Negative numbers
     #' @param stylesheet_path path to the xsl stylesheet to convert XML to markdown.
     #' exclude lines from the top
-    #' 
+    #'
     #' @return a character vector with `n` elements
     tail = function(n = 6L, stylesheet_path = stylesheet()) {
       show_user(tail(private$md_lines(stylesheet = stylesheet_path), n))
     },
-    
+
     #' @description add an arbitrary Markdown element to the document
     #'
     #' @param md a string of markdown formatted text.
@@ -153,13 +153,13 @@ yarn <- R6::R6Class("yarn",
     #' xml2::xml_find_all(ex$body, "md:list", ex$ns)
     #' tmp <- tempfile()
     #' ex$write(tmp)
-    #' readLines(tmp, n = 20) 
+    #' readLines(tmp, n = 20)
     add_md = function(md, where = 0L) {
       self$body <- add_md(self$body, md, where)
       invisible(self)
     },
     #' @description Protect math blocks from being escaped
-    #' 
+    #'
     #' @examples
     #' path <- system.file("extdata", "math-example.md", package = "tinkr")
     #' ex <- tinkr::yarn$new(path)
@@ -168,6 +168,16 @@ yarn <- R6::R6Class("yarn",
     protect_math = function() {
       self$body <- protect_math(self$body, self$ns)
       invisible(self)
+    },
+    #' @description Protect curly phrases `{likethat}` from being escaped
+    #'
+    #' @examples
+    #' path <- system.file("extdata", "basic-curly.md", package = "tinkr")
+    #' ex <- tinkr::yarn$new(path)
+    #' ex$protect_curly()$head()
+    protect_curly = function() {
+      self$body <- protect_curly(self$body, self$ns)
+      invisible(self)
     }
   ),
   private = list(

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -19,4 +19,5 @@ reference:
   - '`find_between`'
   - '`md_ns`'
   - '`protect_math`'
+  - '`protect_curly`'
   - '`stylesheet`'
diff --git a/inst/extdata/basic-curly.md b/inst/extdata/basic-curly.md
@@ -0,0 +1,19 @@
+---
+title: basic curly
+---
+
+# preface {#pre-face .unnumbered}
+
+hello
+
+I like {xml2} but of course {tinkr} is even cooler!
+
+Images that use pandoc style will have curlies with content that should be translated and should be protected.
+
+![a pretty kitten](https://placekitten.com/200/300){#kitteh alt='a picture of a kitten'}
+
+![a pretty puppy](https://placedog.net/200/300){#dog alt="a picture 
+of a dog"}
+
+[a span with attributes]{.span-with-attributes 
+style='color: red;'}
diff --git a/inst/extdata/basic-curly2.Rmd b/inst/extdata/basic-curly2.Rmd
@@ -0,0 +1,26 @@
+---
+title: "Untitled"
+author: "M. Salmon"
+date: "September 6, 2018"
+output: html_document
+---
+
+```{r setup, include=FALSE, eval = TRUE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# preface {#pre-face .unnumbered}
+
+hello
+
+I like {xml2} but of course {tinkr} is even cooler!
+
+Images that use pandoc style will have curlies with content that should be translated and should be protected.
+
+![a pretty kitten](https://placekitten.com/200/300){#kitteh alt='a picture of a kitten'}
+
+![a pretty puppy](https://placedog.net/200/300){#dog alt="a picture 
+of a dog"}
+
+[a span with attributes]{.span-with-attributes 
+style='color: red;'}