Skip to content

Commit

Permalink
Merge pull request #76 from ropensci/curly
Browse files Browse the repository at this point in the history
feat: protect_curly()
  • Loading branch information
zkamvar authored Sep 20, 2022
2 parents a577e9b + 631a99d commit b87da44
Show file tree
Hide file tree
Showing 16 changed files with 375 additions and 37 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tinkr
Title: Cast '(R)Markdown' Files to 'XML' and Back Again
Version: 0.1.0
Version: 0.1.0.9000
Authors@R:
c(person(given = "Maëlle",
family = "Salmon",
Expand Down Expand Up @@ -56,5 +56,5 @@ Config/testthat/edition: 3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
RoxygenNote: 7.2.1.9000
VignetteBuilder: knitr
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(find_between)
export(md_ns)
export(protect_curly)
export(protect_math)
export(stylesheet)
export(to_md)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# tinkr (development version)

* New `protect_curly()` function will add a `curly='true'` attribute to text wrapped in curly braces ('{', '}') to allow parsing of the XML for sending to external APIs. This function will also parse alt text and place it in an attribute.
* New `$protect_curly()` method implements `protect_curly()` on yarn objects

# tinkr 0.1.0

First version on CRAN
Expand Down
44 changes: 22 additions & 22 deletions R/asis-nodes.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
#' @param ns an XML namespace object (defaults: [md_ns()]).
#' @return a copy of the modified XML object
#' @details Commonmark does not know what LaTeX is and will LaTeX equations as
#' normal text. This means that content surrounded by underscores are
#' normal text. This means that content surrounded by underscores are
#' interpreted as `<emph>` elements and all backslashes are escaped by default.
#' This function protects inline and block math elements that use `$` and `$$`
#' for delimiters, respectively.
#'
#' for delimiters, respectively.
#'
#' @note this function is also a method in the [tinkr::yarn] object.
#'
#'
#' @export
#' @examples
#' m <- tinkr::to_xml(system.file("extdata", "math-example.md", package = "tinkr"))
Expand Down Expand Up @@ -47,11 +47,11 @@ inline_dollars_regex <- function(type = c("start", "stop", "full")) {
# any space
ace <- "[:space:]"
punks <- glue::glue("[{ace}[:punct:]]")
# Note about this regex: the first part is a lookahead (?=...) that searches
# Note about this regex: the first part is a lookahead (?=...) that searches
# for the line start, space, or punctuation. Importantly about lookaheads,
# they do not consume the string
# they do not consume the string
# (https://junli.netlify.app/en/overlapping-regular-expression-in-python/)
#
#
# This looks for a potetial minus sign followed by maybe a space to allow for
# $\beta, $$\beta, $-\beta, $- \beta
minus_maybe <- glue::glue("(?=([-][{ace}]?)?")
Expand Down Expand Up @@ -87,7 +87,7 @@ find_broken_math <- function(math) {
}

#' Find and protect all inline math elements
#'
#'
#' @param body an XML document
#' @param ns an XML namespace
#' @return a modified _copy_ of the original XML document
Expand Down Expand Up @@ -144,10 +144,10 @@ protect_inline_math <- function(body, ns) {
le <- length(bmath[endless])
lh <- length(bmath[headless])
if (le != lh) {
unbalanced_math_error(bmath, endless, headless, le, lh)
unbalanced_math_error(bmath, endless, headless, le, lh)
}
# assign sequential tags to the pairs of inline math elements
tags <- seq(length(bmath[endless]))
tags <- seq(length(bmath[endless]))
xml2::xml_set_attr(bmath[endless], "latex-pair", tags)
xml2::xml_set_attr(bmath[headless], "latex-pair", tags)
for (i in tags) {
Expand All @@ -157,13 +157,13 @@ protect_inline_math <- function(body, ns) {
copy_xml(body)
}

# Partial inline math are math elements that are not entirely embedded in a
# single `<text>` element. There are two reasons for this:
# Partial inline math are math elements that are not entirely embedded in a
# single `<text>` element. There are two reasons for this:
#
# 1. Math is split across separate lines in the markdown document
# 2. There are elements like `_` that are interpreted as markdown elements.
#
# To use this function, an inline pair needs to be first tagged with a
# To use this function, an inline pair needs to be first tagged with a
# `latex-pair` attribute that uniquely identifies that pair of tags. It assumes
# that all of the content between that pair of tags belongs to the math element.
fix_partial_inline <- function(tag, body, ns) {
Expand All @@ -181,7 +181,7 @@ fix_partial_inline <- function(tag, body, ns) {
char[[n]] <- sub("[<]text ", "<text asis='true' ", char[[n]])
nodes <- paste(char, collapse = "")
nodes <- make_text_nodes(nodes)
# add the new nodes to the bottom of the existing math lines
# add the new nodes to the bottom of the existing math lines
last_line <- math_lines[n]
to_remove <- math_lines[-n]
add_node_siblings(last_line, nodes, remove = TRUE)
Expand All @@ -197,7 +197,7 @@ fix_fully_inline <- function(math) {
# <text>this is </text><text asis='true'>$\LaTeX$</text><text> text</text>
char <- gsub(
pattern = inline_dollars_regex("full"),
replacement = "</text><text asis='true'>\\1</text><text>",
replacement = "</text><text asis='true'>\\1</text><text>",
x = char,
perl = TRUE
)
Expand All @@ -209,7 +209,7 @@ fix_fully_inline <- function(math) {
#' This is useful in the case where we want to modify some text content to
#' split it and label a portion of it 'asis' to protect it from commonmark's
#' escape processing.
#'
#'
#' `fix_fully_inline()` uses `make_text_nodes()` to modify a single text node
#' into several text nodes. It first takes a string of a single text node like
#' below...
Expand All @@ -228,15 +228,15 @@ fix_fully_inline <- function(math) {
#' The `make_text_nodes()` function takes the above text string and converts it
#' into nodes so that the original text node can be replaced.
#'
#' @param a character vector of modified text nodes
#' @param a character vector of modified text nodes
#' @return a nodeset with no associated namespace
#' @noRd
make_text_nodes <- function(txt) {
# We are hijacking commonmark here to produce an XML markdown document with
# a single element: {paste(txt, collapse = ''). This gets passed to glue where
# it is expanded into nodes that we can read in via {xml2}, strip the
# it is expanded into nodes that we can read in via {xml2}, strip the
# namespace, and extract all nodes below
doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '')}"))
doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '')}"))
nodes <- xml2::xml_ns_strip(xml2::read_xml(doc))
xml2::xml_find_all(nodes, ".//paragraph/text/*")
}
Expand All @@ -257,7 +257,7 @@ protect_block_math <- function(body, ns) {
bm <- find_block_math(body, ns)
# get all of the internal nodes
bm <- xml2::xml_find_all(bm, ".//descendant-or-self::md:*", ns = ns)
set_asis(bm)
set_asis(bm)
}

# TICK BOXES -------------------------------------------------------------------
Expand Down Expand Up @@ -298,7 +298,7 @@ fix_footnotes <- function(feet) {
char <- as.character(feet)
char <- gsub(
pattern = "([\\[][\\^]|\\])",
replacement = "</text><text asis='true'>\\1</text><text>",
replacement = "</text><text asis='true'>\\1</text><text>",
x = char,
perl = TRUE
)
Expand All @@ -307,7 +307,7 @@ fix_footnotes <- function(feet) {

protect_footnotes <- function(body, ns = md_ns()) {
body <- copy_xml(body)

feet <- footnote_check(body, ns)
if (length(feet) == 0) {
return(body)
Expand Down
82 changes: 82 additions & 0 deletions R/attr-nodes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# CURLY ------------------
find_curly <- function(body, ns) {
i <- ".//md:text[not(@asis) and contains(text(), '{')]"
curlies <- xml2::xml_find_all(body, i, ns = ns)
# https://github.com/carpentries/pegboard/blob/a8db02ab037f2ffeab6e13cc3b662ea8c42822ad/R/get_images.R#L84
attr_texts <- xml2::xml_text(curlies)
no_closing <- !grepl("[}]", attr_texts)
if (any(no_closing)) {
close_xpath <- "self::*/following-sibling::md:text[contains(text(), '}')]"
for (not_closed in curlies[no_closing]) {
closing <- xml2::xml_find_all(
not_closed,
glue::glue("./{close_xpath}"),
ns
)
xml2::xml_text(not_closed) <- paste(
xml2::xml_text(not_closed),
xml2::xml_text(closing),
sep = "\n"
)
xml2::xml_remove(closing)
}
}
curlies
}

digest_curly <- function(curly, ns) {
char <- as.character(curly)
curlies <- regmatches(char, gregexpr("\\{.*?\\}", char))[[1]]
for (curl in curlies) {
attributes <- "curly='true'"

alt_fragment <- regmatches(curl, gregexpr("alt=['\"].*?['\"]", curl))[[1]]
if (length(alt_fragment) > 0) {
alt_text <- sub("^alt=", "", alt_fragment)
attributes <- sprintf("%s alt=%s", attributes, alt_text)
}

char <- sub(
curl,
sprintf("</text><text %s>%s</text><text>", attributes, curl),
char,
fixed = TRUE
)
}
make_text_nodes(char)
}

#' Protect curly elements for further processing
#'
#' @inheritParams protect_math
#' @return a copy of the modified XML object
#' @details Commonmark will render text such as `{.unnumbered}`
#' (Pandoc/Quarto option) or
#' `{#hello .greeting .message style="color: red;"}`
#' (Markdown custom block)
#' as normal text which might be problematic if trying to extract
#' real text from the XML.
#'
#' If sending the XML to, say, a translation API that allows some tags
#' to be ignored, you could first transform the text tags with the
#' attribute `curly` to `curly` tags, and then transform them back
#' to text tags before using `to_md()`.
#'
#' @note this function is also a method in the [tinkr::yarn] object.
#'
#' @export
#' @examples
#' m <- tinkr::to_xml(system.file("extdata", "basic-curly.md", package = "tinkr"))
#' xml2::xml_child(m$body)
#' m$body <- protect_curly(m$body)
#' xml2::xml_child(m$body)
protect_curly <- function(body, ns = md_ns()) {
body <- copy_xml(body)
curly <- find_curly(body, ns)
new_nodes <- purrr::map(curly, digest_curly, ns = ns)
# since we split up the nodes, we have to do this node by node
for (i in seq(new_nodes)) {
add_node_siblings(curly[[i]], new_nodes[[i]], remove = TRUE)
}
copy_xml(body)
}
30 changes: 20 additions & 10 deletions R/class-yarn.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
#' @details
#' This class is a fancy wrapper around the results of [tinkr::to_xml()] and
#' has methods that make it easier to add, analyze, remove, or write elements
#' of your markdown document.
#' of your markdown document.
#' @export
yarn <- R6::R6Class("yarn",
yarn <- R6::R6Class("yarn",
portable = TRUE,
public = list(
#' @field path \[`character`\] path to file on disk
Expand All @@ -17,21 +17,21 @@ yarn <- R6::R6Class("yarn",
#' @field yaml \[`character`\] text block at head of file
yaml = NULL,

#' @field body \[`xml_document`\] an xml document of the (R)Markdown file.
#' @field body \[`xml_document`\] an xml document of the (R)Markdown file.
body = NULL,

#' @field ns \[`xml_document`\] an xml namespace object defining "md" to
#' commonmark.
ns = NULL,
#' @description Create a new yarn document
#'
#' @description Create a new yarn document
#'
#' @param path \[`character`\] path to a markdown episode file on disk
#' @param encoding \[`character`\] encoding passed to [readLines()]
#' @param sourcepos passed to [commonmark::markdown_xml()]. If `TRUE`, the
#' source position of the file will be included as a "sourcepos" attribute.
#' Defaults to `FALSE`.
#' @param ... arguments passed on to [to_xml()].
#' @return A new yarn object containing an XML representation of a
#' @return A new yarn object containing an XML representation of a
#' (R)Markdown file.
#'
#' @examples
Expand Down Expand Up @@ -123,12 +123,12 @@ yarn <- R6::R6Class("yarn",
#' @param n the number of elements to show from the bottom. Negative numbers
#' @param stylesheet_path path to the xsl stylesheet to convert XML to markdown.
#' exclude lines from the top
#'
#'
#' @return a character vector with `n` elements
tail = function(n = 6L, stylesheet_path = stylesheet()) {
show_user(tail(private$md_lines(stylesheet = stylesheet_path), n))
},

#' @description add an arbitrary Markdown element to the document
#'
#' @param md a string of markdown formatted text.
Expand All @@ -153,13 +153,13 @@ yarn <- R6::R6Class("yarn",
#' xml2::xml_find_all(ex$body, "md:list", ex$ns)
#' tmp <- tempfile()
#' ex$write(tmp)
#' readLines(tmp, n = 20)
#' readLines(tmp, n = 20)
add_md = function(md, where = 0L) {
self$body <- add_md(self$body, md, where)
invisible(self)
},
#' @description Protect math blocks from being escaped
#'
#'
#' @examples
#' path <- system.file("extdata", "math-example.md", package = "tinkr")
#' ex <- tinkr::yarn$new(path)
Expand All @@ -168,6 +168,16 @@ yarn <- R6::R6Class("yarn",
protect_math = function() {
self$body <- protect_math(self$body, self$ns)
invisible(self)
},
#' @description Protect curly phrases `{likethat}` from being escaped
#'
#' @examples
#' path <- system.file("extdata", "basic-curly.md", package = "tinkr")
#' ex <- tinkr::yarn$new(path)
#' ex$protect_curly()$head()
protect_curly = function() {
self$body <- protect_curly(self$body, self$ns)
invisible(self)
}
),
private = list(
Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ reference:
- '`find_between`'
- '`md_ns`'
- '`protect_math`'
- '`protect_curly`'
- '`stylesheet`'
19 changes: 19 additions & 0 deletions inst/extdata/basic-curly.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
title: basic curly
---

# preface {#pre-face .unnumbered}

hello

I like {xml2} but of course {tinkr} is even cooler!

Images that use pandoc style will have curlies with content that should be translated and should be protected.

![a pretty kitten](https://placekitten.com/200/300){#kitteh alt='a picture of a kitten'}

![a pretty puppy](https://placedog.net/200/300){#dog alt="a picture
of a dog"}

[a span with attributes]{.span-with-attributes
style='color: red;'}
26 changes: 26 additions & 0 deletions inst/extdata/basic-curly2.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
title: "Untitled"
author: "M. Salmon"
date: "September 6, 2018"
output: html_document
---

```{r setup, include=FALSE, eval = TRUE}
knitr::opts_chunk$set(echo = TRUE)
```

# preface {#pre-face .unnumbered}

hello

I like {xml2} but of course {tinkr} is even cooler!

Images that use pandoc style will have curlies with content that should be translated and should be protected.

![a pretty kitten](https://placekitten.com/200/300){#kitteh alt='a picture of a kitten'}

![a pretty puppy](https://placedog.net/200/300){#dog alt="a picture
of a dog"}

[a span with attributes]{.span-with-attributes
style='color: red;'}
Loading

0 comments on commit b87da44

Please sign in to comment.