Skip to content

Commit

Permalink
Merge pull request #81 from volhovm/master
Browse files Browse the repository at this point in the history
Adding minimal LaTeX support
  • Loading branch information
Michael-F-Bryan authored May 9, 2024
2 parents bed5ebb + 8cccfc8 commit e6f83ea
Show file tree
Hide file tree
Showing 12 changed files with 346 additions and 34 deletions.
3 changes: 2 additions & 1 deletion src/bin/mdbook-linkcheck.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ struct Args {
#[structopt(
short = "f",
long = "files",
help = "Check only the given files (check all files if omitted)."
help = "Check only the given files (check all files if omitted).
Paths must be relative to the book root, e.g. 'chapter1/section1.md'."
)]
selected_files: Option<Vec<String>>,
#[structopt(
Expand Down
6 changes: 6 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ pub struct Config {
pub follow_web_links: bool,
/// Are we allowed to link to files outside of the book's source directory?
pub traverse_parent_directories: bool,
/// Turns on support for latex. If true, then the latex fragments will be
/// cut off before the file is processed for link consistency.
pub latex_support: bool,
/// A list of URL patterns to ignore when checking remote links.
#[serde(default)]
pub exclude: Vec<HashedRegex>,
Expand Down Expand Up @@ -124,6 +127,7 @@ impl Default for Config {
Config {
follow_web_links: false,
traverse_parent_directories: false,
latex_support: false,
exclude: Vec::new(),
user_agent: default_user_agent(),
http_headers: HashMap::new(),
Expand Down Expand Up @@ -279,6 +283,7 @@ mod tests {

const CONFIG: &str = r#"follow-web-links = true
traverse-parent-directories = true
latex-support = true
exclude = ["google\\.com"]
user-agent = "Internet Explorer"
cache-timeout = 3600
Expand Down Expand Up @@ -306,6 +311,7 @@ https = ["accept: html/text", "authorization: Basic $TOKEN"]
],
)]),
cache_timeout: 3600,
latex_support: true,
};

let got: Config = toml::from_str(CONFIG).unwrap();
Expand Down
168 changes: 168 additions & 0 deletions src/latex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/// This module provides an (experimental ad-hoc) functionality of
/// supporting latex in `mdbook-linkcheck`.
use std::collections::HashSet;

/// A struct that maps text changes from file B to file A, where file
/// A is original and B is modified. It is used to map back error
/// positions after A is altered into B by regexes that cut out latex
/// fragments.
pub(crate) struct ByteIndexMap {
/// Mapping from B to A stored as (b_i,a_i), stored as
/// monotonously increased pairs.
///
/// I.e. it always holds that b_{i+1} > b_{i} && a_{i+1} > a_i.
mapping: Vec<(u32, u32)>,
/// Ranges in a that are altered.
inserted_ranges_a: HashSet<u32>,
}

impl ByteIndexMap {
pub fn new() -> Self {
ByteIndexMap {
mapping: vec![],
inserted_ranges_a: HashSet::new(),
}
}

// Internal contsistency check function. It can be turned off for
// efficiency if latex support becomes too slow. But for now I prefer to
// leave it here @volhovm.
fn consistency_check(&self, s: &str) {
let mut prev_b: u32 = 0;
let mut prev_a: u32 = 0;
for (ix, (b, a)) in self.mapping.iter().enumerate() {
if b < &prev_b || a < &prev_a {
panic!(
"Inconsistent {}, ix {:?}, value {:?}, prev values {:?}",
s,
ix,
(b, a),
(prev_b, prev_a)
);
}
prev_b = *b;
prev_a = *a;
}
}

pub fn update(&mut self, start: u32, end: u32, len_b: u32) {
assert!(end >= start);
let start_end_range: Vec<u32> = (start..end).collect();
for i in start_end_range.iter() {
assert!(
!self.inserted_ranges_a.contains(i),
"Collision on {:?}",
i
);
self.inserted_ranges_a.insert(*i);
}
self.consistency_check("Before update");
let insert_ix = match self
.mapping
.iter()
.enumerate()
.find(|(_ix, (_pos_b, pos_a))| pos_a > &start)
{
Some((ix, (_, pos_a))) => {
// chunks must not overlap
assert!(end < *pos_a);
ix
},
None => self.mapping.len(),
};
let (pos_b, pos_a) = if insert_ix > 0 {
self.mapping[insert_ix - 1]
} else {
(0, 0)
};
assert!(start >= pos_a);
let delta_same = start - pos_a;
// A: (start,end)
// ... maps to
// B: (cur_b + delta_same, cur_b + delta_same + repl_length)
let new_a = end;
let new_b = pos_b + (delta_same + len_b);
assert!(new_a >= pos_a);
assert!(new_b >= pos_b);
self.mapping.insert(insert_ix, (new_b, new_a));

// Remap all the following pieces.
let mut prev_b: u32 = new_b;
let len_a = end - start;
for i in insert_ix + 1..self.mapping.len() {
let (b, a) = self.mapping[i];
let updated_b = b - len_a + len_b;
self.mapping[i] = (updated_b, a);
assert!(updated_b >= prev_b);
prev_b = updated_b;
}
self.consistency_check("After update");
}

/// Given a position in file B, returns a corresponding position in file A.
pub fn resolve(&self, input_b: u32) -> u32 {
let ix = match self
.mapping
.iter()
.enumerate()
.find(|(_ix, (pos_b, _pos_a))| pos_b > &input_b)
{
Some((ix, _)) => ix,
None => self.mapping.len(),
};
let (pos_b, pos_a) = if ix > 0 { self.mapping[ix - 1] } else { (0, 0) };

pos_a + (input_b - pos_b)
}
}

/// Filters out latex code snippets from md files to avoid false link
/// matches.
pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) {
use regex::Regex;

let mut byte_index_map = ByteIndexMap::new();
let mut src: String = src.to_string();

//println!("\n\n\nFile: {}", src);

let mut process_regex = |regex_expr: &str, replacement: &str| {
let mut byte_index_map_upds = vec![];
let reg = Regex::new(regex_expr).unwrap();
for captures in reg.captures_iter(&src) {
if let Some(mtch) = captures.get(0) {
let start = mtch.start() as u32;
let end = mtch.end() as u32;

let repl_length = replacement.len() as u32;
byte_index_map_upds.push((
byte_index_map.resolve(start),
byte_index_map.resolve(start) + end - start,
repl_length,
));
}
}

// update source and byte_index_map
for (start, end, length) in byte_index_map_upds {
byte_index_map.update(start, end, length);
}
src = reg.replace_all(&src, replacement).to_string();
};

// Everything between a pair of $$ including newlines
process_regex(r"\$\$[^\$]*\$\$", "LATEX_DOUBLE_DOLLAR_SUBSTITUTED");
// Everything between a pair of $ excluding newlines
process_regex(r"\$[^\$\n\r]*\$", "LATEX_SINGLE_DOLLAR_SUBSTITUTED");
// Everything between \( and \) excluding newlines
process_regex(r"\\\([^\n\r]*\\\)", "LATEX_ESCAPED_PARENTHESIS_SUBSTITUTED");
// Everything between \[ and \] including newlines
process_regex(
r"\\\[(.|\r\n|\r|\n)*\\\]",
"LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED",
);

//println!("\n\n\nFile after: {}", src);

(src.to_string(), byte_index_map)
}
14 changes: 7 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pub const COMPATIBLE_MDBOOK_VERSIONS: &str = "^0.4.0";
mod config;
mod context;
mod hashed_regex;
mod latex;
mod links;
mod validate;

Expand Down Expand Up @@ -153,11 +154,10 @@ where
match item {
BookItem::Chapter(ref ch) => {
if let Some(ref path) = ch.path {
if filter(&path) {
let id = dest.add(
path.display().to_string(),
ch.content.clone(),
);
if filter(path) {
let path_str = path.display().to_string();
let content = ch.content.clone();
let id = dest.add(path_str, content);
ids.push(id);
}
}
Expand Down Expand Up @@ -194,11 +194,11 @@ where
F: Fn(&Path) -> bool,
{
log::info!("Scanning book for links");
let mut files = Files::new();
let mut files: Files<String> = Files::new();
let file_ids =
crate::load_files_into_memory(&ctx.book, &mut files, file_filter);
let (links, incomplete_links) =
crate::extract_links(file_ids.clone(), &files);
crate::extract_links(cfg, file_ids.clone(), &files);
log::info!(
"Found {} links ({} incomplete links)",
links.len(),
Expand Down
70 changes: 53 additions & 17 deletions src/links.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
use codespan::{FileId, Files, Span};
use crate::config::Config;
use crate::latex::{filter_out_latex, ByteIndexMap};
use codespan::{ByteIndex, FileId, Files, Span};
use linkcheck::Link;
use pulldown_cmark::{BrokenLink, CowStr};
use std::{cell::RefCell, fmt::Debug};

/// Search every file in the [`Files`] and collate all the links that are
/// found.
pub fn extract<I>(
cfg: &Config,
target_files: I,
files: &Files<String>,
) -> (Vec<Link>, Vec<IncompleteLink>)
Expand All @@ -17,25 +20,58 @@ where

for file_id in target_files {
let src = files.source(file_id);

let (src, byte_index_map) = if cfg.latex_support {
filter_out_latex(src)
} else {
(src.clone(), ByteIndexMap::new())
};

log::debug!("Scanning {}", files.name(file_id).to_string_lossy());

links.extend(scan_links(file_id, &*src, &mut |broken_link| {
let BrokenLink {
reference, span, ..
} = broken_link;
log::debug!(
"Found a (possibly) broken link to [{}] at {:?}",
reference,
span
);
let mapspan = |span: Span| {
Span::new(
ByteIndex(
byte_index_map.resolve(span.start().to_usize() as u32),
),
ByteIndex(byte_index_map.resolve(span.end().to_usize() as u32)),
)
};

links.extend(
scan_links(file_id, &src, &mut |broken_link| {
let BrokenLink {
reference, span, ..
} = broken_link;
log::debug!(
"Found a (possibly) broken link to [{}] at {:?}",
reference,
span
);

////assert!(false, "kek panic, unreachable?");
//println!(
// "start {:?} end {:?} res_a {:?} res_b {:?}",
// span.start,
// span.end,
// ByteIndex(byte_index_map.resolve(span.start as u32)),
// ByteIndex(byte_index_map.resolve(span.end as u32))
//);
let origspan = Span::new(
ByteIndex(span.start as u32),
ByteIndex(span.end as u32),
);
let span = mapspan(origspan);

broken_links.borrow_mut().push(IncompleteLink {
reference: broken_link.reference.to_string(),
span: Span::new(span.start as u32, span.end as u32),
file: file_id,
});
None
}));
broken_links.borrow_mut().push(IncompleteLink {
reference: broken_link.reference.to_string(),
span,
file: file_id,
});
None
})
.map(|link| Link::new(link.href, mapspan(link.span), link.file)),
);
}

(links, broken_links.into_inner())
Expand Down
3 changes: 2 additions & 1 deletion tests/broken-links/src/chapter_1.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@

[incomplete link]

![Missing Image](./asdf.png)
Also if latex support is not enabled, as here, this math expression $[math_var]_5$ \([math_var_2](latex_with_latex_support_disabled)_5\) be parsed as another issue

![Missing Image](./asdf.png)
5 changes: 5 additions & 0 deletions tests/latex-support-links/book.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[book]
authors = ["Michael Bryan"]
multilingual = false
src = "src"
title = "Broken Links"
4 changes: 4 additions & 0 deletions tests/latex-support-links/src/SUMMARY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Summary

- [Chapter 1](./chapter_1.md)
- [Second Directory](second/directory.md)
35 changes: 35 additions & 0 deletions tests/latex-support-links/src/chapter_1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Chapter 1

Here is some test $x + y$ that includes latex fragments \(z + x\).

[Some links work](./chapter_1.md)

$$
\begin{align*}
log_k(s) = d
\end{align*}
$$

Some of these fragments $(a,b,c,d,e)$ may contain something that looks like links, e.g. \([x]_5\) or $[x]_5$ or $[x](some_latex_value)$ but is, in fact, not a link at all.

[but linking to a nonexistent domain fails](http://this-doesnt-exist.com.au.nz.us/)

\[
\begin{align*}
log_k(a) = d+5 [also_not_a_link]_5 [also_not_a_link](latex_number)
\end{align*}
\]

[This chapter doesn't exist](./foo/bar/baz.html)

And sometimes the LaTeX environment is actually broken! For example, single dollar must capture only single-line latex pieces. Therefore if I'm talking about 5$ [and](first_broken_link_nonlatex)
with a dollar $ on the other line, this link should be still considered broken, and must not be erroneously cut out as a latex fragment.

Same goes for the \( single escaped parenthesis, when talking about 1000$ [this](second_broken_link_nonlatex) and [this_incomplete_link_inside_nonlatex]
must not be cut out, no matter how many $ we talk about.

[It would be bad if this worked...](../../../../../../../../../../../../etc/shadow)

[incomplete link]

![Missing Image](./asdf.png)
1 change: 1 addition & 0 deletions tests/latex-support-links/src/second/directory.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Linking to [files not in `SUMMARY.md`](sibling.md) is an error.
Loading

0 comments on commit e6f83ea

Please sign in to comment.