Skip to content

Commit

Permalink
Add latex support via regex filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
volhovm committed Dec 10, 2023
1 parent bed5ebb commit ab65a4f
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 32 deletions.
1 change: 1 addition & 0 deletions rust-toolchain
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.72
16 changes: 14 additions & 2 deletions src/bin/mdbook-linkcheck.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@ fn main() -> Result<(), Error> {
} else {
Some(cache_file.as_path())
};
mdbook_linkcheck::run(cache_file, args.colour, &ctx, args.selected_files)
mdbook_linkcheck::run(
cache_file,
args.colour,
&ctx,
args.selected_files,
args.latex_support,
)
}

#[derive(Debug, Clone, StructOpt)]
Expand Down Expand Up @@ -55,14 +61,20 @@ struct Args {
#[structopt(
short = "f",
long = "files",
help = "Check only the given files (check all files if omitted)."
help = "Check only the given files (check all files if omitted).
Paths must be relative to the book root, e.g. 'chapter1/section1.md'."
)]
selected_files: Option<Vec<String>>,
#[structopt(
long = "no-cache",
help = "Ignore any existing cache, neither using nor updating it."
)]
no_cache: bool,
#[structopt(
long = "latex",
help = "Turn on support for latex: ignores the segments between $, $$, \\[ \\] and \\( \\). Experimental feature."
)]
latex_support: bool,
}

fn parse_colour(raw: &str) -> Result<ColorChoice, Error> {
Expand Down
24 changes: 19 additions & 5 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ pub struct Config {
/// the list of HTTP headers that must be sent to matching sites.
#[serde(default)]
pub http_headers: HashMap<HashedRegex, Vec<HttpHeader>>,
/// Turns on support for latex. If true, then the latex fragments will be cut off
/// before the file is processed for link consistency.
pub latex_support: bool,
}

#[derive(Serialize, Deserialize, Debug, PartialEq, Clone)]
Expand Down Expand Up @@ -129,6 +132,7 @@ impl Default for Config {
http_headers: HashMap::new(),
warning_policy: WarningPolicy::Warn,
cache_timeout: Config::DEFAULT_CACHE_TIMEOUT.as_secs(),
latex_support: false,
}
}
}
Expand Down Expand Up @@ -158,7 +162,9 @@ impl FromStr for HttpHeader {
impl TryFrom<&'_ str> for HttpHeader {
type Error = Error;

fn try_from(s: &'_ str) -> Result<Self, Error> { HttpHeader::from_str(s) }
fn try_from(s: &'_ str) -> Result<Self, Error> {
HttpHeader::from_str(s)
}
}

impl TryFrom<String> for HttpHeader {
Expand All @@ -176,13 +182,19 @@ impl Into<String> for HttpHeader {
}
}

fn default_cache_timeout() -> u64 { Config::DEFAULT_CACHE_TIMEOUT.as_secs() }
fn default_user_agent() -> String { Config::DEFAULT_USER_AGENT.to_string() }
fn default_cache_timeout() -> u64 {
Config::DEFAULT_CACHE_TIMEOUT.as_secs()
}
fn default_user_agent() -> String {
Config::DEFAULT_USER_AGENT.to_string()
}

fn interpolate_env(value: &str) -> Result<HeaderValue, Error> {
use std::{iter::Peekable, str::CharIndices};

fn is_ident(ch: char) -> bool { ch.is_ascii_alphanumeric() || ch == '_' }
fn is_ident(ch: char) -> bool {
ch.is_ascii_alphanumeric() || ch == '_'
}

fn ident_end(start: usize, iter: &mut Peekable<CharIndices>) -> usize {
let mut end = start;
Expand Down Expand Up @@ -268,7 +280,9 @@ impl WarningPolicy {
}

impl Default for WarningPolicy {
fn default() -> WarningPolicy { WarningPolicy::Warn }
fn default() -> WarningPolicy {
WarningPolicy::Warn
}
}

#[cfg(test)]
Expand Down
164 changes: 164 additions & 0 deletions src/latex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/// This module provides an (experimental ad-hoc) functionality of
/// supporting latex in `mdbook-linkcheck`.
use std::collections::HashSet;

/// A struct that maps text changes from file B to file A, where file
/// A is original and B is modified. It is used to map back error
/// positions after A is altered into B by regexes that cut out latex
/// fragments.
pub(crate) struct ByteIndexMap {
/// Mapping from B to A stored as (b_i,a_i), stored as
/// monotonously increased pairs.
///
/// I.e. it always holds that b_{i+1} > b_{i} && a_{i+1} > a_i.
mapping: Vec<(u32, u32)>,
/// Ranges in a that are altered.
inserted_ranges_a: HashSet<u32>,
}

impl ByteIndexMap {
pub fn new() -> Self {
ByteIndexMap {
mapping: vec![],
inserted_ranges_a: HashSet::new(),
}
}

// Internal contsistency check function. It can be turned off for
// efficiency if latex support becomes too slow. But for now I prefer to
// leave it here @volhovm.
fn consistency_check(&self, s: &str) {
let mut prev_b: u32 = 0;
let mut prev_a: u32 = 0;
for (ix, (b, a)) in self.mapping.iter().enumerate() {
if b < &prev_b || a < &prev_a {
panic!(
"Inconsistent {}, ix {:?}, value {:?}, prev values {:?}",
s,
ix,
(b, a),
(prev_b, prev_a)
);
}
prev_b = *b;
prev_a = *a;
}
}

pub fn update(&mut self, start: u32, end: u32, len_b: u32) {
assert!(end >= start);
let start_end_range: Vec<u32> = (start..end).collect();
for i in start_end_range.iter() {
assert!(
!self.inserted_ranges_a.contains(i),
"Collision on {:?}",
i
);
self.inserted_ranges_a.insert(*i);
}
self.consistency_check("Before update");
let insert_ix = match self
.mapping
.iter()
.enumerate()
.find(|(_ix, (_pos_b, pos_a))| pos_a > &start)
{
Some((ix, (_, pos_a))) => {
// chunks must not overlap
assert!(end < *pos_a);
ix
},
None => self.mapping.len(),
};
let (pos_b, pos_a) = if insert_ix > 0 {
self.mapping[insert_ix - 1]
} else {
(0, 0)
};
assert!(start >= pos_a);
let delta_same = start - pos_a;
// A: (start,end)
// ... maps to
// B: (cur_b + delta_same, cur_b + delta_same + repl_length)
let new_a = end;
let new_b = pos_b + (delta_same + len_b);
assert!(new_a >= pos_a);
assert!(new_b >= pos_b);
self.mapping.insert(insert_ix, (new_b, new_a));

// Remap all the following pieces.
let mut prev_b: u32 = new_b;
let len_a = end - start;
for i in insert_ix + 1..self.mapping.len() {
let (b, a) = self.mapping[i];
let updated_b = b - len_a + len_b;
self.mapping[i] = (updated_b, a);
assert!(updated_b >= prev_b);
prev_b = updated_b;
}
self.consistency_check("After update");
}

/// Given a position in file B, returns a corresponding position in file A.
pub fn resolve(&self, input_b: u32) -> u32 {
let ix = match self
.mapping
.iter()
.enumerate()
.find(|(_ix, (pos_b, _pos_a))| pos_b > &input_b)
{
Some((ix, _)) => ix,
None => self.mapping.len(),
};
let (pos_b, pos_a) = if ix > 0 { self.mapping[ix - 1] } else { (0, 0) };

pos_a + (input_b - pos_b)
}
}

/// Filters out latex code snippets from md files to avoid false link
/// matches.
pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) {
use regex::Regex;

let mut byte_index_map = ByteIndexMap::new();
let mut src: String = src.to_string();

let mut process_regex = |regex_expr: &str, replacement: &str| {
let mut byte_index_map_upds = vec![];
let reg = Regex::new(regex_expr).unwrap();
for captures in reg.captures_iter(&src) {
if let Some(mtch) = captures.get(0) {
let start = mtch.start() as u32;
let end = mtch.end() as u32;

let repl_length = replacement.len() as u32;
byte_index_map_upds.push((
byte_index_map.resolve(start),
byte_index_map.resolve(start) + end - start,
repl_length,
));
}
}

// update source and byte_index_map
for (start, end, length) in byte_index_map_upds {
byte_index_map.update(start, end, length);
}
src = reg.replace_all(&src, replacement).to_string();
};

// Everything between a pair of $$ including newlines
process_regex(r"\$\$[^\$]*\$\$", "LATEX_DOUBLE_DOLLAR_SUBSTITUTED");
// Everything between a pair of $ excluding newlines
process_regex(r"\$[^\$\n\r]*\$", "LATEX_SINGLE_DOLLAR_SUBSTITUTED");
// Everything between \( and \) excluding newlines
process_regex(r"\\\([^\n\r]*\\\)", "LATEX_ESCAPED_PARENTHESIS_SUBSTITUTED");
// Everything between \[ and \] including newlines
process_regex(
r"\\\[(.|\r\n|\r|\n)*\\\]",
"LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED",
);

(src.to_string(), byte_index_map)
}
21 changes: 13 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pub const COMPATIBLE_MDBOOK_VERSIONS: &str = "^0.4.0";
mod config;
mod context;
mod hashed_regex;
mod latex;
mod links;
mod validate;

Expand Down Expand Up @@ -65,6 +66,7 @@ pub fn run(
colour: ColorChoice,
ctx: &RenderContext,
selected_files: Option<Vec<String>>,
latex_support: bool,
) -> Result<(), Error> {
let mut cache = if let Some(cache_file) = cache_file {
load_cache(cache_file)
Expand All @@ -75,7 +77,11 @@ pub fn run(
log::info!("Started the link checker");
log::debug!("Selected file: {:?}", selected_files);

let cfg = crate::get_config(&ctx.config)?;
let cfg = {
let mut cfg = crate::get_config(&ctx.config)?;
cfg.latex_support = latex_support;
cfg
};
crate::version_check(&ctx.version)?;

if log::log_enabled!(log::Level::Trace) {
Expand Down Expand Up @@ -153,11 +159,10 @@ where
match item {
BookItem::Chapter(ref ch) => {
if let Some(ref path) = ch.path {
if filter(&path) {
let id = dest.add(
path.display().to_string(),
ch.content.clone(),
);
if filter(path) {
let path_str = path.display().to_string();
let content = ch.content.clone();
let id = dest.add(path_str, content);
ids.push(id);
}
}
Expand Down Expand Up @@ -194,11 +199,11 @@ where
F: Fn(&Path) -> bool,
{
log::info!("Scanning book for links");
let mut files = Files::new();
let mut files: Files<String> = Files::new();
let file_ids =
crate::load_files_into_memory(&ctx.book, &mut files, file_filter);
let (links, incomplete_links) =
crate::extract_links(file_ids.clone(), &files);
crate::extract_links(cfg, file_ids.clone(), &files);
log::info!(
"Found {} links ({} incomplete links)",
links.len(),
Expand Down
Loading

0 comments on commit ab65a4f

Please sign in to comment.