From 603437957c42927dacc20b280ea726e85c310f48 Mon Sep 17 00:00:00 2001 From: Mikhail Volkhov Date: Thu, 30 Nov 2023 12:38:43 +0000 Subject: [PATCH 1/2] Add latex support via regex filtering --- src/bin/mdbook-linkcheck.rs | 3 +- src/config.rs | 6 ++ src/latex.rs | 164 ++++++++++++++++++++++++++++++++++++ src/lib.rs | 14 +-- src/links.rs | 70 +++++++++++---- tests/smoke_tests.rs | 26 ++++-- 6 files changed, 252 insertions(+), 31 deletions(-) create mode 100644 src/latex.rs diff --git a/src/bin/mdbook-linkcheck.rs b/src/bin/mdbook-linkcheck.rs index 3df775088..0582b54ed 100644 --- a/src/bin/mdbook-linkcheck.rs +++ b/src/bin/mdbook-linkcheck.rs @@ -55,7 +55,8 @@ struct Args { #[structopt( short = "f", long = "files", - help = "Check only the given files (check all files if omitted)." + help = "Check only the given files (check all files if omitted). +Paths must be relative to the book root, e.g. 'chapter1/section1.md'." )] selected_files: Option>, #[structopt( diff --git a/src/config.rs b/src/config.rs index 67efdc890..73e969b1b 100644 --- a/src/config.rs +++ b/src/config.rs @@ -22,6 +22,9 @@ pub struct Config { pub follow_web_links: bool, /// Are we allowed to link to files outside of the book's source directory? pub traverse_parent_directories: bool, + /// Turns on support for latex. If true, then the latex fragments will be + /// cut off before the file is processed for link consistency. + pub latex_support: bool, /// A list of URL patterns to ignore when checking remote links. #[serde(default)] pub exclude: Vec, @@ -124,6 +127,7 @@ impl Default for Config { Config { follow_web_links: false, traverse_parent_directories: false, + latex_support: false, exclude: Vec::new(), user_agent: default_user_agent(), http_headers: HashMap::new(), @@ -279,6 +283,7 @@ mod tests { const CONFIG: &str = r#"follow-web-links = true traverse-parent-directories = true +latex-support = true exclude = ["google\\.com"] user-agent = "Internet Explorer" cache-timeout = 3600 @@ -306,6 +311,7 @@ https = ["accept: html/text", "authorization: Basic $TOKEN"] ], )]), cache_timeout: 3600, + latex_support: true, }; let got: Config = toml::from_str(CONFIG).unwrap(); diff --git a/src/latex.rs b/src/latex.rs new file mode 100644 index 000000000..5449bbdec --- /dev/null +++ b/src/latex.rs @@ -0,0 +1,164 @@ +/// This module provides an (experimental ad-hoc) functionality of +/// supporting latex in `mdbook-linkcheck`. +use std::collections::HashSet; + +/// A struct that maps text changes from file B to file A, where file +/// A is original and B is modified. It is used to map back error +/// positions after A is altered into B by regexes that cut out latex +/// fragments. +pub(crate) struct ByteIndexMap { + /// Mapping from B to A stored as (b_i,a_i), stored as + /// monotonously increased pairs. + /// + /// I.e. it always holds that b_{i+1} > b_{i} && a_{i+1} > a_i. + mapping: Vec<(u32, u32)>, + /// Ranges in a that are altered. + inserted_ranges_a: HashSet, +} + +impl ByteIndexMap { + pub fn new() -> Self { + ByteIndexMap { + mapping: vec![], + inserted_ranges_a: HashSet::new(), + } + } + + // Internal contsistency check function. It can be turned off for + // efficiency if latex support becomes too slow. But for now I prefer to + // leave it here @volhovm. + fn consistency_check(&self, s: &str) { + let mut prev_b: u32 = 0; + let mut prev_a: u32 = 0; + for (ix, (b, a)) in self.mapping.iter().enumerate() { + if b < &prev_b || a < &prev_a { + panic!( + "Inconsistent {}, ix {:?}, value {:?}, prev values {:?}", + s, + ix, + (b, a), + (prev_b, prev_a) + ); + } + prev_b = *b; + prev_a = *a; + } + } + + pub fn update(&mut self, start: u32, end: u32, len_b: u32) { + assert!(end >= start); + let start_end_range: Vec = (start..end).collect(); + for i in start_end_range.iter() { + assert!( + !self.inserted_ranges_a.contains(i), + "Collision on {:?}", + i + ); + self.inserted_ranges_a.insert(*i); + } + self.consistency_check("Before update"); + let insert_ix = match self + .mapping + .iter() + .enumerate() + .find(|(_ix, (_pos_b, pos_a))| pos_a > &start) + { + Some((ix, (_, pos_a))) => { + // chunks must not overlap + assert!(end < *pos_a); + ix + }, + None => self.mapping.len(), + }; + let (pos_b, pos_a) = if insert_ix > 0 { + self.mapping[insert_ix - 1] + } else { + (0, 0) + }; + assert!(start >= pos_a); + let delta_same = start - pos_a; + // A: (start,end) + // ... maps to + // B: (cur_b + delta_same, cur_b + delta_same + repl_length) + let new_a = end; + let new_b = pos_b + (delta_same + len_b); + assert!(new_a >= pos_a); + assert!(new_b >= pos_b); + self.mapping.insert(insert_ix, (new_b, new_a)); + + // Remap all the following pieces. + let mut prev_b: u32 = new_b; + let len_a = end - start; + for i in insert_ix + 1..self.mapping.len() { + let (b, a) = self.mapping[i]; + let updated_b = b - len_a + len_b; + self.mapping[i] = (updated_b, a); + assert!(updated_b >= prev_b); + prev_b = updated_b; + } + self.consistency_check("After update"); + } + + /// Given a position in file B, returns a corresponding position in file A. + pub fn resolve(&self, input_b: u32) -> u32 { + let ix = match self + .mapping + .iter() + .enumerate() + .find(|(_ix, (pos_b, _pos_a))| pos_b > &input_b) + { + Some((ix, _)) => ix, + None => self.mapping.len(), + }; + let (pos_b, pos_a) = if ix > 0 { self.mapping[ix - 1] } else { (0, 0) }; + + pos_a + (input_b - pos_b) + } +} + +/// Filters out latex code snippets from md files to avoid false link +/// matches. +pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) { + use regex::Regex; + + let mut byte_index_map = ByteIndexMap::new(); + let mut src: String = src.to_string(); + + let mut process_regex = |regex_expr: &str, replacement: &str| { + let mut byte_index_map_upds = vec![]; + let reg = Regex::new(regex_expr).unwrap(); + for captures in reg.captures_iter(&src) { + if let Some(mtch) = captures.get(0) { + let start = mtch.start() as u32; + let end = mtch.end() as u32; + + let repl_length = replacement.len() as u32; + byte_index_map_upds.push(( + byte_index_map.resolve(start), + byte_index_map.resolve(start) + end - start, + repl_length, + )); + } + } + + // update source and byte_index_map + for (start, end, length) in byte_index_map_upds { + byte_index_map.update(start, end, length); + } + src = reg.replace_all(&src, replacement).to_string(); + }; + + // Everything between a pair of $$ including newlines + process_regex(r"\$\$[^\$]*\$\$", "LATEX_DOUBLE_DOLLAR_SUBSTITUTED"); + // Everything between a pair of $ excluding newlines + process_regex(r"\$[^\$\n\r]*\$", "LATEX_SINGLE_DOLLAR_SUBSTITUTED"); + // Everything between \( and \) excluding newlines + process_regex(r"\\\([^\n\r]*\\\)", "LATEX_ESCAPED_PARENTHESIS_SUBSTITUTED"); + // Everything between \[ and \] including newlines + process_regex( + r"\\\[(.|\r\n|\r|\n)*\\\]", + "LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED", + ); + + (src.to_string(), byte_index_map) +} diff --git a/src/lib.rs b/src/lib.rs index 76f361917..3cef84a83 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,7 @@ pub const COMPATIBLE_MDBOOK_VERSIONS: &str = "^0.4.0"; mod config; mod context; mod hashed_regex; +mod latex; mod links; mod validate; @@ -153,11 +154,10 @@ where match item { BookItem::Chapter(ref ch) => { if let Some(ref path) = ch.path { - if filter(&path) { - let id = dest.add( - path.display().to_string(), - ch.content.clone(), - ); + if filter(path) { + let path_str = path.display().to_string(); + let content = ch.content.clone(); + let id = dest.add(path_str, content); ids.push(id); } } @@ -194,11 +194,11 @@ where F: Fn(&Path) -> bool, { log::info!("Scanning book for links"); - let mut files = Files::new(); + let mut files: Files = Files::new(); let file_ids = crate::load_files_into_memory(&ctx.book, &mut files, file_filter); let (links, incomplete_links) = - crate::extract_links(file_ids.clone(), &files); + crate::extract_links(cfg, file_ids.clone(), &files); log::info!( "Found {} links ({} incomplete links)", links.len(), diff --git a/src/links.rs b/src/links.rs index 77aac841d..8ba6d1590 100644 --- a/src/links.rs +++ b/src/links.rs @@ -1,4 +1,6 @@ -use codespan::{FileId, Files, Span}; +use crate::config::Config; +use crate::latex::{filter_out_latex, ByteIndexMap}; +use codespan::{ByteIndex, FileId, Files, Span}; use linkcheck::Link; use pulldown_cmark::{BrokenLink, CowStr}; use std::{cell::RefCell, fmt::Debug}; @@ -6,6 +8,7 @@ use std::{cell::RefCell, fmt::Debug}; /// Search every file in the [`Files`] and collate all the links that are /// found. pub fn extract( + cfg: &Config, target_files: I, files: &Files, ) -> (Vec, Vec) @@ -17,25 +20,58 @@ where for file_id in target_files { let src = files.source(file_id); + + let (src, byte_index_map) = if cfg.latex_support { + filter_out_latex(src) + } else { + (src.clone(), ByteIndexMap::new()) + }; + log::debug!("Scanning {}", files.name(file_id).to_string_lossy()); - links.extend(scan_links(file_id, &*src, &mut |broken_link| { - let BrokenLink { - reference, span, .. - } = broken_link; - log::debug!( - "Found a (possibly) broken link to [{}] at {:?}", - reference, - span - ); + let mapspan = |span: Span| { + Span::new( + ByteIndex( + byte_index_map.resolve(span.start().to_usize() as u32), + ), + ByteIndex(byte_index_map.resolve(span.end().to_usize() as u32)), + ) + }; + + links.extend( + scan_links(file_id, &src, &mut |broken_link| { + let BrokenLink { + reference, span, .. + } = broken_link; + log::debug!( + "Found a (possibly) broken link to [{}] at {:?}", + reference, + span + ); + + ////assert!(false, "kek panic, unreachable?"); + //println!( + // "start {:?} end {:?} res_a {:?} res_b {:?}", + // span.start, + // span.end, + // ByteIndex(byte_index_map.resolve(span.start as u32)), + // ByteIndex(byte_index_map.resolve(span.end as u32)) + //); + let origspan = Span::new( + ByteIndex(span.start as u32), + ByteIndex(span.end as u32), + ); + let span = mapspan(origspan); - broken_links.borrow_mut().push(IncompleteLink { - reference: broken_link.reference.to_string(), - span: Span::new(span.start as u32, span.end as u32), - file: file_id, - }); - None - })); + broken_links.borrow_mut().push(IncompleteLink { + reference: broken_link.reference.to_string(), + span, + file: file_id, + }); + None + }) + .map(|link| Link::new(link.href, mapspan(link.span), link.file)), + ); } (links, broken_links.into_inner()) diff --git a/tests/smoke_tests.rs b/tests/smoke_tests.rs index 947e47e7d..b5fc81693 100644 --- a/tests/smoke_tests.rs +++ b/tests/smoke_tests.rs @@ -4,11 +4,22 @@ extern crate pretty_assertions; use anyhow::Error; use codespan::{FileId, Files}; use linkcheck::validation::{Cache, Reason}; -use mdbook::{renderer::{RenderContext, Renderer}, MDBook}; +use mdbook::{ + renderer::{RenderContext, Renderer}, + MDBook, +}; use mdbook_linkcheck::{Config, HashedRegex, ValidationOutcome, WarningPolicy}; -use std::{cell::Cell, collections::HashMap, convert::TryInto, iter::FromIterator, path::{Path, PathBuf}}; - -fn test_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")).join("tests") } +use std::{ + cell::Cell, + collections::HashMap, + convert::TryInto, + iter::FromIterator, + path::{Path, PathBuf}, +}; + +fn test_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("tests") +} #[test] fn check_all_links_in_a_valid_book() { @@ -282,8 +293,11 @@ impl Renderer for TestRun { &mut files, noop_filter, ); - let (links, incomplete) = - mdbook_linkcheck::extract_links(file_ids.clone(), &files); + let (links, incomplete) = mdbook_linkcheck::extract_links( + &Default::default(), + file_ids.clone(), + &files, + ); let mut cache = Cache::default(); let outcome = mdbook_linkcheck::validate( From 8cccfc8fee397092ecdf1236a42871c5c980672e Mon Sep 17 00:00:00 2001 From: Mikhail Volkhov Date: Thu, 14 Dec 2023 18:52:02 +0000 Subject: [PATCH 2/2] Add smoke tests for latex --- src/latex.rs | 4 ++ tests/broken-links/src/chapter_1.md | 3 +- tests/latex-support-links/book.toml | 5 +++ tests/latex-support-links/src/SUMMARY.md | 4 ++ tests/latex-support-links/src/chapter_1.md | 35 +++++++++++++++ .../src/second/directory.md | 1 + .../latex-support-links/src/second/sibling.md | 3 ++ tests/smoke_tests.rs | 44 +++++++++++++++++-- 8 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 tests/latex-support-links/book.toml create mode 100644 tests/latex-support-links/src/SUMMARY.md create mode 100644 tests/latex-support-links/src/chapter_1.md create mode 100644 tests/latex-support-links/src/second/directory.md create mode 100644 tests/latex-support-links/src/second/sibling.md diff --git a/src/latex.rs b/src/latex.rs index 5449bbdec..250ac7f50 100644 --- a/src/latex.rs +++ b/src/latex.rs @@ -124,6 +124,8 @@ pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) { let mut byte_index_map = ByteIndexMap::new(); let mut src: String = src.to_string(); + //println!("\n\n\nFile: {}", src); + let mut process_regex = |regex_expr: &str, replacement: &str| { let mut byte_index_map_upds = vec![]; let reg = Regex::new(regex_expr).unwrap(); @@ -160,5 +162,7 @@ pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) { "LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED", ); + //println!("\n\n\nFile after: {}", src); + (src.to_string(), byte_index_map) } diff --git a/tests/broken-links/src/chapter_1.md b/tests/broken-links/src/chapter_1.md index 448412d2f..c8f1bb797 100644 --- a/tests/broken-links/src/chapter_1.md +++ b/tests/broken-links/src/chapter_1.md @@ -10,5 +10,6 @@ [incomplete link] -![Missing Image](./asdf.png) +Also if latex support is not enabled, as here, this math expression $[math_var]_5$ \([math_var_2](latex_with_latex_support_disabled)_5\) be parsed as another issue +![Missing Image](./asdf.png) diff --git a/tests/latex-support-links/book.toml b/tests/latex-support-links/book.toml new file mode 100644 index 000000000..7200c8d0f --- /dev/null +++ b/tests/latex-support-links/book.toml @@ -0,0 +1,5 @@ +[book] +authors = ["Michael Bryan"] +multilingual = false +src = "src" +title = "Broken Links" diff --git a/tests/latex-support-links/src/SUMMARY.md b/tests/latex-support-links/src/SUMMARY.md new file mode 100644 index 000000000..ee42c8ac3 --- /dev/null +++ b/tests/latex-support-links/src/SUMMARY.md @@ -0,0 +1,4 @@ +# Summary + +- [Chapter 1](./chapter_1.md) +- [Second Directory](second/directory.md) diff --git a/tests/latex-support-links/src/chapter_1.md b/tests/latex-support-links/src/chapter_1.md new file mode 100644 index 000000000..563cde4f8 --- /dev/null +++ b/tests/latex-support-links/src/chapter_1.md @@ -0,0 +1,35 @@ +# Chapter 1 + +Here is some test $x + y$ that includes latex fragments \(z + x\). + +[Some links work](./chapter_1.md) + +$$ +\begin{align*} +log_k(s) = d +\end{align*} +$$ + +Some of these fragments $(a,b,c,d,e)$ may contain something that looks like links, e.g. \([x]_5\) or $[x]_5$ or $[x](some_latex_value)$ but is, in fact, not a link at all. + +[but linking to a nonexistent domain fails](http://this-doesnt-exist.com.au.nz.us/) + +\[ +\begin{align*} +log_k(a) = d+5 [also_not_a_link]_5 [also_not_a_link](latex_number) +\end{align*} +\] + +[This chapter doesn't exist](./foo/bar/baz.html) + +And sometimes the LaTeX environment is actually broken! For example, single dollar must capture only single-line latex pieces. Therefore if I'm talking about 5$ [and](first_broken_link_nonlatex) +with a dollar $ on the other line, this link should be still considered broken, and must not be erroneously cut out as a latex fragment. + +Same goes for the \( single escaped parenthesis, when talking about 1000$ [this](second_broken_link_nonlatex) and [this_incomplete_link_inside_nonlatex] +must not be cut out, no matter how many $ we talk about. + +[It would be bad if this worked...](../../../../../../../../../../../../etc/shadow) + +[incomplete link] + +![Missing Image](./asdf.png) diff --git a/tests/latex-support-links/src/second/directory.md b/tests/latex-support-links/src/second/directory.md new file mode 100644 index 000000000..458fef4f4 --- /dev/null +++ b/tests/latex-support-links/src/second/directory.md @@ -0,0 +1 @@ +Linking to [files not in `SUMMARY.md`](sibling.md) is an error. diff --git a/tests/latex-support-links/src/second/sibling.md b/tests/latex-support-links/src/second/sibling.md new file mode 100644 index 000000000..ac996cf48 --- /dev/null +++ b/tests/latex-support-links/src/second/sibling.md @@ -0,0 +1,3 @@ +# Sibling + +This file exists on disk, but wasn't included in `SUMMARY.md`. diff --git a/tests/smoke_tests.rs b/tests/smoke_tests.rs index b5fc81693..6dda5bbe9 100644 --- a/tests/smoke_tests.rs +++ b/tests/smoke_tests.rs @@ -64,6 +64,7 @@ fn correctly_find_broken_links() { "./chapter_1.md", "./second/directory.md", "http://this-doesnt-exist.com.au.nz.us/", + "latex_with_latex_support_disabled", "sibling.md", ]; @@ -75,9 +76,46 @@ fn correctly_find_broken_links() { .map(|invalid| invalid.link.href.to_string()) .collect(); assert_same_links(broken, expected); - // we also have one incomplete link - assert_eq!(output.incomplete_links.len(), 1); + // we also have three incomplete link (one normal, one latex) + assert_eq!(output.incomplete_links.len(), 2); assert_eq!(output.incomplete_links[0].reference, "incomplete link"); + assert_eq!(output.incomplete_links[1].reference, "math_var"); +} + +#[test] +fn correctly_find_links_with_latex() { + let root = test_dir().join("latex-support-links"); + let expected = &[ + "./foo/bar/baz.html", + "../../../../../../../../../../../../etc/shadow", + "./asdf.png", + "http://this-doesnt-exist.com.au.nz.us/", + "sibling.md", + "first_broken_link_nonlatex", + "second_broken_link_nonlatex", + ]; + + let config = Config { + follow_web_links: true, + latex_support: true, + ..Default::default() + }; + let output = run_link_checker_with_config(&root, config).unwrap(); + + let broken: Vec<_> = output + .invalid_links + .iter() + .map(|invalid| invalid.link.href.to_string()) + .collect(); + assert_same_links(broken, expected); + + // we also have two incomplete link + assert_eq!(output.incomplete_links.len(), 2); + assert_eq!( + output.incomplete_links[0].reference, + "this_incomplete_link_inside_nonlatex" + ); + assert_eq!(output.incomplete_links[1].reference, "incomplete link"); } #[test] @@ -294,7 +332,7 @@ impl Renderer for TestRun { noop_filter, ); let (links, incomplete) = mdbook_linkcheck::extract_links( - &Default::default(), + &self.config, file_ids.clone(), &files, );