From 603437957c42927dacc20b280ea726e85c310f48 Mon Sep 17 00:00:00 2001
From: Mikhail Volkhov <volhovm.cs@gmail.com>
Date: Thu, 30 Nov 2023 12:38:43 +0000
Subject: [PATCH 1/2] Add latex support via regex filtering

---
 src/bin/mdbook-linkcheck.rs |   3 +-
 src/config.rs               |   6 ++
 src/latex.rs                | 164 ++++++++++++++++++++++++++++++++++++
 src/lib.rs                  |  14 +--
 src/links.rs                |  70 +++++++++++----
 tests/smoke_tests.rs        |  26 ++++--
 6 files changed, 252 insertions(+), 31 deletions(-)
 create mode 100644 src/latex.rs
diff --git a/src/bin/mdbook-linkcheck.rs b/src/bin/mdbook-linkcheck.rs
index 3df775088..0582b54ed 100644
--- a/src/bin/mdbook-linkcheck.rs
+++ b/src/bin/mdbook-linkcheck.rs
@@ -55,7 +55,8 @@ struct Args {
     #[structopt(
         short = "f",
         long = "files",
-        help = "Check only the given files (check all files if omitted)."
+        help = "Check only the given files (check all files if omitted).
+Paths must be relative to the book root, e.g. 'chapter1/section1.md'."
     )]
     selected_files: Option<Vec<String>>,
     #[structopt(
diff --git a/src/config.rs b/src/config.rs
index 67efdc890..73e969b1b 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -22,6 +22,9 @@ pub struct Config {
     pub follow_web_links: bool,
     /// Are we allowed to link to files outside of the book's source directory?
     pub traverse_parent_directories: bool,
+    /// Turns on support for latex. If true, then the latex fragments will be
+    /// cut off before the file is processed for link consistency.
+    pub latex_support: bool,
     /// A list of URL patterns to ignore when checking remote links.
     #[serde(default)]
     pub exclude: Vec<HashedRegex>,
@@ -124,6 +127,7 @@ impl Default for Config {
         Config {
             follow_web_links: false,
             traverse_parent_directories: false,
+            latex_support: false,
             exclude: Vec::new(),
             user_agent: default_user_agent(),
             http_headers: HashMap::new(),
@@ -279,6 +283,7 @@ mod tests {
 
     const CONFIG: &str = r#"follow-web-links = true
 traverse-parent-directories = true
+latex-support = true
 exclude = ["google\\.com"]
 user-agent = "Internet Explorer"
 cache-timeout = 3600
@@ -306,6 +311,7 @@ https = ["accept: html/text", "authorization: Basic $TOKEN"]
                 ],
             )]),
             cache_timeout: 3600,
+            latex_support: true,
         };
 
         let got: Config = toml::from_str(CONFIG).unwrap();
diff --git a/src/latex.rs b/src/latex.rs
new file mode 100644
index 000000000..5449bbdec
--- /dev/null
+++ b/src/latex.rs
@@ -0,0 +1,164 @@
+/// This module provides an (experimental ad-hoc) functionality of
+/// supporting latex in `mdbook-linkcheck`.
+use std::collections::HashSet;
+
+/// A struct that maps text changes from file B to file A, where file
+/// A is original and B is modified. It is used to map back error
+/// positions after A is altered into B by regexes that cut out latex
+/// fragments.
+pub(crate) struct ByteIndexMap {
+    /// Mapping from B to A stored as (b_i,a_i), stored as
+    /// monotonously increased pairs.
+    ///
+    /// I.e. it always holds that b_{i+1} > b_{i} && a_{i+1} > a_i.
+    mapping: Vec<(u32, u32)>,
+    /// Ranges in a that are altered.
+    inserted_ranges_a: HashSet<u32>,
+}
+
+impl ByteIndexMap {
+    pub fn new() -> Self {
+        ByteIndexMap {
+            mapping: vec![],
+            inserted_ranges_a: HashSet::new(),
+        }
+    }
+
+    // Internal contsistency check function. It can be turned off for
+    // efficiency if latex support becomes too slow. But for now I prefer to
+    // leave it here @volhovm.
+    fn consistency_check(&self, s: &str) {
+        let mut prev_b: u32 = 0;
+        let mut prev_a: u32 = 0;
+        for (ix, (b, a)) in self.mapping.iter().enumerate() {
+            if b < &prev_b || a < &prev_a {
+                panic!(
+                    "Inconsistent {}, ix {:?}, value {:?}, prev values {:?}",
+                    s,
+                    ix,
+                    (b, a),
+                    (prev_b, prev_a)
+                );
+            }
+            prev_b = *b;
+            prev_a = *a;
+        }
+    }
+
+    pub fn update(&mut self, start: u32, end: u32, len_b: u32) {
+        assert!(end >= start);
+        let start_end_range: Vec<u32> = (start..end).collect();
+        for i in start_end_range.iter() {
+            assert!(
+                !self.inserted_ranges_a.contains(i),
+                "Collision on {:?}",
+                i
+            );
+            self.inserted_ranges_a.insert(*i);
+        }
+        self.consistency_check("Before update");
+        let insert_ix = match self
+            .mapping
+            .iter()
+            .enumerate()
+            .find(|(_ix, (_pos_b, pos_a))| pos_a > &start)
+        {
+            Some((ix, (_, pos_a))) => {
+                // chunks must not overlap
+                assert!(end < *pos_a);
+                ix
+            },
+            None => self.mapping.len(),
+        };
+        let (pos_b, pos_a) = if insert_ix > 0 {
+            self.mapping[insert_ix - 1]
+        } else {
+            (0, 0)
+        };
+        assert!(start >= pos_a);
+        let delta_same = start - pos_a;
+        // A: (start,end)
+        // ... maps to
+        // B: (cur_b + delta_same, cur_b + delta_same + repl_length)
+        let new_a = end;
+        let new_b = pos_b + (delta_same + len_b);
+        assert!(new_a >= pos_a);
+        assert!(new_b >= pos_b);
+        self.mapping.insert(insert_ix, (new_b, new_a));
+
+        // Remap all the following pieces.
+        let mut prev_b: u32 = new_b;
+        let len_a = end - start;
+        for i in insert_ix + 1..self.mapping.len() {
+            let (b, a) = self.mapping[i];
+            let updated_b = b - len_a + len_b;
+            self.mapping[i] = (updated_b, a);
+            assert!(updated_b >= prev_b);
+            prev_b = updated_b;
+        }
+        self.consistency_check("After update");
+    }
+
+    /// Given a position in file B, returns a corresponding position in file A.
+    pub fn resolve(&self, input_b: u32) -> u32 {
+        let ix = match self
+            .mapping
+            .iter()
+            .enumerate()
+            .find(|(_ix, (pos_b, _pos_a))| pos_b > &input_b)
+        {
+            Some((ix, _)) => ix,
+            None => self.mapping.len(),
+        };
+        let (pos_b, pos_a) = if ix > 0 { self.mapping[ix - 1] } else { (0, 0) };
+
+        pos_a + (input_b - pos_b)
+    }
+}
+
+/// Filters out latex code snippets from md files to avoid false link
+/// matches.
+pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) {
+    use regex::Regex;
+
+    let mut byte_index_map = ByteIndexMap::new();
+    let mut src: String = src.to_string();
+
+    let mut process_regex = |regex_expr: &str, replacement: &str| {
+        let mut byte_index_map_upds = vec![];
+        let reg = Regex::new(regex_expr).unwrap();
+        for captures in reg.captures_iter(&src) {
+            if let Some(mtch) = captures.get(0) {
+                let start = mtch.start() as u32;
+                let end = mtch.end() as u32;
+
+                let repl_length = replacement.len() as u32;
+                byte_index_map_upds.push((
+                    byte_index_map.resolve(start),
+                    byte_index_map.resolve(start) + end - start,
+                    repl_length,
+                ));
+            }
+        }
+
+        // update source and byte_index_map
+        for (start, end, length) in byte_index_map_upds {
+            byte_index_map.update(start, end, length);
+        }
+        src = reg.replace_all(&src, replacement).to_string();
+    };
+
+    // Everything between a pair of $$ including newlines
+    process_regex(r"\$\$[^\$]*\$\$", "LATEX_DOUBLE_DOLLAR_SUBSTITUTED");
+    // Everything between a pair of $ excluding newlines
+    process_regex(r"\$[^\$\n\r]*\$", "LATEX_SINGLE_DOLLAR_SUBSTITUTED");
+    // Everything between \( and \) excluding newlines
+    process_regex(r"\\\([^\n\r]*\\\)", "LATEX_ESCAPED_PARENTHESIS_SUBSTITUTED");
+    // Everything between \[ and \] including newlines
+    process_regex(
+        r"\\\[(.|\r\n|\r|\n)*\\\]",
+        "LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED",
+    );
+
+    (src.to_string(), byte_index_map)
+}
diff --git a/src/lib.rs b/src/lib.rs
index 76f361917..3cef84a83 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -28,6 +28,7 @@ pub const COMPATIBLE_MDBOOK_VERSIONS: &str = "^0.4.0";
 mod config;
 mod context;
 mod hashed_regex;
+mod latex;
 mod links;
 mod validate;
 
@@ -153,11 +154,10 @@ where
         match item {
             BookItem::Chapter(ref ch) => {
                 if let Some(ref path) = ch.path {
-                    if filter(&path) {
-                        let id = dest.add(
-                            path.display().to_string(),
-                            ch.content.clone(),
-                        );
+                    if filter(path) {
+                        let path_str = path.display().to_string();
+                        let content = ch.content.clone();
+                        let id = dest.add(path_str, content);
                         ids.push(id);
                     }
                 }
@@ -194,11 +194,11 @@ where
     F: Fn(&Path) -> bool,
 {
     log::info!("Scanning book for links");
-    let mut files = Files::new();
+    let mut files: Files<String> = Files::new();
     let file_ids =
         crate::load_files_into_memory(&ctx.book, &mut files, file_filter);
     let (links, incomplete_links) =
-        crate::extract_links(file_ids.clone(), &files);
+        crate::extract_links(cfg, file_ids.clone(), &files);
     log::info!(
         "Found {} links ({} incomplete links)",
         links.len(),
diff --git a/src/links.rs b/src/links.rs
index 77aac841d..8ba6d1590 100644
--- a/src/links.rs
+++ b/src/links.rs
@@ -1,4 +1,6 @@
-use codespan::{FileId, Files, Span};
+use crate::config::Config;
+use crate::latex::{filter_out_latex, ByteIndexMap};
+use codespan::{ByteIndex, FileId, Files, Span};
 use linkcheck::Link;
 use pulldown_cmark::{BrokenLink, CowStr};
 use std::{cell::RefCell, fmt::Debug};
@@ -6,6 +8,7 @@ use std::{cell::RefCell, fmt::Debug};
 /// Search every file in the [`Files`] and collate all the links that are
 /// found.
 pub fn extract<I>(
+    cfg: &Config,
     target_files: I,
     files: &Files<String>,
 ) -> (Vec<Link>, Vec<IncompleteLink>)
@@ -17,25 +20,58 @@ where
 
     for file_id in target_files {
         let src = files.source(file_id);
+
+        let (src, byte_index_map) = if cfg.latex_support {
+            filter_out_latex(src)
+        } else {
+            (src.clone(), ByteIndexMap::new())
+        };
+
         log::debug!("Scanning {}", files.name(file_id).to_string_lossy());
 
-        links.extend(scan_links(file_id, &*src, &mut |broken_link| {
-            let BrokenLink {
-                reference, span, ..
-            } = broken_link;
-            log::debug!(
-                "Found a (possibly) broken link to [{}] at {:?}",
-                reference,
-                span
-            );
+        let mapspan = |span: Span| {
+            Span::new(
+                ByteIndex(
+                    byte_index_map.resolve(span.start().to_usize() as u32),
+                ),
+                ByteIndex(byte_index_map.resolve(span.end().to_usize() as u32)),
+            )
+        };
+
+        links.extend(
+            scan_links(file_id, &src, &mut |broken_link| {
+                let BrokenLink {
+                    reference, span, ..
+                } = broken_link;
+                log::debug!(
+                    "Found a (possibly) broken link to [{}] at {:?}",
+                    reference,
+                    span
+                );
+
+                ////assert!(false, "kek panic, unreachable?");
+                //println!(
+                //    "start {:?} end {:?} res_a {:?} res_b {:?}",
+                //    span.start,
+                //    span.end,
+                //    ByteIndex(byte_index_map.resolve(span.start as u32)),
+                //    ByteIndex(byte_index_map.resolve(span.end as u32))
+                //);
+                let origspan = Span::new(
+                    ByteIndex(span.start as u32),
+                    ByteIndex(span.end as u32),
+                );
+                let span = mapspan(origspan);
 
-            broken_links.borrow_mut().push(IncompleteLink {
-                reference: broken_link.reference.to_string(),
-                span: Span::new(span.start as u32, span.end as u32),
-                file: file_id,
-            });
-            None
-        }));
+                broken_links.borrow_mut().push(IncompleteLink {
+                    reference: broken_link.reference.to_string(),
+                    span,
+                    file: file_id,
+                });
+                None
+            })
+            .map(|link| Link::new(link.href, mapspan(link.span), link.file)),
+        );
     }
 
     (links, broken_links.into_inner())
diff --git a/tests/smoke_tests.rs b/tests/smoke_tests.rs
index 947e47e7d..b5fc81693 100644
--- a/tests/smoke_tests.rs
+++ b/tests/smoke_tests.rs
@@ -4,11 +4,22 @@ extern crate pretty_assertions;
 use anyhow::Error;
 use codespan::{FileId, Files};
 use linkcheck::validation::{Cache, Reason};
-use mdbook::{renderer::{RenderContext, Renderer}, MDBook};
+use mdbook::{
+    renderer::{RenderContext, Renderer},
+    MDBook,
+};
 use mdbook_linkcheck::{Config, HashedRegex, ValidationOutcome, WarningPolicy};
-use std::{cell::Cell, collections::HashMap, convert::TryInto, iter::FromIterator, path::{Path, PathBuf}};
-
-fn test_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")).join("tests") }
+use std::{
+    cell::Cell,
+    collections::HashMap,
+    convert::TryInto,
+    iter::FromIterator,
+    path::{Path, PathBuf},
+};
+
+fn test_dir() -> PathBuf {
+    Path::new(env!("CARGO_MANIFEST_DIR")).join("tests")
+}
 
 #[test]
 fn check_all_links_in_a_valid_book() {
@@ -282,8 +293,11 @@ impl Renderer for TestRun {
             &mut files,
             noop_filter,
         );
-        let (links, incomplete) =
-            mdbook_linkcheck::extract_links(file_ids.clone(), &files);
+        let (links, incomplete) = mdbook_linkcheck::extract_links(
+            &Default::default(),
+            file_ids.clone(),
+            &files,
+        );
 
         let mut cache = Cache::default();
         let outcome = mdbook_linkcheck::validate(

From 8cccfc8fee397092ecdf1236a42871c5c980672e Mon Sep 17 00:00:00 2001
From: Mikhail Volkhov <volhovm.cs@gmail.com>
Date: Thu, 14 Dec 2023 18:52:02 +0000
Subject: [PATCH 2/2] Add smoke tests for latex

---
 src/latex.rs                                  |  4 ++
 tests/broken-links/src/chapter_1.md           |  3 +-
 tests/latex-support-links/book.toml           |  5 +++
 tests/latex-support-links/src/SUMMARY.md      |  4 ++
 tests/latex-support-links/src/chapter_1.md    | 35 +++++++++++++++
 .../src/second/directory.md                   |  1 +
 .../latex-support-links/src/second/sibling.md |  3 ++
 tests/smoke_tests.rs                          | 44 +++++++++++++++++--
 8 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 tests/latex-support-links/book.toml
 create mode 100644 tests/latex-support-links/src/SUMMARY.md
 create mode 100644 tests/latex-support-links/src/chapter_1.md
 create mode 100644 tests/latex-support-links/src/second/directory.md
 create mode 100644 tests/latex-support-links/src/second/sibling.md

diff --git a/src/latex.rs b/src/latex.rs
index 5449bbdec..250ac7f50 100644
--- a/src/latex.rs
+++ b/src/latex.rs
@@ -124,6 +124,8 @@ pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) {
     let mut byte_index_map = ByteIndexMap::new();
     let mut src: String = src.to_string();
 
+    //println!("\n\n\nFile: {}", src);
+
     let mut process_regex = |regex_expr: &str, replacement: &str| {
         let mut byte_index_map_upds = vec![];
         let reg = Regex::new(regex_expr).unwrap();
@@ -160,5 +162,7 @@ pub(crate) fn filter_out_latex(src: &str) -> (String, ByteIndexMap) {
         "LATEX_ESCAPED_SQUARE_BRACKET_SUBSTITUTED",
     );
 
+    //println!("\n\n\nFile after: {}", src);
+
     (src.to_string(), byte_index_map)
 }
diff --git a/tests/broken-links/src/chapter_1.md b/tests/broken-links/src/chapter_1.md
index 448412d2f..c8f1bb797 100644
--- a/tests/broken-links/src/chapter_1.md
+++ b/tests/broken-links/src/chapter_1.md
@@ -10,5 +10,6 @@
 
 [incomplete link]
 
-![Missing Image](./asdf.png)
+Also if latex support is not enabled, as here, this math expression $[math_var]_5$ \([math_var_2](latex_with_latex_support_disabled)_5\) be parsed as another issue
 
+![Missing Image](./asdf.png)
diff --git a/tests/latex-support-links/book.toml b/tests/latex-support-links/book.toml
new file mode 100644
index 000000000..7200c8d0f
--- /dev/null
+++ b/tests/latex-support-links/book.toml
@@ -0,0 +1,5 @@
+[book]
+authors = ["Michael Bryan"]
+multilingual = false
+src = "src"
+title = "Broken Links"
diff --git a/tests/latex-support-links/src/SUMMARY.md b/tests/latex-support-links/src/SUMMARY.md
new file mode 100644
index 000000000..ee42c8ac3
--- /dev/null
+++ b/tests/latex-support-links/src/SUMMARY.md
@@ -0,0 +1,4 @@
+# Summary
+
+- [Chapter 1](./chapter_1.md)
+- [Second Directory](second/directory.md)
diff --git a/tests/latex-support-links/src/chapter_1.md b/tests/latex-support-links/src/chapter_1.md
new file mode 100644
index 000000000..563cde4f8
--- /dev/null
+++ b/tests/latex-support-links/src/chapter_1.md
@@ -0,0 +1,35 @@
+# Chapter 1
+
+Here is some test $x + y$ that includes latex fragments \(z + x\).
+
+[Some links work](./chapter_1.md)
+
+$$
+\begin{align*}
+log_k(s) = d
+\end{align*}
+$$
+
+Some of these fragments $(a,b,c,d,e)$ may contain something that looks like links, e.g. \([x]_5\) or $[x]_5$ or $[x](some_latex_value)$ but is, in fact, not a link at all.
+
+[but linking to a nonexistent domain fails](http://this-doesnt-exist.com.au.nz.us/)
+
+\[
+\begin{align*}
+log_k(a) = d+5 [also_not_a_link]_5 [also_not_a_link](latex_number)
+\end{align*}
+\]
+
+[This chapter doesn't exist](./foo/bar/baz.html)
+
+And sometimes the LaTeX environment is actually broken! For example, single dollar must capture only single-line latex pieces. Therefore if I'm talking about 5$ [and](first_broken_link_nonlatex)
+with a dollar $ on the other line, this link should be still considered broken, and must not be erroneously cut out as a latex fragment.
+
+Same goes for the \( single escaped parenthesis, when talking about 1000$  [this](second_broken_link_nonlatex) and [this_incomplete_link_inside_nonlatex]
+must not be cut out, no matter how many $ we talk about.
+
+[It would be bad if this worked...](../../../../../../../../../../../../etc/shadow)
+
+[incomplete link]
+
+![Missing Image](./asdf.png)
diff --git a/tests/latex-support-links/src/second/directory.md b/tests/latex-support-links/src/second/directory.md
new file mode 100644
index 000000000..458fef4f4
--- /dev/null
+++ b/tests/latex-support-links/src/second/directory.md
@@ -0,0 +1 @@
+Linking to [files not in `SUMMARY.md`](sibling.md) is an error.
diff --git a/tests/latex-support-links/src/second/sibling.md b/tests/latex-support-links/src/second/sibling.md
new file mode 100644
index 000000000..ac996cf48
--- /dev/null
+++ b/tests/latex-support-links/src/second/sibling.md
@@ -0,0 +1,3 @@
+# Sibling
+
+This file exists on disk, but wasn't included in `SUMMARY.md`.
diff --git a/tests/smoke_tests.rs b/tests/smoke_tests.rs
index b5fc81693..6dda5bbe9 100644
--- a/tests/smoke_tests.rs
+++ b/tests/smoke_tests.rs
@@ -64,6 +64,7 @@ fn correctly_find_broken_links() {
         "./chapter_1.md",
         "./second/directory.md",
         "http://this-doesnt-exist.com.au.nz.us/",
+        "latex_with_latex_support_disabled",
         "sibling.md",
     ];
 
@@ -75,9 +76,46 @@ fn correctly_find_broken_links() {
         .map(|invalid| invalid.link.href.to_string())
         .collect();
     assert_same_links(broken, expected);
-    // we also have one incomplete link
-    assert_eq!(output.incomplete_links.len(), 1);
+    // we also have three incomplete link (one normal, one latex)
+    assert_eq!(output.incomplete_links.len(), 2);
     assert_eq!(output.incomplete_links[0].reference, "incomplete link");
+    assert_eq!(output.incomplete_links[1].reference, "math_var");
+}
+
+#[test]
+fn correctly_find_links_with_latex() {
+    let root = test_dir().join("latex-support-links");
+    let expected = &[
+        "./foo/bar/baz.html",
+        "../../../../../../../../../../../../etc/shadow",
+        "./asdf.png",
+        "http://this-doesnt-exist.com.au.nz.us/",
+        "sibling.md",
+        "first_broken_link_nonlatex",
+        "second_broken_link_nonlatex",
+    ];
+
+    let config = Config {
+        follow_web_links: true,
+        latex_support: true,
+        ..Default::default()
+    };
+    let output = run_link_checker_with_config(&root, config).unwrap();
+
+    let broken: Vec<_> = output
+        .invalid_links
+        .iter()
+        .map(|invalid| invalid.link.href.to_string())
+        .collect();
+    assert_same_links(broken, expected);
+
+    // we also have two incomplete link
+    assert_eq!(output.incomplete_links.len(), 2);
+    assert_eq!(
+        output.incomplete_links[0].reference,
+        "this_incomplete_link_inside_nonlatex"
+    );
+    assert_eq!(output.incomplete_links[1].reference, "incomplete link");
 }
 
 #[test]
@@ -294,7 +332,7 @@ impl Renderer for TestRun {
             noop_filter,
         );
         let (links, incomplete) = mdbook_linkcheck::extract_links(
-            &Default::default(),
+            &self.config,
             file_ids.clone(),
             &files,
         );