diff --git a/Cargo.lock b/Cargo.lock index dfef4c97..f606ef73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -442,6 +442,7 @@ dependencies = [ "pretty_assertions", "pulldown-cmark", "pulldown-cmark-to-cmark", + "regex", "semver", "serde_json", "tempfile", @@ -600,9 +601,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.3" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" +checksum = "12de2eff854e5fa4b1295edd650e227e9d8fb0c9e90b12e7f36d6a6811791a29" dependencies = [ "aho-corasick", "memchr", @@ -612,9 +613,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" +checksum = "49530408a136e16e5b486e883fbb6ba058e8e4e8ae6621a77b048b314336e629" dependencies = [ "aho-corasick", "memchr", @@ -623,9 +624,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "rustix" diff --git a/Cargo.toml b/Cargo.toml index 4f1ff209..898c91d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ mdbook = { version = "0.4.25", default-features = false } polib = "0.2.0" pulldown-cmark = { version = "0.9.2", default-features = false } pulldown-cmark-to-cmark = "10.0.4" +regex = "1.9.4" semver = "1.0.16" serde_json = "1.0.91" diff --git a/USAGE.md b/USAGE.md index c62fd638..eeaa8bbf 100644 --- a/USAGE.md +++ b/USAGE.md @@ -182,6 +182,34 @@ Please see the [`publish.yml`] workflow in the Comprehensive Rust 🦀 repositor [`publish.yml`]: https://github.com/google/comprehensive-rust/blob/main/.github/workflows/publish.yml +## Marking Sections to be Skipped for Translation + +A block can be marked to be skipped for translation by prepending a special HTML +comment `` to it. + +For example: + +````markdown +The following code block should not be translated. + + + +``` +fn hello() { + println!("Hello world!"); +} +``` + +Itemized list: + +- A should be translated. + + + +- B should be skipped. +- C should be translated. +```` + ## Normalizing Existing PO Files When mdbook-i18n-helpers change, the generated PO files change as well. This can diff --git a/src/lib.rs b/src/lib.rs index a3263d51..e369d08c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,8 @@ use polib::catalog::Catalog; use pulldown_cmark::{Event, LinkType, Tag}; use pulldown_cmark_to_cmark::{cmark_resume_with_options, Options, State}; +use regex::Regex; +use std::sync::OnceLock; /// Like `mdbook::utils::new_cmark_parser`, but also passes a /// `BrokenLinkCallback`. @@ -183,6 +185,20 @@ pub enum Group<'a> { pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { let mut groups = Vec::new(); + #[derive(Debug)] + struct GroupingContext { + skip_next_group: bool, + // TODO: this struct is planned to expand with translator + // comments and message contexts. + } + impl GroupingContext { + fn clear_skip_next_group(self) -> Self { + Self { + skip_next_group: false, + } + } + } + #[derive(Debug)] enum State { Translate(usize), @@ -190,15 +206,33 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { } impl State { - fn into_group<'a>(self, idx: usize, events: &'a [(usize, Event<'a>)]) -> Group<'a> { + /// Creates a group based on the capturing state and context. + fn into_group<'a>( + self, + idx: usize, + events: &'a [(usize, Event<'a>)], + ctx: GroupingContext, + ) -> (Group<'a>, GroupingContext) { match self { - State::Translate(start) => Group::Translate(&events[start..idx]), - State::Skip(start) => Group::Skip(&events[start..idx]), + State::Translate(start) => { + if ctx.skip_next_group { + ( + Group::Skip(&events[start..idx]), + ctx.clear_skip_next_group(), + ) + } else { + (Group::Translate(&events[start..idx]), ctx) + } + } + State::Skip(start) => (Group::Skip(&events[start..idx]), ctx), } } } let mut state = State::Skip(0); + let mut ctx = GroupingContext { + skip_next_group: false, + }; for (idx, (_, event)) in events.iter().enumerate() { match event { @@ -207,13 +241,19 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { // make the group self-contained. Event::Start(Tag::Paragraph | Tag::CodeBlock(..)) => { // A translatable group starts here. - groups.push(state.into_group(idx, events)); + let next_group; + (next_group, ctx) = state.into_group(idx, events, ctx); + groups.push(next_group); + state = State::Translate(idx); } Event::End(Tag::Paragraph | Tag::CodeBlock(..)) => { // A translatable group ends after `idx`. let idx = idx + 1; - groups.push(state.into_group(idx, events)); + let next_group; + (next_group, ctx) = state.into_group(idx, events, ctx); + groups.push(next_group); + state = State::Skip(idx); } @@ -231,17 +271,41 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { | Event::HardBreak => { // If we're currently skipping, then a new // translatable group starts here. - if let State::Skip(start) = state { - groups.push(Group::Skip(&events[start..idx])); + if let State::Skip(_) = state { + let next_group; + (next_group, ctx) = state.into_group(idx, events, ctx); + groups.push(next_group); + state = State::Translate(idx); } } + // An HTML comment directive to skip the next translation + // group. + Event::Html(s) if is_comment_skip_directive(s) => { + // If in the middle of translation, finish it. + if let State::Translate(_) = state { + let next_group; + (next_group, ctx) = state.into_group(idx, events, ctx); + groups.push(next_group); + + // Restart translation: subtle but should be + // needed to handle the skipping of the rest of + // the inlined content. + state = State::Translate(idx); + } + + ctx.skip_next_group = true; + } + // All other block-level events start or continue a // skipping group. _ => { - if let State::Translate(start) = state { - groups.push(Group::Translate(&events[start..idx])); + if let State::Translate(_) = state { + let next_group; + (next_group, ctx) = state.into_group(idx, events, ctx); + groups.push(next_group); + state = State::Skip(idx); } } @@ -256,6 +320,15 @@ pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { groups } +/// Check whether the HTML is a directive to skip the next translation group. +fn is_comment_skip_directive(html: &str) -> bool { + static RE: OnceLock = OnceLock::new(); + + let re = + RE.get_or_init(|| Regex::new(r"").unwrap()); + re.is_match(html.trim()) +} + /// Render a slice of Markdown events back to Markdown. /// /// # Examples @@ -365,6 +438,7 @@ pub fn extract_messages(document: &str) -> Vec<(usize, String)> { let events = extract_events(document, None); let mut messages = Vec::new(); let mut state = None; + for group in group_events(&events) { match group { Group::Translate(events) => { @@ -578,6 +652,19 @@ mod tests { ); } + #[test] + fn extract_events_comments() { + assert_eq!( + extract_events("\nHello", None), + vec![ + (1, Html("\n".into())), + (2, Start(Paragraph)), + (2, Text("Hello".into())), + (2, End(Paragraph)), + ] + ); + } + #[test] fn extract_messages_empty() { assert_extract_messages("", vec![]); @@ -951,4 +1038,166 @@ BOB ], ); } + + #[test] + fn test_is_comment_skip_directive_simple() { + assert_eq!( + is_comment_skip_directive(""), + true + ); + } + + #[test] + fn test_is_comment_skip_directive_tolerates_spaces() { + assert_eq!( + is_comment_skip_directive(""), + true + ); + } + + #[test] + fn test_is_comment_skip_directive_tolerates_dashes() { + assert_eq!( + is_comment_skip_directive(""), + true + ); + } + + #[test] + fn test_is_comment_skip_directive_needs_skip() { + assert_eq!( + is_comment_skip_directive(""), + false + ); + } + #[test] + fn test_is_comment_skip_directive_needs_to_be_a_comment() { + assert_eq!( + is_comment_skip_directive("
mdbook-xgettext: skip
"), + false + ); + } + + #[test] + fn extract_messages_skip_simple() { + assert_extract_messages( + r#" + +This is a paragraph."#, + vec![], + ); + } + + #[test] + fn extract_messages_skip_next_paragraph_ok() { + assert_extract_messages( + r#" +This is a paragraph. + +This should be translated. +"#, + vec![(4, "This should be translated.")], + ); + } + + #[test] + fn extract_messages_skip_next_codeblock() { + assert_extract_messages( + r#" +``` +def f(x): return x * x +``` +This should be translated. +"#, + vec![(5, "This should be translated.")], + ); + } + + #[test] + fn extract_messages_skip_back_to_back() { + assert_extract_messages( + r#" +``` +def f(x): return x * x +``` + +This should not translated. + +But *this* should! +"#, + vec![(8, "But _this_ should!")], + ); + } + + #[test] + fn extract_messages_inline_skips() { + assert_extract_messages( + " +this should be translated but not this. +... nor this. + +But *this* should!", + vec![(2, "this should be translated "), (5, "But _this_ should!")], + ); + } + + #[test] + fn extract_messages_skipping_second_item() { + assert_extract_messages( + " +* A + +* B +* C +", + vec![(2, "A"), (5, "C")], + ); + } + + #[test] + fn extract_messages_skipping_second_paragraphed_item() { + assert_extract_messages( + " +* A + + +* B + +* C +", + vec![(2, "A"), (7, "C")], + ); + } + + #[test] + fn extract_messages_skipping_inline_second_item_buggy() { + // This isn't great: we lose text following a HTML comment. + // Very similar to the failure mode of the + // `extract_messages_details` test. + // + // The root cause appears to be a bug in the Markdown parser + // because it's not separating HTML element from text that + // immediately follows it. + // + // Related: https://github.com/raphlinus/pulldown-cmark/issues/712 + assert_extract_messages( + " +* A +* B +* C +", + vec![(2, "A")], + ); + } + + #[test] + fn extract_messages_inline_skip_to_end_of_block() { + assert_extract_messages( + "foo **bold** bar +still skipped + +not-skipped", + vec![(1, "foo "), (4, "not-skipped")], + ); + } }