From f3f9b0c98ee8150ee0582b0ffab94c8761907326 Mon Sep 17 00:00:00 2001 From: Michael Schierl Date: Tue, 27 Aug 2024 23:12:00 +0200 Subject: [PATCH] Add ParatextStripped export format --- README.md | 4 +- .../MainModuleRegistry.java | 1 + .../format/paratext/ParatextStripped.java | 547 ++++++++++++++++++ .../format/paratext/USX.java | 92 +-- .../StandardExportWarningMessages.java | 18 + .../format/paratext/USFM3AllTagsTest.java | 132 ++++- 6 files changed, 696 insertions(+), 98 deletions(-) create mode 100644 biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/ParatextStripped.java diff --git a/README.md b/README.md index f07dcbe..e400e7b 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,9 @@ tagged OT/NT/Deuterocanonical content from such a file. It can also be used to c from/to **ParatextDump** (which is a diffable plain text dump of the internal Paratext structure and useful for comparing different Paratext formats), **ParatextCompact* (a more compact representation intended for archival which will remain forward compatible) and **ParatextVPL** (which is -a different diffable format that looks more like VPL, but uses Paratext tags) formats. +a different diffable format that looks more like VPL, but uses Paratext tags) formats. In combination +with the **ParatextStripped** format, various features of the file can be stripped or it can be +made compatible to an older USFM version. The **MyBibleZoneListDownloader** tool (part of SQLite edition) can be used to download the list of available MyBible.Zone modules from the module registry (that is also queried diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java b/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java index 6b74595..1290591 100644 --- a/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java +++ b/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java @@ -32,6 +32,7 @@ public Collection> getExportFormats() { result.add(new Module("StrongConcordance", "Add concordance information to a Strong dictionary", StrongConcordance.HELP_TEXT, StrongConcordance.class)); result.add(new Module("ScrambledDiffable", "Like Diffable, but with scrambled text; for tests with non-free bibles.", ScrambledDiffable.HELP_TEXT, ScrambledDiffable.class)); result.add(new Module("ScrambledParatextDump", "Like ParatextDump, but with scrambled text; for tests with non-free bibles.", ScrambledParatextDump.HELP_TEXT, ScrambledParatextDump.class)); + result.add(new Module("ParatextStripped", "Export parts of a Paratext bible", ParatextStripped.HELP_TEXT, ParatextStripped.class)); result.add(new Module("Volksbibel2000", "Export format for reimporting into Volksbibel 2000", Volksbibel2000.HELP_TEXT, Volksbibel2000.class)); result.add(new Module("OnLineBible", "Export format for importing into OnLine Bible", OnLineBible.HELP_TEXT, OnLineBible.class)); result.add(new Module("BrowserBible", "Export format for The Browser Bible 3 by Digital Bible Society", BrowserBible.HELP_TEXT, BrowserBible.class)); diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/ParatextStripped.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/ParatextStripped.java new file mode 100644 index 0000000..e1c2192 --- /dev/null +++ b/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/ParatextStripped.java @@ -0,0 +1,547 @@ +package biblemulticonverter.format.paratext; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; + +import biblemulticonverter.Main; +import biblemulticonverter.data.Utils; +import biblemulticonverter.format.paratext.ParatextBook.Figure; +import biblemulticonverter.format.paratext.ParatextBook.ParagraphKind; +import biblemulticonverter.format.paratext.ParatextBook.ParagraphStart; +import biblemulticonverter.format.paratext.ParatextBook.ParatextBookContentPart; +import biblemulticonverter.format.paratext.ParatextBook.ParatextBookContentVisitor; +import biblemulticonverter.format.paratext.ParatextBook.ParatextCharacterContentContainer; +import biblemulticonverter.format.paratext.ParatextBook.PeripheralStart; +import biblemulticonverter.format.paratext.ParatextBook.Remark; +import biblemulticonverter.format.paratext.ParatextBook.SidebarEnd; +import biblemulticonverter.format.paratext.ParatextBook.SidebarStart; +import biblemulticonverter.format.paratext.ParatextBook.TableCellStart; +import biblemulticonverter.format.paratext.ParatextCharacterContent.AutoClosingFormatting; +import biblemulticonverter.format.paratext.ParatextCharacterContent.AutoClosingFormattingKind; +import biblemulticonverter.format.paratext.ParatextCharacterContent.FootnoteXref; +import biblemulticonverter.format.paratext.ParatextCharacterContent.FootnoteXrefKind; +import biblemulticonverter.format.paratext.ParatextCharacterContent.KeepIf; +import biblemulticonverter.format.paratext.ParatextCharacterContent.ParatextCharacterContentPart; +import biblemulticonverter.format.paratext.ParatextCharacterContent.ParatextCharacterContentVisitor; +import biblemulticonverter.format.paratext.ParatextCharacterContent.Reference; +import biblemulticonverter.format.paratext.model.ChapterIdentifier; +import biblemulticonverter.format.paratext.model.VerseIdentifier; +import biblemulticonverter.format.paratext.model.Version; + +/** + * Exporter that only exports parts of a Paratext bible. + */ +public class ParatextStripped extends AbstractParatextFormat { + + public static final String[] HELP_TEXT = { + "Export parts of a Paratext bible", + "", + "Usage (export): ParatextStripped [[=]...] -- [...]", + "", + "Supported modes:", + "", + "StripStudyContent", + "StripCustomMarkup", + "StripIntroductions", + "StripTagAttributes", + "StripRemarks", + "StripReferences", + "StripPart={OT|NT|DC}", + "StripStudyCategory=", + "StripParagraph=[#]", + "CompatibleVersion=[version]", + }; + + public ParatextStripped() { + super("ParatextStripped"); + } + + @Override + protected List doImportAllBooks(File inputFile) throws Exception { + throw new UnsupportedOperationException(); + } + + @Override + protected ParatextBook doImportBook(File inputFile) throws Exception { + throw new UnsupportedOperationException(); + } + + @Override + public void doExportBooks(List books, String... exportArgs) throws Exception { + int formatArg = stripBooks(books, exportArgs); + AbstractParatextFormat exportFormat = (AbstractParatextFormat) Main.exportFormats.get(exportArgs[formatArg]).getImplementationClass().newInstance(); + exportFormat.doExportBooks(books, Arrays.copyOfRange(exportArgs, formatArg + 1, exportArgs.length)); + } + + protected int stripBooks(List books, String... exportArgs) throws Exception { + Set featuresToStrip = EnumSet.noneOf(ParatextFeature.class); + EnumSet partsToKeep = EnumSet.allOf(KeepIf.class); + Set categoriesToStrip = new HashSet<>(); + Set paragraphKindsToStrip = EnumSet.noneOf(ParagraphKind.class); + Version compatibleVersionInit = Version.V3; + int formatArg = 0; + while (formatArg < exportArgs.length) { + if (exportArgs[formatArg].equals("--")) { + formatArg++; + break; + } else if (exportArgs[formatArg].startsWith("StripStudyCategory=")) { + categoriesToStrip.add(exportArgs[formatArg].substring(19)); + } else if (exportArgs[formatArg].startsWith("StripPart=")) { + partsToKeep.remove(KeepIf.valueOf(exportArgs[formatArg].substring(10))); + } else if (exportArgs[formatArg].startsWith("StripParagraph=")) { + String kind = exportArgs[formatArg].substring(15).replaceFirst("#$", ".*"); + for (ParagraphKind pk : ParagraphKind.values()) { + if (pk.getTag().matches(kind)) + paragraphKindsToStrip.add(pk); + } + } else if (exportArgs[formatArg].startsWith("CompatibleVersion=")) { + String versionString = exportArgs[formatArg].substring(18); + compatibleVersionInit = Version.V1; + for (Version v : Version.values()) { + if (v.toString().compareTo(versionString) <= 0 && compatibleVersionInit.isLowerOrEqualTo(v)) { + compatibleVersionInit = v; + } + } + System.out.println("NOTE: Set compatible version to " + compatibleVersionInit.toString()); + } else { + featuresToStrip.add(ParatextFeature.valueOf(exportArgs[formatArg])); + } + formatArg++; + } + if (featuresToStrip.contains(ParatextFeature.StripIntroductions)) { + for (ParagraphKind pk : ParagraphKind.values()) { + if (pk.name().startsWith("INTRO_")) + paragraphKindsToStrip.add(pk); + } + } + Version compatibleVersion = compatibleVersionInit; + for (int i = 0; i < books.size(); i++) { + ParatextBook book = books.get(i); + boolean keep = true; + if (book.getId().getId().isDeuterocanonical()) { + keep = partsToKeep.contains(KeepIf.DC); + } else if (book.getId().getId().isNT()) { + keep = partsToKeep.contains(KeepIf.NT); + } else if (book.getId().getId().getZefID() > 0) { + keep = partsToKeep.contains(KeepIf.OT); + } + if (!keep) { + books.remove(i); + i--; + continue; + } + book.getAttributes().entrySet().removeIf(e -> !ParatextBook.getMinVersionForAttribute(e.getKey()).isLowerOrEqualTo(compatibleVersion)); + if (featuresToStrip.contains(ParatextFeature.StripRemarks)) + book.getAttributes().entrySet().removeIf(e -> e.getKey().equals("rem") || e.getKey().startsWith("rem@")); + for (int j = 0; j < book.getContent().size(); j++) { + boolean[] skipSidebarHolder = { false }, skipParagraphHolder = { false }; + List partHolder = new ArrayList<>(); + partHolder.add(book.getContent().get(j)); + partHolder.get(0).acceptThis(new ParatextBookContentVisitor() { + + @Override + public void visitChapterStart(ChapterIdentifier location) { + } + + @Override + public void visitChapterEnd(ChapterIdentifier location) { + } + + @Override + public void visitRemark(String content) { + if (featuresToStrip.contains(ParatextFeature.StripRemarks)) { + partHolder.clear(); + } + } + + @Override + public void visitParagraphStart(ParagraphKind kind) { + if (paragraphKindsToStrip.contains(kind)) + skipParagraphHolder[0] = true; + else if (!kind.getVersion().isLowerOrEqualTo(compatibleVersion)) { + ParagraphKind replacement = replaceParagraphKind(kind); + partHolder.set(0, new ParagraphStart(replacement)); + } + } + + @Override + public void visitTableCellStart(String tag) { + if (tag.contains("-") && !Version.V3.isLowerOrEqualTo(compatibleVersion)) { + if (tag.contains("-")) { + Matcher m = Utils.compilePattern("(t[hcr]+)([0-9]+)-([0-9]+)").matcher(tag); + if (!m.matches()) + throw new IllegalStateException(); + String prefix = m.group(1); + int min = Integer.parseInt(m.group(2)); + int max = Integer.parseInt(m.group(3)); + partHolder.clear(); + for (int i = min; i <= max; i++) { + partHolder.add(new TableCellStart(prefix + i)); + } + return; + } + } + } + + @Override + public void visitSidebarStart(String[] categories) throws RuntimeException { + HashSet cats = new HashSet<>(Arrays.asList(categories)); + cats.retainAll(categoriesToStrip); + if (!cats.isEmpty() || featuresToStrip.contains(ParatextFeature.StripStudyContent) || !Version.V2_1.isLowerOrEqualTo(compatibleVersion)) { + skipSidebarHolder[0] = true; + } + } + + @Override + public void visitSidebarEnd() throws RuntimeException { + } + + public void visitPeripheralStart(String title, String id) throws RuntimeException { + if (id != null && !Version.V3.isLowerOrEqualTo(compatibleVersion)) { + partHolder.set(0, new PeripheralStart(((PeripheralStart) partHolder.get(0)).getTitle(), null)); + } + }; + + @Override + public void visitVerseStart(VerseIdentifier location, String verseNumber) throws RuntimeException { + } + + @Override + public void visitVerseEnd(VerseIdentifier verseLocation) throws RuntimeException { + } + + @Override + public void visitFigure(String caption, Map attributes) throws RuntimeException { + if (!Version.V3.isLowerOrEqualTo(compatibleVersion)) { + attributes.entrySet().removeIf(me -> !Arrays.asList(Figure.FIGURE_PROVIDED_ATTRIBUTES).contains(me.getKey())); + } + } + + @Override + public void visitParatextCharacterContent(ParatextCharacterContent content) { + filterContents(content.getContent(), featuresToStrip, partsToKeep, categoriesToStrip, compatibleVersion); + } + }); + if (skipSidebarHolder[0] && partHolder.get(0) instanceof SidebarStart) { + while (j < book.getContent().size()) { + ParatextBookContentPart removedPart = book.getContent().remove(j); + if (removedPart instanceof SidebarEnd) + break; + } + j--; + } else if (skipParagraphHolder[0] && partHolder.get(0) instanceof ParagraphStart) { + book.getContent().remove(j); + while (j < book.getContent().size() && (book.getContent().get(j) instanceof Figure || + book.getContent().get(j) instanceof Remark || + book.getContent().get(j) instanceof ParatextCharacterContent || + book.getContent().get(j) instanceof TableCellStart)) { + book.getContent().remove(j); + } + j--; + } else { + book.getContent().remove(j); + j--; + for (ParatextBookContentPart part : partHolder) { + j++; + book.getContent().add(j, part); + } + } + } + } + return formatArg; + } + + private void filterContents(List parts, Set featuresToStrip, EnumSet partsToKeep, Set categoriesToStrip, Version compatibleVersion) { + for (int i = 0; i < parts.size(); i++) { + boolean[] skipPartHolder = { false, false }; + ParatextCharacterContentPart[] partHolder = { parts.get(i) }; + partHolder[0].acceptThis(new ParatextCharacterContentVisitor() { + + @Override + public ParatextCharacterContentVisitor visitFootnoteXref(FootnoteXrefKind kind, String caller, String[] categories) throws RuntimeException { + if (featuresToStrip.contains(ParatextFeature.StripStudyContent) && (kind == FootnoteXrefKind.STUDY_EXTENDED_FOOTNOTE || kind == FootnoteXrefKind.STUDY_EXTENDED_XREF)) { + skipPartHolder[0] = true; + return null; + } + if (!Version.V2_1.isLowerOrEqualTo(compatibleVersion) && kind == FootnoteXrefKind.STUDY_EXTENDED_FOOTNOTE) { + ParatextCharacterContent.FootnoteXref fx = new FootnoteXref(FootnoteXrefKind.FOOTNOTE, caller, categories); + fx.getContent().addAll(((FootnoteXref) partHolder[0]).getContent()); + partHolder[0] = fx; + } else if (!Version.V2_3.isLowerOrEqualTo(compatibleVersion) && kind == FootnoteXrefKind.STUDY_EXTENDED_XREF) { + ParatextCharacterContent.FootnoteXref fx = new FootnoteXref(FootnoteXrefKind.XREF, caller, categories); + fx.getContent().addAll(((FootnoteXref) partHolder[0]).getContent()); + partHolder[0] = fx; + } + Set cats = new HashSet<>(Arrays.asList(categories)); + cats.retainAll(categoriesToStrip); + if (!cats.isEmpty()) { + skipPartHolder[0] = true; + return null; + } + filterContents(((FootnoteXref) partHolder[0]).getContent(), featuresToStrip, partsToKeep, categoriesToStrip, compatibleVersion); + return null; + } + + @Override + public ParatextCharacterContentVisitor visitAutoClosingFormatting(AutoClosingFormattingKind kind, Map attributes) throws RuntimeException { + if (kind.getKeepIf() != null && !partsToKeep.contains(kind.getKeepIf())) { + skipPartHolder[0] = true; + return null; + } + filterContents(((ParatextCharacterContentContainer) partHolder[0]).getContent(), featuresToStrip, partsToKeep, categoriesToStrip, compatibleVersion); + if (featuresToStrip.contains(ParatextFeature.StripTagAttributes)) { + attributes.clear(); + } else if (!Version.V3.isLowerOrEqualTo(compatibleVersion)) { + attributes.keySet().removeIf(k -> !k.equals("lemma")); + } + if (!kind.getVersion().isLowerOrEqualTo(compatibleVersion)) { + List replacementList = new ArrayList<>(); + replacementList.add(kind); + for (int i = 0; i < replacementList.size(); i++) { + if (!replacementList.get(i).getVersion().isLowerOrEqualTo(compatibleVersion)) { + AutoClosingFormattingKind[] replacements = replaceAutoClosingFormattingKind(replacementList.get(i)); + if (replacements == null) { + skipPartHolder[0] = true; + return null; + } + replacementList.remove(i); + replacementList.addAll(i, Arrays.asList(replacements)); + i--; + } + } + if (replacementList.size() == 0) { + skipPartHolder[1] = true; + } else { + AutoClosingFormatting acfRoot = new AutoClosingFormatting(replacementList.get(0)); + acfRoot.getAttributes().putAll(attributes); + AutoClosingFormatting acf = acfRoot; + for (int i = 1; i < replacementList.size(); i++) { + AutoClosingFormatting acfParent = acf; + acf = new AutoClosingFormatting(replacementList.get(1)); + acf.getAttributes().putAll(attributes); + acfParent.getContent().add(acf); + } + acf.getContent().addAll(((AutoClosingFormatting) partHolder[0]).getContent()); + partHolder[0] = acfRoot; + } + } + return null; + } + + @Override + public void visitMilestone(String tag, Map attributes) throws RuntimeException { + if (tag.startsWith("z") && featuresToStrip.contains(ParatextFeature.StripCustomMarkup)) { + skipPartHolder[0] = true; + } else if (!Version.V3.isLowerOrEqualTo(compatibleVersion)) { + skipPartHolder[0] = true; + } else if (featuresToStrip.contains(ParatextFeature.StripTagAttributes)) { + attributes.clear(); + } + } + + @Override + public void visitReference(Reference reference) throws RuntimeException { + if (featuresToStrip.contains(ParatextFeature.StripReferences)) { + partHolder[0] = ParatextCharacterContent.Text.from(((Reference) partHolder[0]).getContent()); + } + } + + @Override + public void visitCustomMarkup(String tag, boolean ending) throws RuntimeException { + if (featuresToStrip.contains(ParatextFeature.StripCustomMarkup)) + skipPartHolder[0] = true; + } + + @Override + public void visitSpecialSpace(boolean nonBreakSpace, boolean optionalLineBreak) throws RuntimeException { + } + + @Override + public void visitText(String text) throws RuntimeException { + } + + @Override + public void visitEnd() throws RuntimeException { + } + + }); + if (skipPartHolder[0]) { + parts.remove(i); + i--; + } else if (skipPartHolder[1]) { + ParatextCharacterContentContainer pcc = (ParatextCharacterContentContainer) parts.remove(i); + parts.addAll(i, pcc.getContent()); + i--; + } else { + parts.set(i, partHolder[0]); + } + } + } + + @Override + protected void doExportBook(ParatextBook book, File outFile) throws IOException { + throw new UnsupportedOperationException(); + } + + private static enum ParatextFeature { + StripStudyContent, StripCustomMarkup, StripIntroductions, StripTagAttributes, StripRemarks, StripReferences; + } + + public static ParagraphKind replaceParagraphKind(ParagraphKind kind) { + switch (kind) { + case SECTION_REFERENCE: + return ParagraphKind.MAJOR_SECTION_REFERENCE; + case PARAGRAPH_PMO: + return ParagraphKind.PARAGRAPH_MI; + case PARAGRAPH_PM: + return ParagraphKind.PARAGRAPH_PI; + case PARAGRAPH_PMC: + return ParagraphKind.PARAGRAPH_MI; + case PARAGRAPH_PMR: + return ParagraphKind.PARAGRAPH_RIGHT; + case PARAGRAPH_QM: + return ParagraphKind.PARAGRAPH_Q; + case PARAGRAPH_QM1: + return ParagraphKind.PARAGRAPH_Q1; + case PARAGRAPH_QM2: + return ParagraphKind.PARAGRAPH_Q2; + case PARAGRAPH_QM3: + return ParagraphKind.PARAGRAPH_Q3; + case PARAGRAPH_QM4: + return ParagraphKind.PARAGRAPH_Q4; + case HEBREW_NOTE: + // According to documentation this is very similar to `d` + // (ParagraphKind.DESCRIPTIVE_TITLE) + return ParagraphKind.DESCRIPTIVE_TITLE; + case SEMANTIC_DIVISION: + // TODO maybe add more than 1 blank line? + return ParagraphKind.BLANK_LINE; + case SEMANTIC_DIVISION_1: + return ParagraphKind.BLANK_LINE; + case SEMANTIC_DIVISION_2: + return ParagraphKind.BLANK_LINE; + case SEMANTIC_DIVISION_3: + return ParagraphKind.BLANK_LINE; + case SEMANTIC_DIVISION_4: + return ParagraphKind.BLANK_LINE; + case PARAGRAPH_PO: + return ParagraphKind.PARAGRAPH_P; + case PARAGRAPH_LH: + return ParagraphKind.PARAGRAPH_P; + case PARAGRAPH_LF: + return ParagraphKind.PARAGRAPH_M; + case PARAGRAPH_LIM: + // Documentation is not entirely clear on what the exact difference + // is between `lim#` and `li#` + // one is "embedded" the other is not: + // https://ubsicap.github.io/usfm/lists/index.html#lim + // The assumption is made here that `lim#` is directly replaceable + // with `li#` + return ParagraphKind.PARAGRAPH_LI; + case PARAGRAPH_LIM1: + return ParagraphKind.PARAGRAPH_LI1; + case PARAGRAPH_LIM2: + return ParagraphKind.PARAGRAPH_LI2; + case PARAGRAPH_LIM3: + return ParagraphKind.PARAGRAPH_LI3; + case PARAGRAPH_LIM4: + return ParagraphKind.PARAGRAPH_LI4; + + default: + throw new IllegalStateException("No replacement given for " + kind); + } + } + + public static AutoClosingFormattingKind[] replaceAutoClosingFormattingKind(AutoClosingFormattingKind kind) { + switch (kind) { + case INTRODUCTION_QUOTED_TEXT: + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.QUOTED_TEXT }; + case FOOTNOTE_LABEL: + case FOOTNOTE_PARAGRAPH: + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.FOOTNOTE_TEXT }; + case XREF_OT_CONTENT: + case XREF_NT_CONTENT: + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.XREF_TARGET_REFERENCES }; + case ADDED_PROPER_NAME: + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.ADDITION, AutoClosingFormattingKind.PROPER_NAME }; + case RUBY: + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.PRONUNCIATION }; + case PRONUNCIATION: + return new AutoClosingFormattingKind[0]; + case STUDY_CONTENT_CATEGORY: + return null; + case WORDS_OF_JESUS: + return new AutoClosingFormattingKind[0]; + case EMPHASIS: + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.ITALIC }; + case LIST_TOTAL: + case LIST_KEY: + case LIST_VALUE: + // It should not be too much of an issue to just skip these list + // tags + // E.g. + // \li1 \lik Reuben\lik* \liv1 Eliezer son of Zichri\liv1* + // Will become: + // \li1 Reuben Eliezer son of Zichri + return new AutoClosingFormattingKind[0]; + case FOOTNOTE_WITNESS_LIST: + // The Footnote witness list is just extra markup found within a + // footnote, however according to + // documentation found here: + // https://ubsicap.github.io/usfm/v3.0.rc1/notes_basic/fnotes.html + // Each element within a footnote must start with it's appropriate + // tag. So we can't just skip this tag + // since it could contain text. It would be better to turn this into + // a text entry `ft`. + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.FOOTNOTE_TEXT }; + case XREF_PUBLISHED_ORIGIN: + // Published cross reference origin texts do not exist in USFM 2.x + // There is not really a nice way to downgrade these, we cannot put + // the `xop` tag into `xo` because it + // might not follow the usual `` pattern. + // TODO, maybe we can just write the contents to the parent target, + // just like FOOTNOTE_WITNESS_LIST? + return null; + case XREF_TARGET_REFERENCES_TEXT: + // "Target reference(s) extra / added text" does not exist in USFM + // 2.x + // We should be able to get away with just adding the raw content + // directly `target`. + return new AutoClosingFormattingKind[0]; + case SUPERSCRIPT: + // There is not really a good way to represent superscript in USFM + // 2.x + // To avoid losing data, we skip the tag and just add the content + // directly to `target`. + // TODO, maybe we can use `sc` (Small caps) or `ord` (Ordinal) + // instead? + return new AutoClosingFormattingKind[0]; + case ARAMAIC_WORD: + // There is not really a good way to represent Aramaic words in USFM + // 2.x + // To avoid losing data, we skip the tag and just add the content + // directly to `target`. + return new AutoClosingFormattingKind[0]; + case PROPER_NAME_GEOGRAPHIC: + // This marker just gives geographic names a different presentation, + // replace by PROPER_NAME. + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.PROPER_NAME }; + case LINK: + // just strip the tag and keep the contents. + return new AutoClosingFormattingKind[0]; + case EXTENDED_FOOTNOTE_MARK: + // Use non-extended footnote mark instead + return new AutoClosingFormattingKind[] { AutoClosingFormattingKind.FOOTNOTE_MARK }; + default: + throw new IllegalStateException("No replacement given for " + kind); + } + } +} diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/USX.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/USX.java index a623bf7..4c06a23 100644 --- a/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/USX.java +++ b/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/USX.java @@ -549,34 +549,9 @@ public void visitParagraphStart(ParagraphKind kind) throws IOException { currentContent = para.getContent(); currentTable = null; } else { - visitUnsupportedParagraphStart(kind); - } - } - - private void visitUnsupportedParagraphStart(ParagraphKind kind) throws IOException { - if (kind == ParagraphKind.HEBREW_NOTE) { - // According to documentation this is very similar to `d` (ParagraphKind.DESCRIPTIVE_TITLE) - visitParagraphStart(ParagraphKind.DESCRIPTIVE_TITLE); - logger.logReplaceWarning(kind, ParagraphKind.DESCRIPTIVE_TITLE); - } else if (kind.isSameBase(ParagraphKind.SEMANTIC_DIVISION)) { - // TODO maybe add more than 1 blank line? - visitParagraphStart(ParagraphKind.BLANK_LINE); - logger.logReplaceWarning(kind, ParagraphKind.BLANK_LINE); - } else if (kind == ParagraphKind.PARAGRAPH_PO || kind == ParagraphKind.PARAGRAPH_LH || kind == ParagraphKind.PARAGRAPH_LF) { - logger.logReplaceWarning(kind, ParagraphKind.PARAGRAPH_P); - visitParagraphStart(ParagraphKind.PARAGRAPH_P); - } else if (kind == ParagraphKind.PARAGRAPH_LF) { - logger.logReplaceWarning(kind, ParagraphKind.PARAGRAPH_M); - visitParagraphStart(ParagraphKind.PARAGRAPH_M); - } else if (kind.getTag().startsWith(ParagraphKind.PARAGRAPH_LIM.getTag())) { - // Documentation is not entirely clear on what the exact difference is between `lim#` and `li#` - // one is "embedded" the other is not: https://ubsicap.github.io/usfm/lists/index.html#lim - // The assumption is made here that `lim#` is directly replaceable with `li#` - ParagraphKind replacement = ParagraphKind.PARAGRAPH_LI.getWithNumber(kind.getNumber()); + ParagraphKind replacement = ParatextStripped.replaceParagraphKind(kind); logger.logReplaceWarning(kind, replacement); visitParagraphStart(replacement); - } else { - throw new RuntimeException("Could not export to USX 2 because an unhandled paragraph type `" + kind + "` from a newer USFM/USX version was found."); } } @@ -737,34 +712,7 @@ public ParatextCharacterContentVisitor visitAutoClosingFormatting(A } private ParatextCharacterContentVisitor visitUnsupportedAutoClosingFormatting(AutoClosingFormattingKind kind, Map attributes) throws IOException { - if (kind == AutoClosingFormattingKind.LIST_TOTAL || kind == AutoClosingFormattingKind.LIST_KEY || kind.isSameBase(AutoClosingFormattingKind.LIST_VALUE)) { - // It should not be too much of an issue to just skip these list tags - // E.g. - // \li1 \lik Reuben\lik* \liv1 Eliezer son of Zichri\liv1* - // Wil become: - // \li1 Reuben Eliezer son of Zichri - USX.this.logger.logSkippedWarning(kind); - return new USXCharacterContentVisitor(USX.this.logger, target); - } else if (kind == AutoClosingFormattingKind.FOOTNOTE_WITNESS_LIST) { - // The Footnote witness list is just extra markup found within a footnote, however according to - // documentation found here: https://ubsicap.github.io/usfm/v3.0.rc1/notes_basic/fnotes.html - // Each element within a footnote must start with it's appropriate tag. So we can't just skip this tag - // since it could contain text. It would be better to turn this into a text entry `ft`. - USX.this.logger.logReplaceWarning(kind, AutoClosingFormattingKind.FOOTNOTE_TEXT); - return visitAutoClosingFormatting(AutoClosingFormattingKind.FOOTNOTE_TEXT, attributes); - } else if (kind == AutoClosingFormattingKind.XREF_PUBLISHED_ORIGIN) { - // Published cross reference origin texts do not exist in USFM 2.x - // There is not really a nice way to downgrade these, we cannot put the `xop` tag into `xo` because it - // might not follow the usual `` pattern. - // TODO, maybe we can just write the contents to the parent target, just like FOOTNOTE_WITNESS_LIST? - USX.this.logger.logRemovedWarning(kind); - return null; - } else if (kind == AutoClosingFormattingKind.XREF_TARGET_REFERENCES_TEXT) { - // "Target reference(s) extra / added text" does not exist in USFM 2.x - // We should be able to get away with just adding the raw content directly `target`. - USX.this.logger.logSkippedWarning(kind); - return new USXCharacterContentVisitor(USX.this.logger, target); - } else if (kind == AutoClosingFormattingKind.SUPERSCRIPT) { + if (kind == AutoClosingFormattingKind.SUPERSCRIPT) { // There is not really a good way to represent superscript in USFM 2.x // To avoid losing data, we skip the tag and just add the content directly to `target`. // TODO, maybe we can use `sc` (Small caps) instead? @@ -772,15 +720,6 @@ private ParatextCharacterContentVisitor visitUnsupportedAutoClosing "separated by whitespace, since the previous text and superscript text may not have had been" + "separated by whitespace."); return new USXCharacterContentVisitor(USX.this.logger, target); - } else if (kind == AutoClosingFormattingKind.ARAMAIC_WORD) { - // There is not really a good way to represent Aramaic words in USFM 2.x - // To avoid losing data, we skip the tag and just add the content directly to `target`. - USX.this.logger.logSkippedWarning(kind); - return new USXCharacterContentVisitor(USX.this.logger, target); - } else if (kind == AutoClosingFormattingKind.PROPER_NAME_GEOGRAPHIC) { - // This marker just gives geographic names a different presentation, replace by PROPER_NAME. - USX.this.logger.logReplaceWarning(kind, AutoClosingFormattingKind.PROPER_NAME); - return visitAutoClosingFormatting(AutoClosingFormattingKind.PROPER_NAME, attributes); } else if (kind == AutoClosingFormattingKind.RUBY) { // Replace by putting the gloss into PRONUNCIATION. USX.this.logger.logReplaceWarning(kind, AutoClosingFormattingKind.PRONUNCIATION); @@ -790,16 +729,25 @@ public void visitEnd() throws IOException { outer.visitAutoClosingFormatting(AutoClosingFormattingKind.PRONUNCIATION, new HashMap<>(3)).visitText(attributes.get("gloss")); } }; - } else if (kind == AutoClosingFormattingKind.LINK) { - // just strip the tag and keep the contents. - USX.this.logger.logSkippedWarning(kind); - return new USXCharacterContentVisitor(USX.this.logger, target); - } else if (kind == AutoClosingFormattingKind.EXTENDED_FOOTNOTE_MARK) { - // Use non-extended footnote mark instead - USX.this.logger.logReplaceWarning(kind, AutoClosingFormattingKind.FOOTNOTE_MARK); - return visitAutoClosingFormatting(AutoClosingFormattingKind.FOOTNOTE_MARK, attributes); } else { - throw new RuntimeException("Could not export to USX 2 because an unhandled char type `" + kind + "` from a newer USFM/USX version was found."); + AutoClosingFormattingKind[] replacement = ParatextStripped.replaceAutoClosingFormattingKind(kind); + if (replacement == null) { + USX.this.logger.logRemovedWarning(kind); + return null; + } else if (replacement.length == 0) { + USX.this.logger.logSkippedWarning(kind); + return new USXCharacterContentVisitor(USX.this.logger, target); + } else if (replacement.length == 1) { + USX.this.logger.logReplaceWarning(kind, replacement[0]); + return visitAutoClosingFormatting(replacement[0], attributes); + } else { + USX.this.logger.logReplaceWarning(kind, replacement); + ParatextCharacterContentVisitor v = this; + for(AutoClosingFormattingKind r: replacement) { + v = v.visitAutoClosingFormatting(r, attributes); + } + return v; + } } } diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/utilities/StandardExportWarningMessages.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/utilities/StandardExportWarningMessages.java index 7982398..d9d7045 100644 --- a/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/utilities/StandardExportWarningMessages.java +++ b/biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/utilities/StandardExportWarningMessages.java @@ -1,5 +1,8 @@ package biblemulticonverter.format.paratext.utilities; +import java.util.Arrays; +import java.util.stream.Collectors; + import biblemulticonverter.format.paratext.ParatextBook; import biblemulticonverter.format.paratext.ParatextCharacterContent; @@ -49,6 +52,21 @@ public void logReplaceWarning(ParatextCharacterContent.AutoClosingFormattingKind replacement.getTag() + "`, because this char style cannot be represented in " + targetFormat + "."); } + /** + * Log a message to indicate that a specific char style marker was replaced + * with a list of other char style markers during the export. + * + * @param original + * the char style maker that was replaced. + * @param replacement + * the char style marker that was used as replacement. + */ + public void logReplaceWarning(ParatextCharacterContent.AutoClosingFormattingKind original, ParatextCharacterContent.AutoClosingFormattingKind[] replacement) { + System.out.println("WARNING: Replaced char style marker `" + original.getTag() + "` with " + + Arrays.stream(replacement).map(r ->"`"+r.getTag()+"`").collect(Collectors.joining(" and ")) + + ", because this char style cannot be represented in " + targetFormat + "."); + } + /** * @see #logSkippedWarning(ParatextCharacterContent.AutoClosingFormattingKind, * String) diff --git a/biblemulticonverter/src/test/java/biblemulticonverter/format/paratext/USFM3AllTagsTest.java b/biblemulticonverter/src/test/java/biblemulticonverter/format/paratext/USFM3AllTagsTest.java index 47673da..9448523 100644 --- a/biblemulticonverter/src/test/java/biblemulticonverter/format/paratext/USFM3AllTagsTest.java +++ b/biblemulticonverter/src/test/java/biblemulticonverter/format/paratext/USFM3AllTagsTest.java @@ -2,35 +2,19 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assume.assumeTrue; import java.io.File; import java.io.IOException; -import java.io.StringWriter; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.Arrays; import java.util.List; -import javax.xml.bind.JAXBContext; -import javax.xml.bind.Marshaller; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Test; -import org.w3c.dom.Document; import biblemulticonverter.data.Bible; -import biblemulticonverter.format.paratext.ParatextBook.ParatextBookContentPart; -import biblemulticonverter.schema.usfx.ObjectFactory; -import biblemulticonverter.schema.usfx.PType; -import biblemulticonverter.schema.usfx.Usfx; +import biblemulticonverter.format.paratext.model.Version; import biblemulticonverter.tools.Validate; public class USFM3AllTagsTest { @@ -122,17 +106,38 @@ private void testSingleFormat(AbstractParatextFormat format, String extraArg, St } } - @Test - public void testUSX() throws Exception { - String expectedContent = testBookContent.replaceAll("\n\\\\toca.*", "").replaceAll("\n\\\\usfm 3.0", "").replaceAll("\\\\zCustomTag[* ]", "") - .replace("\\sd2", "\\b").replace("\\ph", "\\li").replace("\\po", "\\p").replace("\\pr ", "\\pmr ").replace("\\pr*", "\\pmr*").replace("\\qd", "\\d") - .replace("\\lh", "\\p").replace("\\lf", "\\p").replace("\\lim", "\\li").replaceAll("\\\\li([kv]|tl)\\*", " ").replaceAll("\\\\li([kv]|tl) ", "") + private String toVersion2_2(String version3) { + return version3.replace("\\usfm 3.0\n", "").replaceAll("\n\\\\toca[123].*", "").replace("\\qd", "\\d") + .replace("\\efm", "\\fm").replace("\\sd2", "\\b").replace("\\po", "\\p").replace("\\lh", "\\p") + .replace("\\lf", "\\m").replace("\\lim", "\\li").replaceAll("\\\\li([kv]|tl)[ *]", "") .replace(" strong=\"G0123\" srcloc=\"version:1.2.3.4\"\\w*", "\\w*").replace("\\png", "\\pn").replaceAll("\\\\wa[ *]", "").replace("\\efm", "\\fm") - .replace("\\sup ", "").replace("\\sup*", " ").replace("~", " ").replace("\\rb Ruby|gloss=\"Roo:bee\"\\rb*", "Ruby\\pro Roo:bee\\pro*") + .replace("\\rb Ruby|gloss=\"Roo:bee\"\\rb*", "\\pro Ruby\\pro*").replace("\\sup ", "").replace("\\sup*", "") .replace("\\tc1-2", "\\tc1 \\tc2").replace("\\th2-3", "\\th2 \\th3").replace("\\fw", "\\ft").replace("\\xop ibidem:\\xop*", "").replaceAll("\\\\xta[ *]", "").replace("|link-href=\"GEN 9:8\"\\xt*", "\\xt*") .replace("|id=\"measures\"", "").replace("|id=\"x-custom\"", "") - .replace("\\jmp to nowhere|link-href=\"https://schierlm.github.io\" x-why=\"That's me\"\\jmp*. Here is \\jmp |link-id=\"a-loop\"\\jmp*\\jmp A loop|link-href=\"#a-loop\" link-title=\"Loop\"\\jmp*.\\ts \\*", "to nowhere . Here is A loop .") - .replace("\\zmyMilestone \\* Milestone,\\qt-s |sid=\"qqA\" who=\"Nobody\"\\*\" Fake Quote:\\qt2-s |sid=\"qqB\" who=\"Nobodier\"\\* Nobody said this!\\qt-e |eid=\"qqB\"\\*\"\\qt-e |eid=\"qqA\"\\*\\ts \\*", " Milestone, \" Fake Quote: Nobody said this! \""); + .replace("\\jmp to nowhere|link-href=\"https://schierlm.github.io\" x-why=\"That's me\"\\jmp*. Here is \\jmp |link-id=\"a-loop\"\\jmp*\\jmp A loop|link-href=\"#a-loop\" link-title=\"Loop\"\\jmp*.\\ts \\*", "to nowhere. Here is A loop.") + .replace("\\zmyMilestone \\* Milestone,\\qt-s |sid=\"qqA\" who=\"Nobody\"\\*\" Fake Quote:\\qt2-s |sid=\"qqB\" who=\"Nobodier\"\\* Nobody said this!\\qt-e |eid=\"qqB\"\\*\"\\qt-e |eid=\"qqA\"\\*\\ts \\*", " Milestone,\" Fake Quote: Nobody said this!\""); + } + + private String toVersion2_1(String version3) { + return toVersion2_2(version3).replace("\\iqt", "\\qt").replace("\\xot", "\\xt").replace("\\xnt", "\\xt"); + } + + private String toVersion2_0_4(String version3) { + return toVersion2_1(version3).replace("\\ef", "\\f").replaceFirst("\\\\esb(.|\n)*?\\\\esbe\n", ""); + } + + private String toVersion2(String version3) { + return toVersion2_0_4(testBookContent).replaceAll("\n\\\\toc[123] .*", "").replaceAll("\\\\f[pl]", "\\\\ft"); + } + + @Test + public void testUSX() throws Exception { + String expectedContent = toVersion2_2(testBookContent.replace("\\sup*", " ").replaceAll("\\\\li([kv]|tl)\\*", " ") + .replace("\\rb Ruby|gloss=\"Roo:bee\"\\rb*", "Ruby\\pro Roo:bee\\pro*")) + .replace("\\pr ", "\\pmr ").replace("\\ph", "\\li") + .replace("~", " ").replaceAll("\\\\zCustomTag[* ]", "") + .replace("nowhere. Here is A loop.", "nowhere . Here is A loop .") + .replace("Milestone,\" Fake Quote: Nobody said this!\"", "Milestone, \" Fake Quote: Nobody said this! \""); testSingleFormat(new USX(), "#-*.usx", expectedContent); } @@ -175,6 +180,83 @@ public void testParatextUSFM() throws Exception { testSingleFormat(new USFM(), "#-*.usfm", null); } + private void testParatextStripped(String[] exportArgs, String expectedContent, Version expectedVersion) throws Exception { + File originalFile = USX3Test.getResource("/usfm3allTags/01-MAT.usfm"); + ParatextBook testBookCopy = new USFM().doImportBook(originalFile); + new ParatextStripped().stripBooks(Arrays.asList(testBookCopy), exportArgs); + USFM usfm = new USFM(); + assertEquals(expectedVersion, usfm.getMinRequiredVersion(testBookCopy)); + File resultFile = USX3Test.createTempFile("export", ".usfm"); + usfm.doExportBook(testBookCopy, resultFile); + String actualContent = new String(Files.readAllBytes(resultFile.toPath()), StandardCharsets.UTF_8); + assertEquals(expectedContent, actualContent); + usfm.doExportBook(usfm.doImportBook(resultFile), resultFile); + assertEquals(expectedContent, new String(Files.readAllBytes(resultFile.toPath()), StandardCharsets.UTF_8)); + } + + @Test + public void testConvertVersion3() throws Exception { + testParatextStripped(new String[0], testBookContent, Version.V3); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V3.toString() }, testBookContent, Version.V3); + } + + @Test + public void testConvertVersion2_2() throws Exception { + String expectedContent = toVersion2_2(testBookContent); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V2_3.toString() }, expectedContent, Version.V2_2); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V2_2.toString() }, expectedContent, Version.V2_2); + } + + @Test + public void testConvertVersion2_1() throws Exception { + String expectedContent = toVersion2_1(testBookContent); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V2_1.toString() }, expectedContent, Version.V2_1); + } + + @Test + public void testConvertVersion2_0_4() throws Exception { + String expectedContent = toVersion2_0_4(testBookContent); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V2_0_4.toString() }, expectedContent, Version.V2_0_4); + } + + @Test + public void testConvertVersion2_0_3() throws Exception { + String expectedContent = toVersion2_0_4(testBookContent).replaceFirst("\n\\\\toc3 .*", ""); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V2_0_3.toString() }, expectedContent, Version.V2_0_3); + } + + @Test + public void testConvertVersion2() throws Exception { + String expectedContent = toVersion2(testBookContent); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V2.toString() }, expectedContent, Version.V2); + } + + @Test + public void testConvertVersion1() throws Exception { + String expectedContent = toVersion2(testBookContent).replace("\\sr ", "\\mr ").replace("\\pmo ", "\\mi ") + .replace("\\pm ", "\\pi ").replace("\\pmr ", "\\pr ").replace("\\pmc ", "\\mi ").replace("\\em", "\\it") + .replace("\\qm", "\\q").replace("\\addpn The Great\\addpn*", "\\add \\+pn The Great\\+pn*\\add*") + .replaceAll("\\\\(wj|pro)[ *]", ""); + testParatextStripped(new String[] { "CompatibleVersion=" + Version.V1.toString() }, expectedContent, Version.V1); + } + + @Test + public void testStripEverything() throws Exception { + String expectedContent = testBookContent.replaceAll("\n\\\\(i[mspbloeq]|rem ).*?(?=\n)", "") + .replace("\\ef † \\cat Etymology\\cat*\\fr 1.1:\\fr*\\fq First:\\fq*\\ft Fore-est\\ft*\\ef*", "") + .replaceFirst("\\\\esb (.|\n)*?\\\\esbe\n", "") + .replaceAll("\\\\zCustomTag[ *]", "").replace("\\zmyMilestone \\*", "") + .replace("\\fdc This bible contains deuterocanonical content.\\fdc*", "") + .replace("\\dc in deuterocanonical books\\dc*", "") + .replace("\\xot Exo 1.1\\xot*", "").replace("\\xdc Sir 2.3\\xdc*", "") + .replaceAll("\\|(lemma|link|gloss|[se]id).*?\\\\", "\\\\"); + testParatextStripped(new String[] { + "StripStudyContent", "StripCustomMarkup", "StripIntroductions", "StripTagAttributes", + "StripRemarks", "StripReferences", "StripPart=OT", "StripPart=DC", + "StripStudyCategory=Sidey", "StripStudyCategory=Etymology", "StripParagraph=qxq", + }, expectedContent, Version.V3); + } + private static void deleteRecursively(File f) throws IOException { if (!f.exists()) return;