Skip to content

Commit

Permalink
version 1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
jbiermann committed Dec 6, 2017
1 parent 8aa3dea commit 00df9e1
Show file tree
Hide file tree
Showing 37 changed files with 597 additions and 298 deletions.
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Usage: Finereader XML to basic TEI converter [command] [command options]
file Process a single XML File
Usage: file [options]
Options:
-f, --foreign
Add foreign elements to output
Default: false
-h, --help
print this message
* -i, --input
Expand All @@ -28,10 +31,13 @@ Usage: Finereader XML to basic TEI converter [command] [command options]
Options:
* -d, --dir
The start path of the directory for processing the files
-f, --foreign
Add foreign elements to output
Default: false
-h, --help
print this message
-lb, --linebreak
Add line breaks to p elements
-lb, --nolinebreak
Add line break to p elements
Default: true
-p, --pattern
The regex pattern for matching the OCR xml files
Expand Down Expand Up @@ -73,7 +79,6 @@ The vendor provides a zip file with some [samples](https://abbyy.technology/_med

This tool works only if the ocr xml has been generated with the character by character generation, so not with the basic format.


Currently only text elements get processed, image and table elements are ignored. The following text attributes are transfered to TEI:

* italic -> `<hi rend="italic"> italic></hi>`
Expand All @@ -82,6 +87,8 @@ Currently only text elements get processed, image and table elements are ignored
* strikethrough -> `<hi rend="strikethrough">strikethrough></hi>`
* super `<hi rend="super">super</hi>`
* unclear characters -> `<unclear cert="unknown" reason="illegible">l</unclear>`
* document language -> `<TEI lang="en">`
* other languages as foreign element if -f parameter has been set -> `<foreign lang="lat">...</foreign>`


# Building
Expand All @@ -100,6 +107,14 @@ The output does not contain the whole TEI schema. Instead a simple outputTemplat

Afterwards you have to adapt the classes under de.sub.goettingen.arendt.ocrmapping for the changed structure.

# New for v1.1

* Support for XPath expression before processing
* extracting of main document language
* each TEI document gets an ID
* support for foreign element if specified via command line option
* improved whitespace handling. The OCR output is missing some whitespaces, this version tries the best to fix the whitespaces

# Further information

The documentation of the Finereader XML File can be found [here](https://abbyy.technology/en:features:ocr:xml), a examplanation of the schema is located [here](https://ocrsdk.com/documentation/specifications/xml-scheme-recognized-document/).
Expand Down
11 changes: 10 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,29 @@
<modelVersion>4.0.0</modelVersion>
<groupId>de.sub.goettingen.arendt</groupId>
<artifactId>ocr2tei</artifactId>
<version>1.0-SNAPSHOT</version>
<version>1.1-SNAPSHOT</version>
<name>OCR2XML</name>
<dependencies>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.72</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.8.0-6</version>
</dependency>
</dependencies>
<description>This is a CLI utility which allows to read the vendor specific Finereader XML files created with the Finereader cloud service or SDK and convert them into a basic TEI output document.</description>
<organization>
<name>SUB Göttingen</name>
<url>http://www.sub.uni-goettingen.de/</url>
</organization>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<build>
<defaultGoal>package</defaultGoal>
<plugins>
Expand Down
26 changes: 19 additions & 7 deletions resources/outputTemplate.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<?xml-model href="transcriptionsSchema.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>

<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="asd" xml:lang="de">
<teiHeader>
<fileDesc>
<titleStmt>
Expand Down Expand Up @@ -43,12 +45,22 @@
</teiHeader>
<text>
<body>
<pb n="000001"/>

<p>A paragraph with unc<unclear cert="unknown" reason="illegible">l</unclear>ear recognition character.
I am <hi rend="italic">italic></hi></p>
<p>and I am <hi rend="bold">bold></hi> <hi rend="underline">underline></hi> <hi rend="strikethrough">strikethrough></hi>
<hi rend="super">super></hi>


<pb n="XY-III"/>

<p>Another Paragraph</p>

<pb n="XY-IV"/>

<ab>Anonymous Block</ab><ab>Anonymous Block</ab>
<ab><hi rend="bold">Bold</hi><hi rend="center">center</hi><hi rend="italic">italic</hi><hi rend="super">super</hi><hi rend="underline">underline</hi></ab>
<p>
<foreign xml:lang="en">I am foreign</foreign>
A paragraph with unc<unclear cert="unknown" reason="illegible">l</unclear>ear recognition character.
I am <hi rend="italic">italic></hi><foreign xml:lang="en"><hi rend="italic">I am foreign</hi> and also</foreign></p>
<p>and I am <hi rend="bold">bold></hi> <hi rend="underline">underline></hi> <hi rend="center">strikethrough></hi>
<hi rend="super">sup<unclear cert="unknown" reason="illegible">This is unclear</unclear>r></hi>
</p>
</body>
</text>
Expand Down
27 changes: 14 additions & 13 deletions resources/outputTemplate.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<xs:element ref="ns1:teiHeader"/>
<xs:element ref="ns1:text"/>
</xs:sequence>
<xs:attribute ref="xml:id" use="required"/>
<xs:attribute ref="xml:lang" use="required"/>
</xs:complexType>
</xs:element>
Expand Down Expand Up @@ -123,7 +124,6 @@
<xs:sequence>
<xs:element ref="ns1:body"/>
</xs:sequence>
<xs:attribute ref="xml:lang" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="body">
Expand All @@ -137,16 +137,14 @@
</xs:element>
<xs:element name="ab">
<xs:complexType mixed="true">
<xs:choice minOccurs="0" maxOccurs="unbounded">
<xs:element ref="ns1:hi"/>
<xs:element ref="ns1:unclear"/>
</xs:choice>
<xs:attribute name="type" type="xs:NCName"/>
<xs:sequence>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="ns1:hi"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="pb">
<xs:complexType>
<xs:attribute name="n" use="required" type="xs:NMTOKEN"/>
<xs:attribute name="n" use="required" type="xs:NCName"/>
</xs:complexType>
</xs:element>
<xs:element name="p">
Expand All @@ -160,15 +158,12 @@
</xs:element>
<xs:element name="foreign">
<xs:complexType mixed="true">
<xs:sequence>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="ns1:hi"/>
</xs:sequence>
<xs:attribute ref="xml:lang" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="unclear">
<xs:complexType mixed="true">
<xs:attribute name="cert" use="required" type="xs:NCName"/>
<xs:attribute name="reason" use="required" type="xs:NCName"/>
</xs:complexType>
</xs:element>
<xs:element name="hi">
<xs:complexType mixed="true">
<xs:sequence>
Expand All @@ -177,4 +172,10 @@
<xs:attribute name="rend" use="required" type="xs:NCName"/>
</xs:complexType>
</xs:element>
<xs:element name="unclear">
<xs:complexType mixed="true">
<xs:attribute name="cert" use="required" type="xs:NCName"/>
<xs:attribute name="reason" use="required" type="xs:NCName"/>
</xs:complexType>
</xs:element>
</xs:schema>
3 changes: 2 additions & 1 deletion resources/xml.xsd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" targetNamespace="http://www.w3.org/XML/1998/namespace" xmlns:ns1="http://www.tei-c.org/ns/1.0">
<xs:import namespace="http://www.tei-c.org/ns/1.0" schemaLocation="template.xsd"/>
<xs:import namespace="http://www.tei-c.org/ns/1.0" schemaLocation="outputTemplate.xsd"/>
<xs:attribute name="id" type="xs:NCName"/>
<xs:attribute name="lang" type="xs:NCName"/>
</xs:schema>
38 changes: 38 additions & 0 deletions src/main/java/de/sub/goettingen/arendt/beans/PreprocessInfo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package de.sub.goettingen.arendt.beans;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

public class PreprocessInfo {
private int numberOfLanguages = 0;
private String mainLanguage;

private Map<String, String> languageMapping = new HashMap<String, String>();

public PreprocessInfo() {
languageMapping.put("Latin", "la");
}

public int getNumberOfLanguages() {
return numberOfLanguages;
}

public String getMainLanguage() {
return mainLanguage;
}

public void setNumberOfLanguages(int numberOfLanguages) {
this.numberOfLanguages = numberOfLanguages;
}

public void setMainLanguage(String mainLanguage) {
this.mainLanguage = mainLanguage;
}

public Map<String, String> getLanguageMapping() {
return languageMapping;
}


}
33 changes: 27 additions & 6 deletions src/main/java/de/sub/goettingen/arendt/ocrmapping/Cli.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@ public static class CommandSingle {
@Parameter(names = { "-o", "--output" }, description = "The path and filename to the generated TEI output file", required = true)
private String outputFile = null;

@Parameter(names = { "-lb", "--linebreak" }, description = "Add line breaks to p elements", required = false, arity = 1)
private boolean lineBreak = true;
@Parameter(names = { "-lb", "--linebreak" }, description = "Add line breaks to p elements", required = false)
private boolean lineBreak = true;

@Parameter(names = { "-f", "--foreign" }, description = "Add foreign elements to output", required = false)
private boolean foreignElements = false;

@Parameter(names = { "-h", "--help" }, description = "print this message", help = true)
private boolean help;
Expand All @@ -60,9 +63,12 @@ public static class CommandDir {
@Parameter(names = { "-d", "--dir" }, description = "The start path of the directory for processing the files", required = true)
private String path = null;

@Parameter(names = { "-lb", "--linebreak" }, description = "Add line breaks to p elements", required = false, arity = 1)
@Parameter(names = { "-lb", "--nolinebreak" }, description = "Add line break to p elements", required = false)
private boolean lineBreak = true;

@Parameter(names = { "-f", "--foreign" }, description = "Add foreign elements to output", required = false)
private boolean foreignElements = false;

@Parameter(names = { "-s", "--specific" }, description = "Do very project specific directory scanning (outside of this project do NOT set this option to true)", required = false)
private boolean specificWalk = false;

Expand Down Expand Up @@ -100,7 +106,7 @@ public static void main(String[] args) {
}

if (jc.getParsedCommand() != null) {
if (jc.getParsedCommand().equals("single")) {
if (jc.getParsedCommand().equals("file")) {
main.runSingle();
}else if (jc.getParsedCommand().equals("dir")) {
main.rundDirectory();
Expand All @@ -117,7 +123,9 @@ private void runSingle() {
System.out.println("Start to process Input File " + Cli.commandSingle.inputFile + " into " + Cli.commandSingle.outputFile);

try {
TEI tei = ReadXML.convert2Tei(Cli.commandSingle.inputFile, Cli.commandSingle.lineBreak);
TEI tei = ReadXML.convert2Tei(Cli.commandSingle.inputFile,
Cli.commandSingle.lineBreak,
null, Cli.commandSingle.foreignElements);
boolean success = OutputXML.writeXMLOutput(tei, Cli.commandSingle.outputFile);
if (success == false) {
System.out.println("Failure on writing XML output");
Expand Down Expand Up @@ -154,6 +162,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IO
System.out.println("file found at path: " + file.toAbsolutePath());
System.out.println("\tStart to process Input File " + file.getFileName().toString() + " into TEI");
String teiFilePath = null;
String documentId = "";

// project specific Filename and path setting
// dir is structured like
Expand All @@ -164,6 +173,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IO
// with file IV-NUMBER_TEI_Originalfilename.xml
if (Cli.commandDir.specificWalk) {
String newTEIDirname = file.getParent().getFileName().toString().replace("_xml", "_TEI");
documentId = "Volume-"+newTEIDirname;
Path newTEIDir= file.getParent().resolveSibling(newTEIDirname);

String teiFilename = newTEIDirname +"_"+ file.getFileName().toString().replace("ocr_", "");
Expand All @@ -183,12 +193,14 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IO
// generic dir traverse, just append TEI and use the original file name
// and save it into the same dir
String teiFilename = "TEI-"+file.getFileName().toString();
documentId = getBaseName(file.getFileName().toString());
teiFilePath = Paths.get(file.getParent().toString(), teiFilename).toString();
}


try {
TEI tei = ReadXML.convert2Tei(file.toAbsolutePath().toString(), true);
TEI tei = ReadXML.convert2Tei(file.toAbsolutePath().toString(), Cli.commandDir.lineBreak,
documentId, Cli.commandDir.foreignElements);

boolean success = OutputXML.writeXMLOutput(tei, teiFilePath);
if (success == false) {
Expand Down Expand Up @@ -228,5 +240,14 @@ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOEx
}
System.out.println("Finished, processed " + cntr + " files");
}

public static String getBaseName(String fileName) {
int index = fileName.lastIndexOf('.');
if (index == -1) {
return fileName;
} else {
return fileName.substring(0, index);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ public class OutputXML {
* like in the TEI header
* @return
*/
public static TEI getTEIHeader(String mainLang) {
public static TEI getTEIHeader(String mainLang, String documentId) {
TEI tei = new TEI();

if (mainLang != null)
tei.setLang(mainLang);
TeiHeader teiHeader = new TeiHeader();

if (documentId != null)
tei.setId(documentId);
FileDesc fileDesc = new FileDesc();
TitleStmt titleStmt = new TitleStmt();
titleStmt.setTitle("Automatic OCR transformation, please change title");
Expand Down
Loading

0 comments on commit 00df9e1

Please sign in to comment.