Skip to content

Commit

Permalink
Change workflows and fixes so that no triples are needed #462
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiasNx committed Jul 19, 2023
1 parent 8bc3d80 commit 81bf8e2
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 32 deletions.
10 changes: 0 additions & 10 deletions app/transformation/TransformDbs.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
import org.metafacture.csv.CsvDecoder;
import org.metafacture.json.JsonEncoder;
import org.metafacture.io.LineReader;
import org.metafacture.triples.StreamToTriples;
import org.metafacture.triples.TripleFilter;
import org.metafacture.triples.TripleCollect;
import org.metafacture.io.ObjectWriter;
import org.metafacture.io.FileOpener;
import org.metafacture.metafix.Metafix;
Expand All @@ -23,22 +20,15 @@
public class TransformDbs {
static void process(final String outputPath, String geoLookupServer) throws FileNotFoundException {
final FileOpener opener = new FileOpener();
StreamToTriples streamToTriples = new StreamToTriples();
streamToTriples.setRedirect(true);
opener.setEncoding("UTF-8");
final CsvDecoder decoder = new CsvDecoder(',');
decoder.setHasHeader(true);
final TripleFilter tripleFilter = new TripleFilter();
tripleFilter.setSubjectPattern(".+"); // Remove entries without id
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
opener//
.setReceiver(new LineReader())//
.setReceiver(decoder)//
.setReceiver(new Metafix("conf/fix-dbs.fix"))//
.setReceiver(streamToTriples)//
.setReceiver(tripleFilter)//
.setReceiver(new TripleCollect())//
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
Expand Down
21 changes: 2 additions & 19 deletions app/transformation/TransformSigel.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,11 @@
import org.metafacture.framework.helpers.DefaultObjectPipe;
import org.metafacture.json.JsonEncoder;
import org.metafacture.metafix.Metafix;
import org.metafacture.triples.StreamToTriples;
import org.metafacture.biblio.pica.PicaXmlHandler;
import org.metafacture.io.LineReader;
import org.metafacture.biblio.pica.PicaDecoder;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.triples.TripleFilter;
import org.metafacture.xml.XmlElementSplitter;
import org.metafacture.triples.TripleCollect;
import org.metafacture.io.ObjectWriter;
import org.metafacture.xml.XmlFilenameWriter;
import org.metafacture.io.FileOpener;
Expand Down Expand Up @@ -55,10 +52,6 @@ public class TransformSigel {
static void processBulk(String startOfUpdates, int intervalSize,
final String outputPath, String geoLookupServer) throws IOException {
final FileOpener dumpOpener = new FileOpener();
StreamToTriples streamToTriples = new StreamToTriples();
streamToTriples.setRedirect(true);
final TripleFilter tripleFilter = new TripleFilter();
tripleFilter.setSubjectPattern(".+"); // Remove entries without id
PicaDecoder picaDecoder = new PicaDecoder();
picaDecoder.setNormalizeUTF8(true);
JsonEncoder encodeJson = new JsonEncoder();
Expand All @@ -67,35 +60,25 @@ static void processBulk(String startOfUpdates, int intervalSize,
.setReceiver(new LineReader())//
.setReceiver(picaDecoder)//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//
.setReceiver(streamToTriples)//
.setReceiver(tripleFilter)//
.setReceiver(new TripleCollect())//
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
dumpOpener.process(TransformAll.DATA_INPUT_DIR + "sigil.dat");
dumpOpener.process(TransformAll.DATA_INPUT_DIR + "sigel.dat");
dumpOpener.closeStream();
}

static void processUpdates(String startOfUpdates, int intervalSize,
final String outputPath, String geoLookupServer) throws IOException {
final FileOpener splitFileOpener = new FileOpener();
StreamToTriples streamToTriples = new StreamToTriples();
streamToTriples.setRedirect(true);
final TripleFilter tripleFilter = new TripleFilter();
tripleFilter.setSubjectPattern(".+"); // Remove entries without id
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
ObjectWriter objectWriter = new ObjectWriter<>(outputPath);
objectWriter.setAppendIfFileExists(true);
splitFileOpener//
.setReceiver(new XmlDecoder())//
.setReceiver(new PicaXmlHandler())//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//
.setReceiver(streamToTriples)//
.setReceiver(tripleFilter)//
.setReceiver(new TripleCollect())//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//fix also kicks out all records without _id
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
Expand Down
5 changes: 4 additions & 1 deletion conf/fix-dbs.fix
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ do once("maps")
end

replace_all("inr"," ","")
copy_field("inr","_id")

replace_all("isil"," ","")
replace_all("isil","/","-")
Expand Down Expand Up @@ -34,3 +33,7 @@ end


vacuum()

unless exists("inr")
reject()
end
3 changes: 2 additions & 1 deletion conf/fix-enriched.fix
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ end
## <!-- Telephone and email -->

if exists ("phonenr")
paste("telephone","~00","countryCode","prefix","phonenr")
paste("@countryCode","~00","countryCode",join_char:"")
paste("telephone","@countryCode","prefix","phonenr")
end
if exists("email")
paste("email","~mailto:","email", join_char:"")
Expand Down
7 changes: 6 additions & 1 deletion conf/fix-sigel.fix
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ copy_field("008H.g","worldcatRegistryId")
replace_all("@dbsId"," ","")
replace_all("isil"," ","")
if any_match("@dbsId","[A-Z]{2}\\d{3}")
copy_field("@dbsId","_id")
copy_field("@dbsId","inr")
else
copy_field("isil","_id")
Expand Down Expand Up @@ -71,3 +70,9 @@ end


vacuum()

unless exists("inr")
unless exists("isil")
reject()
end
end

0 comments on commit 81bf8e2

Please sign in to comment.