diff --git a/pom.xml b/pom.xml index c783bac..7f460b9 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.ohnlp.medtagger medtagger - 1.0.71 + 1.0.72 The MedTagger biomedical information extraction pipeline diff --git a/src/main/java/org/ohnlp/medtagger/backbone/CleanMedTaggerDictOutputTransform.java b/src/main/java/org/ohnlp/medtagger/backbone/CleanMedTaggerDictOutputTransform.java new file mode 100644 index 0000000..1695a86 --- /dev/null +++ b/src/main/java/org/ohnlp/medtagger/backbone/CleanMedTaggerDictOutputTransform.java @@ -0,0 +1,70 @@ +package org.ohnlp.medtagger.backbone; + +import org.apache.beam.sdk.coders.BigEndianLongCoder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.RowCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.transforms.Select; +import org.apache.beam.sdk.transforms.*; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.joda.time.Duration; +import org.ohnlp.backbone.api.annotations.ComponentDescription; +import org.ohnlp.backbone.api.components.OneToOneTransform; +import org.ohnlp.backbone.api.exceptions.ComponentInitializationException; +import org.ohnlp.medtagger.lvg.LvgLookup; + +@ComponentDescription( + name = "Get Dict Freqs", + desc = "Gets Frequency of Dictionary Terms (Useful for Cleaning Noise from Autogenerated Dictionary Entries)" +) +public class CleanMedTaggerDictOutputTransform extends OneToOneTransform { + private final Schema schema = Schema.of( + Schema.Field.of("matched_text", Schema.FieldType.STRING), + Schema.Field.of("freq", Schema.FieldType.INT64) + ); + + @Override + public Schema calculateOutputSchema(Schema schema) { + return this.schema; + } + + @Override + public PCollection expand(PCollection input) { + return input.apply(Select.fieldNames("matched_text", "note_source_value")).apply(ParDo.of( + new DoFn>() { + private LvgLookup lvg; + @ProcessElement + public void process(ProcessContext pc) { + Row input = pc.element(); + String text = input.getString("matched_text"); + text = lvg.getNorm(text).replaceAll("\\s", "\t"); + pc.output(KV.of(Row.withSchema(schema).addValues(text, 1L).build(), input.getString("note_source_value"))); + } + @Setup + public void init() { + this.lvg = new LvgLookup(); + lvg.localInitialize(CleanMedTaggerDictOutputTransform.class.getResourceAsStream("/medtaggerresources/lvg/LRAGR_2021AB"), CleanMedTaggerDictOutputTransform.class.getResourceAsStream("/medtaggerresources/lvg/openclasswords.txt")); + } + } + )).setCoder(KvCoder.of(RowCoder.of(this.schema), StringUtf8Coder.of()) + ).apply(Distinct.create() + ).apply(Count.perKey() + ).setCoder(KvCoder.of(RowCoder.of(this.schema), BigEndianLongCoder.of()) + ).apply(MapElements.via(new SimpleFunction, Row>() { + @Override + public Row apply(KV input) { + return Row.withSchema(schema).addValues(input.getKey().getValue("matched_text"), input.getValue()).build(); + } + })); + } + + @Override + public void init() throws ComponentInitializationException { + } +} diff --git a/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java b/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java index 1c145d1..ed6ebad 100644 --- a/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java +++ b/src/main/java/org/ohnlp/medtagger/backbone/MedTaggerBackboneTransform.java @@ -140,7 +140,7 @@ public PCollection expand(PCollection input) { } private static class MedTaggerPipelineFunction extends DoFn { - private transient static final ReentrantLock INIT_MUTEX_LOCK = new ReentrantLock(); +// private transient static final ReentrantLock INIT_MUTEX_LOCK = new ReentrantLock(); private final String resourceFolder; private final String textField; @@ -168,7 +168,6 @@ public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMod @Setup public void init() throws IOException, InvalidXMLException, URISyntaxException, ResourceInitializationException { try { - INIT_MUTEX_LOCK.lock(); AggregateBuilder ae = new AggregateBuilder(); // Tokenization, Sentence Splitting, Section Detection, etc. if (this.secTag.equalsIgnoreCase("DEFAULT")) { @@ -288,7 +287,6 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException, this.cas = CasCreationUtils.createCas(Collections.singletonList(aae.getMetaData()), null, resMgr); } finally { - INIT_MUTEX_LOCK.unlock(); } } diff --git a/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml b/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml index 40b4356..5dce935 100644 --- a/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml +++ b/src/main/resources/desc/backbone/aes/PreConceptExtractionAE.xml @@ -37,7 +37,6 @@ SentenceDetectorAE TokenizerAE - ChunkerAE POSTaggerAE LineSentenceDetectorAE LvgLookupAE