Skip to content

Commit

Permalink
Don't run chunker for perf and add dictionary cleaning transform
Browse files Browse the repository at this point in the history
  • Loading branch information
qqndrew committed Oct 9, 2023
1 parent 5431742 commit a1fc87c
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.ohnlp.medtagger</groupId>
<artifactId>medtagger</artifactId>
<version>1.0.71</version>
<version>1.0.72</version>
<description>The MedTagger biomedical information extraction pipeline</description>


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package org.ohnlp.medtagger.backbone;

import org.apache.beam.sdk.coders.BigEndianLongCoder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.RowCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.schemas.transforms.Select;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.Row;
import org.checkerframework.checker.initialization.qual.Initialized;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.UnknownKeyFor;
import org.joda.time.Duration;
import org.ohnlp.backbone.api.annotations.ComponentDescription;
import org.ohnlp.backbone.api.components.OneToOneTransform;
import org.ohnlp.backbone.api.exceptions.ComponentInitializationException;
import org.ohnlp.medtagger.lvg.LvgLookup;

@ComponentDescription(
name = "Get Dict Freqs",
desc = "Gets Frequency of Dictionary Terms (Useful for Cleaning Noise from Autogenerated Dictionary Entries)"
)
public class CleanMedTaggerDictOutputTransform extends OneToOneTransform {
private final Schema schema = Schema.of(
Schema.Field.of("matched_text", Schema.FieldType.STRING),
Schema.Field.of("freq", Schema.FieldType.INT64)
);

@Override
public Schema calculateOutputSchema(Schema schema) {
return this.schema;
}

@Override
public PCollection<Row> expand(PCollection<Row> input) {
return input.apply(Select.fieldNames("matched_text", "note_source_value")).apply(ParDo.of(
new DoFn<Row, KV<Row, String>>() {
private LvgLookup lvg;
@ProcessElement
public void process(ProcessContext pc) {
Row input = pc.element();
String text = input.getString("matched_text");
text = lvg.getNorm(text).replaceAll("\\s", "\t");
pc.output(KV.of(Row.withSchema(schema).addValues(text, 1L).build(), input.getString("note_source_value")));
}
@Setup
public void init() {
this.lvg = new LvgLookup();
lvg.localInitialize(CleanMedTaggerDictOutputTransform.class.getResourceAsStream("/medtaggerresources/lvg/LRAGR_2021AB"), CleanMedTaggerDictOutputTransform.class.getResourceAsStream("/medtaggerresources/lvg/openclasswords.txt"));
}
}
)).setCoder(KvCoder.of(RowCoder.of(this.schema), StringUtf8Coder.of())
).apply(Distinct.create()
).apply(Count.perKey()
).setCoder(KvCoder.of(RowCoder.of(this.schema), BigEndianLongCoder.of())
).apply(MapElements.via(new SimpleFunction<KV<Row, Long>, Row>() {
@Override
public Row apply(KV<Row, Long> input) {
return Row.withSchema(schema).addValues(input.getKey().getValue("matched_text"), input.getValue()).build();
}
}));
}

@Override
public void init() throws ComponentInitializationException {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ public PCollection<Row> expand(PCollection<Row> input) {
}

private static class MedTaggerPipelineFunction extends DoFn<Row, Row> {
private transient static final ReentrantLock INIT_MUTEX_LOCK = new ReentrantLock();
// private transient static final ReentrantLock INIT_MUTEX_LOCK = new ReentrantLock();

private final String resourceFolder;
private final String textField;
Expand Down Expand Up @@ -168,7 +168,6 @@ public MedTaggerPipelineFunction(String textField, String resourceFolder, RunMod
@Setup
public void init() throws IOException, InvalidXMLException, URISyntaxException, ResourceInitializationException {
try {
INIT_MUTEX_LOCK.lock();
AggregateBuilder ae = new AggregateBuilder();
// Tokenization, Sentence Splitting, Section Detection, etc.
if (this.secTag.equalsIgnoreCase("DEFAULT")) {
Expand Down Expand Up @@ -288,7 +287,6 @@ public void init() throws IOException, InvalidXMLException, URISyntaxException,
this.cas = CasCreationUtils.createCas(Collections.singletonList(aae.getMetaData()),
null, resMgr);
} finally {
INIT_MUTEX_LOCK.unlock();
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
<fixedFlow>
<node>SentenceDetectorAE</node>
<node>TokenizerAE</node>
<node>ChunkerAE</node>
<node>POSTaggerAE</node>
<node>LineSentenceDetectorAE</node>
<node>LvgLookupAE</node>
Expand Down

0 comments on commit a1fc87c

Please sign in to comment.