Skip to content

Commit

Permalink
fix: handle eds.measurements on many docs better + doc
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Jul 19, 2024
1 parent 667dcab commit 32ecb89
Showing 1 changed file with 140 additions and 112 deletions.
252 changes: 140 additions & 112 deletions edsnlp/pipes/misc/measurements/measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,10 @@ def convert_to(self, other_unit):
return self.value

def __getattr__(self, other_unit):
return self.convert_to(other_unit)
try:
return self.convert_to(other_unit)
except KeyError:
raise AttributeError(f"Unit {other_unit} not found")

@classmethod
def verify(cls, ent):
Expand Down Expand Up @@ -346,6 +349,7 @@ class MeasurementsMatcher(BaseNERComponent):
1. 100-110mg, 2 à 4 jours ...
2. If True `eds.tables` must be called
3. Which measurements are available ? See [Availability](#availability)
Scope
-----
The `eds.measurements` matcher can extract simple (e.g. `3cm`) measurements.
Expand Down Expand Up @@ -431,6 +435,83 @@ class MeasurementsMatcher(BaseNERComponent):
# Out: [178.0, 0.12, 0.24, 1.25]
```
To extract the measurements from many texts, you can use the following snippet:
```python
import edsnlp, edsnlp.pipes as eds
nlp = edsnlp.blank("eds")
nlp.add_pipe(
eds.measurements(measurements="weight", extract_ranges=True, as_ents=True),
)
texts = ["Le patient mesure 40000,0 g (aussi noté 40 kg)"]
docs = edsnlp.data.from_iterable(texts)
docs = docs.map_pipeline(nlp)
docs.to_pandas(
converter="ents",
span_attributes={"value.unit": "original_unit", "value.kg": "kg"},
)
# note_id start end label lexical_variant span_type original_unit kg
# 0 None 18 27 weight 40000,0 g ents g 40.0
# 1 None 40 45 weight 40 kg ents kg 40.0
```
Available units and measurements
--------------------------------
Feel free to propose any missing raw unit or predefined measurement.
Raw units and their derivations (g, mg, mgr ...) and their
compositions (g/ml, cac/j ...) can be detected.
__Available raw units :__
`g, m, m2, m3, mol, ui, Pa, %, log, mmHg, s/min/h/d/w/m/y,
arc-second, °, °C, cac, goutte, l, x10*4, x10*5`
__Available predefined measurements :__
| measurement_name | Example |
|------------------|------------------------|
| `size` | `1m50`, `1.50m`... |
| `weight` | `1kg`, `Poids : 65`... |
| `bmi` | `BMI: 24`, `24 kg.m-2` |
| `volume` | `2 cac`, `8ml`... |
See the [patterns](https://github.com/aphp/edsnlp/blob/master/edsnlp/pipes/misc/measurements/patterns.py)
for exhaustive definition.
Customization
-------------
You can declare custom measurements by altering the patterns:
```python
import edsnlp, edsnlp.pipes as eds
nlp = edsnlp.blank("eds")
nlp.add_pipe(
eds.measurements(
measurements={
"my_custom_surface_measurement": {
# This measurement unit is homogenous to square meters
"unit": "m2",
# Handle cases like "surface: 1.8" (implied m2),
# vs "surface: 50" (implied cm2)
"unitless_patterns": [
{
"terms": ["surface", "aire"],
"ranges": [
{"unit": "m2", "min": 0, "max": 9},
{"unit": "cm2", "min": 10, "max": 100},
],
}
],
},
}
),
)
```
Extensions
----------
The `eds.measurements` pipeline declares its extensions dynamically, depending
Expand Down Expand Up @@ -502,94 +583,39 @@ class MeasurementsMatcher(BaseNERComponent):
instead, and assign all the parsed information (`._.date` / `._.duration`)
to it. Otherwise, don't return the date.
Availability
------------
Feel free to propose any missing raw unit or predefined measurement.
Raw units and their derivations (g, mg, mgr ...) and their
compositions (g/ml, cac/j ...) can be detected.
__Available raw units :__
`g, m, m2, m3, mol, ui, Pa, %, log, mmHg, s/min/h/d/w/m/y,
arc-second, °, °C, cac, goutte, l, x10*4, x10*5`
__Available predefined measurements :__
| measurement_name | Example |
|------------------|------------------------|
| `size` | `1m50`, `1.50m`... |
| `weight` | `1kg`, `Poids : 65`... |
| `bmi` | `BMI: 24`, `24 kg.m-2` |
| `volume` | `2 cac`, `8ml`... |
See `edsnlp.pipes.misc.measurements.patterns` for exhaustive definition.
Customization
-------------
You can declare custom measurements by altering the patterns:
```python
import edsnlp, edsnlp.pipes as eds
nlp = edsnlp.blank("eds")
nlp.add_pipe(
eds.measurements(
measurements={
"my_custom_surface_measurement": {
# This measurement unit is homogenous to square meters
"unit": "m2",
# Handle cases like "surface: 1.8" (implied m2),
# vs "surface: 50" (implied cm2)
"unitless_patterns": [
{
"terms": ["surface", "aire"],
"ranges": [
{"unit": "m2", "min": 0, "max": 9},
{"unit": "cm2", "min": 10, "max": 100},
],
}
],
},
}
),
)
```
Authors and citation
--------------------
The `eds.measurements` pipeline was developed by AP-HP's Data Science team.
'''
''' # noqa: E501

# fmt: off
def __init__(
self,
nlp: PipelineProtocol,
name: str = "measurements",
*,
measurements: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = list(patterns.common_measurements.keys()), # noqa: E501
units_config: Dict[str, UnitConfig] = patterns.units_config,
number_terms: Dict[str, List[str]] = patterns.number_terms,
number_regex: str = patterns.number_regex,
stopwords: List[str] = patterns.stopwords,
unit_divisors: List[str] = patterns.unit_divisors,
ignore_excluded: bool = True,
compose_units: bool = True,
attr: str = "NORM",
extract_ranges: bool = False,
range_patterns: List[Tuple[Optional[str], Optional[str]]] = patterns.range_patterns, # noqa: E501
after_snippet_limit: int = 6,
before_snippet_limit: int = 10,
span_getter: Optional[SpanGetterArg] = None,
merge_mode: Literal["intersect", "align"] = "intersect",
as_ents: bool = False,
span_setter: Optional[SpanSetterArg] = None,
use_tables : bool = True
self,
nlp: PipelineProtocol,
name: str = "measurements",
*,
measurements: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = list(patterns.common_measurements.keys()), # noqa: E501
units_config: Dict[str, UnitConfig] = patterns.units_config,
number_terms: Dict[str, List[str]] = patterns.number_terms,
number_regex: str = patterns.number_regex,
stopwords: List[str] = patterns.stopwords,
unit_divisors: List[str] = patterns.unit_divisors,
ignore_excluded: bool = True,
compose_units: bool = True,
attr: str = "NORM",
extract_ranges: bool = False,
range_patterns: List[Tuple[Optional[str], Optional[str]]] = patterns.range_patterns, # noqa: E501
after_snippet_limit: int = 6,
before_snippet_limit: int = 10,
span_getter: Optional[SpanGetterArg] = None,
merge_mode: Literal["intersect", "align"] = "intersect",
as_ents: bool = False,
span_setter: Optional[SpanSetterArg] = None,
use_tables: bool = True
):

self.use_tables = use_tables and (
"eds.tables" in nlp.pipe_names or "tables" in nlp.pipe_names
"eds.tables" in nlp.pipe_names or "tables" in nlp.pipe_names
)
if use_tables and not self.use_tables:
logger.warning(
Expand Down Expand Up @@ -688,7 +714,7 @@ def __init__(
self.unitless_patterns[pattern_name] = {"name": name, **pattern}

# NUMBER PATTERNS
self.regex_matcher.add("number",[number_regex])
self.regex_matcher.add("number", [number_regex])
self.number_label_hashes = {nlp.vocab.strings["number"]}
for number, terms in number_terms.items():
self.term_matcher.build_patterns(nlp, {number: terms})
Expand Down Expand Up @@ -746,15 +772,15 @@ def extract_units(self, term_matches: Iterable[Span]) -> Iterable[Span]:
if unit_part.label not in self.unit_part_label_hashes:
continue
if last is not None and (
(
unit_part.doc[last.end: unit_part.start].text.strip() != ""
and len(current)
)
or (
not self.compose_units
and len(current)
and current[-1].label_ != "per"
)
(
unit_part.doc[last.end: unit_part.start].text.strip() != ""
and len(current)
)
or (
not self.compose_units
and len(current)
and current[-1].label_ != "per"
)
):
doc = current[0].doc
# Last non "per" match: we don't want our units to be like `g_per`
Expand Down Expand Up @@ -784,10 +810,10 @@ def extract_units(self, term_matches: Iterable[Span]) -> Iterable[Span]:

@classmethod
def make_pseudo_sentence(
cls,
doclike: Union[Doc, Span],
matches: List[Tuple[Span, bool]],
pseudo_mapping: Dict[int, str],
cls,
doclike: Union[Doc, Span],
matches: List[Tuple[Span, bool]],
pseudo_mapping: Dict[int, str],
) -> Tuple[str, List[int]]:
"""
Creates a pseudo sentence (one letter per entity)
Expand Down Expand Up @@ -816,8 +842,8 @@ def make_pseudo_sentence(
offsets = []
for ent, is_sent_split in matches:
if (
ent.start != last
and not doclike.doc[last: ent.start].text.strip() == ""
ent.start != last
and not doclike.doc[last: ent.start].text.strip() == ""
):
pseudo.append("w")
offsets.append(len(pseudo))
Expand Down Expand Up @@ -909,6 +935,7 @@ def extract_measurements(self, doclike: Doc):
table_matches = list(doc.spans["tables"])

matches, unit_label_hashes = self.get_matches(doclike)

# Make match slice function to query them
def get_matches_after(i):
anchor = matches[i][0]
Expand Down Expand Up @@ -986,7 +1013,7 @@ def get_matches_before(i):
pass

# Check if number is in table with a unit in the same row
if unit_norm is None :
if unit_norm is None:
for table in table_matches:
if (number.start >= table.start) and (number.end <= table.end):
table_pd = table._.to_pd_table(as_spans=True)
Expand All @@ -998,6 +1025,7 @@ def get_matches_before(i):
if item is not None), None)
if start_line is None:
continue

def is_within_row(x):
return (x.start >= start_line) and (x.end <= end_line)

Expand Down Expand Up @@ -1041,8 +1069,8 @@ def is_within_row(x):
)
unit_norm = None
if re.fullmatch(
r"[,:n]*",
pseudo[offsets[unitless_idx] + 1: offsets[number_idx]],
r"[,:n]*",
pseudo[offsets[unitless_idx] + 1: offsets[number_idx]],
):
unitless_pattern = self.unitless_patterns[unitless_text.label_]
unit_norm = next(
Expand All @@ -1062,15 +1090,15 @@ def is_within_row(x):
# TODO: handle this part better without .text.strip(), with cases for
# stopwords, etc
if (
unit_text
and number.start <= unit_text.end
and doc[number.end: unit_text.start].text.strip() == ""
unit_text
and number.start <= unit_text.end
and doc[number.end: unit_text.start].text.strip() == ""
):
ent = doc[number.start: unit_text.end]
elif (
unit_text
and unit_text.start <= number.end
and doc[unit_text.end: number.start].text.strip() == ""
unit_text
and unit_text.start <= number.end
and doc[unit_text.end: number.start].text.strip() == ""
):
ent = doc[unit_text.start: number.end]
else:
Expand Down Expand Up @@ -1109,10 +1137,10 @@ def is_within_row(x):
unmatched = []
for idx, (match, _) in enumerate(matches):
if (
match.label in unit_label_hashes
and idx not in matched_unit_indices
or match.label in self.number_label_hashes
and idx not in matched_number_indices
match.label in unit_label_hashes
and idx not in matched_unit_indices
or match.label in self.number_label_hashes
and idx not in matched_number_indices
):
unmatched.append(match)

Expand Down Expand Up @@ -1200,9 +1228,9 @@ def merge_measurements_in_ranges(self, measurements: List[Span]) -> List[Span]:
return merged

def merge_with_existing(
self,
extracted: List[Span],
existing: List[Span],
self,
extracted: List[Span],
existing: List[Span],
) -> List[Span]:
"""
Merges the extracted measurements with the existing measurements in the
Expand Down

0 comments on commit 32ecb89

Please sign in to comment.