Skip to content

Commit

Permalink
fix: various dates related fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed May 17, 2024
1 parent 7be3dd3 commit 9185b3d
Show file tree
Hide file tree
Showing 13 changed files with 241 additions and 104 deletions.
3 changes: 3 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,14 @@
- Trainable embedding components now all use `foldedtensor` to return embeddings, instead of returning a tensor of floats and a mask tensor.
- :boom: TorchComponent `__call__` no longer applies the end to end method, and instead calls the `forward` method directly, like all torch modules.
- The trainable `eds.span_qualifier` component has been renamed to `eds.span_classifier` to reflect its general gpurpose (it doesn't only predict qualifiers, but any attribute of a span using its context or not).
- `omop` converter now takes the `note_datetime` field into account by default when building a document
- `span._.date.to_datetime()` and `span._.date.to_duration()` now automatically take the `note_datetime` into account

### Fixed

- `edsnlp.data.read_json` now correctly read the files from the directory passed as an argument, and not from the parent directory.
- Overwrite spacy's Doc, Span and Token pickling utils to allow recursively storing Doc, Span and Token objects in the extension values (in particular, span._.date.doc)
- Removed pendulum dependency, solving various pickling, multiprocessing and missing attributes errors

## v0.11.2

Expand Down
34 changes: 27 additions & 7 deletions docs/tutorials/multiple-texts.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ for doc in docs:
end=date.end_char,
label="date",
entity_text=date.text,
datetime=date._.date.datetime,
)
rows.append(d)
df = pd.DataFrame(rows)
Expand Down Expand Up @@ -160,7 +161,8 @@ def convert_doc_to_rows(doc):
begin=date.start_char,
end=date.end_char,
label="date",
entity_text=date.text,
lexical_variant=date.text,
datetime=date._.date.datetime,
)
entities.append(d)

Expand All @@ -172,7 +174,13 @@ df = docs.to_pandas(converter=convert_doc_to_rows)
df = docs.to_pandas(
converter="ents",
span_getter=["ents", "dates"],
span_attributes=["negation", "hypothesis", "family"],
span_attributes={
# span._.*** name: column name
"negation": "negation",
"hypothesis": "hypothesis",
"family": "family",
"date.datetime": "datetime",
},
)
```

Expand Down Expand Up @@ -263,10 +271,14 @@ note_nlp = docs.to_pandas(
# Below are the arguments to the converter
span_getter=["ents", "dates"],
span_attributes={ # (1)
# span._.*** name: column name
"negation": "negation",
"hypothesis": "hypothesis",
"family": "family",
"date.day": "date_day", # slugified name
"date.datetime": "datetime",
# having individual columns for each date part
# can be useful for incomplete dates (eg, "in May")
"date.day": "date_day",
"date.month": "date_month",
"date.year": "date_year",
},
Expand Down Expand Up @@ -308,9 +320,13 @@ note_nlp = docs.to_pandas(
"negation": "negation",
"hypothesis": "hypothesis",
"family": "family",
"date.day": "date_day", # slugify the extension name
"date.datetime": "datetime",
# having individual columns for each date part
# can be useful for incomplete dates (eg, "in May")
"date.day": "date_day",
"date.month": "date_month",
"date.year": "date_year"
"date.year": "date_year",
},
)
```
Expand All @@ -336,9 +352,13 @@ note_nlp = docs.to_spark(
"negation": "negation",
"hypothesis": "hypothesis",
"family": "family",
"date.day": "date_day", # slugify the extension name
"date.datetime": "datetime",
# having individual columns for each date part
# can be useful for incomplete dates (eg, "in May")
"date.day": "date_day",
"date.month": "date_month",
"date.year": "date_year"
"date.year": "date_year",
},
dtypes=None, # (1)
)
Expand Down
2 changes: 1 addition & 1 deletion edsnlp/data/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def __init__(
*,
tokenizer: Optional[PipelineProtocol] = None,
span_setter: SpanSetterArg = {"ents": True, "*": True},
doc_attributes: AttributesMappingArg = {},
doc_attributes: AttributesMappingArg = {"note_datetime": "note_datetime"},
span_attributes: Optional[AttributesMappingArg] = None,
default_attributes: AttributesMappingArg = {},
bool_attributes: SequenceStr = [],
Expand Down
37 changes: 20 additions & 17 deletions edsnlp/extensions.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,32 @@
import warnings
from datetime import date, datetime

import pendulum
from dateutil.parser import parse as parse_date
from spacy.tokens import Doc

if not Doc.has_extension("note_id"):
Doc.set_extension("note_id", default=None)


def set_note_datetime(span, dt):
if type(dt) is datetime:
dt = pendulum.instance(dt)
elif isinstance(dt, pendulum.DateTime):
def set_note_datetime(doc, dt):
try:
if type(dt) is datetime:
pass
elif isinstance(dt, str):
dt = parse_date(dt)
elif isinstance(dt, (int, float)):
dt = datetime.fromtimestamp(dt)
elif isinstance(dt, date):
dt = datetime(dt.year, dt.month, dt.day)
elif dt is None:
pass
key = doc._._get_key("note_datetime")
doc.doc.user_data[key] = dt
return
except Exception:
pass
elif isinstance(dt, str):
dt = pendulum.parse(dt)
elif isinstance(dt, (int, float)):
dt = pendulum.from_timestamp(dt)
elif isinstance(dt, date):
dt = pendulum.instance(datetime.fromordinal(dt.toordinal()))
elif dt is None:
pass
else:
raise ValueError(f"Cannot cast {dt} as a datetime")
key = span._._get_key("note_datetime")
span.doc.user_data[key] = dt

warnings.warn(f"Cannot cast {dt} as a note datetime", UserWarning)


def get_note_datetime(doc):
Expand Down
2 changes: 1 addition & 1 deletion edsnlp/pipes/misc/consultation_dates/consultation_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class ConsultationDatesMatcher(GenericMatcher):
# Out: [Consultation du 03/10/2018]
doc.spans["consultation_dates"][0]._.consultation_date.to_datetime()
# Out: DateTime(2018, 10, 3, 0, 0, 0, tzinfo=Timezone('Europe/Paris'))
# Out: DateTime(2018, 10, 3, 0, 0, 0)
```
Extensions
Expand Down
30 changes: 26 additions & 4 deletions edsnlp/pipes/misc/dates/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class DatesMatcher(BaseNERComponent):
```python
import edsnlp, edsnlp.pipes as eds
import pendulum
import datetime
import pytz
nlp = edsnlp.blank("eds")
nlp.add_pipe(eds.dates())
Expand All @@ -70,9 +70,10 @@ class DatesMatcher(BaseNERComponent):
dates[1]._.date.to_datetime()
# Out: None
note_datetime = pendulum.datetime(2021, 8, 27, tz="Europe/Paris")
note_datetime = datetime.datetime(2021, 8, 27, tzinfo=pytz.timezone("Europe/Paris"))
doc._.note_datetime = note_datetime
dates[1]._.date.to_datetime(note_datetime=note_datetime)
dates[1]._.date.to_datetime()
# Out: 2020-08-27T00:00:00+02:00
date_2_output = dates[2]._.date.to_datetime(
Expand All @@ -88,6 +89,26 @@ class DatesMatcher(BaseNERComponent):
# Out: [pendant une semaine]
```
Example on a collection of documents stored in the OMOP schema :
```{ .python .no-check }
import edsnlp, edsnlp.pipes as eds
# with cols "note_id", "note_text" and optionally "note_datetime"
my_omop_df = ...
nlp = edsnlp.blank("eds")
nlp.add_pipe(eds.dates(as_ents=True))
docs = edsnlp.data.from_pandas(my_omop_df)
docs = docs.map_pipeline(nlp)
docs = docs.to_pandas(
converter="ents",
span_attributes={"date.datetime": "datetime"},
)
print(docs)
# note_id start end label lexical_variant span_type datetime
# ...
```
Extensions
----------
The `eds.dates` pipeline declares two extensions on the `Span` object:
Expand Down Expand Up @@ -371,6 +392,7 @@ def parse(
if v is not None and "_" in k:
key, value = k.split("_")
date_cfg.update({key: value})
date_cfg["doc"] = span.doc
if span.label_ == "relative":
parsed = RelativeDate.parse_obj(date_cfg)
span.label_ = self.date_label
Expand Down
Loading

0 comments on commit 9185b3d

Please sign in to comment.