fix: various dates related fixes

aphp · May 17, 2024 · 9185b3d · 9185b3d
1 parent 7be3dd3
commit 9185b3d
Show file tree

Hide file tree

Showing 13 changed files with 241 additions and 104 deletions.
diff --git a/changelog.md b/changelog.md
@@ -54,11 +54,14 @@
 - Trainable embedding components now all use `foldedtensor` to return embeddings, instead of returning a tensor of floats and a mask tensor.
 - :boom: TorchComponent `__call__` no longer applies the end to end method, and instead calls the `forward` method directly, like all torch modules.
 - The trainable `eds.span_qualifier` component has been renamed to `eds.span_classifier` to reflect its general gpurpose (it doesn't only predict qualifiers, but any attribute of a span using its context or not).
+- `omop` converter now takes the `note_datetime` field into account by default when building a document
+- `span._.date.to_datetime()` and `span._.date.to_duration()` now automatically take the `note_datetime` into account
 
 ### Fixed
 
 - `edsnlp.data.read_json` now correctly read the files from the directory passed as an argument, and not from the parent directory.
 - Overwrite spacy's Doc, Span and Token pickling utils to allow recursively storing Doc, Span and Token objects in the extension values (in particular, span._.date.doc)
+- Removed pendulum dependency, solving various pickling, multiprocessing and missing attributes errors
 
 ## v0.11.2
 

diff --git a/docs/tutorials/multiple-texts.md b/docs/tutorials/multiple-texts.md
@@ -83,6 +83,7 @@ for doc in docs:
             end=date.end_char,
             label="date",
             entity_text=date.text,
+            datetime=date._.date.datetime,
         )
         rows.append(d)
 df = pd.DataFrame(rows)
@@ -160,7 +161,8 @@ def convert_doc_to_rows(doc):
             begin=date.start_char,
             end=date.end_char,
             label="date",
-            entity_text=date.text,
+            lexical_variant=date.text,
+            datetime=date._.date.datetime,
         )
         entities.append(d)
 
@@ -172,7 +174,13 @@ df = docs.to_pandas(converter=convert_doc_to_rows)
 df = docs.to_pandas(
     converter="ents",
     span_getter=["ents", "dates"],
-    span_attributes=["negation", "hypothesis", "family"],
+    span_attributes={
+        # span._.*** name: column name
+        "negation": "negation",
+        "hypothesis": "hypothesis",
+        "family": "family",
+        "date.datetime": "datetime",
+    },
 )
 ```
 
@@ -263,10 +271,14 @@ note_nlp = docs.to_pandas(
     # Below are the arguments to the converter
     span_getter=["ents", "dates"],
     span_attributes={  # (1)
+        # span._.*** name: column name
         "negation": "negation",
         "hypothesis": "hypothesis",
         "family": "family",
-        "date.day": "date_day",  # slugified name
+        "date.datetime": "datetime",
+        # having individual columns for each date part
+        # can be useful for incomplete dates (eg, "in May")
+        "date.day": "date_day",
         "date.month": "date_month",
         "date.year": "date_year",
     },
@@ -308,9 +320,13 @@ note_nlp = docs.to_pandas(
         "negation": "negation",
         "hypothesis": "hypothesis",
         "family": "family",
-        "date.day": "date_day",  # slugify the extension name
+        "date.datetime": "datetime",
+
+        # having individual columns for each date part
+        # can be useful for incomplete dates (eg, "in May")
+        "date.day": "date_day",
         "date.month": "date_month",
-        "date.year": "date_year"
+        "date.year": "date_year",
     },
 )
 ```
@@ -336,9 +352,13 @@ note_nlp = docs.to_spark(
         "negation": "negation",
         "hypothesis": "hypothesis",
         "family": "family",
-        "date.day": "date_day",  # slugify the extension name
+        "date.datetime": "datetime",
+
+        # having individual columns for each date part
+        # can be useful for incomplete dates (eg, "in May")
+        "date.day": "date_day",
         "date.month": "date_month",
-        "date.year": "date_year"
+        "date.year": "date_year",
     },
     dtypes=None,  # (1)
 )

diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py
@@ -432,7 +432,7 @@ def __init__(
         *,
         tokenizer: Optional[PipelineProtocol] = None,
         span_setter: SpanSetterArg = {"ents": True, "*": True},
-        doc_attributes: AttributesMappingArg = {},
+        doc_attributes: AttributesMappingArg = {"note_datetime": "note_datetime"},
         span_attributes: Optional[AttributesMappingArg] = None,
         default_attributes: AttributesMappingArg = {},
         bool_attributes: SequenceStr = [],

diff --git a/edsnlp/extensions.py b/edsnlp/extensions.py
@@ -1,29 +1,32 @@
+import warnings
 from datetime import date, datetime
 
-import pendulum
+from dateutil.parser import parse as parse_date
 from spacy.tokens import Doc
 
 if not Doc.has_extension("note_id"):
     Doc.set_extension("note_id", default=None)
 
 
-def set_note_datetime(span, dt):
-    if type(dt) is datetime:
-        dt = pendulum.instance(dt)
-    elif isinstance(dt, pendulum.DateTime):
+def set_note_datetime(doc, dt):
+    try:
+        if type(dt) is datetime:
+            pass
+        elif isinstance(dt, str):
+            dt = parse_date(dt)
+        elif isinstance(dt, (int, float)):
+            dt = datetime.fromtimestamp(dt)
+        elif isinstance(dt, date):
+            dt = datetime(dt.year, dt.month, dt.day)
+        elif dt is None:
+            pass
+        key = doc._._get_key("note_datetime")
+        doc.doc.user_data[key] = dt
+        return
+    except Exception:
         pass
-    elif isinstance(dt, str):
-        dt = pendulum.parse(dt)
-    elif isinstance(dt, (int, float)):
-        dt = pendulum.from_timestamp(dt)
-    elif isinstance(dt, date):
-        dt = pendulum.instance(datetime.fromordinal(dt.toordinal()))
-    elif dt is None:
-        pass
-    else:
-        raise ValueError(f"Cannot cast {dt} as a datetime")
-    key = span._._get_key("note_datetime")
-    span.doc.user_data[key] = dt
+
+    warnings.warn(f"Cannot cast {dt} as a note datetime", UserWarning)
 
 
 def get_note_datetime(doc):

diff --git a/edsnlp/pipes/misc/consultation_dates/consultation_dates.py b/edsnlp/pipes/misc/consultation_dates/consultation_dates.py
@@ -52,7 +52,7 @@ class ConsultationDatesMatcher(GenericMatcher):
     # Out: [Consultation du 03/10/2018]
 
     doc.spans["consultation_dates"][0]._.consultation_date.to_datetime()
-    # Out: DateTime(2018, 10, 3, 0, 0, 0, tzinfo=Timezone('Europe/Paris'))
+    # Out: DateTime(2018, 10, 3, 0, 0, 0)
     ```
 
     Extensions

diff --git a/edsnlp/pipes/misc/dates/dates.py b/edsnlp/pipes/misc/dates/dates.py
@@ -46,8 +46,8 @@ class DatesMatcher(BaseNERComponent):
 
     ```python
     import edsnlp, edsnlp.pipes as eds
-
-    import pendulum
+    import datetime
+    import pytz
 
     nlp = edsnlp.blank("eds")
     nlp.add_pipe(eds.dates())
@@ -70,9 +70,10 @@ class DatesMatcher(BaseNERComponent):
     dates[1]._.date.to_datetime()
     # Out: None
 
-    note_datetime = pendulum.datetime(2021, 8, 27, tz="Europe/Paris")
+    note_datetime = datetime.datetime(2021, 8, 27, tzinfo=pytz.timezone("Europe/Paris"))
+    doc._.note_datetime = note_datetime
 
-    dates[1]._.date.to_datetime(note_datetime=note_datetime)
+    dates[1]._.date.to_datetime()
     # Out: 2020-08-27T00:00:00+02:00
 
     date_2_output = dates[2]._.date.to_datetime(
@@ -88,6 +89,26 @@ class DatesMatcher(BaseNERComponent):
     # Out: [pendant une semaine]
     ```
 
+    Example on a collection of documents stored in the OMOP schema :
+
+    ```{ .python .no-check }
+    import edsnlp, edsnlp.pipes as eds
+
+    # with cols "note_id", "note_text" and optionally "note_datetime"
+    my_omop_df = ...
+    nlp = edsnlp.blank("eds")
+    nlp.add_pipe(eds.dates(as_ents=True))
+    docs = edsnlp.data.from_pandas(my_omop_df)
+    docs = docs.map_pipeline(nlp)
+    docs = docs.to_pandas(
+        converter="ents",
+        span_attributes={"date.datetime": "datetime"},
+    )
+    print(docs)
+    # note_id  start  end label lexical_variant span_type datetime
+    # ...
+    ```
+
     Extensions
     ----------
     The `eds.dates` pipeline declares two extensions on the `Span` object:
@@ -371,6 +392,7 @@ def parse(
                 if v is not None and "_" in k:
                     key, value = k.split("_")
                     date_cfg.update({key: value})
+            date_cfg["doc"] = span.doc
             if span.label_ == "relative":
                 parsed = RelativeDate.parse_obj(date_cfg)
                 span.label_ = self.date_label