From a8f75676ebd4e947bef3d4cb3c88014a5ba659de Mon Sep 17 00:00:00 2001 From: antoinefa Date: Tue, 7 May 2024 21:52:06 -0400 Subject: [PATCH 1/2] fix bug correction side effect on NUM_DECES - permalink --- projects/deces-dataprep/datasets/deces_src.yml | 3 +-- projects/deces-dataprep/recipes/deces_dataprep.yml | 7 +++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/projects/deces-dataprep/datasets/deces_src.yml b/projects/deces-dataprep/datasets/deces_src.yml index 22d9bf0..e813c08 100644 --- a/projects/deces-dataprep/datasets/deces_src.yml +++ b/projects/deces-dataprep/datasets/deces_src.yml @@ -8,7 +8,7 @@ datasets: compression: gzip type: fwf encoding: latin1 - widths: [80, 1, 8, 5, 30, 30, 8, 5, 9, 1] + widths: [80, 1, 8, 5, 30, 30, 8, 5, 10] names: - NOM_PRENOMS - SEXE @@ -19,4 +19,3 @@ datasets: - DATE_DECES - CODE_INSEE_DECES - NUM_DECES - - END_OF_LINE diff --git a/projects/deces-dataprep/recipes/deces_dataprep.yml b/projects/deces-dataprep/recipes/deces_dataprep.yml index 2a137a8..87061be 100644 --- a/projects/deces-dataprep/recipes/deces_dataprep.yml +++ b/projects/deces-dataprep/recipes/deces_dataprep.yml @@ -7,8 +7,6 @@ recipes: dataset: deces_index threads: !ENV ${RECIPE_THREADS} steps: - - delete: - select: END_OF_LINE - normalize: select: (NOM_PRENOMS|COMMUNE_NAISSANCE|PAYS_NAISSANCE|NUM_DECES)$ - exec: @@ -17,6 +15,11 @@ recipes: - df['_id'] = df['UID'] - df['SOURCE'] = str(desc['source']['name']) - df['SOURCE_LINE'] = 1+df.index + - replace: + select: NUM_DECES + regex: + - '(.{9}).*': '\1' + - '\s*$': '' - replace: select: SOURCE regex: From 3a6ade915cee34694d7d215034845b19fff8bada Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sat, 20 Jul 2024 19:48:00 -0400 Subject: [PATCH 2/2] Fix: The support for using Y (years) and M (months) in np.timedelta64 was removed in newer versions of NumPy because these units are not unambiguous - using seconds instead of Years --- projects/deces-dataprep/recipes/deces_dataprep.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/deces-dataprep/recipes/deces_dataprep.yml b/projects/deces-dataprep/recipes/deces_dataprep.yml index 87061be..4447f81 100644 --- a/projects/deces-dataprep/recipes/deces_dataprep.yml +++ b/projects/deces-dataprep/recipes/deces_dataprep.yml @@ -96,8 +96,9 @@ recipes: select: DATE.*NORM format: "%Y%m%d" - exec: + # Year = 365.25 * 24 * 60 * 60 seconds - df['AGE_DECES'] = np.where(df['DATE_DECES'] > df['DATE_NAISSANCE'], - (df['DATE_DECES_NORM'] - df['DATE_NAISSANCE_NORM']).astype('