Support unicode characters by replacing unidecode with new normalize …

…method (#4484) * replace unidecode with new normalize method * change to use NFC not NFD... * fix tests by removing unicode in excel test files * this test file also had a unicode character in address line 1 * first test with unicode data * unicode test * cast guillemets to single and double quotes --------- Co-authored-by: Alex Swindler <[email protected]>
SEED-platform · Jan 27, 2024 · c8a8e06 · c8a8e06
1 parent ff1ce82
commit c8a8e06
Show file tree

Hide file tree

Showing 15 changed files with 255 additions and 17 deletions.
diff --git a/.cspell.json b/.cspell.json
@@ -150,6 +150,7 @@
     "graphviz",
     "greenbutton",
     "gte",
+    "guillemets",
     "Gunter",
     "Gzip",
     "hardcoded",
@@ -206,6 +207,7 @@
     "lookup",
     "lte",
     "lxml",
+    "malcriado",
     "MapItem",
     "mappable",
     "mapquest",
@@ -225,6 +227,7 @@
     "MyBrowser",
     "MyModel",
     "namespace",
+    "Ñaño",
     "natively",
     "netcat",
     "newdomain",
@@ -307,6 +310,7 @@
     "runtime",
     "salesforce",
     "scalable",
+    "schön",
     "seeddb",
     "seedorg",
     "seedpass",
@@ -371,6 +375,7 @@
     "tsts",
     "tuples",
     "typechecking",
+    "über",
     "ubid",
     "UBID",
     "ubidmodel",
@@ -416,6 +421,7 @@
     "webdriver",
     "webserver",
     "webservers",
+    "Welstone",
     "whitelist",
     "wildcards",
     "workflow",
@@ -432,7 +438,9 @@
     "xpaths",
     "XSLX",
     "yasg",
-    "yml"
+    "yml",
+    "اضافية",
+    "بيانات"
   ],
   "flagWords": [
     "hte"

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -42,7 +42,6 @@ Markdown==3.1.1
 polling==0.3.2
 pyyaml==6.0.1
 street-address==0.4.0
-unidecode==1.1.1
 xlrd==1.2.0
 xlsxwriter==1.2.7
 xmltodict==0.12.0

diff --git a/seed/data_importer/tasks.py b/seed/data_importer/tasks.py
@@ -33,7 +33,6 @@
 from django.utils import timezone as tz
 from django.utils.timezone import make_naive
 from past.builtins import basestring
-from unidecode import unidecode
 
 from seed.building_sync import validation_client
 from seed.building_sync.building_sync import BuildingSync
@@ -50,6 +49,7 @@
 from seed.data_importer.sensor_readings_parser import SensorsReadingsParser
 from seed.data_importer.utils import usage_point_id
 from seed.lib.mcm import cleaners, mapper, reader
+from seed.lib.mcm.cleaners import normalize_unicode_and_characters
 from seed.lib.mcm.mapper import expand_rows
 from seed.lib.mcm.utils import batch
 from seed.lib.progress_data.progress_data import ProgressData
@@ -741,7 +741,7 @@ def _save_raw_data_chunk(chunk, file_pk, progress_key):
                     elif key == "_source_filename":  # grab source filename (for BSync)
                         source_filename = v
                     elif isinstance(v, basestring):
-                        new_chunk[key] = unidecode(v)
+                        new_chunk[key] = normalize_unicode_and_characters(v)
                     elif isinstance(v, (datetime, date)):
                         raise TypeError(
                             "Datetime class not supported in Extra Data. Needs to be a string.")
@@ -1559,9 +1559,10 @@ def add_dictionary_repr_to_hash(hash_obj, dict_obj):
             if isinstance(value, dict):
                 add_dictionary_repr_to_hash(hash_obj, value)
             else:
-                hash_obj.update(str(unidecode(key)).encode('utf-8'))
+                # TODO: Do we need to normalize_unicode_and_characters (formerly unidecode) here?
+                hash_obj.update(str(normalize_unicode_and_characters(key)).encode('utf-8'))
                 if isinstance(value, basestring):
-                    hash_obj.update(unidecode(value).encode('utf-8'))
+                    hash_obj.update(normalize_unicode_and_characters(value).encode('utf-8'))
                 else:
                     hash_obj.update(str(value).encode('utf-8'))
         return hash_obj

diff --git a/seed/data_importer/tests/data/example-data-properties-2-invalid-footprints.xlsx b/seed/data_importer/tests/data/example-data-properties-2-invalid-footprints.xlsx
diff --git a/seed/data_importer/tests/data/example-data-properties-small-changes.xlsx b/seed/data_importer/tests/data/example-data-properties-small-changes.xlsx
diff --git a/seed/data_importer/tests/data/example-data-properties-unicode.xlsx b/seed/data_importer/tests/data/example-data-properties-unicode.xlsx
diff --git a/seed/data_importer/tests/data/example-data-properties.xlsx b/seed/data_importer/tests/data/example-data-properties.xlsx
diff --git a/seed/data_importer/tests/integration/test_merge_duplicate_rows.py b/seed/data_importer/tests/integration/test_merge_duplicate_rows.py
@@ -120,7 +120,9 @@ def test_hash_quantity_unicode(self):
             data_state=DATA_STATE_IMPORT,
             import_file_id=0,
         )
-        self.assertEqual(ps1.hash_object, ps2.hash_object)
+        # Not that we support unicode in the fields, then the hashes should not be the
+        # same anymore. #TODO: Should we strip all unicode characters in extra data fields?
+        self.assertNotEqual(ps1.hash_object, ps2.hash_object)
 
     def test_hash_release_date(self):
         """The hash_state_object method makes the timezones naive, so this should work because

diff --git a/seed/data_importer/tests/test_mapping.py b/seed/data_importer/tests/test_mapping.py
@@ -88,7 +88,7 @@ def test_mapping(self):
         # for p in props:
         #     pp(p)
 
-    def test_remapping_with_and_without_unit_aware_columns_doesnt_lose_data(self):
+    def test_remapping_with_and_without_unit_aware_columns_does_not_lose_data(self):
         """
         During import, when the initial -State objects are created from the extra_data values,
         ColumnMapping objects are used to take the extra_data dictionary values and create the

diff --git a/seed/data_importer/tests/test_match_unicode.py b/seed/data_importer/tests/test_match_unicode.py
@@ -0,0 +1,156 @@
+
+# !/usr/bin/env python
+# encoding: utf-8
+"""
+SEED Platform (TM), Copyright (c) Alliance for Sustainable Energy, LLC, and other contributors.
+See also https://github.com/seed-platform/seed/main/LICENSE.md
+"""
+import logging
+import os.path as osp
+import pathlib
+
+from django.core.files.uploadedfile import SimpleUploadedFile
+
+from seed.data_importer import tasks
+from seed.data_importer.tests.util import FAKE_MAPPINGS
+from seed.lib.mcm.cleaners import normalize_unicode_and_characters
+from seed.models import (
+    ASSESSED_RAW,
+    DATA_STATE_MAPPING,
+    Column,
+    PropertyState,
+    PropertyView
+)
+from seed.test_helpers.fake import (
+    FakePropertyStateFactory,
+    FakeTaxLotStateFactory
+)
+from seed.tests.util import DataMappingBaseTestCase
+
+logger = logging.getLogger(__name__)
+
+
+class TestUnicodeNormalization(DataMappingBaseTestCase):
+    def test_unicode_normalization(self):
+        """Test a few cases. The unicodedata.normalize('NFC', text) method combines the
+        the letter and diacritics, which seems to provide the best compatibility."""
+        # Guillemets
+        unicode_text = "Café «Déjà Vu»"
+        expected_out = "Café \"Déjà Vu\""
+        normalized_text = normalize_unicode_and_characters(unicode_text)
+        self.assertEqual(normalized_text, expected_out)
+
+        # This passes straight through (no diacritics)
+        unicode_text = "شكرا لك"
+        normalized_text = normalize_unicode_and_characters(unicode_text)
+        self.assertEqual(normalized_text, unicode_text)
+
+        # mdash to `--`
+        unicode_text = "– über schön! —"
+        expected_out = "- über schön! --"
+        normalized_text = normalize_unicode_and_characters(unicode_text)
+        self.assertEqual(normalized_text, expected_out)
+
+        # \u004E\u0303 is Ñ (N + tilde) and the normalization converts it to a
+        # single unicode character. ñ stays and combines the diacritic and letter
+        unicode_text = "\u004E\u0303a\u006E\u0303o malcriado"
+        expected_out = "Ñaño malcriado"
+        normalized_text = normalize_unicode_and_characters(unicode_text)
+        self.assertEqual(normalized_text, expected_out)
+
+
+class TestUnicodeImport(DataMappingBaseTestCase):
+    def setUp(self):
+        filename = getattr(self, 'filename', 'example-data-properties-unicode.xlsx')
+        import_file_source_type = ASSESSED_RAW
+        self.fake_mappings = FAKE_MAPPINGS['unicode']
+        selfvars = self.set_up(import_file_source_type)
+        self.user, self.org, self.import_file, self.import_record, self.cycle = selfvars
+        filepath = osp.join(osp.dirname(__file__), 'data', filename)
+        self.import_file.file = SimpleUploadedFile(
+            name=filename,
+            content=pathlib.Path(filepath).read_bytes()
+        )
+        self.import_file.save()
+
+    def test_unicode_import(self):
+        """Test that unicode characters are imported correctly"""
+        tasks.save_raw_data(self.import_file.pk)
+        Column.create_mappings(self.fake_mappings, self.org, self.user, self.import_file.pk)
+        tasks.map_data(self.import_file.pk)
+
+        # Check to make sure all the properties imported
+        ps = PropertyState.objects.filter(
+            data_state=DATA_STATE_MAPPING,
+            organization=self.org,
+            import_file=self.import_file,
+        )
+        self.assertEqual(len(ps), 3)
+
+        # check that the property has the unicode characters
+        ps = PropertyState.objects.filter(
+            data_state=DATA_STATE_MAPPING,
+            organization=self.org,
+            import_file=self.import_file,
+            custom_id_1='unicode-1',
+        )[0]
+        self.assertEqual(ps.property_name, 'Déjà vu Café')
+        # check if there is an extra data key with unicode
+        self.assertEqual('بيانات اضافية' in ps.extra_data, True)
+
+        # check that we can query on unicode character
+        ps = PropertyState.objects.filter(
+            data_state=DATA_STATE_MAPPING,
+            organization=self.org,
+            import_file=self.import_file,
+            property_name='🏦 Bank',
+        )[0]
+        self.assertIsNotNone(ps)
+
+        tasks.geocode_and_match_buildings_task(self.import_file.id)
+
+        qry = PropertyView.objects.filter(state__custom_id_1='unicode-1')
+        self.assertEqual(qry.count(), 1)
+        state = qry.first().state
+
+        self.assertEqual(state.property_name, "Déjà vu Café")
+
+
+class TestUnicodeMatching(DataMappingBaseTestCase):
+    """Test the matching of two properties with unicode characters
+    and changing one of the matching criteria with a unicode character and
+    having it fail."""
+
+    def setUp(self):
+        selfvars = self.set_up(ASSESSED_RAW)
+        self.user, self.org, self.import_file_1, self.import_record_1, self.cycle_1 = selfvars
+
+        self.property_state_factory = FakePropertyStateFactory(organization=self.org)
+        self.taxlot_state_factory = FakeTaxLotStateFactory(organization=self.org)
+
+    def test_unicode_matching(self):
+        """If the file did not come from excel or a csv, then the unicode characters will
+        not be normalized."""
+        base_state_details = {
+            'pm_property_id': 'Building — 1',  # <- that is an m-dash
+            'city': 'City 1',
+            'import_file_id': self.import_file_1.id,
+            'data_state': DATA_STATE_MAPPING,
+            'no_default_data': False,
+        }
+        self.property_state_factory.get_property_state(**base_state_details)
+
+        # Should normalize some characters, eg. mdash to `--`
+        base_state_details['pm_property_id'] = 'Building — 1'  # <- new state with mdash normalized
+        base_state_details['city'] = 'New City'
+        self.property_state_factory.get_property_state(**base_state_details)
+
+        # Import file and create -Views and canonical records.
+        self.import_file_1.mapping_done = True
+        self.import_file_1.save()
+        tasks.geocode_and_match_buildings_task(self.import_file_1.id)
+
+        # there should only be one property view
+        self.assertEqual(PropertyView.objects.count(), 1)
+        only_view = PropertyView.objects.first()
+        self.assertEqual(only_view.state.city, 'New City')
diff --git a/seed/data_importer/tests/util.py b/seed/data_importer/tests/util.py
@@ -249,6 +249,33 @@
             "to_field": 'Double Tester',
         }
     ],
+    'unicode': [
+        {
+            "from_field": 'Custom ID 1',
+            "to_table_name": 'PropertyState',
+            "to_field": 'custom_id_1',
+        }, {
+            "from_field": 'Property Name',
+            "to_table_name": 'PropertyState',
+            "to_field": 'property_name',
+        }, {
+            "from_field": 'Extra Data - String',
+            "to_table_name": 'PropertyState',
+            "to_field": 'Extra Data - String',
+        }, {
+            "from_field": 'Extra Data - Float',
+            "to_table_name": 'PropertyState',
+            "to_field": 'Extra Data - Float',
+        }, {
+            "from_field": 'بيانات اضافية',
+            "to_table_name": 'PropertyState',
+            "to_field": 'بيانات اضافية',
+        }, {
+            "from_field": 'Notes',
+            "to_table_name": 'PropertyState',
+            "to_field": 'Notes',
+        }
+    ],
     'short': {  # Short should no longer be used and probably does not work anymore.
         'property_name': 'Name',
         'address_line_1': 'Address Line 1',
@@ -261,6 +288,7 @@
     "to_table_name": 'TaxLotState',
     "to_field": 'taxlot_footprint',
 }
+
 PROPERTY_FOOTPRINT_MAPPING = {
     "from_field": 'Property Coordinates',
     "to_table_name": 'PropertyState',

diff --git a/seed/lib/mappings/mapper.py b/seed/lib/mappings/mapper.py
@@ -14,11 +14,13 @@
 from os.path import dirname, join, realpath
 
 from past.builtins import basestring
-from unidecode import unidecode
+
+from seed.lib.mcm.cleaners import normalize_unicode_and_characters
 
 LINEAR_UNITS = set(['ft', 'm', 'in'])
 MAPPING_DATA_DIR = join(dirname(realpath(__file__)), 'data')
 
+
 _log = logging.getLogger(__name__)
 
 
@@ -34,7 +36,7 @@ def _sanitize_and_convert_keys_to_regex(key):
     # force unicode
     # TODO: python3 check if this to run in python3
     if isinstance(key, basestring):
-        key = unidecode(key)
+        key = normalize_unicode_and_characters(key)
 
     # fix superscripts - copied from old code
     found = False