diff --git a/.cspell.json b/.cspell.json index f74492eb55..928068b9fb 100644 --- a/.cspell.json +++ b/.cspell.json @@ -150,6 +150,7 @@ "graphviz", "greenbutton", "gte", + "guillemets", "Gunter", "Gzip", "hardcoded", @@ -206,6 +207,7 @@ "lookup", "lte", "lxml", + "malcriado", "MapItem", "mappable", "mapquest", @@ -225,6 +227,7 @@ "MyBrowser", "MyModel", "namespace", + "Ñaño", "natively", "netcat", "newdomain", @@ -307,6 +310,7 @@ "runtime", "salesforce", "scalable", + "schön", "seeddb", "seedorg", "seedpass", @@ -371,6 +375,7 @@ "tsts", "tuples", "typechecking", + "über", "ubid", "UBID", "ubidmodel", @@ -416,6 +421,7 @@ "webdriver", "webserver", "webservers", + "Welstone", "whitelist", "wildcards", "workflow", @@ -432,7 +438,9 @@ "xpaths", "XSLX", "yasg", - "yml" + "yml", + "اضافية", + "بيانات" ], "flagWords": [ "hte" diff --git a/requirements/base.txt b/requirements/base.txt index 1efc35f0d5..3cc4f626bb 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -42,7 +42,6 @@ Markdown==3.1.1 polling==0.3.2 pyyaml==6.0.1 street-address==0.4.0 -unidecode==1.1.1 xlrd==1.2.0 xlsxwriter==1.2.7 xmltodict==0.12.0 diff --git a/seed/data_importer/tasks.py b/seed/data_importer/tasks.py index ff78a25c02..47d536eed9 100644 --- a/seed/data_importer/tasks.py +++ b/seed/data_importer/tasks.py @@ -33,7 +33,6 @@ from django.utils import timezone as tz from django.utils.timezone import make_naive from past.builtins import basestring -from unidecode import unidecode from seed.building_sync import validation_client from seed.building_sync.building_sync import BuildingSync @@ -50,6 +49,7 @@ from seed.data_importer.sensor_readings_parser import SensorsReadingsParser from seed.data_importer.utils import usage_point_id from seed.lib.mcm import cleaners, mapper, reader +from seed.lib.mcm.cleaners import normalize_unicode_and_characters from seed.lib.mcm.mapper import expand_rows from seed.lib.mcm.utils import batch from seed.lib.progress_data.progress_data import ProgressData @@ -741,7 +741,7 @@ def _save_raw_data_chunk(chunk, file_pk, progress_key): elif key == "_source_filename": # grab source filename (for BSync) source_filename = v elif isinstance(v, basestring): - new_chunk[key] = unidecode(v) + new_chunk[key] = normalize_unicode_and_characters(v) elif isinstance(v, (datetime, date)): raise TypeError( "Datetime class not supported in Extra Data. Needs to be a string.") @@ -1559,9 +1559,10 @@ def add_dictionary_repr_to_hash(hash_obj, dict_obj): if isinstance(value, dict): add_dictionary_repr_to_hash(hash_obj, value) else: - hash_obj.update(str(unidecode(key)).encode('utf-8')) + # TODO: Do we need to normalize_unicode_and_characters (formerly unidecode) here? + hash_obj.update(str(normalize_unicode_and_characters(key)).encode('utf-8')) if isinstance(value, basestring): - hash_obj.update(unidecode(value).encode('utf-8')) + hash_obj.update(normalize_unicode_and_characters(value).encode('utf-8')) else: hash_obj.update(str(value).encode('utf-8')) return hash_obj diff --git a/seed/data_importer/tests/data/example-data-properties-2-invalid-footprints.xlsx b/seed/data_importer/tests/data/example-data-properties-2-invalid-footprints.xlsx index d71b885280..b56624661d 100644 Binary files a/seed/data_importer/tests/data/example-data-properties-2-invalid-footprints.xlsx and b/seed/data_importer/tests/data/example-data-properties-2-invalid-footprints.xlsx differ diff --git a/seed/data_importer/tests/data/example-data-properties-small-changes.xlsx b/seed/data_importer/tests/data/example-data-properties-small-changes.xlsx index e87eb934f5..eb9ed26e69 100644 Binary files a/seed/data_importer/tests/data/example-data-properties-small-changes.xlsx and b/seed/data_importer/tests/data/example-data-properties-small-changes.xlsx differ diff --git a/seed/data_importer/tests/data/example-data-properties-unicode.xlsx b/seed/data_importer/tests/data/example-data-properties-unicode.xlsx new file mode 100644 index 0000000000..64e00c2e24 Binary files /dev/null and b/seed/data_importer/tests/data/example-data-properties-unicode.xlsx differ diff --git a/seed/data_importer/tests/data/example-data-properties.xlsx b/seed/data_importer/tests/data/example-data-properties.xlsx index 7a3e590069..1f9ebe8770 100644 Binary files a/seed/data_importer/tests/data/example-data-properties.xlsx and b/seed/data_importer/tests/data/example-data-properties.xlsx differ diff --git a/seed/data_importer/tests/integration/test_merge_duplicate_rows.py b/seed/data_importer/tests/integration/test_merge_duplicate_rows.py index 66553b48a7..5d38fc5a70 100644 --- a/seed/data_importer/tests/integration/test_merge_duplicate_rows.py +++ b/seed/data_importer/tests/integration/test_merge_duplicate_rows.py @@ -120,7 +120,9 @@ def test_hash_quantity_unicode(self): data_state=DATA_STATE_IMPORT, import_file_id=0, ) - self.assertEqual(ps1.hash_object, ps2.hash_object) + # Not that we support unicode in the fields, then the hashes should not be the + # same anymore. #TODO: Should we strip all unicode characters in extra data fields? + self.assertNotEqual(ps1.hash_object, ps2.hash_object) def test_hash_release_date(self): """The hash_state_object method makes the timezones naive, so this should work because diff --git a/seed/data_importer/tests/test_mapping.py b/seed/data_importer/tests/test_mapping.py index 2f6d79b6f6..687a966bb6 100644 --- a/seed/data_importer/tests/test_mapping.py +++ b/seed/data_importer/tests/test_mapping.py @@ -88,7 +88,7 @@ def test_mapping(self): # for p in props: # pp(p) - def test_remapping_with_and_without_unit_aware_columns_doesnt_lose_data(self): + def test_remapping_with_and_without_unit_aware_columns_does_not_lose_data(self): """ During import, when the initial -State objects are created from the extra_data values, ColumnMapping objects are used to take the extra_data dictionary values and create the diff --git a/seed/data_importer/tests/test_match_unicode.py b/seed/data_importer/tests/test_match_unicode.py new file mode 100644 index 0000000000..e14646e05e --- /dev/null +++ b/seed/data_importer/tests/test_match_unicode.py @@ -0,0 +1,156 @@ + +# !/usr/bin/env python +# encoding: utf-8 +""" +SEED Platform (TM), Copyright (c) Alliance for Sustainable Energy, LLC, and other contributors. +See also https://github.com/seed-platform/seed/main/LICENSE.md +""" +import logging +import os.path as osp +import pathlib + +from django.core.files.uploadedfile import SimpleUploadedFile + +from seed.data_importer import tasks +from seed.data_importer.tests.util import FAKE_MAPPINGS +from seed.lib.mcm.cleaners import normalize_unicode_and_characters +from seed.models import ( + ASSESSED_RAW, + DATA_STATE_MAPPING, + Column, + PropertyState, + PropertyView +) +from seed.test_helpers.fake import ( + FakePropertyStateFactory, + FakeTaxLotStateFactory +) +from seed.tests.util import DataMappingBaseTestCase + +logger = logging.getLogger(__name__) + + +class TestUnicodeNormalization(DataMappingBaseTestCase): + def test_unicode_normalization(self): + """Test a few cases. The unicodedata.normalize('NFC', text) method combines the + the letter and diacritics, which seems to provide the best compatibility.""" + # Guillemets + unicode_text = "Café «Déjà Vu»" + expected_out = "Café \"Déjà Vu\"" + normalized_text = normalize_unicode_and_characters(unicode_text) + self.assertEqual(normalized_text, expected_out) + + # This passes straight through (no diacritics) + unicode_text = "شكرا لك" + normalized_text = normalize_unicode_and_characters(unicode_text) + self.assertEqual(normalized_text, unicode_text) + + # mdash to `--` + unicode_text = "– über schön! —" + expected_out = "- über schön! --" + normalized_text = normalize_unicode_and_characters(unicode_text) + self.assertEqual(normalized_text, expected_out) + + # \u004E\u0303 is Ñ (N + tilde) and the normalization converts it to a + # single unicode character. ñ stays and combines the diacritic and letter + unicode_text = "\u004E\u0303a\u006E\u0303o malcriado" + expected_out = "Ñaño malcriado" + normalized_text = normalize_unicode_and_characters(unicode_text) + self.assertEqual(normalized_text, expected_out) + + +class TestUnicodeImport(DataMappingBaseTestCase): + def setUp(self): + filename = getattr(self, 'filename', 'example-data-properties-unicode.xlsx') + import_file_source_type = ASSESSED_RAW + self.fake_mappings = FAKE_MAPPINGS['unicode'] + selfvars = self.set_up(import_file_source_type) + self.user, self.org, self.import_file, self.import_record, self.cycle = selfvars + filepath = osp.join(osp.dirname(__file__), 'data', filename) + self.import_file.file = SimpleUploadedFile( + name=filename, + content=pathlib.Path(filepath).read_bytes() + ) + self.import_file.save() + + def test_unicode_import(self): + """Test that unicode characters are imported correctly""" + tasks.save_raw_data(self.import_file.pk) + Column.create_mappings(self.fake_mappings, self.org, self.user, self.import_file.pk) + tasks.map_data(self.import_file.pk) + + # Check to make sure all the properties imported + ps = PropertyState.objects.filter( + data_state=DATA_STATE_MAPPING, + organization=self.org, + import_file=self.import_file, + ) + self.assertEqual(len(ps), 3) + + # check that the property has the unicode characters + ps = PropertyState.objects.filter( + data_state=DATA_STATE_MAPPING, + organization=self.org, + import_file=self.import_file, + custom_id_1='unicode-1', + )[0] + self.assertEqual(ps.property_name, 'Déjà vu Café') + # check if there is an extra data key with unicode + self.assertEqual('بيانات اضافية' in ps.extra_data, True) + + # check that we can query on unicode character + ps = PropertyState.objects.filter( + data_state=DATA_STATE_MAPPING, + organization=self.org, + import_file=self.import_file, + property_name='🏦 Bank', + )[0] + self.assertIsNotNone(ps) + + tasks.geocode_and_match_buildings_task(self.import_file.id) + + qry = PropertyView.objects.filter(state__custom_id_1='unicode-1') + self.assertEqual(qry.count(), 1) + state = qry.first().state + + self.assertEqual(state.property_name, "Déjà vu Café") + + +class TestUnicodeMatching(DataMappingBaseTestCase): + """Test the matching of two properties with unicode characters + and changing one of the matching criteria with a unicode character and + having it fail.""" + + def setUp(self): + selfvars = self.set_up(ASSESSED_RAW) + self.user, self.org, self.import_file_1, self.import_record_1, self.cycle_1 = selfvars + + self.property_state_factory = FakePropertyStateFactory(organization=self.org) + self.taxlot_state_factory = FakeTaxLotStateFactory(organization=self.org) + + def test_unicode_matching(self): + """If the file did not come from excel or a csv, then the unicode characters will + not be normalized.""" + base_state_details = { + 'pm_property_id': 'Building — 1', # <- that is an m-dash + 'city': 'City 1', + 'import_file_id': self.import_file_1.id, + 'data_state': DATA_STATE_MAPPING, + 'no_default_data': False, + } + self.property_state_factory.get_property_state(**base_state_details) + + # Should normalize some characters, eg. mdash to `--` + base_state_details['pm_property_id'] = 'Building — 1' # <- new state with mdash normalized + base_state_details['city'] = 'New City' + self.property_state_factory.get_property_state(**base_state_details) + + # Import file and create -Views and canonical records. + self.import_file_1.mapping_done = True + self.import_file_1.save() + tasks.geocode_and_match_buildings_task(self.import_file_1.id) + + # there should only be one property view + self.assertEqual(PropertyView.objects.count(), 1) + only_view = PropertyView.objects.first() + self.assertEqual(only_view.state.city, 'New City') diff --git a/seed/data_importer/tests/util.py b/seed/data_importer/tests/util.py index 1a4de200af..ce9b6e8058 100644 --- a/seed/data_importer/tests/util.py +++ b/seed/data_importer/tests/util.py @@ -249,6 +249,33 @@ "to_field": 'Double Tester', } ], + 'unicode': [ + { + "from_field": 'Custom ID 1', + "to_table_name": 'PropertyState', + "to_field": 'custom_id_1', + }, { + "from_field": 'Property Name', + "to_table_name": 'PropertyState', + "to_field": 'property_name', + }, { + "from_field": 'Extra Data - String', + "to_table_name": 'PropertyState', + "to_field": 'Extra Data - String', + }, { + "from_field": 'Extra Data - Float', + "to_table_name": 'PropertyState', + "to_field": 'Extra Data - Float', + }, { + "from_field": 'بيانات اضافية', + "to_table_name": 'PropertyState', + "to_field": 'بيانات اضافية', + }, { + "from_field": 'Notes', + "to_table_name": 'PropertyState', + "to_field": 'Notes', + } + ], 'short': { # Short should no longer be used and probably does not work anymore. 'property_name': 'Name', 'address_line_1': 'Address Line 1', @@ -261,6 +288,7 @@ "to_table_name": 'TaxLotState', "to_field": 'taxlot_footprint', } + PROPERTY_FOOTPRINT_MAPPING = { "from_field": 'Property Coordinates', "to_table_name": 'PropertyState', diff --git a/seed/lib/mappings/mapper.py b/seed/lib/mappings/mapper.py index 5c76376684..1fb553ec61 100644 --- a/seed/lib/mappings/mapper.py +++ b/seed/lib/mappings/mapper.py @@ -14,11 +14,13 @@ from os.path import dirname, join, realpath from past.builtins import basestring -from unidecode import unidecode + +from seed.lib.mcm.cleaners import normalize_unicode_and_characters LINEAR_UNITS = set(['ft', 'm', 'in']) MAPPING_DATA_DIR = join(dirname(realpath(__file__)), 'data') + _log = logging.getLogger(__name__) @@ -34,7 +36,7 @@ def _sanitize_and_convert_keys_to_regex(key): # force unicode # TODO: python3 check if this to run in python3 if isinstance(key, basestring): - key = unidecode(key) + key = normalize_unicode_and_characters(key) # fix superscripts - copied from old code found = False diff --git a/seed/lib/mcm/cleaners.py b/seed/lib/mcm/cleaners.py index 923dd2109b..47830906a8 100644 --- a/seed/lib/mcm/cleaners.py +++ b/seed/lib/mcm/cleaners.py @@ -6,6 +6,7 @@ """ import re import string +import unicodedata from datetime import date, datetime import dateutil @@ -33,6 +34,46 @@ PUNCT_REGEX = re.compile('[{0}]'.format( re.escape(string.punctuation.replace('.', '').replace('-', ''))) ) +# Mapping of specific characters to their normalized versions (need to expand this list) +CHAR_MAPPING = { + ord('“'): '"', + ord('”'): '"', + ord('‘'): "'", + ord('’'): "'", + ord('′'): "'", + ord('″'): '"', + ord('‴'): "'''", + ord('…'): '...', + ord('•'): '*', + ord('⁄'): '/', + ord('×'): 'x', + ord('⁓'): '~', + # mdash, ndash, horizontal bar + ord('–'): '-', + ord('—'): '--', + ord('―'): '-', + ord('¬'): '-', + # guillemets to single and double quotes + ord('‹'): ''', + ord('›'): ''', + ord('«'): '"', + ord('»'): '"', +} + + +def normalize_unicode_and_characters(text): + """Method to normalize unicode characters and replace specific characters with their normalized versions.""" + # Normalize Unicode characters to their canonical form (NFC decomposition) -- + # Combines characters and diacritics when possible. + + # Unicode standardizes on a single code point for accented characters such as é, ü, and ñ. + # More info can be seed here: https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize + normalized_text = unicodedata.normalize('NFC', text) + + # Apply CHAR_MAPPINGS to remove certain characters to be normalized. + normalized_text = normalized_text.translate(CHAR_MAPPING) + + return normalized_text def default_cleaner(value, *args): diff --git a/seed/lib/mcm/reader.py b/seed/lib/mcm/reader.py index 502a13dcde..e88395a3da 100644 --- a/seed/lib/mcm/reader.py +++ b/seed/lib/mcm/reader.py @@ -18,11 +18,11 @@ import xmltodict from past.builtins import basestring -from unidecode import unidecode from xlrd import XLRDError, empty_cell, open_workbook, xldate from xlrd.xldate import XLDateAmbiguous from seed.data_importer.utils import kbtu_thermal_conversion_factors +from seed.lib.mcm.cleaners import normalize_unicode_and_characters # Create a list of Excel cell types. This is copied # directly from the xlrd source code. @@ -56,7 +56,7 @@ def clean_fieldnames(fieldnames): num_generated_headers = 0 new_fieldnames = [] for fieldname in fieldnames: - new_fieldname = unidecode(fieldname) + new_fieldname = normalize_unicode_and_characters(fieldname) if fieldname == '': num_generated_headers += 1 new_fieldname = f'{SEED_GENERATED_HEADER_PREFIX} {num_generated_headers}' @@ -389,7 +389,7 @@ def get_value(self, item, **kwargs): value = " ".join(value.split()) else: value = item.value - return unidecode(value) + return normalize_unicode_and_characters(value) # only remaining items should be booleans return item.value @@ -605,7 +605,7 @@ def first_five_rows(self): for x in first_row: row_field = r[x] if isinstance(row_field, basestring): - row_field = unidecode(r[x]) + row_field = normalize_unicode_and_characters(r[x]) else: row_field = str(r[x]) row_arr.append(row_field.strip()) diff --git a/seed/views/v3/data_quality_checks.py b/seed/views/v3/data_quality_checks.py index 07de4ee228..071bf06342 100644 --- a/seed/views/v3/data_quality_checks.py +++ b/seed/views/v3/data_quality_checks.py @@ -11,10 +11,10 @@ from drf_yasg.utils import swagger_auto_schema from rest_framework import status, viewsets from rest_framework.decorators import action -from unidecode import unidecode from seed.data_importer.tasks import do_checks from seed.decorators import ajax_request_class +from seed.lib.mcm.cleaners import normalize_unicode_and_characters from seed.lib.superperms.orgs.decorators import has_perm_class from seed.models import PropertyView, TaxLotView from seed.models.data_quality import DataQualityCheck @@ -149,8 +149,9 @@ def results_csv(self, request): result['formatted_field'], result.get('label', None), result['condition'], - # the detailed_message field can have units which has superscripts/subscripts, so unidecode it! - unidecode(result['detailed_message']), + # the detailed_message field can have units which has superscripts/subscripts, + # so normalize_unicode_and_characters it! + normalize_unicode_and_characters(result['detailed_message']), result['severity'] ])