Skip to content

Commit

Permalink
Support unicode characters by replacing unidecode with new normalize …
Browse files Browse the repository at this point in the history
…method (#4484)

* replace unidecode with new normalize method

* change to use NFC not NFD...

* fix tests by removing unicode in excel test files

* this test file also had a unicode character in address line 1

* first test with unicode data

* unicode test

* cast guillemets to single and double quotes

---------

Co-authored-by: Alex Swindler <[email protected]>
  • Loading branch information
nllong and axelstudios authored Jan 27, 2024
1 parent ff1ce82 commit c8a8e06
Show file tree
Hide file tree
Showing 15 changed files with 255 additions and 17 deletions.
10 changes: 9 additions & 1 deletion .cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@
"graphviz",
"greenbutton",
"gte",
"guillemets",
"Gunter",
"Gzip",
"hardcoded",
Expand Down Expand Up @@ -206,6 +207,7 @@
"lookup",
"lte",
"lxml",
"malcriado",
"MapItem",
"mappable",
"mapquest",
Expand All @@ -225,6 +227,7 @@
"MyBrowser",
"MyModel",
"namespace",
"Ñaño",
"natively",
"netcat",
"newdomain",
Expand Down Expand Up @@ -307,6 +310,7 @@
"runtime",
"salesforce",
"scalable",
"schön",
"seeddb",
"seedorg",
"seedpass",
Expand Down Expand Up @@ -371,6 +375,7 @@
"tsts",
"tuples",
"typechecking",
"über",
"ubid",
"UBID",
"ubidmodel",
Expand Down Expand Up @@ -416,6 +421,7 @@
"webdriver",
"webserver",
"webservers",
"Welstone",
"whitelist",
"wildcards",
"workflow",
Expand All @@ -432,7 +438,9 @@
"xpaths",
"XSLX",
"yasg",
"yml"
"yml",
"اضافية",
"بيانات"
],
"flagWords": [
"hte"
Expand Down
1 change: 0 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ Markdown==3.1.1
polling==0.3.2
pyyaml==6.0.1
street-address==0.4.0
unidecode==1.1.1
xlrd==1.2.0
xlsxwriter==1.2.7
xmltodict==0.12.0
Expand Down
9 changes: 5 additions & 4 deletions seed/data_importer/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from django.utils import timezone as tz
from django.utils.timezone import make_naive
from past.builtins import basestring
from unidecode import unidecode

from seed.building_sync import validation_client
from seed.building_sync.building_sync import BuildingSync
Expand All @@ -50,6 +49,7 @@
from seed.data_importer.sensor_readings_parser import SensorsReadingsParser
from seed.data_importer.utils import usage_point_id
from seed.lib.mcm import cleaners, mapper, reader
from seed.lib.mcm.cleaners import normalize_unicode_and_characters
from seed.lib.mcm.mapper import expand_rows
from seed.lib.mcm.utils import batch
from seed.lib.progress_data.progress_data import ProgressData
Expand Down Expand Up @@ -741,7 +741,7 @@ def _save_raw_data_chunk(chunk, file_pk, progress_key):
elif key == "_source_filename": # grab source filename (for BSync)
source_filename = v
elif isinstance(v, basestring):
new_chunk[key] = unidecode(v)
new_chunk[key] = normalize_unicode_and_characters(v)
elif isinstance(v, (datetime, date)):
raise TypeError(
"Datetime class not supported in Extra Data. Needs to be a string.")
Expand Down Expand Up @@ -1559,9 +1559,10 @@ def add_dictionary_repr_to_hash(hash_obj, dict_obj):
if isinstance(value, dict):
add_dictionary_repr_to_hash(hash_obj, value)
else:
hash_obj.update(str(unidecode(key)).encode('utf-8'))
# TODO: Do we need to normalize_unicode_and_characters (formerly unidecode) here?
hash_obj.update(str(normalize_unicode_and_characters(key)).encode('utf-8'))
if isinstance(value, basestring):
hash_obj.update(unidecode(value).encode('utf-8'))
hash_obj.update(normalize_unicode_and_characters(value).encode('utf-8'))
else:
hash_obj.update(str(value).encode('utf-8'))
return hash_obj
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified seed/data_importer/tests/data/example-data-properties.xlsx
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def test_hash_quantity_unicode(self):
data_state=DATA_STATE_IMPORT,
import_file_id=0,
)
self.assertEqual(ps1.hash_object, ps2.hash_object)
# Not that we support unicode in the fields, then the hashes should not be the
# same anymore. #TODO: Should we strip all unicode characters in extra data fields?
self.assertNotEqual(ps1.hash_object, ps2.hash_object)

def test_hash_release_date(self):
"""The hash_state_object method makes the timezones naive, so this should work because
Expand Down
2 changes: 1 addition & 1 deletion seed/data_importer/tests/test_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_mapping(self):
# for p in props:
# pp(p)

def test_remapping_with_and_without_unit_aware_columns_doesnt_lose_data(self):
def test_remapping_with_and_without_unit_aware_columns_does_not_lose_data(self):
"""
During import, when the initial -State objects are created from the extra_data values,
ColumnMapping objects are used to take the extra_data dictionary values and create the
Expand Down
156 changes: 156 additions & 0 deletions seed/data_importer/tests/test_match_unicode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@

# !/usr/bin/env python
# encoding: utf-8
"""
SEED Platform (TM), Copyright (c) Alliance for Sustainable Energy, LLC, and other contributors.
See also https://github.com/seed-platform/seed/main/LICENSE.md
"""
import logging
import os.path as osp
import pathlib

from django.core.files.uploadedfile import SimpleUploadedFile

from seed.data_importer import tasks
from seed.data_importer.tests.util import FAKE_MAPPINGS
from seed.lib.mcm.cleaners import normalize_unicode_and_characters
from seed.models import (
ASSESSED_RAW,
DATA_STATE_MAPPING,
Column,
PropertyState,
PropertyView
)
from seed.test_helpers.fake import (
FakePropertyStateFactory,
FakeTaxLotStateFactory
)
from seed.tests.util import DataMappingBaseTestCase

logger = logging.getLogger(__name__)


class TestUnicodeNormalization(DataMappingBaseTestCase):
def test_unicode_normalization(self):
"""Test a few cases. The unicodedata.normalize('NFC', text) method combines the
the letter and diacritics, which seems to provide the best compatibility."""
# Guillemets
unicode_text = "Café «Déjà Vu»"
expected_out = "Café \"Déjà Vu\""
normalized_text = normalize_unicode_and_characters(unicode_text)
self.assertEqual(normalized_text, expected_out)

# This passes straight through (no diacritics)
unicode_text = "شكرا لك"
normalized_text = normalize_unicode_and_characters(unicode_text)
self.assertEqual(normalized_text, unicode_text)

# mdash to `--`
unicode_text = "– über schön! —"
expected_out = "- über schön! --"
normalized_text = normalize_unicode_and_characters(unicode_text)
self.assertEqual(normalized_text, expected_out)

# \u004E\u0303 is Ñ (N + tilde) and the normalization converts it to a
# single unicode character. ñ stays and combines the diacritic and letter
unicode_text = "\u004E\u0303a\u006E\u0303o malcriado"
expected_out = "Ñaño malcriado"
normalized_text = normalize_unicode_and_characters(unicode_text)
self.assertEqual(normalized_text, expected_out)


class TestUnicodeImport(DataMappingBaseTestCase):
def setUp(self):
filename = getattr(self, 'filename', 'example-data-properties-unicode.xlsx')
import_file_source_type = ASSESSED_RAW
self.fake_mappings = FAKE_MAPPINGS['unicode']
selfvars = self.set_up(import_file_source_type)
self.user, self.org, self.import_file, self.import_record, self.cycle = selfvars
filepath = osp.join(osp.dirname(__file__), 'data', filename)
self.import_file.file = SimpleUploadedFile(
name=filename,
content=pathlib.Path(filepath).read_bytes()
)
self.import_file.save()

def test_unicode_import(self):
"""Test that unicode characters are imported correctly"""
tasks.save_raw_data(self.import_file.pk)
Column.create_mappings(self.fake_mappings, self.org, self.user, self.import_file.pk)
tasks.map_data(self.import_file.pk)

# Check to make sure all the properties imported
ps = PropertyState.objects.filter(
data_state=DATA_STATE_MAPPING,
organization=self.org,
import_file=self.import_file,
)
self.assertEqual(len(ps), 3)

# check that the property has the unicode characters
ps = PropertyState.objects.filter(
data_state=DATA_STATE_MAPPING,
organization=self.org,
import_file=self.import_file,
custom_id_1='unicode-1',
)[0]
self.assertEqual(ps.property_name, 'Déjà vu Café')
# check if there is an extra data key with unicode
self.assertEqual('بيانات اضافية' in ps.extra_data, True)

# check that we can query on unicode character
ps = PropertyState.objects.filter(
data_state=DATA_STATE_MAPPING,
organization=self.org,
import_file=self.import_file,
property_name='🏦 Bank',
)[0]
self.assertIsNotNone(ps)

tasks.geocode_and_match_buildings_task(self.import_file.id)

qry = PropertyView.objects.filter(state__custom_id_1='unicode-1')
self.assertEqual(qry.count(), 1)
state = qry.first().state

self.assertEqual(state.property_name, "Déjà vu Café")


class TestUnicodeMatching(DataMappingBaseTestCase):
"""Test the matching of two properties with unicode characters
and changing one of the matching criteria with a unicode character and
having it fail."""

def setUp(self):
selfvars = self.set_up(ASSESSED_RAW)
self.user, self.org, self.import_file_1, self.import_record_1, self.cycle_1 = selfvars

self.property_state_factory = FakePropertyStateFactory(organization=self.org)
self.taxlot_state_factory = FakeTaxLotStateFactory(organization=self.org)

def test_unicode_matching(self):
"""If the file did not come from excel or a csv, then the unicode characters will
not be normalized."""
base_state_details = {
'pm_property_id': 'Building — 1', # <- that is an m-dash
'city': 'City 1',
'import_file_id': self.import_file_1.id,
'data_state': DATA_STATE_MAPPING,
'no_default_data': False,
}
self.property_state_factory.get_property_state(**base_state_details)

# Should normalize some characters, eg. mdash to `--`
base_state_details['pm_property_id'] = 'Building — 1' # <- new state with mdash normalized
base_state_details['city'] = 'New City'
self.property_state_factory.get_property_state(**base_state_details)

# Import file and create -Views and canonical records.
self.import_file_1.mapping_done = True
self.import_file_1.save()
tasks.geocode_and_match_buildings_task(self.import_file_1.id)

# there should only be one property view
self.assertEqual(PropertyView.objects.count(), 1)
only_view = PropertyView.objects.first()
self.assertEqual(only_view.state.city, 'New City')
28 changes: 28 additions & 0 deletions seed/data_importer/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,33 @@
"to_field": 'Double Tester',
}
],
'unicode': [
{
"from_field": 'Custom ID 1',
"to_table_name": 'PropertyState',
"to_field": 'custom_id_1',
}, {
"from_field": 'Property Name',
"to_table_name": 'PropertyState',
"to_field": 'property_name',
}, {
"from_field": 'Extra Data - String',
"to_table_name": 'PropertyState',
"to_field": 'Extra Data - String',
}, {
"from_field": 'Extra Data - Float',
"to_table_name": 'PropertyState',
"to_field": 'Extra Data - Float',
}, {
"from_field": 'بيانات اضافية',
"to_table_name": 'PropertyState',
"to_field": 'بيانات اضافية',
}, {
"from_field": 'Notes',
"to_table_name": 'PropertyState',
"to_field": 'Notes',
}
],
'short': { # Short should no longer be used and probably does not work anymore.
'property_name': 'Name',
'address_line_1': 'Address Line 1',
Expand All @@ -261,6 +288,7 @@
"to_table_name": 'TaxLotState',
"to_field": 'taxlot_footprint',
}

PROPERTY_FOOTPRINT_MAPPING = {
"from_field": 'Property Coordinates',
"to_table_name": 'PropertyState',
Expand Down
6 changes: 4 additions & 2 deletions seed/lib/mappings/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
from os.path import dirname, join, realpath

from past.builtins import basestring
from unidecode import unidecode

from seed.lib.mcm.cleaners import normalize_unicode_and_characters

LINEAR_UNITS = set(['ft', 'm', 'in'])
MAPPING_DATA_DIR = join(dirname(realpath(__file__)), 'data')


_log = logging.getLogger(__name__)


Expand All @@ -34,7 +36,7 @@ def _sanitize_and_convert_keys_to_regex(key):
# force unicode
# TODO: python3 check if this to run in python3
if isinstance(key, basestring):
key = unidecode(key)
key = normalize_unicode_and_characters(key)

# fix superscripts - copied from old code
found = False
Expand Down
Loading

0 comments on commit c8a8e06

Please sign in to comment.