diff --git a/DataScience/Analytics/data/data.py b/DataScience/Analytics/data/data.py index 1719613f8..da52f210d 100644 --- a/DataScience/Analytics/data/data.py +++ b/DataScience/Analytics/data/data.py @@ -5,12 +5,17 @@ A simple script containing methods to query or modify the ONS AddressBase data. +Uses Dask for larger-than-memory computations. Can also use distributed to spread +the computations over a cluster if needed. + Requirements ------------ -:requires: numpy -:requires: pandas +:requires: dask (tested with 0.14.0) +:requires: distributed (tested with 1.16.0) +:requires: numpy (tested with 1.12.0) +:requires: pandas (tested with 0.19.2) :requires: sqlalchemy :requires: tqdm (https://github.com/tqdm/tqdm) @@ -24,15 +29,19 @@ Version ------- -:version: 0.9 -:date: 23-Jan-2016 +:version: 1.0 +:date: 2-Mar-2016 """ import glob import os import re import sqlite3 + +import dask.dataframe as dd import numpy as np import pandas as pd +from dask.diagnostics import ProgressBar +from distributed import Client, LocalCluster from sqlalchemy import create_engine from tqdm import tqdm @@ -74,8 +83,9 @@ def getPostcode(string): """ try: tmp = \ - re.findall(r'[A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA', - string)[0] + re.findall( + r'[A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA', + string)[0] except ValueError: tmp = None @@ -190,65 +200,81 @@ def combineMiniABtestingData(): data.to_csv(path + 'ABmini.csv', index=0) -def combineAddressBaseData(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRESSBASE/', - filename='AB.csv'): +def combine_address_base_data(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRESSBASE/', + filename='AB.csv', distributed=False): """ Read in all the Address Base Epoch 39 CSV files and combine to a single CSV file. Only relevant information is retained to compress the AB for easier handling. + .. Note:: Uses Dask so that the datasets do not need to fit in memory. This is not very efficient + as the join is using a column not index. However, as UPRN is not unique using it as index + has a penalty too. + :param path: location of the AddressBase CSV files :type path: str :param filename: name of the output file :type filename: str + :param distributed: + :type distributed: bool :return: None """ - files = glob.glob(path + 'ABP_E39_*.csv') + if distributed: + cluster = LocalCluster(n_workers=4, threads_per_worker=1) + client = Client(cluster) + print(client) - for file in tqdm(files): - - # skip a few addresses not used - if 'CLASSIFICATION' in file or 'STREET.csv' in file: - pass - - print('Reading in', file) - tmp = pd.read_csv(file, dtype=str) + all_files = glob.glob(path + 'ABP_E39_*.csv') + files = [file for file in all_files if ('STREET.csv' not in file)] + data_container = dict() + for file in files: if 'BLPU' in file: - BLPU = tmp[['UPRN', 'POSTCODE_LOCATOR']] + columns = ['UPRN', 'POSTCODE_LOCATOR'] + id = 'BLPU' if 'DELIVERY_POINT' in file: - DP = tmp[['UPRN', 'ORGANISATION_NAME', 'DEPARTMENT_NAME', 'SUB_BUILDING_NAME', - 'BUILDING_NAME', 'BUILDING_NUMBER', 'THROUGHFARE', 'DEPENDENT_LOCALITY', - 'POST_TOWN', 'POSTCODE']] + columns = ['UPRN', 'ORGANISATION_NAME', 'DEPARTMENT_NAME', 'SUB_BUILDING_NAME', + 'BUILDING_NAME', 'BUILDING_NUMBER', 'THROUGHFARE', 'DEPENDENT_LOCALITY', + 'POST_TOWN', 'POSTCODE'] + id = 'DP' if 'LPI' in file: - LPI = tmp[['UPRN', 'USRN', 'LANGUAGE', - 'PAO_TEXT', 'PAO_START_NUMBER', 'PAO_START_SUFFIX', 'PAO_END_NUMBER', 'PAO_END_SUFFIX', - 'SAO_TEXT', 'SAO_START_NUMBER', 'SAO_START_SUFFIX']] + columns = ['UPRN', 'USRN', 'LANGUAGE', 'PAO_TEXT', 'PAO_START_NUMBER', 'PAO_START_SUFFIX', + 'PAO_END_NUMBER', 'PAO_END_SUFFIX', 'SAO_TEXT', 'SAO_START_NUMBER', 'SAO_START_SUFFIX', + 'SAO_END_NUMBER', 'SAO_END_SUFFIX'] + id = 'LPI' if 'STREET_DESC' in file: - ST = tmp[['USRN', 'STREET_DESCRIPTOR', 'TOWN_NAME', 'LANGUAGE', 'LOCALITY']] + columns = ['USRN', 'STREET_DESCRIPTOR', 'TOWN_NAME', 'LANGUAGE', 'LOCALITY'] + id = 'ST' if 'ORGANISATION' in file: - ORG = tmp[['UPRN', 'ORGANISATION']] + columns = ['UPRN', 'ORGANISATION'] + id = 'ORG' - print('joining the individual dataframes...') - data = pd.merge(BLPU, DP, how='left', on='UPRN') - data = pd.merge(data, LPI, how='left', on='UPRN') - data = pd.merge(data, ORG, how='left', on=['UPRN']) - data = pd.merge(data, ST, how='left', on=['USRN', 'LANGUAGE']) + print('Reading in', file) + data_container[id] = dd.read_csv(file, dtype=str, usecols=columns) - # drop if all null - data.dropna(inplace=True, how='all') + print('joining the individual data frames to form a single hybrid index...') + data = dd.merge(data_container['BLPU'], data_container['DP'], how='left', on='UPRN') + data = dd.merge(data, data_container['LPI'], how='left', on='UPRN') + data = dd.merge(data, data_container['ORG'], how='left', on=['UPRN']) + data = dd.merge(data, data_container['ST'], how='left', on=['USRN', 'LANGUAGE']) - # change uprn to int + if distributed: + data = dd.compute(data)[0] + else: + with ProgressBar(): + data = dd.compute(data)[0] + + print('change the uprn type to int...') data['UPRN'] = data['UPRN'].astype(int) - # drop if no UPRN + print('drop all entries with no UPRN...') data = data[np.isfinite(data['UPRN'].values)] - # drop some that are not needed + print('drop unnecessary columns...') data.drop(['LANGUAGE', 'USRN'], axis=1, inplace=True) print(data.info()) @@ -339,7 +365,7 @@ def create_NLP_index(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRE columns = {'BLPU': ['UPRN', 'POSTCODE_LOCATOR'], 'LPI': ['UPRN', 'USRN', 'LANGUAGE', 'PAO_TEXT', 'PAO_START_NUMBER', 'PAO_START_SUFFIX', 'PAO_END_NUMBER', - 'PAO_END_SUFFIX','SAO_TEXT', 'SAO_START_NUMBER', 'SAO_START_SUFFIX', 'OFFICIAL_FLAG'], + 'PAO_END_SUFFIX', 'SAO_TEXT', 'SAO_START_NUMBER', 'SAO_START_SUFFIX', 'OFFICIAL_FLAG'], 'STREET_DESC': ['USRN', 'STREET_DESCRIPTOR', 'TOWN_NAME', 'LANGUAGE', 'LOCALITY'], 'ORGANISATION': ['UPRN', 'ORGANISATION']} @@ -382,8 +408,9 @@ def create_NLP_index(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRE data.to_csv(path + filename, index=False) -def create_random_sample_of_delivery_point_addresses(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRESSBASE/', - size=100000): +def create_random_sample_of_delivery_point_addresses( + path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRESSBASE/', + size=100000): """ :param path: @@ -391,7 +418,7 @@ def create_random_sample_of_delivery_point_addresses(path='/Users/saminiemi/Proj :return: """ # read in delivery point table - delivery_point = pd.read_csv(path + 'ABP_E39_DELIVERY_POINT.csv', dtype=str) + delivery_point = pd.read_csv(path + 'ABP_E39_DELIVERY_POINT.csv', dtype=str) delivery_point['UPRN'] = delivery_point['UPRN'].astype(np.int64) print(len(delivery_point.index), 'delivery point addresses...') @@ -411,7 +438,7 @@ def create_random_sample_of_delivery_point_addresses(path='/Users/saminiemi/Proj # write to a file - UPRN and a single string address from the delivery point data = data.fillna('') data['ADDRESS'] = data["ORGANISATION_NAME"] + ' ' + data["DEPARTMENT_NAME"] + ' ' + data["SUB_BUILDING_NAME"] + ' '\ - + data["BUILDING_NAME"] + ' ' + data["BUILDING_NUMBER"] + ' ' + data["THROUGHFARE"] + ' ' +\ + + data["BUILDING_NAME"] + ' ' + data["BUILDING_NUMBER"] + ' ' + data["THROUGHFARE"] + ' ' + \ data["POST_TOWN"] + ' ' + data["POSTCODE"] data = data[['UPRN', 'ADDRESS']] @@ -419,6 +446,129 @@ def create_random_sample_of_delivery_point_addresses(path='/Users/saminiemi/Proj data.to_csv(path + 'delivery_point_addresses.csv', index=False) +def create_final_hybrid_index(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRESSBASE/', filename='AB.csv', + output_filename='AB_processed.csv'): + """ + A function to load an initial version of hybrid index as produced by combine_address_base_data + and to process it to the final hybrid index used in matching. + + .. Warning: this method modifies the original AB information by e.g. combining different tables. Such + activities are undertaken because of the aggressive blocking the prototype linking code uses. + The actual production system should take AB as it is and the linking should not perform blocking + but rather be flexible and take into account that in NAG the information can be stored in various + fields. + """ + address_base = pd.read_csv(path + filename, + dtype={'UPRN': np.int64, 'POSTCODE_LOCATOR': str, 'ORGANISATION_NAME': str, + 'DEPARTMENT_NAME': str, 'SUB_BUILDING_NAME': str, 'BUILDING_NAME': str, + 'BUILDING_NUMBER': str, 'THROUGHFARE': str, 'DEPENDENT_LOCALITY': str, + 'POST_TOWN': str, 'POSTCODE': str, 'PAO_TEXT': str, + 'PAO_START_NUMBER': str, 'PAO_START_SUFFIX': str, 'PAO_END_NUMBER': str, + 'PAO_END_SUFFIX': str, 'SAO_TEXT': str, 'SAO_START_NUMBER': np.float64, + 'SAO_START_SUFFIX': str, 'ORGANISATION': str, 'STREET_DESCRIPTOR': str, + 'TOWN_NAME': str, 'LOCALITY': str, 'SAO_END_NUMBER': np.float64, + 'SAO_END_SUFFIX': str}) + print('Found {} addresses from the combined AddressBase file...'.format(len(address_base.index))) + + # remove street records from the list of potential matches - this makes the search space slightly smaller + exclude = 'STREET RECORD|ELECTRICITY SUB STATION|PUMPING STATION|POND \d+M FROM|PUBLIC TELEPHONE|' + exclude += 'PART OF OS PARCEL|DEMOLISHED BUILDING|CCTV CAMERA|TANK \d+M FROM|SHELTER \d+M FROM|TENNIS COURTS|' + exclude += 'PONDS \d+M FROM|SUB STATION' + msk = address_base['PAO_TEXT'].str.contains(exclude, na=False, case=False) + address_base = address_base.loc[~msk] + + # combine information - could be done differently, but for now using some of these for blocking + msk = address_base['THROUGHFARE'].isnull() + address_base.loc[msk, 'THROUGHFARE'] = address_base.loc[msk, 'STREET_DESCRIPTOR'] + + msk = address_base['BUILDING_NUMBER'].isnull() + address_base.loc[msk, 'BUILDING_NUMBER'] = address_base.loc[msk, 'PAO_START_NUMBER'] + + msk = address_base['BUILDING_NAME'].isnull() + address_base.loc[msk, 'BUILDING_NAME'] = address_base.loc[msk, 'PAO_TEXT'] + + msk = address_base['ORGANISATION_NAME'].isnull() + address_base.loc[msk, 'ORGANISATION_NAME'] = address_base.loc[msk, 'ORGANISATION'] + + msk = address_base['POSTCODE'].isnull() + address_base.loc[msk, 'POSTCODE'] = address_base.loc[msk, 'POSTCODE_LOCATOR'] + + msk = address_base['SUB_BUILDING_NAME'].isnull() + address_base.loc[msk, 'SUB_BUILDING_NAME'] = address_base.loc[msk, 'SAO_TEXT'] + + msk = address_base['POST_TOWN'].isnull() + address_base.loc[msk, 'POST_TOWN'] = address_base.loc[msk, 'TOWN_NAME'] + + msk = address_base['POSTCODE'].isnull() + address_base.loc[msk, 'POSTCODE'] = address_base.loc[msk, 'POSTCODE_LOCATOR'] + + msk = address_base['LOCALITY'].isnull() + address_base.loc[msk, 'LOCALITY'] = address_base.loc[msk, 'DEPENDENT_LOCALITY'] + + # sometimes addressbase does not have SAO_START_NUMBER even if SAO_TEXT clearly has a number + # take the digits from SAO_TEXT and place them to SAO_START_NUMBER if this is empty + msk = address_base['SAO_START_NUMBER'].isnull() & (address_base['SAO_TEXT'].notnull()) + address_base.loc[msk, 'SAO_START_NUMBER'] = pd.to_numeric( + address_base.loc[msk, 'SAO_TEXT'].str.extract('(\d+)')) + + # normalise street names so that st. is always st and 's is always s + msk = address_base['THROUGHFARE'].str.contains('ST\.\s', na=False, case=False) + address_base.loc[msk, 'THROUGHFARE'] = address_base.loc[msk, 'THROUGHFARE'].str.replace('ST\. ', 'ST ') + msk = address_base['THROUGHFARE'].str.contains("'S\s", na=False, case=False) + address_base.loc[msk, 'THROUGHFARE'] = address_base.loc[msk, 'THROUGHFARE'].str.replace("'S\s", 'S ') + + # drop some that are not needed - in the future versions these might be useful + address_base.drop(['DEPENDENT_LOCALITY', 'POSTCODE_LOCATOR', 'ORGANISATION'], axis=1, inplace=True) + + # split postcode to in and outcode - useful for doing blocking in different ways + postcodes = address_base['POSTCODE'].str.split(' ', expand=True) + postcodes.rename(columns={0: 'postcode_in', 1: 'postcode_out'}, inplace=True) + address_base = pd.concat([address_base, postcodes], axis=1) + + print('Using {} addresses from the final hybrid index...'.format(len(address_base.index))) + + print(address_base.info(verbose=True, memory_usage=True, null_counts=True)) + address_base.to_csv(path + output_filename, index=False) + + +def create_test_hybrid_index(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/ADDRESSBASE/', + filename='AB_processed.csv', output_filename='ABtest.csv'): + """ + Updates an existing test index to reflect changes made to the processed final hybrid index. + + :param path: + :param filename: + :param output_filename: + + :return: None + """ + address_base = pd.read_csv(path + filename, + dtype={'UPRN': np.int64, 'POSTCODE_LOCATOR': str, 'ORGANISATION_NAME': str, + 'DEPARTMENT_NAME': str, 'SUB_BUILDING_NAME': str, 'BUILDING_NAME': str, + 'BUILDING_NUMBER': str, 'THROUGHFARE': str, 'DEPENDENT_LOCALITY': str, + 'POST_TOWN': str, 'POSTCODE': str, 'PAO_TEXT': str, + 'PAO_START_NUMBER': str, 'PAO_START_SUFFIX': str, 'PAO_END_NUMBER': str, + 'PAO_END_SUFFIX': str, 'SAO_TEXT': str, 'SAO_START_NUMBER': np.float64, + 'SAO_START_SUFFIX': str, 'ORGANISATION': str, 'STREET_DESCRIPTOR': str, + 'TOWN_NAME': str, 'LOCALITY': str, 'SAO_END_NUMBER': np.float64, + 'SAO_END_SUFFIX': str}) + print('Found {} addresses from te hybrid index...'.format(len(address_base.index))) + + test_index_uprns = pd.read_csv(path + output_filename, usecols=['UPRN'], dtype={'UPRN': np.int64})['UPRN'].values + print('Found {} addresses from the test index...'.format(len(test_index_uprns))) + + # find the overlap + mask = np.in1d(address_base['UPRN'].values, test_index_uprns) + + # output to a file - overwrites the old test index + address_base_test_index = address_base.loc[mask] + address_base_test_index.to_csv(path + output_filename, index=False) + print(address_base_test_index.info()) + + if __name__ == "__main__": - create_NLP_index() - create_random_sample_of_delivery_point_addresses() \ No newline at end of file + combine_address_base_data() + create_final_hybrid_index() + convertCSVtoSQLite() + + create_test_hybrid_index() diff --git a/DataScience/Analytics/data/extractSAOinformation.py b/DataScience/Analytics/data/extractSAOinformation.py new file mode 100644 index 000000000..78db3e812 --- /dev/null +++ b/DataScience/Analytics/data/extractSAOinformation.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +""" +ONS Address Index - Extracting SAO Information +============================================== + +A simple script to check the logic for extracting SAO information from an input string. + + +Running +------- + +When all requirements are satisfied the script can be run from command line using CPython:: + + python extractSAOinformation.py + + +Requirements +------------ + +:requires: pandas +:requires: ProbabilisticParser (a CRF model specifically build for ONS) + + +Author +------ + +:author: Sami Niemi (sami.niemi@valtech.co.uk) + + +Version +------- + +:version: 0.1 +:date: 28-Feb-2017 +""" +import pandas as pd +from Analytics.linking import addressParser + + +def read_data(path='/Users/saminiemi/Projects/ONS/AddressIndex/data/', filename='SAO_END_SUFFIX.xlsx'): + """ + Read in the data containing addresses with both PAO_START_NUMBER and SAO_END_SUFFIX present. + + :param path: location of the input file + :type path: str + :param filename: name of the input file + :type filename: str + + :return: input data in a single data frame + :rtype: pandas.DataFrame + """ + df = pd.read_excel(path + filename) + return df + + +def parse_SAO_information(path, output): + """ + Parse SAO information from a special file containing addresses with both PAO_START_NUMBER and SAO_END_SUFFIX + populated. + + :param path: location to which the output will be stored + :type path: str + :param output: name of the output file + :type output: str + + :return: None + """ + input_data = read_data() + + address_parser = addressParser.AddressParser() + output_data = address_parser.parse(input_data) + + output_data.to_csv(path + output) + + +if __name__ == "__main__": + parse_SAO_information('/Users/saminiemi/Projects/ONS/AddressIndex/data/', 'SAO_END_SUFFIX_parsed.csv') diff --git a/DataScience/Analytics/linking/addressLinking.py b/DataScience/Analytics/linking/addressLinking.py index 2dcc97950..094a7ac47 100644 --- a/DataScience/Analytics/linking/addressLinking.py +++ b/DataScience/Analytics/linking/addressLinking.py @@ -24,12 +24,12 @@ Requirements ------------ -:requires: ProbabilisticParser (a CRF model specifically build for ONS) -:requires: pandas (0.19.1) -:requires: numpy (1.11.2) +:requires: addressParser (a CRF model specifically build for ONS) +:requires: pandas (tested with 0.19.2) +:requires: numpy (tested with 1.12.0) :requires: tqdm (4.10.0: https://github.com/tqdm/tqdm) :requires: recordlinkage (0.6.0: https://pypi.python.org/pypi/recordlinkage/) -:requires: matplotlib (1.5.3) +:requires: matplotlib (2.0.0) Author @@ -41,23 +41,23 @@ Version ------- -:version: 0.91 -:date: 22-Feb-2017 +:version: 0.92 +:date: 2-Mar-2017 """ import datetime import os -import re import sqlite3 import time import warnings + +import matplotlib import numpy as np import pandas as pd import pandas.util.testing as pdt import recordlinkage as rl +from Analytics.linking import addressParser from Analytics.linking import logger -from ProbabilisticParser import parser from tqdm import tqdm -import matplotlib matplotlib.use('Agg') # to prevent Tkinter crashing on cdhut-d03 import matplotlib.pyplot as plt @@ -65,7 +65,7 @@ warnings.simplefilter(action="ignore", category=FutureWarning) pd.options.mode.chained_assignment = None -__version__ = '0.91' +__version__ = '0.92' class AddressLinker: @@ -73,12 +73,15 @@ class AddressLinker: This class forms the backbone of the Address Linking prototype developed for ONS as a part of the Address Index project. - The class implements methods to read in AddressBase, to normalise and parse address strings, - link input data against AddressBase, and finally to merge the test data with the AddressBase - information. It should be noted that the load_data method should be overwritten and made - appropriate for each input test file which maybe be in different formats. In addition, the - check_performance method should also be overwritten because some datasets may or may not - contain already attached UPRNs and different confidences may have been attached to these UPRNs. + The class implements methods to read in AddressBase, link input data against AddressBase, + and finally to merge the test data with the AddressBase information. It should be noted that + the load_data method should be overwritten and made appropriate for each input test file which + maybe be in different formats. In addition, the check_performance method should also be overwritten + because some datasets may or may not contain already attached UPRNs and different confidences may + have been attached to these UPRNs. + + .. Note:: address parsing is now implemented in the AddressParser class. This linking class relies on + the parser class. """ def __init__(self, **kwargs): @@ -201,10 +204,6 @@ def load_data(self): # rename postcode to postcode_orig and locality to locality_orig self.toLinkAddressData.rename(columns={'UPRNs_matched_to_date': 'UPRN_old'}, inplace=True) - - # convert original UPRN to numeric - self.toLinkAddressData['UPRN_old'] = self.toLinkAddressData['UPRN_old'].convert_objects( - convert_numeric=True) else: self.log.info('ERROR - please overwrite the method and make it relevant for the actual test data...') raise NotImplementedError @@ -224,6 +223,8 @@ def check_loaded_data(self): self.log.info('Found {} addresses...'.format(n_addresses)) if 'UPRN_old' in self.toLinkAddressData.columns: + # cast the UPRNs as float64 for comparison purposes as int64 does not support NaNs + self.toLinkAddressData['UPRN_old'] = self.toLinkAddressData['UPRN_old'].astype(np.float64) self.nExistingUPRN = len(self.toLinkAddressData.loc[self.toLinkAddressData['UPRN_old'].notnull()].index) else: self.log.warning('No existing UPRNs found') @@ -247,113 +248,35 @@ def check_loaded_data(self): def load_addressbase(self): """ - A method to load a compressed version of the full AddressBase file. + A method to load a compressed version of the full AddressBase hybrid index. The information being used has been processed from a AB Epoch 39 files provided by ONS. .. Note: this method assumes that all modifications have already been carried out. This method allows the prototype to be run on the ONS utility node as the memory requirements are - reduced compared to when using the load_and_process_addressbase method. + reduced. """ self.log.info('Reading in Modified Address Base Data...') - self.addressBase = pd.read_csv(self.settings['ABpath'] + self.settings['ABfilename'], - dtype={'UPRN': np.int64, 'ORGANISATION_NAME': str, - 'DEPARTMENT_NAME': str, 'SUB_BUILDING_NAME': str, 'BUILDING_NAME': str, - 'BUILDING_NUMBER': str, 'THROUGHFARE': str, - 'POST_TOWN': str, 'POSTCODE': str, 'PAO_TEXT': str, - 'PAO_START_NUMBER': str, 'PAO_START_SUFFIX': str, - 'PAO_END_SUFFIX': str, 'PAO_END_NUMBER': str, 'SAO_START_SUFFIX': str, - 'SAO_TEXT': str, 'SAO_START_NUMBER': np.float64, - 'STREET_DESCRIPTOR': str, 'TOWN_NAME': str, 'LOCALITY': str, - 'postcode_in': str, 'postcode_out': str}) - - self.addressBase['PAO_START_NUMBER'] = self.addressBase['PAO_START_NUMBER'].fillna('-12345') - self.addressBase['PAO_START_NUMBER'] = self.addressBase['PAO_START_NUMBER'].astype(np.int32) - - self.addressBase['PAO_END_NUMBER'] = self.addressBase['PAO_END_NUMBER'].fillna('-12345') - self.addressBase['PAO_END_NUMBER'] = self.addressBase['PAO_END_NUMBER'].astype(np.int32) - - # normalise street names so that st. is always st and 's is always s - PAF and NLP has differences - msk = self.addressBase['THROUGHFARE'].str.contains('ST\.\s', na=False, case=False) - self.addressBase.loc[msk, 'THROUGHFARE'] = self.addressBase.loc[msk, 'THROUGHFARE'].str.replace('ST\. ', 'ST ') - msk = self.addressBase['THROUGHFARE'].str.contains("'S\s", na=False, case=False) - self.addressBase.loc[msk, 'THROUGHFARE'] = self.addressBase.loc[msk, 'THROUGHFARE'].str.replace("'S\s", 'S ') - - self.log.info('Using {} addresses from AddressBase for matching...'.format(len(self.addressBase.index))) - - # set index name - needed later for merging / duplicate removal - self.addressBase.index.name = 'AddressBase_Index' - - def load_and_process_addressbase(self): - """ - A method to load a compressed version of the full AddressBase file and to process it. - - .. Note: this method modifies the original AB information by e.g. combining different tables. Such - activities are undertaken because of the aggressive blocking the prototype linking code uses. - The actual production system should take AB as it is and the linking should not perform blocking - but rather be flexible and take into account that in NAG the information can be stored in various - fields. - """ - self.log.info('Reading in Address Base Data...') - if self.settings['test']: self.log.warning('Using Test Data...') self.settings['ABfilename'] = 'ABtest.csv' + # for comparison purposes cast the UPRN as float as int64 does not support missing values self.addressBase = pd.read_csv(self.settings['ABpath'] + self.settings['ABfilename'], - dtype={'UPRN': np.int64, 'POSTCODE_LOCATOR': str, 'ORGANISATION_NAME': str, + dtype={'UPRN': np.float64, 'ORGANISATION_NAME': str, 'DEPARTMENT_NAME': str, 'SUB_BUILDING_NAME': str, 'BUILDING_NAME': str, - 'BUILDING_NUMBER': str, 'THROUGHFARE': str, 'DEPENDENT_LOCALITY': str, + 'BUILDING_NUMBER': str, 'THROUGHFARE': str, 'POST_TOWN': str, 'POSTCODE': str, 'PAO_TEXT': str, - 'PAO_START_NUMBER': str, 'PAO_START_SUFFIX': str, 'PAO_END_NUMBER': str, - 'PAO_END_SUFFIX': str, 'SAO_TEXT': str, 'SAO_START_NUMBER': np.float64, - 'SAO_START_SUFFIX': str, 'ORGANISATION': str, 'STREET_DESCRIPTOR': str, - 'TOWN_NAME': str, 'LOCALITY': str}) - self.log.info('Found {} addresses from AddressBase...'.format(len(self.addressBase.index))) - - # remove street records from the list of potential matches - this makes the search space slightly smaller - exclude = 'STREET RECORD|ELECTRICITY SUB STATION|PUMPING STATION|POND \d+M FROM|PUBLIC TELEPHONE|' - exclude += 'PART OF OS PARCEL|DEMOLISHED BUILDING|CCTV CAMERA|TANK \d+M FROM|SHELTER \d+M FROM|TENNIS COURTS|' - exclude += 'PONDS \d+M FROM|SUB STATION' - msk = self.addressBase['PAO_TEXT'].str.contains(exclude, na=False, case=False) - self.addressBase = self.addressBase.loc[~msk] - - # combine information - could be done differently, but for now using some of these for blocking - msk = self.addressBase['THROUGHFARE'].isnull() - self.addressBase.loc[msk, 'THROUGHFARE'] = self.addressBase.loc[msk, 'STREET_DESCRIPTOR'] - - msk = self.addressBase['BUILDING_NUMBER'].isnull() - self.addressBase.loc[msk, 'BUILDING_NUMBER'] = self.addressBase.loc[msk, 'PAO_START_NUMBER'] - - msk = self.addressBase['BUILDING_NAME'].isnull() - self.addressBase.loc[msk, 'BUILDING_NAME'] = self.addressBase.loc[msk, 'PAO_TEXT'] - - msk = self.addressBase['ORGANISATION_NAME'].isnull() - self.addressBase.loc[msk, 'ORGANISATION_NAME'] = self.addressBase.loc[msk, 'ORGANISATION'] - - msk = self.addressBase['POSTCODE'].isnull() - self.addressBase.loc[msk, 'POSTCODE'] = self.addressBase.loc[msk, 'POSTCODE_LOCATOR'] - - msk = self.addressBase['SUB_BUILDING_NAME'].isnull() - self.addressBase.loc[msk, 'SUB_BUILDING_NAME'] = self.addressBase.loc[msk, 'SAO_TEXT'] - - msk = self.addressBase['POST_TOWN'].isnull() - self.addressBase.loc[msk, 'POST_TOWN'] = self.addressBase.loc[msk, 'TOWN_NAME'] - - msk = self.addressBase['POSTCODE'].isnull() - self.addressBase.loc[msk, 'POSTCODE'] = self.addressBase.loc[msk, 'POSTCODE_LOCATOR'] - - msk = self.addressBase['LOCALITY'].isnull() - self.addressBase.loc[msk, 'LOCALITY'] = self.addressBase.loc[msk, 'DEPENDENT_LOCALITY'] + 'PAO_START_NUMBER': str, 'PAO_START_SUFFIX': str, + 'PAO_END_SUFFIX': str, 'PAO_END_NUMBER': str, 'SAO_START_SUFFIX': str, + 'SAO_TEXT': str, 'SAO_START_NUMBER': np.float64, 'LOCALITY': str, + 'STREET_DESCRIPTOR': str, 'postcode_in': str, 'postcode_out': str, + 'SAO_END_SUFFIX': str, 'SAO_END_NUMBER': np.float64}) - # sometimes addressbase does not have SAO_START_NUMBER even if SAO_TEXT clearly has a number - # take the digits from SAO_TEXT and place them to SAO_START_NUMBER if this is empty - msk = self.addressBase['SAO_START_NUMBER'].isnull() & (self.addressBase['SAO_TEXT'].notnull()) - self.addressBase.loc[msk, 'SAO_START_NUMBER'] = pd.to_numeric( - self.addressBase.loc[msk, 'SAO_TEXT'].str.extract('(\d+)')) - self.addressBase['SAO_START_NUMBER'].fillna(value=-12345, inplace=True) - self.addressBase['SAO_START_NUMBER'] = self.addressBase['SAO_START_NUMBER'].astype(np.int32) + # remove those with former in the sao_text + msk = self.addressBase['SAO_TEXT'].str.contains('FORMER', na=False, case=False) + self.addressBase = self.addressBase.loc[~msk] self.addressBase['PAO_START_NUMBER'] = self.addressBase['PAO_START_NUMBER'].fillna('-12345') self.addressBase['PAO_START_NUMBER'] = self.addressBase['PAO_START_NUMBER'].astype(np.int32) @@ -361,413 +284,23 @@ def load_and_process_addressbase(self): self.addressBase['PAO_END_NUMBER'] = self.addressBase['PAO_END_NUMBER'].fillna('-12345') self.addressBase['PAO_END_NUMBER'] = self.addressBase['PAO_END_NUMBER'].astype(np.int32) - # normalise street names so that st. is always st and 's is always s - msk = self.addressBase['THROUGHFARE'].str.contains('ST\.\s', na=False, case=False) - self.addressBase.loc[msk, 'THROUGHFARE'] = self.addressBase.loc[msk, 'THROUGHFARE'].str.replace('ST\. ', 'ST ') - msk = self.addressBase['THROUGHFARE'].str.contains("'S\s", na=False, case=False) - self.addressBase.loc[msk, 'THROUGHFARE'] = self.addressBase.loc[msk, 'THROUGHFARE'].str.replace("'S\s", 'S ') + self.addressBase['SAO_START_NUMBER'] = self.addressBase['SAO_START_NUMBER'].fillna('-12345') + self.addressBase['SAO_START_NUMBER'] = self.addressBase['SAO_START_NUMBER'].astype(np.int32) - # drop some that are not needed - in the future versions these might be useful - self.addressBase.drop(['DEPENDENT_LOCALITY', 'POSTCODE_LOCATOR', 'ORGANISATION'], - axis=1, inplace=True) + self.addressBase['SAO_END_NUMBER'] = self.addressBase['SAO_END_NUMBER'].fillna('-12345') + self.addressBase['SAO_END_NUMBER'] = self.addressBase['SAO_END_NUMBER'].astype(np.int32) - # split postcode to in and outcode - useful for doing blocking in different ways - if self.settings['expandPostcode']: - postcodes = self.addressBase['POSTCODE'].str.split(' ', expand=True) - postcodes.rename(columns={0: 'postcode_in', 1: 'postcode_out'}, inplace=True) - self.addressBase = pd.concat([self.addressBase, postcodes], axis=1) + for dummies_columns in ('PAO_START_SUFFIX', 'PAO_END_SUFFIX', 'SAO_START_SUFFIX', 'SAO_END_SUFFIX', 'SAO_TEXT'): + # if field is empty add dummy - helps when comparing against None + msk = self.addressBase[dummies_columns].isnull() + self.addressBase.loc[msk, dummies_columns] = 'N/A' self.log.info('Using {} addresses from AddressBase for matching...'.format(len(self.addressBase.index))) # set index name - needed later for merging / duplicate removal self.addressBase.index.name = 'AddressBase_Index' - if self.settings['verbose']: - print('AddressBase:') - print(self.addressBase.info(verbose=True, memory_usage=True, null_counts=True)) - if not self.settings['test']: - self.addressBase.to_csv(self.settings['ABpath'] + 'AB_processed.csv', index=False) - - @staticmethod - def _extract_postcode(string): - """ - A static private method to extract a postcode from address string. - - Uses a rather loose regular expression, so may get some strings that are not completely valid postcodes. - Should not be used to validate whether a postcode conforms to the UK postcode standards. - - The regular expression was taken from: - http://stackoverflow.com/questions/164979/uk-postcode-regex-comprehensive - - :param string: string to be parsed - :type string: str - - :return: postcode - :rtype: str - """ - regx = r'(([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}))' - try: - potential_postcode = re.findall(regx, string)[0][0] - potential_postcode = potential_postcode.lower().strip() - except IndexError: - potential_postcode = None - - # above regex gives also those without space between, add if needed - if potential_postcode is not None: - if ' ' not in potential_postcode: - inc = potential_postcode[-3:] - out = potential_postcode.replace(inc, '') - potential_postcode = out + ' ' + inc - - return potential_postcode - - def _normalize_input_data(self): - """ - A private method to normalize address information. - - Removes white spaces, commas, and backslashes. Can also be used to expand common synonyms such - as RD or BERKS. Finally parses counties as the an early version of the probabilistic parser was - not trained to parse counties. - """ - self.log.info('Normalising input addresses') - - # make a copy of the actual address field and run the parsing against it - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS'].copy() - - # remove white spaces if present - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS_norm'].str.strip() - - # remove commas - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS_norm'].str.replace(', ', ' ') - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS_norm'].str.replace(',', ' ') - - # remove backslash if present and replace with space - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS_norm'].str.replace('\\', ' ') - - # remove spaces around hyphens as this causes ranges to be interpreted incorrectly - # e.g. FLAT 15 191 - 193 NEWPORT ROAD CARDIFF CF24 1AJ is parsed incorrectly if there - # is space around the hyphen - self.toLinkAddressData['ADDRESS_norm'] = \ - self.toLinkAddressData['ADDRESS_norm'].str.replace(r'(\d+)(\s*-\s*)(\d+)', r'\1-\3', case=False) - - # some addresses have number TO number, while this should be with hyphen, replace TO with - in those cases - # note: using \1 for group 1 and \3 for group 3 as I couldn't make non-capturing groups work - self.toLinkAddressData['ADDRESS_norm'] = \ - self.toLinkAddressData['ADDRESS_norm'].str.replace(r'(\d+)(\s*TO\s*)(\d+)', r'\1-\3', case=False) - - # some addresses have number/number rather than - as the range separator - self.toLinkAddressData['ADDRESS_norm'] = \ - self.toLinkAddressData['ADDRESS_norm'].str.replace(r'(\d+)(\s*/\s*)(\d+)', r'\1-\3', case=False) - - # synonyms to expand - read from a file with format (from, to) - synonyms = pd.read_csv(os.path.join(self.currentDirectory, '../../data/') + 'synonyms.csv').values - - # expand common synonyms to help with parsing - if self.settings['expandSynonyms']: - self.log.info('Expanding synonyms as a part of normalisation...') - for fro, to in synonyms: - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS_norm'].str.replace(fro, to) - - # parsing gets really confused if region or county is in the line - get known counties from a file - counties = pd.read_csv(os.path.join(self.currentDirectory, '../../data/') + 'counties.csv')['county'] - - # use this for the counties so that e.g. ESSEX ROAD does not become just ROAD... - # todo: the regex is getting ridiculous, maybe do other way around i.e. country must be followed by postcode or - # be the last component. - addRegex = r'(?:\s|$)(?!ROAD|LANE|STREET|CLOSE|DRIVE|AVENUE|SQUARE|COURT|PARK|CRESCENT|WAY|WALK|HEOL|FFORDD|HILL|GARDENS|GATE|GROVE|HOUSE|VIEW|BUILDING|VILLAS|LODGE|PLACE|ROW|WHARF|RISE|TERRACE|CROSS|ENTERPRISE|HATCH|&)' - - # remove county from address but add a column for it - self.toLinkAddressData['County'] = None - for county in counties: - msk = self.toLinkAddressData['ADDRESS_norm'].str.contains(county + addRegex, regex=True, na=False) - self.toLinkAddressData.loc[msk, 'County'] = county - self.toLinkAddressData['ADDRESS_norm'] = self.toLinkAddressData['ADDRESS_norm'].str.replace(county + - addRegex, '', - case=False) - - @staticmethod - def _fix_london_boroughs(parsed, directory, datafile='localities.csv'): - """ - A static private method to address incorrectly parsed London boroughs. - - If the street name contains London borough then move it to locality and remove from the street name. - - :param parsed: a dictionary containing the address tokens that have been parsed - :type parsed: dict - :param directory: location of the data file - :type directory: str - :param datafile: name of the data file containing a column locality - :type datafile: str - - :return: a dictionary containing the address tokens with updated information - :rtype: dict - """ - london_localities = pd.read_csv(directory + datafile)['locality'] - - for LondonLocality in london_localities: - if parsed['StreetName'].strip().endswith(LondonLocality): - parsed['Locality'] = LondonLocality - # take the last part out, so that e.g. CHINGFORD AVENUE CHINGFORD is correctly processed - # need to be careful with e.g. WESTERN GATEWAY ROYAL VICTORIA DOCK (3 parts to remove) - parsed['StreetName'] = parsed['StreetName'].strip()[:-len(LondonLocality)].strip() - - return parsed - - def parse_input_addresses_to_tokens(self): - """ - Parses the address information from the input data. - - Uses a combination of a probabilistic Conditional Random Fields model trained on PAF data and some rules. - Can perform address string normalisation i.e. remove punctuation and e.g. expand synonyms. - """ - self.log.info('Start parsing address data...') - - # normalise data so that the parser has the best possible chance of getting things right - self._normalize_input_data() - - # get addresses and store separately as an vector - addresses = self.toLinkAddressData['ADDRESS_norm'].values - self.log.info('{} addresses to parse...'.format(len(addresses))) - - # temp data storage lists - organisation = [] - department = [] - sub_building = [] - flat_number = [] - building_name = [] - building_number = [] - pao_start_number = [] - pao_end_number = [] - building_suffix = [] - street = [] - locality = [] - town = [] - postcode = [] - - # loop over addresses - quite inefficient, should avoid a loop - for address in tqdm(addresses): - parsed = parser.tag(address.upper()) # probabilistic parser - possible_postcode = self._extract_postcode(address) # regular expression extraction - - # if both parsers found postcode then check that they are the same - if parsed.get('Postcode', None) is not None and possible_postcode is not None: - if parsed['Postcode'] != possible_postcode: - # not the same, use possible_postcode - parsed['Postcode'] = possible_postcode - - # if the probabilistic parser did not find postcode but regular expression did, then use that - if parsed.get('Postcode', None) is None and possible_postcode is not None: - parsed['Postcode'] = possible_postcode - - if parsed.get('Postcode', None) is not None: - # check that there is space, if not then add if the parsed postcode is long enough to contain a complete - # postcode. Some users have partial postcodes to which one should not add a space. - if ' ' not in parsed['Postcode'] and len(parsed['Postcode']) > 4: - in_code = parsed['Postcode'][-3:] - out_code = parsed['Postcode'].replace(in_code, '') - parsed['Postcode'] = out_code + ' ' + in_code - - # change to all capitals - parsed['Postcode'] = parsed['Postcode'].upper() - - # if Hackney etc. in StreetName then remove and move to locality if town name contains London - # Probabilistic parser should see more cases with london localities, parsed incorrectly at the mo - if parsed.get('StreetName', None) is not None and parsed.get('TownName', None) is not None: - if 'LONDON' in parsed['TownName']: - parsed = self._fix_london_boroughs(parsed, os.path.join(self.currentDirectory, '../../data/')) - - # if delivery point address is e.g. "5 BEST HOUSE", then the "5" refers likely to FLAT 5 - if parsed.get('BuildingNumber', None) is None and parsed.get('BuildingName', None) is not None: - tmp = parsed['BuildingName'].split(' ') - if len(tmp) > 1: - try: - _ = int(tmp[0]) - parsed['FlatNumber'] = tmp[0] - except ValueError: - pass - - # if BuildingName is e.g. 55A then should get the number and suffix separately - if parsed.get('BuildingName', None) is not None: - - parsed['pao_end_number'] = None - - if '-' in parsed['BuildingName']: - tmp = parsed['BuildingName'].split('-') - parsed['pao_start_number'] = ''.join([x for x in tmp[0] if x.isdigit()]) - parsed['pao_end_number'] = ''.join([x for x in tmp[-1] if x.isdigit()]) - else: - parsed['pao_start_number'] = ''.join([x for x in parsed['BuildingName'] if x.isdigit()]) - - if len(parsed['pao_start_number']) < 1: - parsed['pao_start_number'] = None - - parsed['BuildingSuffix'] = ''.join([x for x in parsed['BuildingName'] if not x.isdigit()]) - - # accept suffixes that are only maximum two chars and if not hyphen - if len(parsed['BuildingSuffix']) > 2 or parsed['BuildingSuffix'] == '-' or \ - parsed['BuildingSuffix'] == '/': - parsed['BuildingSuffix'] = None - - # some addresses contain place CO place, where the CO is not part of the actual name - remove these - # same is true for IN e.g. Road Marton IN Cleveland - if parsed.get('Locality', None) is not None: - if parsed['Locality'].strip().endswith(' CO'): - parsed['Locality'] = parsed['Locality'].replace(' CO', '') - if parsed['Locality'].strip().endswith(' IN'): - parsed['Locality'] = parsed['Locality'].replace(' IN', '') - - # sometimes building number gets placed at building name, take it and add to building name - if parsed.get('BuildingNumber', None) is None and parsed.get('BuildingName', None) is not None: - tmp = parsed['BuildingName'].split(' ') - if len(tmp) > 1: - try: - _ = int(tmp[0]) - parsed['BuildingNumber'] = tmp[0] - except ValueError: - pass - - # if pao_start_number is Null then add BuildingNumber to it - if parsed.get('pao_start_number', None) is None and parsed.get('BuildingNumber', None) is not None: - parsed['pao_start_number'] = parsed['BuildingNumber'] - - # parser sometimes places house to organisation name, while it is likelier that it should be subBuilding - if parsed.get('OrganisationName') == 'HOUSE' and parsed.get('SubBuildingName', None) is None: - parsed['SubBuildingName'] = parsed.get('OrganisationName') - - # store the parsed information to separate lists - organisation.append(parsed.get('OrganisationName', None)) - department.append(parsed.get('DepartmentName', None)) - sub_building.append(parsed.get('SubBuildingName', None)) - building_name.append(parsed.get('BuildingName', None)) - building_number.append(parsed.get('BuildingNumber', None)) - street.append(parsed.get('StreetName', None)) - locality.append(parsed.get('Locality', None)) - town.append(parsed.get('TownName', None)) - postcode.append(parsed.get('Postcode', None)) - building_suffix.append(parsed.get('BuildingSuffix', None)) - pao_start_number.append(parsed.get('pao_start_number', None)) - pao_end_number.append(parsed.get('pao_end_number', None)) - flat_number.append(parsed.get('FlatNumber', None)) - - # add the parsed information to the dataframe - self.toLinkAddressData['OrganisationName'] = organisation - self.toLinkAddressData['DepartmentName'] = department - self.toLinkAddressData['SubBuildingName'] = sub_building - self.toLinkAddressData['BuildingName'] = building_name - self.toLinkAddressData['BuildingNumber'] = building_number - self.toLinkAddressData['StreetName'] = street - self.toLinkAddressData['Locality'] = locality - self.toLinkAddressData['TownName'] = town - self.toLinkAddressData['Postcode'] = postcode - self.toLinkAddressData['BuildingSuffix'] = building_suffix - self.toLinkAddressData['BuildingStartNumber'] = pao_start_number - self.toLinkAddressData['BuildingEndNumber'] = pao_end_number - self.toLinkAddressData['FlatNumber'] = flat_number - self.toLinkAddressData['PAOText'] = self.toLinkAddressData['BuildingName'].copy() - self.toLinkAddressData['SAOText'] = self.toLinkAddressData['SubBuildingName'].copy() - - if self.settings['expandPostcode']: - # if valid postcode information found then split between in and outcode - if self.toLinkAddressData['Postcode'].count() > 0: - postcodes = self.toLinkAddressData['Postcode'].str.split(' ', expand=True) - postcodes.rename(columns={0: 'postcode_in', 1: 'postcode_out'}, inplace=True) - self.toLinkAddressData = pd.concat([self.toLinkAddressData, postcodes], axis=1) - else: - self.toLinkAddressData['postcode_in'] = None - self.toLinkAddressData['postcode_out'] = None - - # if building number is empty and subBuildingName is a only number, add to BuildingStartNumber - msk = self.toLinkAddressData['SubBuildingName'].str.contains('\d+', na=False, case=False) & \ - self.toLinkAddressData['BuildingStartNumber'].isnull() - self.toLinkAddressData.loc[msk, 'BuildingStartNumber'] = self.toLinkAddressData.loc[msk, 'SubBuildingName'] - - # split flat or apartment number as separate for numerical comparison - compare e.g. SAO number - msk = self.toLinkAddressData['SubBuildingName'].str.contains('flat|apartment|unit', na=False, case=False) - self.toLinkAddressData.loc[msk, 'FlatNumber'] = self.toLinkAddressData.loc[msk, 'SubBuildingName'] - self.toLinkAddressData.loc[msk, 'FlatNumber'] = \ - self.toLinkAddressData.loc[msk].apply(lambda x: x['FlatNumber'].strip(). - replace('FLAT', '').replace('APARTMENT', '').replace('UNIT', ''), - axis=1) - - # sometimes subBuildingName is e.g. C2 where to number refers to the flat number - msk = self.toLinkAddressData['FlatNumber'].str.contains('[A-Z]\d+', na=False, case=False) - self.toLinkAddressData.loc[msk, 'FlatNumber'] = \ - self.toLinkAddressData.loc[msk, 'FlatNumber'].str.replace('[A-Z]', '') - - # deal with addresses that are of type 5/7 4 whatever road... - msk = self.toLinkAddressData['SubBuildingName'].str.contains('\d+\/\d+', na=False, case=False) & \ - self.toLinkAddressData['FlatNumber'].isnull() & self.toLinkAddressData['BuildingNumber'].notnull() - self.toLinkAddressData.loc[msk, 'FlatNumber'] = \ - self.toLinkAddressData.loc[msk, 'SubBuildingName'].str.replace('\/\d+', '') - - # some addresses have / as the separator for buildings and flats, when matching against NLP, needs "FLAT" - msk = self.toLinkAddressData['SubBuildingName'].str.contains('\d+\/\d+', na=False, case=False) - self.toLinkAddressData.loc[msk, 'SubBuildingName'] = 'FLAT ' + \ - self.toLinkAddressData.loc[msk, 'SubBuildingName'] - - # if SubBuildingName contains only numbers, then place also to the flat number field as likely to be flat - msk = self.toLinkAddressData['SubBuildingName'].str.isnumeric() & self.toLinkAddressData['FlatNumber'].isnull() - msk[msk.isnull()] = False - self.toLinkAddressData.loc[msk, 'FlatNumber'] = self.toLinkAddressData.loc[msk, 'SubBuildingName'] - - # some addresses, e.g. "5B ELIZABETH AVENUE", have FLAT implicitly even if not spelled -> add "FLAT X" - msk = (self.toLinkAddressData['BuildingSuffix'].notnull()) & \ - (self.toLinkAddressData['SubBuildingName'].isnull()) - self.toLinkAddressData.loc[msk, 'SubBuildingName'] = 'FLAT ' + self.toLinkAddressData.loc[msk, 'BuildingSuffix'] - - # in some other cases / is in the BuildingName field - now this separates the building and flat - # the first part refers to the building number and the second to the flat - msk = self.toLinkAddressData['BuildingName'].str.contains('\d+\/\d+', na=False, case=False) & \ - self.toLinkAddressData['FlatNumber'].isnull() - self.toLinkAddressData.loc[msk, 'FlatNumber'] = self.toLinkAddressData.loc[msk, 'BuildingName'] - self.toLinkAddressData.loc[msk, 'FlatNumber'] = \ - self.toLinkAddressData.loc[msk, 'FlatNumber'].str.replace('\d+\/', '') - self.toLinkAddressData['FlatNumber'] = pd.to_numeric(self.toLinkAddressData['FlatNumber'], errors='coerce') - self.toLinkAddressData['FlatNumber'].fillna(-12345, inplace=True) - self.toLinkAddressData['FlatNumber'] = self.toLinkAddressData['FlatNumber'].astype(np.int32) - - self.toLinkAddressData.loc[msk, 'BuildingStartNumber'] = self.toLinkAddressData.loc[msk, 'BuildingName'] - self.toLinkAddressData.loc[msk, 'BuildingStartNumber'] = \ - self.toLinkAddressData.loc[msk, 'BuildingStartNumber'].str.replace('\/\d+', '') - self.toLinkAddressData['BuildingStartNumber'] = pd.to_numeric(self.toLinkAddressData['BuildingStartNumber'], - errors='coerce') - self.toLinkAddressData['BuildingStartNumber'].fillna(-12345, inplace=True) - self.toLinkAddressData['BuildingStartNumber'] = self.toLinkAddressData['BuildingStartNumber'].astype(np.int32) - - self.toLinkAddressData['BuildingEndNumber'] = pd.to_numeric(self.toLinkAddressData['BuildingEndNumber'], - errors='coerce') - self.toLinkAddressData['BuildingEndNumber'].fillna(-12345, inplace=True) - self.toLinkAddressData['BuildingEndNumber'] = self.toLinkAddressData['BuildingEndNumber'].astype(np.int32) - - # if SubBuilding name or BuildingSuffix is empty add dummy - helps when comparing against None - msk = self.toLinkAddressData['SubBuildingName'].isnull() - self.toLinkAddressData.loc[msk, 'SubBuildingName'] = 'N/A' - msk = self.toLinkAddressData['BuildingSuffix'].isnull() - self.toLinkAddressData.loc[msk, 'BuildingSuffix'] = 'N/A' - msk = self.toLinkAddressData['PAOText'].isnull() - # for some welsh addresses the building name is parsed as organisation name, so place to PAOtext if empty - self.toLinkAddressData.loc[msk, 'PAOText'] = self.toLinkAddressData['OrganisationName'] - msk = self.toLinkAddressData['PAOText'].isnull() - self.toLinkAddressData.loc[msk, 'PAOText'] = '' - msk = self.toLinkAddressData['SAOText'].isnull() - self.toLinkAddressData.loc[msk, 'SAOText'] = 'N/A' - - # fill columns that are often NA with empty strings - helps when doing string comparisons against Nones - columns_to_add_empty_strings = ['OrganisationName', 'DepartmentName', 'SubBuildingName'] - self.toLinkAddressData[columns_to_add_empty_strings].fillna('', inplace=True) - - # save for inspection - self.toLinkAddressData.to_csv(self.settings['outpath'] + self.settings['outname'] + '_parsed_addresses.csv', - index=False) - - # drop the temp info - self.toLinkAddressData.drop(['ADDRESS_norm', ], axis=1, inplace=True) - - if self.settings['verbose']: - print('Parsed:') - print(self.toLinkAddressData.info(verbose=True, memory_usage=True, null_counts=True)) - - def link_all_addresses(self, blocking_modes=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)): + def link_all_addresses(self, blocking_modes=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)): """ A method to link addresses against AddressBase. @@ -803,7 +336,7 @@ def _find_likeliest_address(self, addresses_to_be_linked, blocking=1): :param addresses_to_be_linked: dataframe holding the address information that is to be matched against a source :type addresses_to_be_linked: pandas.DataFrame - :param blocking: the mode of blocking, ranging from 1 to 8 + :param blocking: the mode of blocking, ranging from 1 to 11 :type blocking: int :return: dataframe of matches, dataframe of non-matched addresses @@ -816,33 +349,36 @@ def _find_likeliest_address(self, addresses_to_be_linked, blocking=1): # block on both postcode and house number, street name can have typos and therefore is not great for blocking self.log.info('Start matching with blocking mode {}'.format(blocking)) if blocking == 1: + pairs = pcl.block(left_on=['OrganisationName', 'Postcode'], + right_on=['ORGANISATION_NAME', 'POSTCODE']) + elif blocking == 2: pairs = pcl.block(left_on=['OrganisationName', 'TownName', 'BuildingNumber'], right_on=['ORGANISATION_NAME', 'POST_TOWN', 'BUILDING_NUMBER']) - elif blocking == 2: + elif blocking == 3: pairs = pcl.block(left_on=['OrganisationName', 'TownName'], right_on=['ORGANISATION_NAME', 'POST_TOWN']) - elif blocking == 3: + elif blocking == 4: pairs = pcl.block(left_on=['Postcode', 'BuildingName'], right_on=['POSTCODE', 'BUILDING_NAME']) - elif blocking == 4: + elif blocking == 5: pairs = pcl.block(left_on=['Postcode', 'BuildingNumber'], right_on=['POSTCODE', 'BUILDING_NUMBER']) - elif blocking == 5: + elif blocking == 6: pairs = pcl.block(left_on=['Postcode', 'StreetName'], right_on=['POSTCODE', 'THROUGHFARE']) - elif blocking == 6: + elif blocking == 7: pairs = pcl.block(left_on=['Postcode', 'TownName'], right_on=['POSTCODE', 'POST_TOWN']) - elif blocking == 7: + elif blocking == 8: pairs = pcl.block(left_on=['Postcode'], right_on=['POSTCODE']) - elif blocking == 8: + elif blocking == 9: pairs = pcl.block(left_on=['BuildingName', 'StreetName'], right_on=['BUILDING_NAME', 'THROUGHFARE']) - elif blocking == 9: + elif blocking == 10: pairs = pcl.block(left_on=['BuildingNumber', 'StreetName'], right_on=['BUILDING_NUMBER', 'THROUGHFARE']) - elif blocking == 10: + elif blocking == 11: pairs = pcl.block(left_on=['StreetName', 'TownName'], right_on=['THROUGHFARE', 'POST_TOWN']) else: @@ -865,12 +401,15 @@ def _find_likeliest_address(self, addresses_to_be_linked, blocking=1): missing_value=0.8) compare.string('BUILDING_NUMBER', 'BuildingNumber', method='jarowinkler', name='building_number_dl', missing_value=0.5) - compare.numeric('PAO_START_NUMBER', 'BuildingStartNumber', threshold=0.1, method='linear', + compare.numeric('PAO_START_NUMBER', 'PAOstartNumber', threshold=0.1, method='linear', name='pao_number_dl') - compare.numeric('PAO_END_NUMBER', 'BuildingEndNumber', threshold=0.1, method='linear', + compare.numeric('PAO_END_NUMBER', 'PAOendNumber', threshold=0.1, method='linear', name='building_end_number_dl') - compare.string('THROUGHFARE', 'StreetName', method='jarowinkler', name='street_dl', - missing_value=0.7) + if blocking not in (6, 9, 10): + compare.string('THROUGHFARE', 'StreetName', method='jarowinkler', name='street_dl', + missing_value=0.7) + compare.string('STREET_DESCRIPTOR', 'StreetName', method='jarowinkler', name='street_desc_dl', + missing_value=0.6) compare.string('POST_TOWN', 'TownName', method='jarowinkler', name='town_dl', missing_value=0.2) compare.string('LOCALITY', 'Locality', method='jarowinkler', name='locality_dl', @@ -883,18 +422,25 @@ def _find_likeliest_address(self, addresses_to_be_linked, blocking=1): compare.string('postcode_out', 'postcode_out', method='jarowinkler', name='outcode_dl', missing_value=0.0) - if blocking in (1, 2, 8, 9, 10): + if blocking in (2, 3, 9, 10, 11): compare.string('POSTCODE', 'Postcode', method='jarowinkler', name='postcode_dl', missing_value=0.0) # use to separate e.g. 55A from 55 - compare.string('PAO_START_SUFFIX', 'BuildingSuffix', method='jarowinkler', name='pao_suffix_dl', + compare.string('PAO_START_SUFFIX', 'PAOstartSuffix', method='jarowinkler', name='pao_suffix_dl', + missing_value=0.5) + compare.string('PAO_END_SUFFIX', 'PAOendSuffix', method='jarowinkler', name='pao_suffix_dl2', missing_value=0.5) # the following is good for flats and apartments, which have been numbered compare.string('SUB_BUILDING_NAME', 'SubBuildingName', method='jarowinkler', name='flatw_dl', missing_value=0.6) - compare.numeric('SAO_START_NUMBER', 'FlatNumber', threshold=0.1, method='linear', name='sao_number_dl') + compare.numeric('SAO_START_NUMBER', 'SAOStartNumber', threshold=0.1, method='linear', name='sao_number_dl') + compare.numeric('SAO_END_NUMBER', 'SAOEndNumber', threshold=0.1, method='linear', name='sao_number_dl2') + compare.string('SAO_START_SUFFIX', 'SAOStartSuffix', method='jarowinkler', name='sao_suffix_dl', + missing_value=0.5) + compare.string('SAO_END_SUFFIX', 'SAOEndSuffix', method='jarowinkler', name='sao_suffix_dl2', + missing_value=0.5) # set rules for organisations such as care homes and similar type addresses compare.string('ORGANISATION_NAME', 'OrganisationName', method='jarowinkler', name='organisation_dl', @@ -902,23 +448,21 @@ def _find_likeliest_address(self, addresses_to_be_linked, blocking=1): compare.string('DEPARTMENT_NAME', 'DepartmentName', method='jarowinkler', name='department_dl', missing_value=0.6) - # Extras - compare.string('STREET_DESCRIPTOR', 'StreetName', method='jarowinkler', name='street_desc_dl', - missing_value=0.6) - # execute the comparison model compare.run() # remove those matches that are not close enough - requires e.g. street name to be close enough - if blocking in (1, 2): + if blocking in (2, 3): compare.vectors = compare.vectors.loc[compare.vectors['incode_dl'] >= 0.8] compare.vectors = compare.vectors.loc[compare.vectors['outcode_dl'] >= 0.5] compare.vectors = compare.vectors.loc[compare.vectors['street_dl'] >= 0.7] - elif blocking in (3, 4): - compare.vectors = compare.vectors.loc[compare.vectors['street_dl'] >= 0.5] - elif blocking in (5, 6, 7): + elif blocking in (4,): + compare.vectors = compare.vectors.loc[compare.vectors['street_dl'] >= 0.6] + elif blocking in (6,): + compare.vectors = compare.vectors.loc[compare.vectors['pao_number_dl'] > 0.9] + elif blocking in (7, 8): + compare.vectors = compare.vectors.loc[compare.vectors['street_dl'] >= 0.6] compare.vectors = compare.vectors.loc[compare.vectors['pao_number_dl'] > 0.9] - elif blocking in (6, 7): msk = (compare.vectors['street_dl'] >= 0.7) | (compare.vectors['organisation_dl'] > 0.3) compare.vectors = compare.vectors.loc[msk] @@ -1015,7 +559,7 @@ def check_performance(self): """ self.log.info('Checking Performance...') - # count the number of matches and the total number of addresses and write to the lod + # count the number of matches and the total number of addresses total = len(self.matching_results.index) # save matched to a file for inspection @@ -1213,16 +757,16 @@ def run_all(self): self.log.info('finished in {} seconds...'.format(round((stop - start), 1))) start = time.clock() - if self.settings['test']: - self.load_and_process_addressbase() - else: - self.load_addressbase() - + self.load_addressbase() stop = time.clock() self.log.info('finished in {} seconds...'.format(round((stop - start), 1))) start = time.clock() - self.parse_input_addresses_to_tokens() + address_parser = addressParser.AddressParser(log=self.log, **self.settings) + self.toLinkAddressData = address_parser.parse(self.toLinkAddressData) + self.toLinkAddressData = address_parser.convert_to_numeric_and_add_dummies(self.toLinkAddressData) + self.toLinkAddressData.to_csv(self.settings['outpath'] + self.settings['outname'] + '_parsed_addresses.csv', + index=False) stop = time.clock() self.log.info('finished in {} seconds...'.format(round((stop - start), 1))) diff --git a/DataScience/Analytics/linking/addressParser.py b/DataScience/Analytics/linking/addressParser.py new file mode 100644 index 000000000..4c2bd6c94 --- /dev/null +++ b/DataScience/Analytics/linking/addressParser.py @@ -0,0 +1,558 @@ +""" +ONS Address Index - Complete Address Parser +=========================================== + +This file contains an Address Parser class that can perform normalisation, probabilistic parsing, and post-processing. + + +Requirements +------------ + +:requires: pandas (tested with 0.19.2) +:requires: numpy (tested with 1.12.0) +:requires: ProbabilisticParser (a CRF model specifically build for ONS) +:requires: tqdm (4.10.0: https://github.com/tqdm/tqdm) + + +Author +------ + +:author: Sami Niemi (sami.niemi@valtech.co.uk) + + +Version +------- + +:version: 0.1 +:date: 28-Feb-2017 +""" +import logging +import os +import re +import sys +import warnings + +import numpy as np +import pandas as pd +from ProbabilisticParser import parser +from tqdm import tqdm + +# suppress pandas warnings +warnings.simplefilter(action="ignore", category=FutureWarning) +warnings.simplefilter(action="ignore", category=UserWarning) + + +class AddressParser: + """ + Address Parser class that implements the probabilistic parser and required pre- and post-processing steps. + """ + + def __init__(self, log=None, **kwargs): + if log is None: + log = logging.getLogger() + log.addHandler(logging.StreamHandler(sys.stdout).setLevel(logging.DEBUG)) + + self.log = log + + # relative path when referring to data files + self.currentDirectory = os.path.dirname(__file__) # for relative path definitions + self.settings = dict(expandSynonyms=True) + self.settings.update(kwargs) + + @staticmethod + def _extract_postcode(string): + """ + A static private method to extract a postcode from address string. + + Uses a rather loose regular expression, so may get some strings that are not completely valid postcodes. + Should not be used to validate whether a postcode conforms to the UK postcode standards. + + The regular expression was taken from: + http://stackoverflow.com/questions/164979/uk-postcode-regex-comprehensive + + :param string: string to be parsed + :type string: str + + :return: postcode + :rtype: str + """ + regx = r'(([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|' + \ + '(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9]' + \ + '[abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}))' + try: + potential_postcode = re.findall(regx, string)[0][0] + potential_postcode = potential_postcode.lower().strip() + except IndexError: + potential_postcode = None + + # above regex gives also those without space between, add if needed + if potential_postcode is not None: + if ' ' not in potential_postcode: + inc = potential_postcode[-3:] + out = potential_postcode.replace(inc, '') + potential_postcode = out + ' ' + inc + + return potential_postcode + + @staticmethod + def _fix_london_boroughs(parsed, directory, datafile='localities.csv'): + """ + A static private method to address incorrectly parsed London boroughs. + + If the street name contains London borough then move it to locality and remove from the street name. + + :param parsed: a dictionary containing the address tokens that have been parsed + :type parsed: dict + :param directory: location of the data file + :type directory: str + :param datafile: name of the data file containing a column locality + :type datafile: str + + :return: a dictionary containing the address tokens with updated information + :rtype: dict + """ + london_localities = pd.read_csv(directory + datafile)['locality'] + + for LondonLocality in london_localities: + if parsed['StreetName'].strip().endswith(LondonLocality): + parsed['Locality'] = LondonLocality + # take the last part out, so that e.g. CHINGFORD AVENUE CHINGFORD is correctly processed + # need to be careful with e.g. WESTERN GATEWAY ROYAL VICTORIA DOCK (3 parts to remove) + parsed['StreetName'] = parsed['StreetName'].strip()[:-len(LondonLocality)].strip() + + return parsed + + def _normalize_input_data(self, data, normalised_field_name='ADDRESS_norm'): + """ + Normalise input address information. + + This includes removal of commas and backslashes and whitespaces around numerical ranges. + + :param data: address data containing a column 'ADDRESS' to normalise + :type data: pandas.DataFrame + :param normalised_field_name: name of the new field to contain normalised address data + :type normalised_field_name: str + + :return: normalised data containing a new column names as given by normalised_field_name + :rtype: pandas.DataFrame + """ + # make a copy of the actual address field and run the parsing against it + data[normalised_field_name] = data['ADDRESS'].copy() + + # remove white spaces from the end and beginning if present + data[normalised_field_name] = data[normalised_field_name].str.strip() + + # remove commas if present as not useful for matching + data[normalised_field_name] = data[normalised_field_name].str.replace(', ', ' ') + data[normalised_field_name] = data[normalised_field_name].str.replace(',', ' ') + + # remove backslash if present and replace with space + data[normalised_field_name] = data[normalised_field_name].str.replace('\\', ' ') + + # remove spaces around hyphens as this causes ranges to be interpreted incorrectly + # e.g. FLAT 15 191 - 193 NEWPORT ROAD CARDIFF CF24 1AJ is parsed incorrectly if there + # is space around the hyphen + data[normalised_field_name] = \ + data[normalised_field_name].str.replace(r'(\d+)(\s*-\s*)(\d+)', r'\1-\3', case=False) + + # some addresses have number TO number, while this should be with hyphen, replace TO with - in those cases + # note: using \1 for group 1 and \3 for group 3 as I couldn't make non-capturing groups work + data[normalised_field_name] = \ + data[normalised_field_name].str.replace(r'(\d+)(\s*TO\s*)(\d+)', r'\1-\3', case=False) + + # some addresses have number/number rather than - as the range separator + data[normalised_field_name] = \ + data[normalised_field_name].str.replace(r'(\d+)(\s*/\s*)(\d+)', r'\1-\3', case=False) + + # some addresses have number+suffix - number+suffix, remove the potential whitespaces around the hyphen + data[normalised_field_name] = \ + data[normalised_field_name].str.replace(r'(\d+[a-z])(\s*-\s*)(\d+[a-z])', r'\1-\3', case=False) + + # synonyms to expand - read from a file with format (from, to) + synonyms = pd.read_csv(os.path.join(self.currentDirectory, '../../data/') + 'synonyms.csv').values + + # expand common synonyms to help with parsing + if self.settings['expandSynonyms']: + self.log.info('Expanding synonyms as a part of normalisation...') + for fro, to in synonyms: + data['ADDRESS_norm'] = data['ADDRESS_norm'].str.replace(fro, to) + + # parsing gets really confused if region or county is in the line - get known counties from a file + counties = pd.read_csv(os.path.join(self.currentDirectory, '../../data/') + 'counties.csv')['county'] + + # use this for the counties so that e.g. ESSEX ROAD does not become just ROAD... + # todo: the regex is getting ridiculous, maybe do other way around i.e. country must be followed by postcode or + # be the last component. + addRegex = r'(?:\s|$)(?!ROAD|LANE|STREET|CLOSE|DRIVE|AVENUE|SQUARE|COURT|PARK|CRESCENT|WAY|WALK|HEOL|FFORDD|HILL|GARDENS|GATE|GROVE|HOUSE|VIEW|BUILDING|VILLAS|LODGE|PLACE|ROW|WHARF|RISE|TERRACE|CROSS|ENTERPRISE|HATCH|&)' + + # remove county from address but add a column for it + data['County'] = None + for county in counties: + msk = data[normalised_field_name].str.contains(county + addRegex, regex=True, na=False) + data.loc[msk, 'County'] = county + data[normalised_field_name] = data[normalised_field_name].str.replace(county + addRegex, '', case=False) + + return data + + def parse(self, data, normalised_field_name='ADDRESS_norm'): + """ + Parse the address information given in the data. + + Assumes that the address information is stored in columned named 'ADDRESS'. + + :param data: address data containing a column 'ADDRESS' to parse + :type data: pandas.DataFrame + :param normalised_field_name: name of the new field to contain normalised address data + :type normalised_field_name: str + + :return: parsed address data + :rtype: pandas.DataFrame + """ + self.log.info('Start parsing address data...') + + data = self._normalize_input_data(data, normalised_field_name=normalised_field_name) + + addresses = data[normalised_field_name].values + self.log.info('{} addresses to parse...'.format(len(addresses))) + + # temp data storage lists + organisation = [] + department = [] + sub_building = [] + building_name = [] + building_number = [] + street = [] + locality = [] + town = [] + postcode = [] + + # loop over addresses and use the probabilistic parser to tag the address components - should avoid a loop + for address in tqdm(addresses): + parsed = parser.tag(address.upper()) + possible_postcode = self._extract_postcode(address) # regular expression extraction + + # if both parsers found postcode then check that they are the same + if parsed.get('Postcode', None) is not None and possible_postcode is not None: + if parsed['Postcode'] != possible_postcode: + # not the same, use possible_postcode + parsed['Postcode'] = possible_postcode + + # if the probabilistic parser did not find postcode but regular expression did, then use that + if parsed.get('Postcode', None) is None and possible_postcode is not None: + parsed['Postcode'] = possible_postcode + + if parsed.get('Postcode', None) is not None: + # check that there is space, if not then add if the parsed postcode is long enough to contain a complete + # postcode. Some users have partial postcodes to which one should not add a space. + if ' ' not in parsed['Postcode'] and len(parsed['Postcode']) > 4: + in_code = parsed['Postcode'][-3:] + out_code = parsed['Postcode'].replace(in_code, '') + parsed['Postcode'] = out_code + ' ' + in_code + + # change to all capitals + parsed['Postcode'] = parsed['Postcode'].upper() + + # if Hackney etc. in StreetName then remove and move to locality if town name contains London + # Probabilistic parser should see more cases with london localities, parsed incorrectly at the mo + if parsed.get('StreetName', None) is not None and parsed.get('TownName', None) is not None: + if 'LONDON' in parsed['TownName']: + parsed = self._fix_london_boroughs(parsed, os.path.join(self.currentDirectory, '../../data/')) + + # sometimes building number gets placed at building name, take it and add to building name + if parsed.get('BuildingNumber', None) is None and parsed.get('BuildingName', None) is not None: + tmp = parsed['BuildingName'].split(' ') + if len(tmp) > 1: + try: + _ = int(tmp[0]) + parsed['BuildingNumber'] = tmp[0] + except ValueError: + pass + + # some addresses contain place CO place, where the CO is not part of the actual name - remove these + # same is true for IN e.g. Road Marton IN Cleveland + if parsed.get('Locality', None) is not None: + if parsed['Locality'].strip().endswith(' CO'): + parsed['Locality'] = parsed['Locality'].replace(' CO', '') + if parsed['Locality'].strip().endswith(' IN'): + parsed['Locality'] = parsed['Locality'].replace(' IN', '') + + # parser sometimes places house to organisation name, while it is likelier that it should be subBuilding + if parsed.get('OrganisationName') == 'HOUSE' and parsed.get('SubBuildingName', None) is None: + parsed['SubBuildingName'] = parsed.get('OrganisationName') + + # store the parsed information to separate lists + organisation.append(parsed.get('OrganisationName', None)) + department.append(parsed.get('DepartmentName', None)) + sub_building.append(parsed.get('SubBuildingName', None)) + building_name.append(parsed.get('BuildingName', None)) + building_number.append(parsed.get('BuildingNumber', None)) + street.append(parsed.get('StreetName', None)) + locality.append(parsed.get('Locality', None)) + town.append(parsed.get('TownName', None)) + postcode.append(parsed.get('Postcode', None)) + + # add the parsed information to the dataframe + data['OrganisationName'] = organisation + data['DepartmentName'] = department + data['SubBuildingName'] = sub_building + data['BuildingName'] = building_name + data['BuildingNumber'] = building_number + data['StreetName'] = street + data['Locality'] = locality + data['TownName'] = town + data['Postcode'] = postcode + data['PAOText'] = data['BuildingName'].copy() + data['SAOText'] = data['SubBuildingName'].copy() + + data = self._parser_postprocessing(data) + + return data + + @staticmethod + def _parser_postprocessing(data): + """ + Parser post-processing steps. + + Extracts e.g. PAO_START, END, SAO_START, and END information from the parser tokens. + + :param data: parsed address data ready for post-processing + :type data: pandas.DataFrame + + :return: parsed address data, which have gone through the post-processing steps + :rtype: pandas.DataFrame + """ + # if valid postcode information found then split between in and outcode + if data['Postcode'].count() > 0: + postcodes = data['Postcode'].str.split(' ', expand=True) + postcodes.rename(columns={0: 'postcode_in', 1: 'postcode_out'}, inplace=True) + data = pd.concat([data, postcodes], axis=1) + else: + data['postcode_in'] = None + data['postcode_out'] = None + + # data containers for those components not parsed, but derived during post-processing + data['PAOstartNumber'] = None + data['PAOendNumber'] = None + data['PAOstartSuffix'] = None + data['PAOendSuffix'] = None + data['SAOStartNumber'] = None + data['SAOEndNumber'] = None + data['SAOStartSuffix'] = None + data['SAOEndSuffix'] = None + + # if building number is present, then copy it to start number + data['PAOstartNumber'] = data['BuildingNumber'].copy() + + # in some other cases / is in the BuildingName field - now this separates the building and flat + # the first part refers to the building number and the second to the flat + tmp = r'(\d+)\/(\d+)' + msk = data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[0] + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[1] + + # some cases the SAO components end up in the organisation name field, need to be separated + tmp = r'(\d+)([A-Z])-(\d+)([A-Z])' + msk = data['OrganisationName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'OrganisationName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[0] + data.loc[msk & data['SAOStartSuffix'].isnull(), 'SAOStartSuffix'] = extracted_components[1] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SAOEndNumber'] = extracted_components[2] + data.loc[msk & data['SAOEndSuffix'].isnull(), 'SAOEndSuffix'] = extracted_components[3] + + # some cases the SAO components end up in the organisation name field, need to be separated + tmp = r'(\d+)-(\d+)([A-Z])' + msk = data['OrganisationName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'OrganisationName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[0] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SAOEndNumber'] = extracted_components[1] + data.loc[msk & data['SAOEndSuffix'].isnull(), 'SAOEndSuffix'] = extracted_components[2] + + # sometimes both PAO and SAO range is in the BuildingName e.g. "35A-35D 35A-35F" + tmp = r'(\d+)([A-Z])-(\d+)([A-Z]).*?(\d+)([A-Z])-(\d+)([A-Z])' + msk = data['BuildingNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[0] + data.loc[msk & data['SAOStartSuffix'].isnull(), 'SAOStartSuffix'] = extracted_components[1] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SAOEndNumber'] = extracted_components[2] + data.loc[msk & data['SAOEndSuffix'].isnull(), 'SAOEndSuffix'] = extracted_components[3] + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[4] + data.loc[msk & data['PAOstartSuffix'].isnull(), 'PAOstartSuffix'] = extracted_components[5] + data.loc[msk & data['PAOendNumber'].isnull(), 'PAOendNumber'] = extracted_components[6] + data.loc[msk & data['PAOendSuffix'].isnull(), 'PAOendSuffix'] = extracted_components[7] + + # sometimes both PAO and SAO range is in the BuildingName e.g. "28A-28F PICCADILLY COURT 457-463" + tmp = r'(\d+)([A-Z])-(\d+)([A-Z]).*?(\d+)-(\d+)' + msk = data['BuildingNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[0] + data.loc[msk & data['SAOStartSuffix'].isnull(), 'SAOStartSuffix'] = extracted_components[1] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SAOEndNumber'] = extracted_components[2] + data.loc[msk & data['SAOEndSuffix'].isnull(), 'SAOEndSuffix'] = extracted_components[3] + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[4] + data.loc[msk & data['PAOendNumber'].isnull(), 'PAOendNumber'] = extracted_components[5] + + # sometimes both PAO and SAO range is in the BuildingName e.g. "3-3A CHURCHILL COURT 112-144" + tmp = r'(\d+)-(\d+)([A-Z]).*?(\d+)-(\d+)' + msk = data['BuildingNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[0] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SAOEndNumber'] = extracted_components[1] + data.loc[msk & data['SAOEndSuffix'].isnull(), 'SAOEndSuffix'] = extracted_components[2] + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[3] + data.loc[msk & data['PAOendNumber'].isnull(), 'PAOendNumber'] = extracted_components[4] + + # sometimes both building number and flat range are stored in BuildingName (e.g. 9B-9C 65A), separate these + tmp = r'(\d+)([A-Z])-(\d+)([A-Z])\s.*?(\d+)([A-Z])' + msk = data['BuildingNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SAOStartNumber'] = extracted_components[0] + data.loc[msk & data['SAOStartSuffix'].isnull(), 'SAOStartSuffix'] = extracted_components[1] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SAOEndNumber'] = extracted_components[2] + data.loc[msk & data['SAOEndSuffix'].isnull(), 'SAOEndSuffix'] = extracted_components[3] + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[4] + data.loc[msk & data['PAOstartSuffix'].isnull(), 'PAOstartSuffix'] = extracted_components[5] + + # if building number is not present, try to extract from building name if appropriate type + # deal with cases where buildingName contains a suffix range: 24D-24E + tmp = r'(\d+)([A-Z])-(\d+)([A-Z])' + msk = data['PAOstartNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[0] + data.loc[msk & data['PAOstartSuffix'].isnull(), 'PAOstartSuffix'] = extracted_components[1] + data.loc[msk & data['PAOendNumber'].isnull(), 'PAOendNumber'] = extracted_components[2] + data.loc[msk & data['PAOendSuffix'].isnull(), 'PAOendSuffix'] = extracted_components[3] + # deal with cases where buildingName contains a suffix range: 24-24E + tmp = r'(\d+)-(\d+)([A-Z])' + msk = data['PAOstartNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[0] + data.loc[msk & data['PAOendNumber'].isnull(), 'PAOendNumber'] = extracted_components[1] + data.loc[msk & data['PAOendSuffix'].isnull(), 'PAOendSuffix'] = extracted_components[2] + # deal with cases where buildingName is a range: 120-122 + tmp = r'(\d+)-(\d+)' + msk = data['PAOstartNumber'].isnull() & data['BuildingName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'BuildingName'].str.extract(tmp) + data.loc[msk & data['PAOstartNumber'].isnull(), 'PAOstartNumber'] = extracted_components[0] + data.loc[msk & data['PAOendNumber'].isnull(), 'PAOendNumber'] = extracted_components[1] + # deal with cases where buildingName is 54A or 65B but not part of a range e.g. 65A-65B + tmp = r'(? 0: + data.loc[msk, 'SubBuildingName'] = extracted_components.values + + # deal with addresses that are of type 5/7 4 whatever road, the format assumed start/end_sao_numb pao_start_numb + tmp = r'(\d+)\/(\d+)' + msk = data['SubBuildingName'].str.contains(tmp, na=False, case=False) & \ + data['SAOStartNumber'].isnull() & data['BuildingNumber'].notnull() + extracted_components = data.loc[msk, 'SubBuildingName'].str.extract(tmp) + data.loc[msk & data['SAOStartNumber'].isnull(), 'SubBuildingName'] = extracted_components[0] + data.loc[msk & data['SAOEndNumber'].isnull(), 'SubBuildingName'] = extracted_components[1] + + # if SubBuildingName contains only numbers, then place also to the sao start number field as likely to be flat + msk = data['SubBuildingName'].str.isnumeric() & data['SAOStartNumber'].isnull() + msk[msk.isnull()] = False + data.loc[msk, 'SAOStartNumber'] = data.loc[msk, 'SubBuildingName'] + + # if street name contains a number and buildingnumber is empty, then place it there and pao_start_number + tmp = r'(\d+)' + msk = data['BuildingNumber'].isnull() & data['StreetName'].str.contains(tmp, na=False, case=False) + extracted_components = data.loc[msk, 'StreetName'].str.extract(tmp) + if len(extracted_components.index) > 0: + data.loc[msk, 'BuildingNumber'] = extracted_components.values + data.loc[msk, 'PAOstartNumber'] = extracted_components.values + + # split flat or apartment number as separate for numerical comparison - compare e.g. SAO number + # todo: rewrite + msk = data['SubBuildingName'].str.contains('flat|apartment|unit', na=False, case=False) + data.loc[msk, 'SAOStartNumber'] = data.loc[msk, 'SubBuildingName'] + data.loc[msk, 'SAOStartNumber'] = \ + data.loc[msk].apply(lambda x: x['SAOStartNumber'].strip(). + replace('FLAT', '').replace('APARTMENT', '').replace('UNIT', ''), + axis=1) + + return data + + @staticmethod + def convert_to_numeric_and_add_dummies(data): + """ + + :param data: + :return: + """ + for numeric_columns in ('PAOstartNumber', 'PAOendNumber', 'SAOStartNumber', 'SAOEndNumber'): + # convert to numeric, if NA then set to dummy + data[numeric_columns] = pd.to_numeric(data[numeric_columns], errors='coerce') + data[numeric_columns].fillna(-12345, inplace=True) + data[numeric_columns] = data[numeric_columns].astype(np.int32) + + for dummies_columns in ('PAOstartSuffix', 'PAOendSuffix', 'SAOStartSuffix', 'SAOEndSuffix', 'SAOText'): + # if field is empty add dummy - helps when comparing against None + msk = data[dummies_columns].isnull() + data.loc[msk, dummies_columns] = 'N/A' + + # for some welsh addresses the building name is parsed as organisation name, so place to PAOtext if empty + msk = data['PAOText'].isnull() + data.loc[msk, 'PAOText'] = data['OrganisationName'] + msk = data['PAOText'].isnull() + data.loc[msk, 'PAOText'] = '' + + # fill columns that are often NA with empty strings - helps when doing string comparisons against Nones + columns_to_add_empty_strings = ['OrganisationName', 'DepartmentName', 'SubBuildingName'] + data[columns_to_add_empty_strings].fillna('', inplace=True) + + return data diff --git a/DataScience/Analytics/prototype/saoSuffixAddresses.py b/DataScience/Analytics/prototype/saoSuffixAddresses.py new file mode 100644 index 000000000..4e29bf785 --- /dev/null +++ b/DataScience/Analytics/prototype/saoSuffixAddresses.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +""" +ONS Address Index - Secondary Address Object End Suffix Test Data +================================================================= + +A simple script to attach UPRNs to a dataset with SAO end suffixes. + +As the dataset is synthetic it contains AddressBase UPRNs enabling automatic +performance computations. + +This is a prototype code aimed for experimentation and testing. There are not unit tests. +The code has been written for speed rather than accuracy, it therefore uses fairly aggressive +blocking. As the final solution will likely use ElasticSearch, the aim of this prototype is +not the highest accuracy but to quickly test different ideas, which can inform the final +ElasticSearch solution. + + +Running +------- + +After all requirements are satisfied, the script can be invoked using CPython interpreter:: + + python saoSuffixAddresses.py + + +Requirements +------------ + +:requires: numpy (tested with 1.12.0) +:requires: pandas (tested with 0.19.2) +:requires: addressLinking (and all the requirements within it) + + +Author +------ + +:author: Sami Niemi (sami.niemi@valtech.co.uk) + + +Version +------- + +:version: 0.1 +:date: 1-Mar-2017 +""" +import numpy as np +import pandas as pd +from Analytics.linking import addressLinking + + +class SAOsuffixLinker(addressLinking.AddressLinker): + """ + Address Linker for the SAO Suffix dataset. Inherits the AddressLinker and overwrites the load_data method. + """ + + def load_data(self): + """ + Read in the SAO Suffix address test data. Overwrites the method in the AddressLinker. + """ + self.toLinkAddressData = pd.read_excel(self.settings['inputPath'] + self.settings['inputFilename']) + self.toLinkAddressData['ID'] = np.arange(len(self.toLinkAddressData['UPRN'].index)) + + self.toLinkAddressData.rename(columns={'UPRN': 'UPRN_old'}, inplace=True) + + +def run_sao_suffix_linker(**kwargs): + """ + A simple wrapper that allows running SAO Suffix Address linker. + + :return: None + """ + settings = dict(inputFilename='SAO_END_SUFFIX.xlsx', + inputPath='/Users/saminiemi/Projects/ONS/AddressIndex/data/', + outname='SAOsuffix') + settings.update(kwargs) + + linker = SAOsuffixLinker(**settings) + linker.run_all() + del linker + + +if __name__ == "__main__": + run_sao_suffix_linker()