Skip to content

Commit

Permalink
HDXDSYS-765 Parent name mapping (#42)
Browse files Browse the repository at this point in the history
* Allow admin name mappings to be restricted by parent admin or country.

* Make separate method for readability

* Do same for admin_name_replacements
  • Loading branch information
mcarans authored May 23, 2024
1 parent b93f464 commit 92ffeb7
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 24 deletions.
17 changes: 13 additions & 4 deletions documentation/main.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,19 @@ The administration level mappings takes input configuration dictionary,

*admin_config* can have the following optional keys:

*countries_fuzzy_try* are countries (iso3 codes) for which to try fuzzy matching. Default is all countries.
*admin_name_mappings* is a dictionary of mappings from name to pcode (for where fuzzy matching fails)
*admin_name_replacements* is a dictionary of textual replacements to try when fuzzy matching
*admin_fuzzy_dont* is a list of names for which fuzzy matching should not be tried
*countries_fuzzy_try* are countries (iso3 codes) for which to try fuzzy
matching. Default is all countries.
*admin_name_mappings* is a dictionary of mappings from name to pcode. These can
be global or they can be restricted by country or parent (if the AdminLevel
object has been set up with parents). Keys take the form "MAPPING",
"AFG|MAPPING" or "AF01|MAPPING".
*admin_name_replacements* is a dictionary of textual replacements to try when
fuzzy matching. It maps from string to string replacement. The replacements can
be global or they can be restricted by country or parent (if the AdminLevel
object has been set up with parents). Keys take the form "STRING_TO_REPLACE",
"AFG|STRING_TO_REPLACE" or "AF01|STRING_TO_REPLACE".
*admin_fuzzy_dont* is a list of names for which fuzzy matching should not be
tried

Once an AdminLevel object is constructed, one of three setup methods must be
called: *setup_from_admin_info*, *setup_from_libhxl_dataset* or
Expand Down
120 changes: 103 additions & 17 deletions src/hdx/location/adminlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def get_admin_level(self, countryiso3: str) -> int:
"""Get admin level for country
Args:
countryiso3 (str): Iso3 country code
countryiso3 (str): ISO3 country code
Returns:
int: Admin level
Expand All @@ -320,7 +320,7 @@ def get_pcode_length(self, countryiso3: str) -> Optional[int]:
"""Get pcode length for country
Args:
countryiso3 (str): Iso3 country code
countryiso3 (str): ISO3 country code
Returns:
Optional[int]: Country's pcode length or None
Expand Down Expand Up @@ -517,6 +517,41 @@ def convert_admin1_pcode_length(
return pcode
return None

def get_admin_name_replacements(
self, countryiso3: str, parent: Optional[str]
) -> Dict[str, str]:
"""Get relevant admin name replacements from admin name replacements
which is a dictionary of mappings from string to string replacement.
These can be global or they can be restricted by
country or parent (if the AdminLevel object has been set up with
parents). Keys take the form "STRING_TO_REPLACE",
"AFG|STRING_TO_REPLACE" or "AF01|STRING_TO_REPLACE".
Args:
countryiso3 (str): ISO3 country code
parent (Optional[str]): Parent admin code
Returns:
Dict[str, str]: Relevant admin name replacements
"""
relevant_name_replacements = {}
for key, value in self.admin_name_replacements.items():
if "|" not in key:
if key not in relevant_name_replacements:
relevant_name_replacements[key] = value
continue
prefix, name = key.split("|")
if parent:
if prefix == parent:
if name not in relevant_name_replacements:
relevant_name_replacements[name] = value
continue
if prefix == countryiso3:
if name not in relevant_name_replacements:
relevant_name_replacements[name] = value
continue
return relevant_name_replacements

def fuzzy_pcode(
self,
countryiso3: str,
Expand All @@ -526,7 +561,7 @@ def fuzzy_pcode(
"""Fuzzy match name to pcode
Args:
countryiso3 (str): Iso3 country code
countryiso3 (str): ISO3 country code
name (str): Name to match
**kwargs:
parent (Optional[str]): Parent admin code
Expand All @@ -543,8 +578,17 @@ def fuzzy_pcode(
if logname:
self.ignored.add((logname, countryiso3))
return None
if self.use_parent and "parent" in kwargs:
parent = kwargs["parent"]
if self.use_parent:
parent = kwargs.get("parent")
else:
parent = None
if parent is None:
name_to_pcode = self.name_to_pcode.get(countryiso3)
if not name_to_pcode:
if logname:
self.errors.add((logname, countryiso3))
return None
else:
name_parent_to_pcode = self.name_parent_to_pcode.get(countryiso3)
if not name_parent_to_pcode:
if logname:
Expand All @@ -555,15 +599,10 @@ def fuzzy_pcode(
if logname:
self.errors.add((logname, countryiso3, parent))
return None
else:
name_to_pcode = self.name_to_pcode.get(countryiso3)
if not name_to_pcode:
if logname:
self.errors.add((logname, countryiso3))
return None
adm_name_lookup = clean_name(name)
adm_name_lookup2 = multiple_replace(
adm_name_lookup, self.admin_name_replacements
adm_name_lookup,
self.get_admin_name_replacements(countryiso3, parent),
)
pcode = name_to_pcode.get(
adm_name_lookup, name_to_pcode.get(adm_name_lookup2)
Expand Down Expand Up @@ -643,6 +682,33 @@ def al_transform_2(name):
)
return pcode

def get_name_mapped_pcode(
self, countryiso3: str, name: str, parent: Optional[str]
) -> Optional[str]:
"""Get pcode from admin name mappings which is a dictionary of mappings
from name to pcode. These can be global or they can be restricted by
country or parent (if the AdminLevel object has been set up with
parents). Keys take the form "MAPPING", "AFG|MAPPING" or
"AF01|MAPPING".
Args:
countryiso3 (str): ISO3 country code
name (str): Name to match
parent (Optional[str]): Parent admin code
Returns:
Optional[str]: P code match from admin name mappings or None if no match
"""
if parent:
pcode = self.admin_name_mappings.get(f"{parent}|{name}")
if pcode is None:
pcode = self.admin_name_mappings.get(f"{countryiso3}|{name}")
else:
pcode = self.admin_name_mappings.get(f"{countryiso3}|{name}")
if pcode is None:
pcode = self.admin_name_mappings.get(name)
return pcode

def get_pcode(
self,
countryiso3: str,
Expand All @@ -653,7 +719,7 @@ def get_pcode(
"""Get pcode for a given name
Args:
countryiso3 (str): Iso3 country code
countryiso3 (str): ISO3 country code
name (str): Name to match
fuzzy_match (bool): Whether to try fuzzy matching. Defaults to True.
**kwargs:
Expand All @@ -663,9 +729,17 @@ def get_pcode(
Returns:
Tuple[Optional[str], bool]: (Matched P code or None if no match, True if exact match or False if not)
"""
pcode = self.admin_name_mappings.get(name)
if self.use_parent:
parent = kwargs.get("parent")
else:
parent = None
pcode = self.get_name_mapped_pcode(countryiso3, name, parent)
if pcode and self.pcode_to_iso3[pcode] == countryiso3:
return pcode, True
if parent:
if self.pcode_to_parent[pcode] == parent:
return pcode, True
else:
return pcode, True
if self.looks_like_pcode(name):
pcode = name.upper()
if pcode in self.pcodes: # name is a p-code
Expand All @@ -677,8 +751,7 @@ def get_pcode(
)
return pcode, True
else:
if self.use_parent and "parent" in kwargs:
parent = kwargs["parent"]
if parent:
name_parent_to_pcode = self.name_parent_to_pcode.get(
countryiso3
)
Expand Down Expand Up @@ -756,3 +829,16 @@ def output_admin_name_mappings(self) -> List[str]:
logger.info(line)
output.append(line)
return output

def output_admin_name_replacements(self) -> List[str]:
"""Output log of name replacements
Returns:
List[str]: List of name replacements
"""
output = []
for name, replacement in self.admin_name_replacements.items():
line = f"{name}: {replacement}"
logger.info(line)
output.append(line)
return output
25 changes: 25 additions & 0 deletions tests/fixtures/adminlevelparent.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
admin_info_with_parent:
- {pcode: AF0101, name: Kabul, iso3: AFG, parent: AF01}
- {pcode: AF0102, name: Paghman, iso3: AFG, parent: AF01}
- {pcode: AF0201, name: Kabul, iso3: AFG, parent: AF02} # testing purposes
- {pcode: AF0301, name: Charikar, iso3: AFG, parent: AF03}
- {pcode: AF0401, name: Maydan Shahr, iso3: AFG, parent: AF04}
- {pcode: AF0501, name: Pul-e-Alam, iso3: AFG, parent: AF05}
- {pcode: AF0501, name: Pul-e-Alam, iso3: AFG, parent: AF05}
- {pcode: CD2013, name: Mbanza-Ngungu, iso3: COD, parent: CD20}
- {pcode: CD3102, name: Kenge, iso3: COD, parent: CD31}
- {pcode: MW305, name: Blantyre, iso3: MWI, parent: MW3}

admin_name_mappings:
"MyMapping": "AF0301"
"AFG|MyMapping2": "AF0401"
"AF05|MyMapping3": "AF0501"

admin_name_replacements:
" city": ""

alt1_admin_name_replacements:
"COD| city": ""

alt2_admin_name_replacements:
"CD20| city": ""
Loading

0 comments on commit 92ffeb7

Please sign in to comment.