Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance nickname processing #122

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nameparser/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def __init__(self,
self.first_name_titles = SetManager(first_name_titles)
self.conjunctions = SetManager(conjunctions)
self.capitalization_exceptions = TupleManager(capitalization_exceptions)
self.regexes = TupleManager(regexes)
self.regexes = TupleManager([tpl[:2] for tpl in REGEXES])
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should use the local variable regexes to preserve the ability to pass it as an attribute to a new instance (not that anyone is doing that). ([tpl[:2] for tpl in regexes]).

What is the slice doing here? It's not clear to me.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a tag/label to some of the tuples. The slice returns the first two items in the tuples, omitting the tag/label data. The TupleManager object can still be used in the code. The regexes variable in the constants() is no longer a set object, just a list of tuples. I did this to preserve the order of the regex patterns.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. But If someone tries to instantiate passing a TuperManger to regexes, it will have no effect because you are using the global variable instead of the local one. Need to replace REGEXES for regexes.

ex: name = HumanNam(regexes=myTupleManager) would fail. (I guess I should have some tests for those instantiation attributes.)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand. The list of compiled regex patterns for nicknames is different than the regexes that is fed into tuplemanager. I thought I'd left the tuplemanager-based regexes alone. I might have gotten a little confused by variables/functions with the same name. I'll take another look at it.

Some clarification would be helpful.

Copy link
Owner

@derek73 derek73 May 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a simple mistake of using the module constant instead of the attribute passed to the class' init function.

change:
self.regexes = TupleManager([tpl[:2] for tpl in REGEXES])
to
self.regexes = TupleManager([tpl[:2] for tpl in regexes])

Here's a test that should pass but will fail with your code above.

    def test_custom_regex_constant(self):
        t = {'test': 'test'}
        c = Constants(regexes=t)
        self.assertEqual(c.regexes, t)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I edited the last line of my test to fix the equals test)

self._pst = None

@property
Expand Down
29 changes: 24 additions & 5 deletions nameparser/config/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,39 @@
'[\u2600-\u26FF\u2700-\u27BF])+',
re.UNICODE)

REGEXES = set([
REGEXES = [
("spaces", re.compile(r"\s+", re.U)),
("word", re.compile(r"(\w|\.)+", re.U)),
("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)),
("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)),
("quoted_word", re.compile(r'(?<!\w)\'([^\s]*?)\'(?!\w)', re.U)),
("double_quotes", re.compile(r'\"(.*?)\"', re.U)),
("parenthesis", re.compile(r'\((.*?)\)', re.U)),
("double_apostrophe_ASCII", re.compile(r"(?!\w)''(\w[^']*?)''(?!\w)", re.U), 'nickname'),
("smart_quote", re.compile(r"(?!\w)“(\w[^”]*?)”(?!\w)", re.U), 'nickname'),
("smart_single_quote", re.compile(r"(?!\w)‘(\w[^’]*?)’(?!\w)", re.U), 'nickname'),
("grave_accent", re.compile(r'(?!\w)`(\w[^`]*?)`(?!\w)', re.U), 'nickname'),
("grave_acute", re.compile(r'(?!\w)`(\w[^´]*?)´(?!\w)', re.U), 'nickname'),
("apostrophe_ASCII", re.compile(r"(?!\w)'(\w[^']*?)'(?!\w)", re.U), 'nickname'),
("quote_ASCII", re.compile(r'(?!\w)"(\w[^"]*?)"(?!\w)', re.U), 'nickname'),
("parenthesis", re.compile(r'(?!\w)\((\w[^)]*?)\)(?!\w)', re.U), 'nickname'),
("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)),
("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)),
("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)),
("emoji",re_emoji),
("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)),
])
("nn_sep_safe", re.compile(r'[^ ,]', re.U)),
("paren_suffix", re.compile(r'(?!\w)(\((?:ret|vet)\.?\))(?!\w)', re.I | re.U)),
]
"""
All regular expressions used by the parser are precompiled and stored in the config.

REGEX tuple positions are:
[0] - name of the pattern, used in code as named attribute
[1] - compiled pattern
[2] - (optional) label/tag of the pattern, used in code for
filtering patterns

All nickname patterns should follow this pattern:
(?!\w)leading_delim([^trailing_delim]*?)trailing_delim(?!\w)

Nicknames are assume to be delimited by non-word characters.

"""
2 changes: 2 additions & 0 deletions nameparser/config/suffixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
'esq',
'esquire',
'jr',
'jr.',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some of these suffixes are abbreviations that could also end in a period, eg "esq". Some of them are not abbreviations, eg "iii". Do we need to enter period versions of all of them that are abbreviations?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd have to verify how you are processing the suffixes to respond with 100% certainty. Since the suffixes are in a list, I think the code is most likely doing a set inclusion test, using the in operator. Part of the answer depends on how you are parsing the text prior to doing the set inclusion test. If your parsing picks up periods, then my answer is "yes" - you will need to have both the plain and period-ending versions of the suffixes. Also, be aware that the set inclusion test is case sensitive.

In the new suffix preprocessing routine I added, the regex pattern has a ".?", which will match the prior text with and without a trailing period character. The question mark makes the period character optional.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you want to happen?

The parsing code does a split on space characters, so any trailing punctuation (comma, period, semicolon, etc.) will remain with the word.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the utility function lc() removes periods.

Looking at the is_suffix method in the parser,

        # suffixes may have periods inside them like "M.D."
        return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \
            or (lc(piece) in self.C.suffix_not_acronyms)) \
            and not self.is_an_initial(piece)

Looks like it was already a bit unclear before you started because .replace('.','') is unnecessary in that first line.

So, I think you should remove the "jr." entry because I don't expect it is making anything work. Probably there's something not quite right there but it's not related to any changes you made and we can fix it after merging your changes in if we want.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is period removal sufficient? I'm thinking about other punctuation characters that may separate suffixes, such as comma.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first step of the parser is to split the entire string on commas. Many suffixes listed at the end of a name are always treated as suffixes. These are the 2 supported formats for commas:

Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix]
Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix]

'jnr',
'junior',
'sr',
Expand All @@ -25,6 +26,7 @@
"""
SUFFIX_ACRONYMS = set([
'(ret)',
'(ret.)',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The SUFFIX_ACRONYMS set was intended to be acronyms that could optionally be separated by periods. I'm not sure how (ret) got in there because it's surprising to see parenthesis in there, but it doesn't seem to make sense to add a period to one of the items in this set.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "(ret)" was there when I started coding. I didn't know how you might want to parse the "ret" and "vet" suffixes, so I tried to keep the parentheses in the parsed result. I think I added trailing-period versions to the list before I realized that any parenthesis-delimited string would be parsed by the nickname routine. I can certainly change how I'm processing these two as well as add other suffix strings to the preprocessing pattern. Just let me know.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you want to happen with ret and vet?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does something fail if you remove the one with the period?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I'm using a regex pattern for the (ret) and (vet) suffixes. The regex pattern ignores any trailing period. Using the existing list of suffixes doesn't work, since the nickname processing picks these up, matching the parenthesis pattern. Nickname processing happens before suffix processing, which is why I added a suffix preprocessor routine for these two parenthesis-delimited suffixes.

I can retain the parentheses or drop them. Let me know and I'll make sure the code does what you want, relative to these two suffixes.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have changed the way that suffixes are detected then I think you need to modify the is_suffix method to implement that test using your new regexes.

'(vet)',
'8-vsb',
'aas',
Expand Down
2 changes: 2 additions & 0 deletions nameparser/config/titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@
'chef',
'chemist',
'chief',
'chief justice',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm kind of surprised if this works, but I guess I could see it because of how the titles chain together. "Justice" is a somewhat common first name so we couldn't just add that as it's own title, so if this works it's a nice workaround.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't realize that "Justice" was a common first name. I don't think my "Chief Justice" string is being matched due to the prior parsing actions. I'm not sure what qualifies as "common first name". "Justice" is around the 580th most common first name in America. However, I think it is probably more common than the number of judges.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I spoke with a judge and asked him about the use of the title "Justice". He said it was rare. I'll undo this change, since it was based on a false assumption.

The judge expressed some dismay that titles were being used as first name. He has encountered people with first names, such as "King" and "Queen", in his courtroom.

We might want to include the parser's bias for first names over titles.

If someone is parsing names of titled people (think UN delegations), what should they do?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bias for first names over titles is already a feature of the parser, and why there are no potential first names in the titles constant. First job of a name parser is to parse names, then it can optionally parse titles but not if it messes up names.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do some testing after removing "Justice" and "Chief Justice" from the list. I might add a tests for "David Justice" and "Justice, David", the baseball player.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I work in legal data, and I'll note that there are very few "Justices" though I suppose "justice of the peace" is a title. But in my experience, "justice" is reserved pretty much exclusively for the SCOTUS justices. You can see the way this shakes out on this page (though it doesn't discuss this topic): https://www.uscourts.gov/judges-judgeships/about-federal-judges

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry, meant to say that relatedly, "J." is a very common title among judges. I'm guessing it can't be added b/c it's one letter, but I thought I'd throw that out there.

'chieftain',
'choreographer',
'civil',
Expand Down Expand Up @@ -339,6 +340,7 @@
'judicial',
'junior',
'jurist',
'justice',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My friend Justice would be upset that the parser would not recognize his first name.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are about the same number of people named "Justice" as there are judges in America (~33000). What does the name parser do, or what should it do, when it encounters several names? What if "Justice" is one of the first of the words in a multi-name string?

This is a question similar to the one that I posed for myself when I first approached the problem of nicknames that might also be suffixes. I didn't have a good answer, so I abandoned that original approach. It is still an unanswered question.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The simplest use case for this parser is just Firstname Lastname. I feel like when there is conflict with other things the parser should/could do (ex: recognize titles), those other things should be sacrificed to preserve it's ability to split up a simple name. There is a fairly simple workaround if someone using the parser wants to change it, and a human interacting with the parser could add their fist name and the parser would then figure out that it's a title, kind of like if you were interacting with a human and they had the same confusion.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hadn't noticed a test for this. I'll look at it and alter my John Roberts test accordingly.

I'll remove "Justice" from the titles list.

'keyboardist',
'kingdom',
'knowledge',
Expand Down
103 changes: 80 additions & 23 deletions nameparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from nameparser.config import CONSTANTS
from nameparser.config import Constants
from nameparser.config import DEFAULT_ENCODING
from nameparser.config.regexes import REGEXES

ENCODING = 'utf-8'

Expand Down Expand Up @@ -70,7 +71,7 @@ class HumanName(object):
_members = ['title','first','middle','last','suffix','nickname']
unparsable = True
_full_name = ''

def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,
string_format=None):
self.C = constants
Expand All @@ -79,7 +80,17 @@ def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,

self.encoding = encoding
self.string_format = string_format or self.C.string_format
self._nickname_regexes = [tpl[1]
for tpl in REGEXES
if isinstance(tpl[-1], str)
and 'nickname' in tpl[-1]
]
# full_name setter triggers the parse
#========================================================
#IMPORTANT NOTE:
# The followint statement must be the last one in the
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be "The following statement...", also could combine with the existing comment:

The following statement must be the last line in _init__ because it triggers the parse using :py:func:`full_name.setter`.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. These two statements can be combined. I was 'bitten' by that when I started to change the code. I wanted to add text to really draw the attention of future collaborators.

# __init__ function
#========================================================
self.full_name = full_name

def __iter__(self):
Expand Down Expand Up @@ -243,7 +254,11 @@ def nickname(self):
The person's nicknames. Any text found inside of quotes (``""``) or
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description of what constitutes a nickname should probably be updated.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. We've exchanged thoughts on this via email.

I added a couple of new nickname patterns and 'standardized' the existing patterns. Do you still have those emails?

parenthesis (``()``)
"""
return " ".join(self.nickname_list) or self.C.empty_attribute_default
if len(self.nickname_list) <= 1:
f_string = '{0}'
else:
f_string = '"{0}"'
return ", ".join([f_string.format(nn) for nn in self.nickname_list]) or self.C.empty_attribute_default

@property
def surnames_list(self):
Expand Down Expand Up @@ -387,18 +402,24 @@ def pre_process(self):
This method happens at the beginning of the :py:func:`parse_full_name`
before any other processing of the string aside from unicode
normalization, so it's a good place to do any custom handling in a
subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`.
subclass. Runs
:py:func:`fix_phd`
:py:func:`parse_parenthesized_suffixes`
:py:func:`parse_nicknames`
:py:func:`squash_emoji`.

"""
self.fix_phd()
self.parse_parenthesized_suffixes()
self.parse_nicknames()
self.squash_emoji()

def post_process(self):
"""
This happens at the end of the :py:func:`parse_full_name` after
all other processing has taken place. Runs :py:func:`handle_firstnames`
and :py:func:`handle_capitalization`.
all other processing has taken place. Runs
:py:func:`handle_firstnames`
:py:func:`handle_capitalization`
"""
self.handle_firstnames()
self.handle_capitalization()
Expand All @@ -412,25 +433,49 @@ def fix_phd(self):

def parse_nicknames(self):
"""
The content of parenthesis or quotes in the name will be added to the
The content of defined nickname regex patterns in the name will be added to the
nicknames list. This happens before any other processing of the name.

Single quotes cannot span white space characters and must border
white space to allow for quotes in names like O'Connor and Kawai'ae'a.
Double quotes and parenthesis can span white space.

Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`;
`quoted_word`, `double_quotes` and `parenthesis`.
"""

re_quoted_word = self.C.regexes.quoted_word
re_double_quotes = self.C.regexes.double_quotes
re_parenthesis = self.C.regexes.parenthesis

for _re in (re_quoted_word, re_double_quotes, re_parenthesis):
if _re.search(self._full_name):
self.nickname_list += [x for x in _re.findall(self._full_name)]
self._full_name = _re.sub('', self._full_name)

Some basic rules for nickname processing:
* Nicknames must begin with a word character.
* Nickname patterns should include an outer (not processed)
delimiter that excludes word characters.

Loops through :py:data:`~nameparser.config.regexes.REGEXES` with
label/tag like "nickname"
"""
#ToDo:
# * create a list of matches
# * sort the list by span
# * check inter-match strings for commas
# * remove the commas if safe to remove
# safe = character(s) between matches are ONLY
# spaces and commas
# * iterate the matches, collecting the nicknames
# and removing the matches from self._full_name
nn_matches = []
nn_sep = self.C.regexes.nn_sep_safe
_fn = self._full_name
for _re in self._nickname_regexes:
if _re.search(_fn):
nn_matches.extend( _re.finditer(_fn) )
#remove matches from string
for _match in _re.finditer(_fn):
_fn = (' ' * (_match.end() - _match.start())).join([_fn[:_match.start()], _fn[_match.end():]])

if len(nn_matches) == 0:
return #"empty matches"

nn_matches.sort(key=lambda x: x.span())

#remove any inter-match commas, if safe to do so
for low, high in zip(nn_matches[0:-1], nn_matches[1:]):
if nn_sep.search(self._full_name[low.span()[1]:high.span()[0]]) is None:
self._full_name = ' '.join([self._full_name[:low.span()[1]], self._full_name[high.span()[0]:] ])

for nn_match in nn_matches:
self.nickname_list.append( nn_match.group(1) )
self._full_name = nn_match.re.sub(' ', self._full_name, 1)

def squash_emoji(self):
"""
Expand All @@ -452,6 +497,18 @@ def handle_firstnames(self):
and not lc(self.title) in self.C.first_name_titles:
self.last, self.first = self.first, self.last

def parse_parenthesized_suffixes(self):
"""
Extract any parenthesized suffixes: (ret. | ret | vet. | vet)
"""
_re = self.C.regexes.paren_suffix
if _re.search(self._full_name):
for _match in _re.finditer(self._full_name):
self.suffix_list.append(_match.group(1))

self._full_name = _re.sub(' ', self._full_name)


def parse_full_name(self):
"""

Expand Down
38 changes: 38 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from nameparser import HumanName
from nameparser.util import u
from nameparser.config import Constants
import re

log = logging.getLogger('HumanName')

Expand Down Expand Up @@ -1491,7 +1492,36 @@ def test_nickname_and_last_name_with_title(self):
self.m(hn.last, "Edmonds", hn)
self.m(hn.nickname, "Rick", hn)

def test_append_nickname(self):
hn = HumanName()
new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE)
hn._nickname_regexes.append(new_rgx)
self.assertEqual(hn._nickname_regexes[-1], new_rgx)
hn.full_name = r"Benjamin (_openBen):close Franklin"
self.m(hn.first, "Benjamin", hn)
self.m(hn.middle, ":close", hn)
self.m(hn.last, "Franklin", hn)
self.m(hn.nickname, "_openBen", hn)

def test_prepend_nickname(self):
hn = HumanName()
new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE)
hn._nickname_regexes.insert(0, new_rgx)
self.assertEqual(hn._nickname_regexes[0], new_rgx)
hn.full_name = r"Benjamin (_openBen):close Franklin"
self.m(hn.first, "Benjamin", hn)
self.m(hn.middle, "", hn)
self.m(hn.last, "Franklin", hn)
self.m(hn.nickname, "Ben", hn)

def test_multiple_nicknames(self):
hn = HumanName('Chief Justice John (JR), "No Glove, No Love" Glover Roberts, Jr.')
self.m(hn.title, 'Chief Justice', hn)
self.m(hn.first, "John", hn)
self.m(hn.middle, "Glover", hn)
self.m(hn.last, "Roberts", hn)
self.m(hn.suffix, "Jr.", hn)
self.m(hn.nickname, '"JR", "No Glove, No Love"', hn)
derek73 marked this conversation as resolved.
Show resolved Hide resolved

# class MaidenNameTestCase(HumanNameTestBase):
#
Expand Down Expand Up @@ -1766,6 +1796,14 @@ def test_suffix_with_periods_with_lastname_comma(self):
self.m(hn.last, "Doe", hn)
self.m(hn.suffix, "Msc.Ed.", hn)

def test_suffix_parenthesized_with_nickname(self):
hn = HumanName("Gen Dwight David (Ike) Eisenhower (ret.) KG")
self.m(hn.title, "Gen", hn)
self.m(hn.first, "Dwight", hn)
self.m(hn.middle, "David", hn)
self.m(hn.last, "Eisenhower", hn)
self.m(hn.suffix, "(ret.), KG", hn)
self.m(hn.nickname, "Ike", hn)

class TitleTestCase(HumanNameTestBase):

Expand Down