derek73 · aikimark · Mar 20, 2021 · Mar 21, 2021 · Mar 22, 2021 · Mar 25, 2021
diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py
@@ -231,7 +231,7 @@ def __init__(self,
         self.first_name_titles   = SetManager(first_name_titles)
         self.conjunctions        = SetManager(conjunctions)
         self.capitalization_exceptions = TupleManager(capitalization_exceptions)
-        self.regexes             = TupleManager(regexes)
+        self.regexes             = TupleManager([tpl[:2] for tpl in REGEXES])
         self._pst = None
 
     @property

diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py
@@ -18,20 +18,39 @@
         '[\u2600-\u26FF\u2700-\u27BF])+', 
         re.UNICODE)
 
-REGEXES = set([
+REGEXES = [
     ("spaces", re.compile(r"\s+", re.U)),
     ("word", re.compile(r"(\w|\.)+", re.U)),
     ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)),
     ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)),
-    ("quoted_word", re.compile(r'(?<!\w)\'([^\s]*?)\'(?!\w)', re.U)),
-    ("double_quotes", re.compile(r'\"(.*?)\"', re.U)),
-    ("parenthesis", re.compile(r'\((.*?)\)', re.U)),
+    ("double_apostrophe_ASCII", re.compile(r"(?!\w)''(\w[^']*?)''(?!\w)", re.U), 'nickname'),
+    ("smart_quote", re.compile(r"(?!\w)“(\w[^”]*?)”(?!\w)", re.U), 'nickname'),
+    ("smart_single_quote", re.compile(r"(?!\w)‘(\w[^’]*?)’(?!\w)", re.U), 'nickname'),
+    ("grave_accent", re.compile(r'(?!\w)`(\w[^`]*?)`(?!\w)', re.U), 'nickname'),
+    ("grave_acute", re.compile(r'(?!\w)`(\w[^´]*?)´(?!\w)', re.U), 'nickname'),
+    ("apostrophe_ASCII", re.compile(r"(?!\w)'(\w[^']*?)'(?!\w)", re.U), 'nickname'),
+    ("quote_ASCII", re.compile(r'(?!\w)"(\w[^"]*?)"(?!\w)', re.U), 'nickname'),
+    ("parenthesis", re.compile(r'(?!\w)\((\w[^)]*?)\)(?!\w)', re.U), 'nickname'),
     ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)),
     ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)),
     ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)),
     ("emoji",re_emoji),
     ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)),
-])
+    ("nn_sep_safe", re.compile(r'[^ ,]', re.U)),
+    ("paren_suffix", re.compile(r'(?!\w)(\((?:ret|vet)\.?\))(?!\w)', re.I | re.U)),
+]
 """
 All regular expressions used by the parser are precompiled and stored in the config.
+
+REGEX tuple positions are:
+    [0] - name of the pattern, used in code as named attribute
+    [1] - compiled pattern
+    [2] - (optional) label/tag of the pattern, used in code for 
+          filtering patterns
+
+All nickname patterns should follow this pattern: 
+    (?!\w)leading_delim([^trailing_delim]*?)trailing_delim(?!\w)
+
+Nicknames are assume to be delimited by non-word characters.
+
 """
diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py
@@ -6,6 +6,7 @@
     'esq',
     'esquire',
     'jr',
+    'jr.',
     'jnr',
     'junior',
     'sr',
@@ -25,6 +26,7 @@
 """
 SUFFIX_ACRONYMS = set([
     '(ret)',
+    '(ret.)',
     '(vet)',
     '8-vsb',
     'aas',

diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py
@@ -166,6 +166,7 @@
     'chef',
     'chemist',
     'chief',
+    'chief justice',
     'chieftain',
     'choreographer',
     'civil',
@@ -339,6 +340,7 @@
     'judicial',
     'junior',
     'jurist',
+    'justice',
     'keyboardist',
     'kingdom',
     'knowledge',

diff --git a/nameparser/parser.py b/nameparser/parser.py
@@ -12,6 +12,7 @@
 from nameparser.config import CONSTANTS
 from nameparser.config import Constants
 from nameparser.config import DEFAULT_ENCODING
+from nameparser.config.regexes import REGEXES
 
 ENCODING = 'utf-8'
 
@@ -70,7 +71,7 @@ class HumanName(object):
     _members = ['title','first','middle','last','suffix','nickname']
     unparsable = True
     _full_name = ''
-
+    
     def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,
                 string_format=None):
         self.C = constants
@@ -79,7 +80,17 @@ def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,
 
         self.encoding = encoding
         self.string_format = string_format or self.C.string_format
+        self._nickname_regexes = [tpl[1] 
+                                  for tpl in REGEXES
+                                  if isinstance(tpl[-1], str)
+                                  and 'nickname' in tpl[-1]
+                                 ]
         # full_name setter triggers the parse
+        #========================================================
+        #IMPORTANT NOTE:
+        #  The followint statement must be the last one in the 
+        #  __init__ function
+        #========================================================
         self.full_name = full_name
 
     def __iter__(self):
@@ -243,7 +254,11 @@ def nickname(self):
         The person's nicknames. Any text found inside of quotes (``""``) or
         parenthesis (``()``)
         """
-        return " ".join(self.nickname_list) or self.C.empty_attribute_default
+        if len(self.nickname_list) <= 1:
+            f_string = '{0}'
+        else:
+            f_string = '"{0}"'
+        return ", ".join([f_string.format(nn) for nn in self.nickname_list]) or self.C.empty_attribute_default
 
     @property
     def surnames_list(self):
@@ -387,18 +402,24 @@ def pre_process(self):
         This method happens at the beginning of the :py:func:`parse_full_name`
         before any other processing of the string aside from unicode
         normalization, so it's a good place to do any custom handling in a
-        subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`.
+        subclass. Runs 
+            :py:func:`fix_phd` 
+            :py:func:`parse_parenthesized_suffixes` 
+            :py:func:`parse_nicknames` 
+            :py:func:`squash_emoji`.
 
         """
         self.fix_phd()
+        self.parse_parenthesized_suffixes()
         self.parse_nicknames()
         self.squash_emoji()
 
     def post_process(self):
         """
         This happens at the end of the :py:func:`parse_full_name` after
-        all other processing has taken place. Runs :py:func:`handle_firstnames`
-        and :py:func:`handle_capitalization`.
+        all other processing has taken place. Runs 
+        :py:func:`handle_firstnames`
+        :py:func:`handle_capitalization`
         """
         self.handle_firstnames()
         self.handle_capitalization()
@@ -412,25 +433,49 @@ def fix_phd(self):
 
     def parse_nicknames(self):
         """
-        The content of parenthesis or quotes in the name will be added to the
+        The content of defined nickname regex patterns in the name will be added to the
         nicknames list. This happens before any other processing of the name.
-
-        Single quotes cannot span white space characters and must border
-        white space to allow for quotes in names like O'Connor and Kawai'ae'a.
-        Double quotes and parenthesis can span white space.
-
-        Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`;
-        `quoted_word`, `double_quotes` and `parenthesis`.
-        """
-
-        re_quoted_word = self.C.regexes.quoted_word
-        re_double_quotes = self.C.regexes.double_quotes
-        re_parenthesis = self.C.regexes.parenthesis
-
-        for _re in (re_quoted_word, re_double_quotes, re_parenthesis):
-            if _re.search(self._full_name):
-                self.nickname_list += [x for x in _re.findall(self._full_name)]
-                self._full_name = _re.sub('', self._full_name)
+
+        Some basic rules for nickname processing:
+          * Nicknames must begin with a word character.
+          * Nickname patterns should include an outer (not processed)
+            delimiter that excludes word characters.
+
+        Loops through :py:data:`~nameparser.config.regexes.REGEXES` with
+        label/tag like "nickname"
+        """
+        #ToDo:
+        # * create a list of matches
+        # * sort the list by span
+        # * check inter-match strings for commas
+        # * remove the commas if safe to remove
+        #   safe = character(s) between matches are ONLY
+        #          spaces and commas
+        # * iterate the matches, collecting the nicknames
+        #   and removing the matches from self._full_name
+        nn_matches = []
+        nn_sep = self.C.regexes.nn_sep_safe
+        _fn = self._full_name
+        for _re in self._nickname_regexes:
+            if _re.search(_fn):
+                nn_matches.extend( _re.finditer(_fn) )
+                #remove matches from string
+                for _match in _re.finditer(_fn):
+                    _fn = (' ' * (_match.end() - _match.start())).join([_fn[:_match.start()], _fn[_match.end():]])
+
+        if len(nn_matches) == 0:
+            return #"empty matches"
+
+        nn_matches.sort(key=lambda x: x.span())
+
+        #remove any inter-match commas, if safe to do so
+        for low, high in zip(nn_matches[0:-1], nn_matches[1:]):
+            if nn_sep.search(self._full_name[low.span()[1]:high.span()[0]]) is None:
+                self._full_name = ' '.join([self._full_name[:low.span()[1]], self._full_name[high.span()[0]:] ])
+
+        for nn_match in nn_matches:
+            self.nickname_list.append( nn_match.group(1) )
+            self._full_name = nn_match.re.sub(' ', self._full_name, 1)
 
     def squash_emoji(self):
         """
@@ -452,6 +497,18 @@ def handle_firstnames(self):
                 and not lc(self.title) in self.C.first_name_titles:
             self.last, self.first = self.first, self.last
 
+    def parse_parenthesized_suffixes(self):
+        """
+        Extract any parenthesized suffixes: (ret. | ret | vet. | vet)
+        """
+        _re = self.C.regexes.paren_suffix
+        if _re.search(self._full_name):
+            for _match in _re.finditer(self._full_name):
+                self.suffix_list.append(_match.group(1))
+
+            self._full_name = _re.sub(' ', self._full_name)
+
+
     def parse_full_name(self):
         """
 

diff --git a/tests.py b/tests.py
@@ -27,6 +27,7 @@
 from nameparser import HumanName
 from nameparser.util import u
 from nameparser.config import Constants
+import re
 
 log = logging.getLogger('HumanName')
 
@@ -1491,7 +1492,36 @@ def test_nickname_and_last_name_with_title(self):
         self.m(hn.last, "Edmonds", hn)
         self.m(hn.nickname, "Rick", hn)
 
+    def test_append_nickname(self):
+        hn = HumanName()
+        new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE)
+        hn._nickname_regexes.append(new_rgx)
+        self.assertEqual(hn._nickname_regexes[-1], new_rgx)
+        hn.full_name = r"Benjamin (_openBen):close Franklin"
+        self.m(hn.first, "Benjamin", hn)
+        self.m(hn.middle, ":close", hn)
+        self.m(hn.last, "Franklin", hn)
+        self.m(hn.nickname, "_openBen", hn)
 
+    def test_prepend_nickname(self):
+        hn = HumanName()
+        new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE)
+        hn._nickname_regexes.insert(0, new_rgx)
+        self.assertEqual(hn._nickname_regexes[0], new_rgx)
+        hn.full_name = r"Benjamin (_openBen):close Franklin"
+        self.m(hn.first, "Benjamin", hn)
+        self.m(hn.middle, "", hn)
+        self.m(hn.last, "Franklin", hn)
+        self.m(hn.nickname, "Ben", hn)
+
+    def test_multiple_nicknames(self):
+        hn = HumanName('Chief Justice John (JR), "No Glove, No Love" Glover Roberts, Jr.')
+        self.m(hn.title, 'Chief Justice', hn)
+        self.m(hn.first, "John", hn)
+        self.m(hn.middle, "Glover", hn)
+        self.m(hn.last, "Roberts", hn)
+        self.m(hn.suffix, "Jr.", hn)
+        self.m(hn.nickname, '"JR", "No Glove, No Love"', hn)
 
 # class MaidenNameTestCase(HumanNameTestBase):
 #
@@ -1766,6 +1796,14 @@ def test_suffix_with_periods_with_lastname_comma(self):
         self.m(hn.last, "Doe", hn)
         self.m(hn.suffix, "Msc.Ed.", hn)
 
+    def test_suffix_parenthesized_with_nickname(self):
+        hn = HumanName("Gen Dwight David (Ike) Eisenhower (ret.) KG")
+        self.m(hn.title, "Gen", hn)
+        self.m(hn.first, "Dwight", hn)
+        self.m(hn.middle, "David", hn)
+        self.m(hn.last, "Eisenhower", hn)
+        self.m(hn.suffix, "(ret.), KG", hn)
+        self.m(hn.nickname, "Ike", hn)
 
 class TitleTestCase(HumanNameTestBase):