expand_dcml.split_labels() replaces '42' inversions with the default '2'

johentsch · Aug 29, 2023 · 125174f · 125174f
1 parent 87d5c68
commit 125174f
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 13 deletions.
diff --git a/src/ms3/expand_dcml.py b/src/ms3/expand_dcml.py
@@ -5,6 +5,7 @@
 import re
 import sys
 from collections import defaultdict
+from typing import Optional
 
 import pandas as pd
 
@@ -169,6 +170,7 @@ def expand_labels(
         skip_checks=skip_checks,
         logger=logger,
     )
+
     df["chord_type"] = transform(
         df,
         features2type,
@@ -224,15 +226,17 @@ def expand_labels(
     return df
 
 
-def extract_features_from_labels(S, regex=None):
+def extract_features_from_labels(
+    S: pd.Series, regex: Optional[re.Pattern | str] = None
+) -> pd.DataFrame:
     """Applies .str.extract(regex) on the Series and returns a DataFrame with all named capturing groups."""
     if regex is None:
         regex = DCML_REGEX
     if regex.__class__ != re.compile("").__class__:
         regex = re.compile(regex, re.VERBOSE)
     features = list(regex.groupindex.keys())
     extracted = S.str.extract(regex, expand=True)
-    return extracted[features].copy()
+    return extracted[features].copy()  # removes superfluous columns
 
 
 def split_labels(
@@ -284,6 +288,19 @@ def split_labels(
     if len(rename) > 0:
         spl.rename(columns=rename, inplace=True)
     df = values_into_df(df, spl)
+
+    # replace '42' chord inversion with '2'. It is equivalent and allowed for convenience but must be harmonized
+    replace_42_mask = (df.figbass == "42").fillna(False)
+    if replace_42_mask.any():
+
+        def replace_42(S: pd.Series) -> pd.Series:
+            return S.str.replace("42", "2", n=1, regex=False)
+
+        replace_cols = ["label", "chord", "figbass"]
+        df.loc[replace_42_mask, replace_cols] = df.loc[
+            replace_42_mask, replace_cols
+        ].apply(replace_42)
+
     if not skip_checks:
         syntax_errors = spl.isna().all(axis=1) & df[label_column].notna()
         if syntax_errors.any():
@@ -294,7 +311,11 @@ def split_labels(
         return df
 
 
-def values_into_df(df, new_values):
+def values_into_df(df: pd.DataFrame, new_values: pd.DataFrame) -> pd.DataFrame:
+    """Updates the given DataFrame with the values from the other DataFrame by updating existing columns and
+    concatenating new columns. The returned DataFrame has the columns of ``new_values`` on the right-hand side as if
+    they had been concatenated.
+    """
     features = list(new_values.columns)
     update_columns = [col for col in features if col in df.columns]
     new_columns = [col for col in features if col not in df.columns]

diff --git a/src/ms3/utils/functions.py b/src/ms3/utils/functions.py
@@ -5173,21 +5173,26 @@ def path2parent_corpus(path):
         return None
 
 
-def chord2tpcs(chord, regex=None, logger=None, **kwargs):
+def chord2tpcs(
+    chord: str,
+    regex: Optional[re.Pattern] = None,
+    logger: Optional[logging.Logger] = None,
+    **kwargs,
+):
     """
     Split a chord label into its features and apply features2tpcs().
 
     Uses: features2tpcs()
 
-    Parameters
-    ----------
-    chord : :obj:`str`
-        Chord label that can be split into the features ['numeral', 'form', 'figbass', 'changes', 'relativeroot'].
-    regex : :obj:`re.Pattern`, optional
-        Compiled regex with named groups for the five features. By default, the current version of the DCML harmony
-        annotation standard is used.
-    **kwargs :
-        arguments for features2tpcs (pass MC to show it in warnings!)
+    Args:
+        chord:
+            Chord label that can be split into the features ['numeral', 'form', 'figbass',
+            'changes', 'relativeroot'].
+        regex:
+            Compiled regex with named groups for the five features. By default, the current
+            version of the DCML harmony annotation standard is used.
+        **kwargs:
+            arguments for features2tpcs (pass mc=MC to show it in warnings!)
     """
     if logger is None:
         logger = module_logger