Skip to content

Commit

Permalink
expand_dcml.split_labels() replaces '42' inversions with the default '2'
Browse files Browse the repository at this point in the history
  • Loading branch information
johentsch committed Aug 29, 2023
1 parent 87d5c68 commit 125174f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 13 deletions.
27 changes: 24 additions & 3 deletions src/ms3/expand_dcml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import sys
from collections import defaultdict
from typing import Optional

import pandas as pd

Expand Down Expand Up @@ -169,6 +170,7 @@ def expand_labels(
skip_checks=skip_checks,
logger=logger,
)

df["chord_type"] = transform(
df,
features2type,
Expand Down Expand Up @@ -224,15 +226,17 @@ def expand_labels(
return df


def extract_features_from_labels(S, regex=None):
def extract_features_from_labels(
S: pd.Series, regex: Optional[re.Pattern | str] = None
) -> pd.DataFrame:
"""Applies .str.extract(regex) on the Series and returns a DataFrame with all named capturing groups."""
if regex is None:
regex = DCML_REGEX
if regex.__class__ != re.compile("").__class__:
regex = re.compile(regex, re.VERBOSE)
features = list(regex.groupindex.keys())
extracted = S.str.extract(regex, expand=True)
return extracted[features].copy()
return extracted[features].copy() # removes superfluous columns


def split_labels(
Expand Down Expand Up @@ -284,6 +288,19 @@ def split_labels(
if len(rename) > 0:
spl.rename(columns=rename, inplace=True)
df = values_into_df(df, spl)

# replace '42' chord inversion with '2'. It is equivalent and allowed for convenience but must be harmonized
replace_42_mask = (df.figbass == "42").fillna(False)
if replace_42_mask.any():

def replace_42(S: pd.Series) -> pd.Series:
return S.str.replace("42", "2", n=1, regex=False)

replace_cols = ["label", "chord", "figbass"]
df.loc[replace_42_mask, replace_cols] = df.loc[
replace_42_mask, replace_cols
].apply(replace_42)

if not skip_checks:
syntax_errors = spl.isna().all(axis=1) & df[label_column].notna()
if syntax_errors.any():
Expand All @@ -294,7 +311,11 @@ def split_labels(
return df


def values_into_df(df, new_values):
def values_into_df(df: pd.DataFrame, new_values: pd.DataFrame) -> pd.DataFrame:
"""Updates the given DataFrame with the values from the other DataFrame by updating existing columns and
concatenating new columns. The returned DataFrame has the columns of ``new_values`` on the right-hand side as if
they had been concatenated.
"""
features = list(new_values.columns)
update_columns = [col for col in features if col in df.columns]
new_columns = [col for col in features if col not in df.columns]
Expand Down
25 changes: 15 additions & 10 deletions src/ms3/utils/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5173,21 +5173,26 @@ def path2parent_corpus(path):
return None


def chord2tpcs(chord, regex=None, logger=None, **kwargs):
def chord2tpcs(
chord: str,
regex: Optional[re.Pattern] = None,
logger: Optional[logging.Logger] = None,
**kwargs,
):
"""
Split a chord label into its features and apply features2tpcs().
Uses: features2tpcs()
Parameters
----------
chord : :obj:`str`
Chord label that can be split into the features ['numeral', 'form', 'figbass', 'changes', 'relativeroot'].
regex : :obj:`re.Pattern`, optional
Compiled regex with named groups for the five features. By default, the current version of the DCML harmony
annotation standard is used.
**kwargs :
arguments for features2tpcs (pass MC to show it in warnings!)
Args:
chord:
Chord label that can be split into the features ['numeral', 'form', 'figbass',
'changes', 'relativeroot'].
regex:
Compiled regex with named groups for the five features. By default, the current
version of the DCML harmony annotation standard is used.
**kwargs:
arguments for features2tpcs (pass mc=MC to show it in warnings!)
"""
if logger is None:
logger = module_logger
Expand Down

0 comments on commit 125174f

Please sign in to comment.