Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

326 forward link #13

Merged
merged 12 commits into from
May 22, 2024
61 changes: 61 additions & 0 deletions src/forward_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import numpy as np
import pandas as pd


def calculate_imputation_link(
df: pd.DataFrame,
period: str,
strata: str,
match_col: str,
target_variable: str,
predictive_variable: str,
) -> pd.Series:
"""
Calculate link between target_variable and predictive_variable by strata,
a match_col must be supplied which indicates if target_variable
and predictive_variable can be linked.

Parameters
----------
df : pd.Dataframe
Original dataframe.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
match_col : str
Column name of the matched pair links, this column should be bool.
target_variable : str
Column name of the targeted variable.
predictive_variable : str
Column name of the predicted target variable.

Returns
-------
link : pd.Series
A pandas series with the links.
"""

df_intermediate = df.copy()

df_intermediate[target_variable] = (
df_intermediate[target_variable] * df_intermediate[match_col]
)

df_intermediate[predictive_variable] = (
df_intermediate[predictive_variable] * df_intermediate[match_col]
)

numerator = df_intermediate.groupby([strata, period])[target_variable].transform(
"sum"
)

denominator = df_intermediate.groupby([strata, period])[
predictive_variable
].transform("sum")

denominator.replace(0, np.nan, inplace=True) # cover division with 0

link = numerator / denominator

return link
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other functions return dataframe, but I think the point about returning dataframe or series is a good one and should probably be discussed (i.e. how will all of the functions link together). Happy for this to be left as is and a ticket added to the backlog as other functions might need to be refactored

16 changes: 16 additions & 0 deletions tests/calculate_links_test_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,identifier,period,group,question,f_predictive_question,b_predictive_question,f_matched_pair,b_matched_pair,f_link,b_link
0,10001,202001,1,547.0,,362.0,False,True,,0.9925133689839573
1,10001,202002,1,362.0,547.0,895.0,True,True,1.0075431034482758,0.8431018935978359
2,10001,202003,1,895.0,362.0,,True,False,1.186096256684492,
3,10002,202001,1,381.0,,573.0,False,True,,0.9925133689839573
4,10002,202002,1,573.0,381.0,214.0,True,True,1.0075431034482758,0.8431018935978359
5,10002,202003,1,214.0,573.0,,True,False,1.186096256684492,
6,10001,202001,2,961.0,,267.0,False,True,,1.693854748603352
7,10001,202002,2,267.0,961.0,314.0,True,True,0.5903693931398417,0.8523809523809524
8,10001,202003,2,314.0,267.0,,True,False,1.1731843575418994,
9,10002,202001,2,555.0,,628.0,False,True,,1.693854748603352
10,10002,202002,2,628.0,555.0,736.0,True,True,0.5903693931398417,0.8523809523809524
11,10002,202003,2,736.0,628.0,,True,False,1.1731843575418994,
12,10005,202001,1,,,,False,False,,0.9925133689839573
13,10005,202002,2,,,100.0,False,False,0.5903693931398417,0.8523809523809524
14,10005,202003,2,100.0,,,False,False,1.1731843575418994,
46 changes: 46 additions & 0 deletions tests/test_forward_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pytest
from helper_functions import load_and_format
from pandas.testing import assert_series_equal

from src.forward_link import calculate_imputation_link

scenarios = ["calculate_links_test_data"]
AntonZogk marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("scenario", scenarios)
class TestLinks:
def test_forward_links(self, scenario):
"""Test if function returns the f_link column"""

df_input = load_and_format("tests/" + scenario + ".csv")

expected_link = df_input["f_link"]

link_to_test = calculate_imputation_link(
df_input,
"period",
"group",
"f_matched_pair",
"question",
"f_predictive_question",
)

assert_series_equal(link_to_test, expected_link, check_names=False)

def test_back_links(self, scenario):
"""Test if function returns the b_link column"""

df_input = load_and_format("tests/" + scenario + ".csv")

expected_link = df_input["b_link"]

link_to_test = calculate_imputation_link(
df_input,
"period",
"group",
"b_matched_pair",
"question",
"b_predictive_question",
)

assert_series_equal(link_to_test, expected_link, check_names=False)
Loading