Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating codebase to produce selective editing outputs #160

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions mbs_results/imputation/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,22 @@ def impute(dataframe: pd.DataFrame, config: dict) -> pd.DataFrame:
question_no=config["question_no"],
spp_form_id=config["form_id_spp"],
)
target = config["target"]

post_constrain["imputed_and_derived_flag"] = post_constrain.apply(
lambda row: (
"d"
if "sum" in str(row["constrain_marker"]).lower()
else row[f"imputation_flags_{target}"]
else row[f"imputation_flags_{config['target']}"]
),
axis=1,
)

# Added reverse mapping for idbr formtype. Needed for SE and other outputs
spp_to_idbr_mapping = {value: key for key, value in config["idbr_to_spp"].items()}
post_constrain.loc[post_constrain["formtype"].isnull(), "formtype"] = (
post_constrain.loc[post_constrain["formtype"].isnull(), "form_type_spp"].map(
spp_to_idbr_mapping
)
)

return post_constrain
4 changes: 3 additions & 1 deletion mbs_results/outputs/pivot_imputation_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ def merge_counts(
return df_merge.drop(columns=[count_cell, count_date])


def create_imputation_link_output(additional_outputs_df: pd.DataFrame) -> pd.DataFrame:
def create_imputation_link_output(
additional_outputs_df: pd.DataFrame, **config
) -> pd.DataFrame:
"""
A wrapper function that runs the necessary functions for creating the
imputation_link output.
Expand Down
2 changes: 2 additions & 0 deletions mbs_results/outputs/produce_additional_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def get_additional_outputs_df(
"response",
"froempment",
"cell_no",
"imputation_class",
"imputation_flags_adjustedresponse",
"f_link_adjustedresponse",
"b_link_adjustedresponse",
Expand Down Expand Up @@ -108,3 +109,4 @@ def produce_additional_outputs(config: dict, additional_outputs_df: pd.DataFrame
for output in additional_outputs:
filename = f"{output}_v{file_version_mbs}_{snapshot_name}.csv"
additional_outputs[output].to_csv(config["output_path"] + filename, index=False)
print(config["output_path"] + filename + " saved")
31 changes: 11 additions & 20 deletions mbs_results/outputs/selective_editing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pandas as pd

from mbs_results.utilities.utils import convert_column_to_datetime


def calculate_predicted_value(
dataframe: pd.DataFrame,
Expand Down Expand Up @@ -93,24 +91,24 @@ def create_standardising_factor(
each reference.

"""
questions_selected = [40, 49]
current_df = dataframe[(dataframe[period] == period_selected)]
current_df = current_df[current_df[question_no].isin(questions_selected)]
# questions_selected = [40, 49]
# current_df = dataframe[(dataframe[period] == period_selected)]
# current_df = current_df[current_df[question_no].isin(questions_selected)]

# The standardising factor is created for each record before summing for each
# domain-question grouping.
current_df["unit_standardising_factor"] = (
current_df[predicted_value]
* current_df[a_weight]
* current_df[o_weight]
* current_df[g_weight]
dataframe["unit_standardising_factor"] = (
dataframe[predicted_value]
* dataframe[a_weight]
* dataframe[o_weight]
* dataframe[g_weight]
)

current_df["standardising_factor"] = current_df.groupby([domain, question_no])[
dataframe["standardising_factor"] = dataframe.groupby([domain, question_no])[
"unit_standardising_factor"
].transform("sum")

output_df = current_df[
output_df = dataframe[
[
period,
reference,
Expand Down Expand Up @@ -173,8 +171,6 @@ def calculate_auxiliary_value(
# convert register turover from annual pounds-thousands to monthly pounds
dataframe[frozen_turnover] = dataframe[frozen_turnover] * 1000 / 12

period_selected = pd.to_datetime(period_selected, format="%Y%m")
dataframe[period] = convert_column_to_datetime(dataframe[period])
current_df = dataframe[(dataframe[period] == period_selected)]

q40 = current_df[current_df[question_no] == 40]
Expand All @@ -183,12 +179,7 @@ def calculate_auxiliary_value(
q40["auxiliary_value"] = q40[frozen_turnover]
q49["auxiliary_value"] = q49[frozen_turnover] * q49[construction_link]

keep_cols = [
reference,
period,
question_no,
"auxiliary_value",
]
keep_cols = [reference, period, question_no, "auxiliary_value", imputation_class]

output_df = pd.concat([q40[keep_cols], q49[keep_cols]])

Expand Down
28 changes: 18 additions & 10 deletions mbs_results/outputs/selective_editing_contributer_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,35 +55,43 @@ def get_selective_editing_contributer_output(
input_data = additional_outputs_df.loc[
additional_outputs_df[question_no].isin(questions_selected)
]
input_data = additional_outputs_df[
input_data = input_data[
[period, reference, "design_weight", "frosic2007", "formtype"]
]

input_data["frosic2007"] = input_data["frosic2007"].astype(str)

domain_data = pd.read_csv(
sic_domain_mapping_path, dtype={"sic_5_digit": str, "domain": str}
)
threshold_mapping = pd.read_csv(
threshold_filepath, dtype={"formtype": str, "domain": str, "threshold": float}
)
# Threshold file contains multiple duplicate rows
threshold_mapping.drop_duplicates(inplace=True)

selective_editing_contributer_output = merge_domain(
selective_editing_contributor_output = merge_domain(
input_data, domain_data, "frosic2007", "sic_5_digit"
)

selective_editing_contributer_output = pd.merge(
selective_editing_contributer_output,
selective_editing_contributor_output = pd.merge(
selective_editing_contributor_output,
threshold_mapping,
on=["formtype", "domain"],
how="left",
).drop(columns=["formtype"])

selective_editing_contributer_output = selective_editing_contributer_output.rename(
selective_editing_contributor_output = selective_editing_contributor_output.rename(
columns={"reference": "ruref", "domain": "domain_group"}
)
).drop(columns="frosic2007")

# Survey code is requested on this output, 009 is MBS code
selective_editing_contributer_output["survey_code"] = "009"
selective_editing_contributor_output["survey_code"] = "009"

return selective_editing_contributer_output.loc[
selective_editing_contributer_output["period"] == period_selected
]
# Dropping duplicates as we expect the same contributor for q40 and q49 in some form
# types. Selecting only needed period
contributor_output_without_dupes = selective_editing_contributor_output.loc[
selective_editing_contributor_output["period"] == period_selected
].drop_duplicates()

return contributor_output_without_dupes
14 changes: 10 additions & 4 deletions mbs_results/outputs/selective_editing_question_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,15 @@ def create_selective_editing_question_output(
>> period_selected=202201,
>> )
"""
questions_selected = [40, 49]
input_data = additional_outputs_df.loc[
additional_outputs_df["questioncode"].isin(questions_selected)
]
input_data = input_data[(input_data["period"] == period_selected)]
sic_domain_mapping = pd.read_csv(sic_domain_mapping_path).astype(str)

df_with_domain = merge_domain(
input_df=additional_outputs_df,
input_df=input_data,
domain_mapping=sic_domain_mapping,
sic_input="frosic2007",
sic_mapping="sic_5_digit",
Expand All @@ -68,7 +73,7 @@ def create_selective_editing_question_output(
)

auxiliary_value = calculate_auxiliary_value(
dataframe=additional_outputs_df,
dataframe=input_data,
reference="reference",
period="period",
question_no="questioncode",
Expand All @@ -81,7 +86,7 @@ def create_selective_editing_question_output(
question_output = pd.merge(
standardising_factor,
auxiliary_value,
on=["reference", "imputation_class", "questioncode"],
on=["period", "reference", "imputation_class", "questioncode"],
how="left",
).drop("imputation_class", axis=1)

Expand All @@ -97,7 +102,8 @@ def create_selective_editing_question_output(
"reference": "ruref",
"domain": "domain_group",
"frotover": "auxiliary_value",
"imputation_flags_adjusted_value": "imputation_marker",
"imputation_flags_adjustedresponse": "imputation_marker",
"adjustedresponse": "predicted_value",
"questioncode": "question_code",
}
)
Expand Down
8 changes: 7 additions & 1 deletion mbs_results/staging/back_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def read_back_data(config: dict) -> pd.DataFrame:
finalsel = read_colon_separated_file(
config["back_data_finalsel_path"], config["sample_column_names"]
)
finalsel["formtype"] = "0" + finalsel["formtype"].astype(str)

qv_and_cp = pd.merge(
qv_df, cp_df, how="left", on=[config["period"], config["reference"]]
Expand Down Expand Up @@ -136,7 +137,10 @@ def append_back_data(staged_data: pd.DataFrame, config: dict) -> pd.DataFrame:
# they are loaded as int

back_data.insert(0, imp_marker_col, back_data[type_col].astype(str).map(map_type))

idbr_to_spp_mapping = config["idbr_to_spp"]
back_data[config["form_id_spp"]] = back_data[config["form_id_idbr"]].map(
idbr_to_spp_mapping
)
common_cols = list(staged_data.columns.intersection(back_data.columns))

common_cols.append(imp_marker_col)
Expand All @@ -156,6 +160,8 @@ def append_back_data(staged_data: pd.DataFrame, config: dict) -> pd.DataFrame:
config["revision_period"],
)

back_data["cellnumber"] = back_data["cell_no"]

staged_and_back_data = pd.concat([back_data, staged_data], ignore_index=True)

return staged_and_back_data
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
reference,period,question_no,auxiliary_value
1,202001,40,250000
2,202001,49,9375000
3,202001,49,12500000
4,202001,49,625000
5,202001,49,7000000
reference,period,question_no,auxiliary_value,imputation_class
1,202001,40,250000,500
2,202001,49,9375000,600
3,202001,49,12500000,600
4,202001,49,625000,700
5,202001,49,7000000,500
4 changes: 2 additions & 2 deletions tests/data/staging/back_data/test_append_back_expected.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
adjustedresponse,period,questioncode,reference,response,formtype,cell_no,froempment,frosic2007,frotover,imputation_marker,cellnumber,frozenemployees,frozensic,frozenturnover,status,statusencoded,form_type_spp,frozen_error
333333333,202202,40,1,444444,88888,55555,99999,99999,99999,r,,,,,,,,
666666666,202202,40,2,55555,88888,55555,99999,99999,99999,r,,,,,,,,
333333333,202202,40,1,444444,88888,55555,99999,99999,99999,r,55555,,,,,,,
666666666,202202,40,2,55555,88888,55555,99999,99999,99999,r,55555,,,,,,,
99999,202303,90,1,99999,999,99999,99999,99999,99999,,77777.0,8888.0,888888.0,777777.0,Clear,99999.0,6666.0,
88888,202303,110,2,88888,999,99999,99999,99999,99999,,77777.0,8888.0,888888.0,777777.0,Clear,99999.0,6666.0,
77777,202302,40,3,77777,999,99999,99999,99999,99999,,7777.0,8888.0,888888.0,777777.0,Clear,99999.0,6666.0,
Expand Down
5 changes: 4 additions & 1 deletion tests/data/staging/back_data/test_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@
"returned_value":"response",
"adjusted_value":"adjustedresponse",
"question_no":"questioncode"
}
},
"idbr_to_spp":{"0111": 1},
"form_id_idbr": "formtype",
"form_id_spp": "form_type_spp"
}
4 changes: 0 additions & 4 deletions tests/outputs/test_selective_editing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
calculate_predicted_value,
create_standardising_factor,
)
from mbs_results.utilities.utils import convert_column_to_datetime


@pytest.fixture(scope="class")
Expand Down Expand Up @@ -104,9 +103,6 @@ def test_calculate_auxiliary_value(
input_data = calculate_auxiliary_value_input

expected_output = calculate_auxiliary_value_output
expected_output["period"] = convert_column_to_datetime(
expected_output["period"]
)
expected_output["auxiliary_value"] = expected_output["auxiliary_value"].astype(
float
)
Expand Down
3 changes: 3 additions & 0 deletions tests/staging/test_back_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,8 @@ def test_append_back_data(mock_read_back_data, filepath):
actual_output = append_back_data(staged, config)

mock_read_back_data.assert_called_once_with(config)
order = actual_output.columns
expected_output = expected_output[order]
expected_output["cellnumber"] = expected_output["cellnumber"].astype(int)

assert_frame_equal(actual_output, expected_output)
4 changes: 4 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

from helper_functions import create_testing_config

from mbs_results.main import run_mbs_main
Expand Down Expand Up @@ -34,3 +36,5 @@ def test_main():
create_testing_config(test_config)

run_mbs_main()

os.remove("config.json")
Loading