Skip to content

Commit

Permalink
Make the default missing value imputation 'mean' (#731)
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Oct 26, 2023
1 parent 98c50d0 commit 3738f9e
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 31 deletions.
2 changes: 1 addition & 1 deletion rdt/transformers/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class BinaryEncoder(BaseTransformer):
INPUT_SDTYPE = 'boolean'
null_transformer = None

def __init__(self, missing_value_replacement='random', model_missing_values=None,
def __init__(self, missing_value_replacement='mode', model_missing_values=None,
missing_value_generation='random'):
super().__init__()
self._set_missing_value_generation(missing_value_generation)
Expand Down
8 changes: 4 additions & 4 deletions rdt/transformers/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class UnixTimestampEncoder(BaseTransformer):
Indicate what to replace the null values with. If the strings ``'mean'`` or ``'mode'``
are given, replace them with the corresponding aggregation, if ``'random'``, use
random values from the dataset to fill the nan values.
Defaults to ``random``.
Defaults to ``mean``.
model_missing_values (bool):
**DEPRECATED** Whether to create a new column to indicate which values were null or
not. The column will be created only if there are null values. If ``True``, create
Expand All @@ -44,10 +44,10 @@ class UnixTimestampEncoder(BaseTransformer):
INPUT_SDTYPE = 'datetime'
null_transformer = None

def __init__(self, missing_value_replacement='random', model_missing_values=None,
def __init__(self, missing_value_replacement='mean', model_missing_values=None,
datetime_format=None, missing_value_generation='random'):
super().__init__()
self._set_missing_value_replacement('random', missing_value_replacement)
self._set_missing_value_replacement('mean', missing_value_replacement)
self._set_missing_value_generation(missing_value_generation)
if model_missing_values is not None:
self._set_model_missing_values(model_missing_values)
Expand Down Expand Up @@ -190,7 +190,7 @@ class OptimizedTimestampEncoder(UnixTimestampEncoder):
Indicate what to replace the null values with. If the strings ``'mean'`` or ``'mode'``
are given, replace them with the corresponding aggregation, if ``'random'``, use
random values from the dataset to fill the nan values.
Defaults to ``random``.
Defaults to ``mean``.
model_missing_values (bool):
**DEPRECATED** Whether to create a new column to indicate which values were null or
not. The column will be created only if there are null values. If ``True``, create
Expand Down
6 changes: 3 additions & 3 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class FloatFormatter(BaseTransformer):
Indicate what to replace the null values with. If an integer or float is given,
replace them with the given value. If the strings ``'mean'`` or ``'mode'``
are given, replace them with the corresponding aggregation and if ``'random'``
replace each null value with a random value in the data range. Defaults to ``random``.
replace each null value with a random value in the data range. Defaults to ``mean``.
model_missing_values (bool):
**DEPRECATED** Whether to create a new column to indicate which values were null or
not. The column will be created only if there are null values. If ``True``, create
Expand Down Expand Up @@ -77,13 +77,13 @@ class FloatFormatter(BaseTransformer):
_min_value = None
_max_value = None

def __init__(self, missing_value_replacement='random', model_missing_values=None,
def __init__(self, missing_value_replacement='mean', model_missing_values=None,
learn_rounding_scheme=False, enforce_min_max_values=False,
computer_representation='Float', missing_value_generation='random'):
super().__init__()
self.missing_value_replacement = missing_value_replacement
self._set_missing_value_generation(missing_value_generation)
self._set_missing_value_replacement('random', missing_value_replacement)
self._set_missing_value_replacement('mean', missing_value_replacement)
if model_missing_values is not None:
self._set_model_missing_values(model_missing_values)

Expand Down
33 changes: 14 additions & 19 deletions tests/integration/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,16 +178,7 @@ def test_default_inputs(self):
# Assert
expected_transformed = pd.DataFrame({
'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0],
'float': [
0.100000,
0.200000,
0.100000,
0.325355,
0.100000,
0.400000,
0.299855,
0.300000
],
'float': [0.1, 0.2, 0.1, 0.20000000000000004, 0.1, 0.4, 0.20000000000000004, 0.3],
'categorical': [
0.239836,
0.233842,
Expand All @@ -209,14 +200,14 @@ def test_default_inputs(self):
0.450609,
],
'datetime': [
1.263287e+18,
1.264982e+18,
1.262304e+18,
1.2630692571428572e+18,
1.2649824e+18,
1.262304e+18,
1.262304e+18,
1.264982e+18,
1.262304e+18,
1.2649824e+18,
1.262304e+18,
1.262304e+18
],
'names': [
0.159704,
Expand All @@ -232,7 +223,7 @@ def test_default_inputs(self):
pd.testing.assert_frame_equal(transformed, expected_transformed)

reversed_datetimes = pd.to_datetime([
'2010-01-12',
'2010-01-09',
'2010-02-01',
'2010-01-01',
'2010-01-01',
Expand All @@ -247,7 +238,7 @@ def test_default_inputs(self):
0.100000,
np.nan,
np.nan,
0.3253550514650646,
0.20000000000000004,
0.100000,
0.400000,
np.nan,
Expand Down Expand Up @@ -1095,7 +1086,9 @@ def test_reset_randomization(self):
# Test transforming multiple times with different transformers
expected_first_transformed = pd.DataFrame({
'age': [18.0, 25.0, 54.0, 60.0, 31.0],
'signup_day': [1.577837e+18, 1.260712e+18, 1.554077e+18, 1.228090e+18, 1.463357e+18],
'signup_day': [
1.5778368e+18, 1.45584e+18, 1.5540768e+18, 1.2280896e+18, 1.4633568e+18
],
'balance.normalized': [
-2.693016e-01,
-2.467182e-01,
Expand All @@ -1114,7 +1107,9 @@ def test_reset_randomization(self):
})
expected_second_transformed = pd.DataFrame({
'age': [18.0, 25.0, 54.0, 60.0, 31.0],
'signup_day': [1.577837e+18, 1.333605e+18, 1.554077e+18, 1.228090e+18, 1.463357e+18],
'signup_day': [
1.5778368e+18, 1.45584e+18, 1.5540768e+18, 1.2280896e+18, 1.4633568e+18
],
'balance.normalized': [
-2.693016e-01,
-2.467182e-01,
Expand Down Expand Up @@ -1153,7 +1148,7 @@ def test_reset_randomization(self):
],
'age': [18, 25, 54, 60, 31],
'name': ['AAAAA', 'AAAAB', 'AAAAC', 'AAAAD', 'AAAAE'],
'signup_day': ['01/01/2020', '12/13/2009', '04/01/2019', np.nan, np.nan],
'signup_day': ['01/01/2020', '02/19/2016', '04/01/2019', np.nan, np.nan],
'balance': [250, 5400, 150000, 61662.5, 91000],
'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa']
})
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_int_nan_default_missing_value_generation(self):

reverse = nt.reverse_transform(transformed)
assert len(reverse) == 6
assert reverse['a'][5] == 1.9583798838965891 or np.isnan(reverse['a'][5])
assert reverse['a'][5] == 1.4 or np.isnan(reverse['a'][5])
for value in reverse['a'][:5]:
assert value in {1, 2} or np.isnan(value)

Expand Down Expand Up @@ -95,7 +95,7 @@ def test_missing_value_generation_none(self):
# Assert
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 1)
assert transformed['a'].iloc[5] == 1.9583798838965891
assert transformed['a'].iloc[5] == 1.4

def test_model_missing_value(self):
"""Test that we are still able to use ``model_missing_value``."""
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/transformers/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test___init__(self):
# Asserts
error_message = 'Unexpected missing_value_replacement'
error_generation = 'Unexpected missing_value_generation'
assert transformer.missing_value_replacement == 'random', error_message
assert transformer.missing_value_replacement == 'mode', error_message
assert transformer.missing_value_generation == 'random', error_generation

def test___init___model_missing_value_passed(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/transformers/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def test__fit(self, null_transformer_mock):
transformer._fit(data)

# Assert
null_transformer_mock.assert_called_once_with('random', 'random')
null_transformer_mock.assert_called_once_with('mean', 'random')
assert null_transformer_mock.return_value.fit.call_count == 1
np.testing.assert_allclose(
null_transformer_mock.return_value.fit.call_args_list[0][0][0],
Expand Down

0 comments on commit 3738f9e

Please sign in to comment.