Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix _set_seed of BaseTransformer #917

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions rdt/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,8 @@ def _fit(self, columns_data):

def _set_seed(self, data):
hash_value = self.columns[0]
for value in data.head(5):
hash_value += str(value)
for _, row in data.head(5).iterrows():
hash_value += str(row[self.columns[0]])
fealho marked this conversation as resolved.
Show resolved Hide resolved

hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16)
self.random_seed = hash_value % ((2**32) - 1) # maximum value for a seed
Expand Down
162 changes: 69 additions & 93 deletions tests/integration/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,35 +154,35 @@ def get_transformed_data():
'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0],
'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3],
'categorical': [
0.239836,
0.233842,
0.654524,
0.994903,
0.371298,
0.659559,
0.270355,
0.120638,
0.567084448793377,
0.6093336225506188,
0.7615294627334518,
0.7399456934698997,
0.14955684499269545,
0.6892412387800234,
0.2983218715022131,
0.2666832097799694,
],
'bool': [
0.667087,
0.238123,
0.345841,
0.842023,
0.478896,
0.495079,
0.775272,
0.675913,
0.6706619134407177,
0.7141489973331722,
0.500122612214152,
0.9370483545902434,
0.131531110786568,
0.6767797543043805,
0.9996445530632698,
0.03411262483361044,
],
'datetime': datetimes,
'names': [
0.159704,
0.684242,
0.719619,
0.458355,
0.536445,
0.991478,
0.078868,
0.575187,
0.29313498219418804,
0.8305802896312711,
0.6384738989349088,
0.051422702116981564,
0.3512323081205742,
0.8890855245423491,
0.5731709189162542,
0.22257809518154342,
],
},
index=TEST_DATA_INDEX,
Expand All @@ -205,7 +205,6 @@ def test_default_inputs(self):

Input:
- A dataframe with every sdtype.
- A fixed random seed to guarantee the samle values are null.

Expected behavior:
- The transformed data should contain all the ML ready data.
Expand Down Expand Up @@ -274,24 +273,24 @@ def test_default_inputs(self):
0.3,
],
'categorical': [
0.239836,
0.233842,
0.634841,
0.996602,
0.371298,
0.773039,
0.270355,
0.120638,
0.2805901104976278,
0.5376704648586699,
0.6607380757145003,
0.9546983806500966,
0.13763987779608952,
0.9144775150685426,
0.5660759260383321,
0.47380265437109054,
],
'bool': [
0.444725,
0.579374,
0.230561,
0.842023,
0.319264,
0.665026,
0.775272,
0.450609,
0.4722867116608579,
0.6243189355674945,
0.3976966355650366,
0.896655767208367,
0.0748079759632943,
0.5930938196358954,
0.7692382682525273,
0.3024284729840169,
],
'datetime': [
1.2630692571428572e18,
Expand All @@ -304,14 +303,14 @@ def test_default_inputs(self):
1.262304e18,
],
'names': [
0.159704,
0.684242,
0.719619,
0.458355,
0.536445,
0.991478,
0.078868,
0.575187,
0.29313498219418804,
0.8305802896312711,
0.6384738989349088,
0.051422702116981564,
0.3512323081205742,
0.8890855245423491,
0.5731709189162542,
0.22257809518154342,
],
},
index=TEST_DATA_INDEX,
Expand All @@ -320,7 +319,7 @@ def test_default_inputs(self):

reversed_datetimes = pd.to_datetime([
'2010-01-09',
'2010-02-01',
np.nan,
'2010-01-01',
'2010-01-01',
'2010-01-01',
Expand All @@ -331,18 +330,9 @@ def test_default_inputs(self):
expected_reversed = pd.DataFrame(
{
'integer': [1, 2, 1, 3, 1, 4, 2, 3],
'float': [
0.100000,
np.nan,
np.nan,
0.20000000000000004,
0.100000,
0.400000,
np.nan,
0.300000,
],
'float': [0.1, 0.2, 0.1, np.nan, 0.1, 0.4, 0.20000000000000004, 0.3],
'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'],
'bool': [False, False, False, True, False, False, True, False],
'bool': [False, np.nan, False, True, False, np.nan, True, False],
'datetime': reversed_datetimes,
'names': [
'Jon',
Expand Down Expand Up @@ -1237,11 +1227,11 @@ def test_reset_randomization(self):
],
'balance.component': [0.0, 0, 0, 0, 0],
'card_type': [
0.413995,
0.224306,
0.639794,
0.862760,
0.263703,
0.17901105796558806,
0.3582933494588839,
0.6532481234958804,
0.8859678246550227,
0.4245315684590038,
],
})
expected_second_transformed = pd.DataFrame({
Expand All @@ -1262,11 +1252,11 @@ def test_reset_randomization(self):
],
'balance.component': [0.0, 0, 0, 0, 0],
'card_type': [
0.314403,
0.287979,
0.714735,
0.939781,
0.251442,
0.3012879880691509,
0.2678513907358402,
0.7060422948755574,
0.9270899473086737,
0.3107417744890652,
],
})

Expand All @@ -1291,14 +1281,8 @@ def test_reset_randomization(self):
],
'age': [18, 25, 54, 60, 31],
'name': ['AAAAA', 'AAAAB', 'AAAAC', 'AAAAD', 'AAAAE'],
'signup_day': [
'01/01/2020',
'02/19/2016',
'04/01/2019',
np.nan,
np.nan,
],
'balance': [250, 5400, 150000, 61662.5, 91000],
'signup_day': ['01/01/2020', '02/19/2016', '04/01/2019', '12/01/2008', '05/16/2016'],
'balance': [np.nan, 5400.0, 150000.0, 61662.5, np.nan],
'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'],
})
expected_second_reverse = pd.DataFrame({
Expand All @@ -1313,20 +1297,20 @@ def test_reset_randomization(self):
'name': ['AAAAF', 'AAAAG', 'AAAAH', 'AAAAI', 'AAAAJ'],
'signup_day': [
'01/01/2020',
np.nan,
'02/19/2016',
'04/01/2019',
'12/01/2008',
np.nan,
'05/16/2016',
],
'balance': [np.nan, 5400, np.nan, 61662.5, 91000],
'balance': [np.nan, 5400, 150000, 61662, 91000],
'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'],
})
first_reverse1 = ht1.reverse_transform(first_transformed1)
first_reverse2 = ht2.reverse_transform(first_transformed1)
second_reverse1 = ht1.reverse_transform(first_transformed1)
pd.testing.assert_frame_equal(first_reverse1, expected_first_reverse)
pd.testing.assert_frame_equal(first_reverse2, expected_first_reverse)
pd.testing.assert_frame_equal(expected_second_reverse, second_reverse1)
pd.testing.assert_frame_equal(second_reverse1, expected_second_reverse)

# Test resetting randomization
ht1.reset_randomization()
Expand Down Expand Up @@ -1603,11 +1587,7 @@ def test_hypertransformer_with_mutli_column_transformer_end_to_end(self):
expected_transformed_data = pd.DataFrame({
'A': [1.0, 2.0, 3.0],
'B': [4.0, 5.0, 6.0],
'C': [
0.10333535312718026,
0.6697388922326716,
0.18775548909503287,
],
'C': [0.30137162079637486, 0.8122604974222468, 0.04281698463499313],
})

pd.testing.assert_frame_equal(transformed_data, expected_transformed_data)
Expand Down Expand Up @@ -1647,11 +1627,7 @@ def test_hypertransformer_with_mutli_column_transformer_and_single_column(
expected_transformed_data = pd.DataFrame({
'A': [1.0, 2.0, 3.0],
'B2': [4.0, 5.0, 6.0],
'C': [
0.04206197607326308,
0.8000968077312287,
0.06325519846695522,
],
'C': [0.30137162079637486, 0.8122604974222468, 0.04281698463499313],
})

pd.testing.assert_frame_equal(transformed_data, expected_transformed_data)
Expand Down
24 changes: 24 additions & 0 deletions tests/integration/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import pandas as pd
import pytest

from rdt.hyper_transformer import HyperTransformer
from rdt.transformers import (
Expand Down Expand Up @@ -194,6 +195,29 @@ def test_transform_with_nans(self):
# Assert
pd.testing.assert_frame_equal(out, data)

def test_fit_transform_random_seeds(self):
"""Test identical data has identical transforms, while different data does not."""
# Setup
data1 = pd.DataFrame({
'a': [1, 2, 3],
})
data2 = pd.DataFrame({
'a': [1, 2, 4],
})
transformer1 = UniformEncoder()
transformer2 = UniformEncoder()
transformer3 = UniformEncoder()

# Run
transform1 = transformer1.fit_transform(data1, 'a')
transform2 = transformer2.fit_transform(data1, 'a')
transform3 = transformer3.fit_transform(data2, 'a')

# Assert
pd.testing.assert_frame_equal(transform1, transform2)
with pytest.raises(AssertionError):
pd.testing.assert_frame_equal(transform1, transform3)


class TestOrderedUniformEncoder:
"""Test class for the OrderedUniformEncoder."""
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/transformers/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ def test_unixtimestampencoder_with_missing_value_replacement_random(self):

# Asserts
expect_transformed = pd.DataFrame({
'column': [-7.007396e16, 845510400000000000, -145497600000000000]
'column': [7.896217487028026e17, 8.455104e17, -1.454976e17]
})
expected_reversed = pd.DataFrame({'column': [np.nan, 'Oct 17, 1996', 'May 23, 1965']})
pd.testing.assert_frame_equal(expect_transformed, transformed)
pd.testing.assert_frame_equal(transformed, expect_transformed)
pd.testing.assert_frame_equal(reverted, expected_reversed)

def test_unixtimestampencoder_with_model_missing_values(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/transformers/test_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,5 +359,5 @@ def test_end_to_end_scrambled(self):
reverse_transformed = ht.reverse_transform(transformed)

# Assert
expected_id = pd.Series(['id_b', 'id_a', 'id_c', 'id_e', 'id_d'], name='id')
expected_id = pd.Series(['id_b', 'id_c', 'id_a', 'id_d', 'id_e'], name='id')
pd.testing.assert_series_equal(reverse_transformed['id'], expected_id)
4 changes: 2 additions & 2 deletions tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_missing_value_replacement_set_to_random_and_model_missing_values(

# Assert
expected_transformed = pd.DataFrame({
'a': [1.0, 2.0, 3.0, 2.617107, 1.614805, 4.0],
'a': [1.0, 2.0, 3.0, 3.465976493452848, 1.5297519377926643, 4.0],
'a.is_null': [0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
})
pd.testing.assert_frame_equal(transformed, expected_transformed)
Expand Down Expand Up @@ -356,7 +356,7 @@ def test_missing_value_generation_random(self):

reverse = ct.reverse_transform(transformed)
expected = pd.DataFrame(
[1.0, 1.9999999510423996, 1.0, 1.9999999510423996, 1.4, 1.0],
[1.0, 1.9999999510423996, 1.0, 1.9999999510423996, 1.4, np.nan],
columns=['a'],
)
pd.testing.assert_frame_equal(reverse, expected)
Expand Down
Loading