From 58c77be86d510917dcd71dc993808cad45d90760 Mon Sep 17 00:00:00 2001 From: Avery Chan Date: Tue, 12 Oct 2021 18:15:33 -0500 Subject: [PATCH 1/2] add __init__.py file --- mastml/__init__.py | 0 mastml/data_splitters.py | 59 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 mastml/__init__.py diff --git a/mastml/__init__.py b/mastml/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mastml/data_splitters.py b/mastml/data_splitters.py index 222ffdd2..56b6e371 100644 --- a/mastml/data_splitters.py +++ b/mastml/data_splitters.py @@ -57,6 +57,8 @@ import warnings import shutil from scipy.spatial.distance import minkowski + +from mastml.feature_generators import ElementalFractionGenerator try: import keras except: @@ -1415,6 +1417,63 @@ def split(self, X, y=None, groups=None): def get_n_splits(self, X=None, y=None, groups=None): return len(X) +class LeaveOutCloseCompositions_INPROGRESS(BaseSplitter): + """ + Leave-P-out where you exclude materials with compositions close to those the test set + + Args: + composition_df (pd.DataFrame): dataframe containing the vector of material compositions to analyze + + dist_threshold (float): Entries must be farther than this distance to be included in the training set + + nn_kwargs (dict): Keyword arguments for the scikit-learn NearestNeighbor class used to find nearest points + + """ + + def __init__(self, composition_df, dist_threshold=0.1, nn_kwargs=None): + super(LeaveCloseCompositionsOut, self).__init__() + if nn_kwargs is None: + nn_kwargs = {} + self.composition_df = composition_df + self.dist_threshold = dist_threshold + self.nn_kwargs = nn_kwargs + + # composition_df willl be generated by + # ElementalFractionGenerator() + + pass + + def split(self, X, y=None, groups=None): + + # # Generate the composition vectors + # frac_computer = ElementFraction() + # elem_fracs = frac_computer.featurize_many(list(map(Composition, self.composition_df[self.composition_df.columns[0]])), pbar=False) + + # # Generate the nearest-neighbor lookup tool + # neigh = NearestNeighbors(**self.nn_kwargs) + # neigh.fit(elem_fracs) + + # # Generate a list of all entries + # all_inds = np.arange(0, self.composition_df.shape[0], 1) + + # # Loop through each entry in X + # trains_tests = list() + # for i, x in enumerate(elem_fracs): + + # # Get all the entries within the threshold distance of the test point + # too_close, = neigh.radius_neighbors([x], self.dist_threshold, return_distance=False) + + # # Get the training set as "not these points" + # train_inds = np.setdiff1d(all_inds, too_close) + # test_inds = np.setdiff1d(all_inds, train_inds) + + # trains_tests.append((np.asarray(train_inds), np.asarray(test_inds))) + # return trains_tests + return [] + + def get_n_splits(self, X=None, y=None, groups=None): + return len(X) + class LeaveOutPercent(BaseSplitter): """ From f2b1129ca1f243fd9391c63f023c52f7c740502b Mon Sep 17 00:00:00 2001 From: Avery Chan Date: Tue, 12 Oct 2021 18:26:23 -0500 Subject: [PATCH 2/2] undo wrong changes --- mastml/data_splitters.py | 59 ---------------------------------------- 1 file changed, 59 deletions(-) diff --git a/mastml/data_splitters.py b/mastml/data_splitters.py index 56b6e371..222ffdd2 100644 --- a/mastml/data_splitters.py +++ b/mastml/data_splitters.py @@ -57,8 +57,6 @@ import warnings import shutil from scipy.spatial.distance import minkowski - -from mastml.feature_generators import ElementalFractionGenerator try: import keras except: @@ -1417,63 +1415,6 @@ def split(self, X, y=None, groups=None): def get_n_splits(self, X=None, y=None, groups=None): return len(X) -class LeaveOutCloseCompositions_INPROGRESS(BaseSplitter): - """ - Leave-P-out where you exclude materials with compositions close to those the test set - - Args: - composition_df (pd.DataFrame): dataframe containing the vector of material compositions to analyze - - dist_threshold (float): Entries must be farther than this distance to be included in the training set - - nn_kwargs (dict): Keyword arguments for the scikit-learn NearestNeighbor class used to find nearest points - - """ - - def __init__(self, composition_df, dist_threshold=0.1, nn_kwargs=None): - super(LeaveCloseCompositionsOut, self).__init__() - if nn_kwargs is None: - nn_kwargs = {} - self.composition_df = composition_df - self.dist_threshold = dist_threshold - self.nn_kwargs = nn_kwargs - - # composition_df willl be generated by - # ElementalFractionGenerator() - - pass - - def split(self, X, y=None, groups=None): - - # # Generate the composition vectors - # frac_computer = ElementFraction() - # elem_fracs = frac_computer.featurize_many(list(map(Composition, self.composition_df[self.composition_df.columns[0]])), pbar=False) - - # # Generate the nearest-neighbor lookup tool - # neigh = NearestNeighbors(**self.nn_kwargs) - # neigh.fit(elem_fracs) - - # # Generate a list of all entries - # all_inds = np.arange(0, self.composition_df.shape[0], 1) - - # # Loop through each entry in X - # trains_tests = list() - # for i, x in enumerate(elem_fracs): - - # # Get all the entries within the threshold distance of the test point - # too_close, = neigh.radius_neighbors([x], self.dist_threshold, return_distance=False) - - # # Get the training set as "not these points" - # train_inds = np.setdiff1d(all_inds, too_close) - # test_inds = np.setdiff1d(all_inds, train_inds) - - # trains_tests.append((np.asarray(train_inds), np.asarray(test_inds))) - # return trains_tests - return [] - - def get_n_splits(self, X=None, y=None, groups=None): - return len(X) - class LeaveOutPercent(BaseSplitter): """