diff --git a/python/tests/test_divmat.py b/python/tests/test_divmat.py index c95a9ab345..e65fcf3756 100644 --- a/python/tests/test_divmat.py +++ b/python/tests/test_divmat.py @@ -24,6 +24,7 @@ """ import array import collections +import functools import msprime import numpy as np @@ -258,8 +259,23 @@ def divergence_matrix( ) -def stats_api_divergence_matrix( - ts, windows=None, samples=None, sample_sets=None, mode="site", span_normalise=True +def stats_api_divergence_matrix(ts, *args, **kwargs): + return stats_api_matrix_method(ts, ts.divergence, *args, **kwargs) + + +def stats_api_genetic_relatedness_matrix(ts, *args, **kwargs): + method = functools.partial(ts.genetic_relatedness, proportion=False) + return stats_api_matrix_method(ts, method, *args, **kwargs) + + +def stats_api_matrix_method( + ts, + method, + windows=None, + samples=None, + sample_sets=None, + mode="site", + span_normalise=True, ): if samples is not None and sample_sets is not None: raise ValueError("Cannot specify both") @@ -282,19 +298,6 @@ def stats_api_divergence_matrix( else: return np.zeros(shape=(0, 0)) - # # Make sure that all the specified samples have the sample flag set, otherwise - # # the library code will complain - # tables = ts.dump_tables() - # flags = tables.nodes.flags - # # NOTE: this is a shortcut, setting all flags unconditionally to zero, so don't - # # use this tree sequence outside this method. - # flags[:] = 0 - # for sample_set in sample_sets: - # for u in sample_set: - # flags[u] = tskit.NODE_IS_SAMPLE - # tables.nodes.flags = flags - # ts = tables.tree_sequence() - # FIXME We have to go through this annoying rigmarole because windows must start and # end with 0 and L. We should relax this requirement to just making the windows # contiguous, so that we just look at specific sections of the genome. @@ -308,7 +311,7 @@ def stats_api_divergence_matrix( n = len(sample_sets) indexes = [(i, j) for i in range(n) for j in range(n)] - X = ts.divergence( + X = method( sample_sets, indexes=indexes, mode=mode, @@ -1282,7 +1285,7 @@ def test_good_args(self, arg, flattened, sizes): assert isinstance(f, np.ndarray) assert f.dtype == np.int32 assert isinstance(s, np.ndarray) - assert s.dtype == np.uint32 + assert s.dtype == np.uint64 np.testing.assert_array_equal(f, flattened) np.testing.assert_array_equal(s, sizes) @@ -1341,3 +1344,40 @@ def test_dict_args(self, arg): def test_bad_arg_types(self, arg): with pytest.raises(TypeError): tskit.TreeSequence._parse_stat_matrix_ids_arg(arg) + + +class TestGeneticRelatednessMatrix: + def check(self, ts, mode, windows=None): + G1 = stats_api_genetic_relatedness_matrix(ts, mode=mode, windows=windows) + # Seem to be out by a factor of -2, quick hack just to check + G2 = ts.genetic_relatedness_matrix(mode=mode, windows=windows) + np.testing.assert_array_almost_equal(G1, G2) + + @pytest.mark.parametrize("mode", DIVMAT_MODES) + def test_single_tree(self, mode): + # 2.00┊ 6 ┊ + # ┊ ┏━┻━┓ ┊ + # 1.00┊ 4 5 ┊ + # ┊ ┏┻┓ ┏┻┓ ┊ + # 0.00┊ 0 1 2 3 ┊ + # 0 1 + ts = tskit.Tree.generate_balanced(4).tree_sequence + ts = tsutil.insert_branch_sites(ts) + self.check(ts, mode) + + @pytest.mark.parametrize("mode", DIVMAT_MODES) + def test_single_tree_windows(self, mode): + # 2.00┊ 6 ┊ + # ┊ ┏━┻━┓ ┊ + # 1.00┊ 4 5 ┊ + # ┊ ┏┻┓ ┏┻┓ ┊ + # 0.00┊ 0 1 2 3 ┊ + # 0 1 + ts = tskit.Tree.generate_balanced(4).tree_sequence + ts = tsutil.insert_branch_sites(ts) + self.check(ts, mode, windows=[0, 0.5, 1]) + + @pytest.mark.parametrize("ts", get_example_tree_sequences()) + @pytest.mark.parametrize("mode", DIVMAT_MODES) + def test_suite_defaults(self, ts, mode): + self.check(ts, mode=mode) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index b86654339b..f347fe8b4a 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -7900,7 +7900,7 @@ def _parse_stat_matrix_ids_arg(ids): def divergence_matrix( self, - ids, + ids=None, *, windows=None, num_threads=0, @@ -8064,6 +8064,45 @@ def genetic_relatedness( return out + def genetic_relatedness_matrix( + self, + ids=None, + *, + windows=None, + num_threads=0, + mode=None, + span_normalise=True, + ): + D = self.divergence_matrix( + ids, + windows=windows, + num_threads=num_threads, + mode=mode, + span_normalise=span_normalise, + ) + + def _normalise(B): + if len(B) == 0: + return B + K = np.zeros_like(B) + N = K.shape[0] + B_mean = np.mean(B) + Bi_mean = np.mean(B, axis=0) + # TODO numpify - should be easy enough by creating full matrices + # for the row and column means. + for i in range(N): + for j in range(N): + K[i, j] = B[i, j] - Bi_mean[i] - Bi_mean[j] + B_mean + # FIXME I don't know what this factor -2 is about + return K / -2 + + if windows is None: + return _normalise(D) + else: + for j in range(D.shape[0]): + D[j] = _normalise(D[j]) + return D + def genetic_relatedness_weighted( self, W,