Skip to content

Commit

Permalink
Normalise pair coalescence counts by nonmissing span per window
Browse files Browse the repository at this point in the history
Update changelog

Update changelog

Docstring fix, more tests
  • Loading branch information
nspope authored and mergify[bot] committed Nov 21, 2024
1 parent b31d4d1 commit 8cd494a
Show file tree
Hide file tree
Showing 7 changed files with 193 additions and 13 deletions.
23 changes: 19 additions & 4 deletions c/tests/test_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,15 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options)
const double *breakpoints = tsk_treeseq_get_breakpoints(ts);
const tsk_size_t P = 2;
const tsk_size_t I = P * (P + 1) / 2;
const tsk_size_t B = 8;
tsk_id_t sample_sets[n];
tsk_size_t sample_set_sizes[P];
tsk_id_t index_tuples[2 * I];
tsk_id_t node_bin_map[N];
tsk_size_t dim = T * N * I;
double C[dim];
double C_B[T * B * I];
double C_Nh[T * (N / 2) * I];
tsk_size_t i, j, k;

for (i = 0; i < n; i++) {
Expand All @@ -332,7 +335,7 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options)
sample_set_sizes[i] = 0;
}
for (j = 0; j < n; j++) {
i = j / (n / P);
i = j / ((n + P - 1) / P);
sample_set_sizes[i]++;
}

Expand All @@ -345,17 +348,17 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options)

/* test various bin assignments */
for (i = 0; i < N; i++) {
node_bin_map[i] = ((tsk_id_t) i) % 8;
node_bin_map[i] = ((tsk_id_t) (i % B));
}
ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, 8, node_bin_map, options, C);
index_tuples, T, breakpoints, B, node_bin_map, options, C_B);
CU_ASSERT_EQUAL_FATAL(ret, 0);

for (i = 0; i < N; i++) {
node_bin_map[i] = i < N / 2 ? ((tsk_id_t) i) : TSK_NULL;
}
ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I,
index_tuples, T, breakpoints, N / 2, node_bin_map, options, C);
index_tuples, T, breakpoints, N / 2, node_bin_map, options, C_Nh);
CU_ASSERT_EQUAL_FATAL(ret, 0);

for (i = 0; i < N; i++) {
Expand Down Expand Up @@ -3687,6 +3690,17 @@ test_pair_coalescence_counts(void)
tsk_treeseq_free(&ts);
}

static void
test_pair_coalescence_counts_missing(void)
{
tsk_treeseq_t ts;
tsk_treeseq_from_text(&ts, 5, missing_ex_nodes, missing_ex_edges, NULL,
NULL, NULL, NULL, NULL, 0);
verify_pair_coalescence_counts(&ts, 0);
verify_pair_coalescence_counts(&ts, TSK_STAT_SPAN_NORMALISE);
tsk_treeseq_free(&ts);
}

static void
test_pair_coalescence_quantiles(void)
{
Expand Down Expand Up @@ -3816,6 +3830,7 @@ main(int argc, char **argv)
{ "test_multiroot_divergence_matrix", test_multiroot_divergence_matrix },

{ "test_pair_coalescence_counts", test_pair_coalescence_counts },
{ "test_pair_coalescence_counts_missing", test_pair_coalescence_counts_missing },
{ "test_pair_coalescence_quantiles", test_pair_coalescence_quantiles },
{ "test_pair_coalescence_rates", test_pair_coalescence_rates },

Expand Down
27 changes: 27 additions & 0 deletions c/tests/testlib.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,33 @@ const char *empty_ex_nodes = "1 0.0 0 -1\n"
"1 0.0 0 -1\n";
const char *empty_ex_edges = "";

/*** An example of a tree sequence with missing marginal trees. ***/
/*
| 4 | | 4 |
| / \ | | / \ |
| 3 \ | | / 3 |
| / \ \ | | / / \ |
| 0 1 2 | | 0 1 2 |
|-|-----------|-|-----------|-|
0 1 2 3 4 5
*/
const char *missing_ex_nodes =
"1 0.0 0 -1\n"
"1 0.0 0 -1\n"
"1 0.0 0 -1\n"
"0 1.0 0 -1\n"
"0 2.0 0 -1\n";

const char *missing_ex_edges =
"1.0 2.0 3 0\n"
"1.0 2.0 3 1\n"
"3.0 4.0 3 1\n"
"3.0 4.0 3 2\n"
"3.0 4.0 4 0\n"
"1.0 2.0 4 2\n"
"1.0 2.0 4 3\n"
"3.0 4.0 4 3\n";

/* Simple utilities to parse text so we can write declaritive
* tests. This is not intended as a robust general input mechanism.
*/
Expand Down
3 changes: 3 additions & 0 deletions c/tests/testlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,7 @@ extern const char *paper_ex_sites;
extern const char *paper_ex_mutations;
extern const char *paper_ex_individuals;

extern const char *missing_ex_nodes;
extern const char *missing_ex_edges;

#endif
26 changes: 22 additions & 4 deletions c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -9334,9 +9334,9 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
void *summary_func_args, tsk_flags_t options, double *result)
{
int ret = 0;
double left, right, remaining_span, window_span, denominator, x, t;
double left, right, remaining_span, missing_span, window_span, denominator, x, t;
tsk_id_t e, p, c, u, v, w, i, j;
tsk_size_t num_samples;
tsk_size_t num_samples, num_edges;
tsk_tree_position_t tree_pos;
const tsk_table_collection_t *tables = self->tables;
const tsk_size_t num_nodes = tables->nodes.num_rows;
Expand Down Expand Up @@ -9449,6 +9449,8 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
goto out;
}

num_edges = 0;
missing_span = 0.0;
w = 0;
while (true) {
tsk_tree_position_next(&tree_pos);
Expand Down Expand Up @@ -9494,6 +9496,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
}
p = nodes_parent[p];
}
num_edges -= 1;
}

for (u = tree_pos.in.start; u != tree_pos.in.stop; u++) {
Expand Down Expand Up @@ -9530,6 +9533,11 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
c = p;
p = nodes_parent[c];
}
num_edges += 1;
}

if (num_edges == 0) {
missing_span += right - left;
}

/* flush windows */
Expand Down Expand Up @@ -9589,7 +9597,13 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
}
/* normalise weights */
if (options & (TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE)) {
window_span = windows[w + 1] - windows[w];
window_span = windows[w + 1] - windows[w] - missing_span;
missing_span = 0.0;
if (num_edges == 0) {
remaining_span = right - windows[w + 1];
window_span -= remaining_span;
missing_span += remaining_span;
}
for (i = 0; i < (tsk_id_t) num_set_indexes; i++) {
denominator = 1.0;
if (options & TSK_STAT_SPAN_NORMALISE) {
Expand All @@ -9600,7 +9614,7 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp
}
weight = GET_2D_ROW(bin_weight, num_bins, i);
for (v = 0; v < (tsk_id_t) num_bins; v++) {
weight[v] /= denominator;
weight[v] *= denominator == 0.0 ? 0.0 : 1 / denominator;
}
}
}
Expand Down Expand Up @@ -9668,6 +9682,9 @@ pair_coalescence_quantiles(tsk_size_t input_dim, const double *weight,
j = 0;
coalesced = 0.0;
timepoint = TSK_UNKNOWN_TIME;
for (i = 0; i < output_dim; i++) {
output[i] = NAN;
}
for (i = 0; i < input_dim; i++) {
if (weight[i] > 0) {
coalesced += weight[i];
Expand Down Expand Up @@ -9806,6 +9823,7 @@ pair_coalescence_rates(tsk_size_t input_dim, const double *weight, const double
} else {
rate = log(1 - weight[i] / (1 - coalesced)) / (a - b);
}
// avoid tiny negative values from fp error
output[i] = rate > 0 ? rate : 0;
coalesced += weight[i];
}
Expand Down
9 changes: 8 additions & 1 deletion python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@
[0.6.1] - 2024-XX-XX
--------------------

**Bufixes**
**Bugfixes**

- Fix to ``TreeSequence.pair_coalescence_counts`` output dimension when
provided with time windows containing no nodes (:user:`nspope`,
:issue:`3046`, :pr:`3058`)

- Fix to ``TreeSequence.pair_coalescence_counts`` to normalise by non-missing
span if ``span_normalise=True``. This resolves a bug where
``TreeSequence.pair_coalescence_rates`` would return incorrect values for
intervals with missing trees. (:user:`natep`, :issue:`3053`, :pr:`3059`)

- Fix to ``TreeSequence.pair_coalescence_rates`` causing an
assertion to be triggered by floating point error, when all coalescence events are inside a single time window (:user:`natep`, :issue:`3035`, :pr:`3038`)

--------------------
[0.6.0] - 2024-10-16
Expand Down
112 changes: 111 additions & 1 deletion python/tests/test_coalrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,36 @@ def _single_tree_example(L, T):
# --- prototype --- #


def _nonmissing_window_span(ts, windows):
num_windows = windows.size - 1
sequence_length = ts.sequence_length
missing_span = np.zeros(num_windows)
missing = 0.0
num_edges = 0
w = 0
position = tsutil.TreePosition(ts)
while position.interval.right < sequence_length:
position.next()
left, right = position.interval.left, position.interval.right
out_range, in_range = position.out_range, position.in_range
for _ in range(out_range.start, out_range.stop): # edges_out
num_edges -= 1
for _ in range(in_range.start, in_range.stop): # edges_out
num_edges += 1
if num_edges == 0:
missing += right - left
while w < num_windows and windows[w + 1] <= right: # flush window
missing_span[w] = missing
missing = 0.0
if num_edges == 0:
x = max(0, right - windows[w + 1])
missing_span[w] -= x
missing += x
w += 1
window_span = np.diff(windows) - missing_span
return window_span


def _pair_coalescence_weights(
coalescing_pairs,
nodes_time,
Expand Down Expand Up @@ -234,6 +264,9 @@ def _pair_coalescence_stat(
else:
total_pairs[i] = sizes[j] * sizes[k]

if span_normalise:
window_span = _nonmissing_window_span(ts, windows)

for i, s in enumerate(sample_sets): # initialize
nodes_sample[s, i] = 1
sample_counts = nodes_sample.copy()
Expand Down Expand Up @@ -327,7 +360,7 @@ def _pair_coalescence_stat(
nodes_values[nonzero, i] /= nodes_weight[nonzero, i]
nodes_values[~nonzero, i] = np.nan
if span_normalise:
nodes_weight /= windows[w + 1] - windows[w]
nodes_weight /= window_span[w]
if pair_normalise:
nodes_weight /= total_pairs[np.newaxis, :]
for i in range(num_indexes): # apply function to empirical distribution
Expand Down Expand Up @@ -1261,6 +1294,40 @@ def test_span_normalise(self):
proto = proto_pair_coalescence_counts(ts, windows=windows, span_normalise=False)
np.testing.assert_allclose(proto, check)

def test_span_normalise_with_missing(self):
"""
test case where span is normalised and there are intervals without trees
"""
ts = self.example_ts()
missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length
ts = ts.delete_intervals(missing)
windows = np.array([0.0, 0.33, 1.0]) * ts.sequence_length
window_size = np.diff(windows) - np.diff(missing, axis=1).flatten()
check = (
ts.pair_coalescence_counts(windows=windows, span_normalise=False)
/ window_size[:, np.newaxis]
)
implm = ts.pair_coalescence_counts(windows=windows, span_normalise=True)
np.testing.assert_allclose(implm, check)
# TODO: remove with prototype
proto = proto_pair_coalescence_counts(ts, windows=windows, span_normalise=True)
np.testing.assert_allclose(proto, check)

def test_empty_windows(self):
"""
test that windows without nodes contain zeros
"""
ts = self.example_ts()
missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length
ts = ts.delete_intervals(missing)
windows = np.concatenate(missing)
check = ts.pair_coalescence_counts(windows=windows, span_normalise=False)
implm = ts.pair_coalescence_counts(windows=windows, span_normalise=True)
np.testing.assert_allclose(check[0], 0.0)
np.testing.assert_allclose(check[2], 0.0)
np.testing.assert_allclose(implm[0], 0.0)
np.testing.assert_allclose(implm[2], 0.0)

def test_pair_normalise(self):
ts = self.example_ts()
windows = np.array([0.0, 0.33, 1.0]) * ts.sequence_length
Expand Down Expand Up @@ -1638,6 +1705,19 @@ def test_long_sequence(self):
)
np.testing.assert_allclose(quants, np.tile(ck_quants, (windows.size - 1, 1)))

def test_empty_windows(self):
"""
test case where a window has no nodes
"""
ts = self.example_ts()
missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length
ts = ts.delete_intervals(missing)
windows = np.concatenate(missing)
quantiles = np.linspace(0, 1, 10)
check = ts.pair_coalescence_quantiles(windows=windows, quantiles=quantiles)
assert np.all(np.isnan(check[0]))
assert np.all(np.isnan(check[2]))


class TestPairCoalescenceRates:
"""
Expand Down Expand Up @@ -1785,3 +1865,33 @@ def test_extra_time_windows(self):
assert implm.shape == (time_windows.size - 1,)
max_idx = np.searchsorted(time_windows, max_time, side="right")
assert np.all(np.isnan(implm[max_idx:]))

def test_missing_sequence(self):
"""
test that missing intervals are ignored when calculating rates
"""
ts = self.example_ts()
missing = np.array([[0.0, 0.1], [0.9, 1.0]]) * ts.sequence_length
ts = ts.delete_intervals(missing)
windows = np.array([0.0, 0.5, 1.0]) * ts.sequence_length
ts_trim = ts.trim()
windows_trim = np.array([0.0, 0.5, 1.0]) * ts_trim.sequence_length
time_windows = np.linspace(0, ts.nodes_time.max() * 2, 10)
time_windows[-1] = np.inf
implm = ts.pair_coalescence_rates(time_windows, windows=windows)
check = ts_trim.pair_coalescence_rates(time_windows, windows=windows_trim)
np.testing.assert_allclose(implm, check)

def test_empty_windows(self):
"""
test case where a window has no nodes
"""
ts = self.example_ts()
missing = np.array([[0.0, 0.1], [0.8, 1.0]]) * ts.sequence_length
ts = ts.delete_intervals(missing)
windows = np.concatenate(missing)
time_windows = np.linspace(0, ts.nodes_time.max() * 2, 10)
time_windows[-1] = np.inf
check = ts.pair_coalescence_rates(time_windows, windows=windows)
assert np.all(np.isnan(check[0]))
assert np.all(np.isnan(check[2]))
6 changes: 3 additions & 3 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -9718,7 +9718,7 @@ def pair_coalescence_counts(
:param list windows: An increasing list of breakpoints between the
sequence windows to compute the statistic in, or None.
:param bool span_normalise: Whether to divide the result by the span of
the window (defaults to True).
non-missing sequence in the window (defaults to True).
:param bool pair_normalise: Whether to divide the result by the total
number of pairs for a given index (defaults to False).
:param time_windows: Either a string "nodes" or an increasing
Expand Down Expand Up @@ -9893,8 +9893,8 @@ def pair_coalescence_rates(
The first breakpoint in `time_windows` must start at the age of the
samples, and the last must end at infinity. In the output array, any
time windows that do not contain coalescence events will have `NaN`
values.
time windows where all pairs have coalesced by start of the window will
contain `NaN` values.
Pair coalescence rates may be calculated within or between the
non-overlapping lists of samples contained in `sample_sets`. In the
Expand Down

0 comments on commit 8cd494a

Please sign in to comment.