Skip to content

Commit

Permalink
Implement draft "ids" arg
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Aug 23, 2023
1 parent f0e69f0 commit 7c5d1d5
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 133 deletions.
2 changes: 1 addition & 1 deletion c/tests/test_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -2176,7 +2176,7 @@ test_simplest_divergence_matrix_internal_sample(void)
{
const char *nodes = "1 0 0\n"
"1 0 0\n"
"0 1 0\n";
"1 1 0\n";
const char *edges = "0 1 2 0,1\n";
tsk_treeseq_t ts;
tsk_id_t sample_ids[] = { 0, 1, 2 };
Expand Down
68 changes: 45 additions & 23 deletions c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -6516,7 +6516,7 @@ tsk_treeseq_divergence_matrix_branch(const tsk_treeseq_t *self,
double tu, tv, d, span, left, right, span_left, span_right;
double *restrict D;
sv_tables_t sv;
tsk_size_t *ss_offsets = malloc((num_sample_sets + 1) * sizeof(*ss_offsets));
tsk_size_t *ss_offsets = tsk_malloc((num_sample_sets + 1) * sizeof(*ss_offsets));

memset(&sv, 0, sizeof(sv));
ret = tsk_tree_init(&tree, self, 0);
Expand Down Expand Up @@ -6557,11 +6557,18 @@ tsk_treeseq_divergence_matrix_branch(const tsk_treeseq_t *self,
span = span_right - span_left;
sv_tables_build(&sv, &tree);
for (sj = 0; sj < N; sj++) {
/* printf("set sj = %d\n", (int) sj); */
for (j = ss_offsets[sj]; j < ss_offsets[sj + 1]; j++) {
u = sample_sets[j];
/* printf("Set %d offset %d = %d\n", (int) sj, (int) j, u); */
for (sk = sj; sk < N; sk++) {
/* printf("\tset sk = %d\n", (int) sk); */
for (k = ss_offsets[sk]; k < ss_offsets[sk + 1]; k++) {
v = sample_sets[k];
/* printf("\t\tSet %d offset %d = %d\n", (int) sk, (int) k,
* v); */
/* printf("\t\tu = %d v = %d\n", u, v); */

if (u == v) {
/* This case contributes zero to divergence, so
* short-circuit to save time.
Expand All @@ -6581,7 +6588,7 @@ tsk_treeseq_divergence_matrix_branch(const tsk_treeseq_t *self,
tu = nodes_time[u_root] - nodes_time[u];
tv = nodes_time[v_root] - nodes_time[v];
d = (tu + tv) * span;
D[j * N + k] += d;
D[sj * N + sk] += d;
}
}
}
Expand Down Expand Up @@ -6614,6 +6621,7 @@ update_site_divergence(const tsk_variant_t *var, const tsk_id_t *restrict A,
const tsk_size_t num_alleles = var->num_alleles;
tsk_size_t a, b, j, k;
tsk_id_t u, v;
double increment;

for (a = 0; a < num_alleles; a++) {
for (b = a + 1; b < num_alleles; b++) {
Expand All @@ -6627,7 +6635,11 @@ update_site_divergence(const tsk_variant_t *var, const tsk_id_t *restrict A,
u = A[k];
v = A[j];
}
D[u * (tsk_id_t) num_sample_sets + v]++;
increment = 1;
if (u == v) {
increment = 2;
}
D[u * (tsk_id_t) num_sample_sets + v] += increment;
}
}
}
Expand Down Expand Up @@ -6662,9 +6674,10 @@ remap_to_sample_sets(const tsk_size_t num_samples, const tsk_id_t *restrict samp
{
tsk_size_t j;
tsk_id_t u;

for (j = 0; j < num_samples; j++) {
u = samples[A[j]];
tsk_bug_assert(u >= 0);
tsk_bug_assert(sample_set_index_map[u] >= 0);
A[j] = sample_set_index_map[u];
}
}
Expand All @@ -6689,6 +6702,8 @@ tsk_treeseq_divergence_matrix_site(const tsk_treeseq_t *self, tsk_size_t num_sam
tsk_size_t *allele_offsets = NULL;
tsk_variant_t variant;

/* FIXME it's not clear that using TSK_ISOLATED_NOT_MISSING is
* correct here */
ret = tsk_variant_init(
&variant, self, samples, num_samples, NULL, TSK_ISOLATED_NOT_MISSING);
if (ret != 0) {
Expand Down Expand Up @@ -6747,39 +6762,37 @@ tsk_treeseq_divergence_matrix_site(const tsk_treeseq_t *self, tsk_size_t num_sam
* set.
*/
static int
get_sample_set_index_map(const tsk_size_t num_nodes, const tsk_size_t num_sample_sets,
get_sample_set_index_map(const tsk_treeseq_t *self, const tsk_size_t num_sample_sets,
const tsk_size_t *restrict sample_set_sizes, const tsk_id_t *restrict sample_sets,
tsk_size_t *ret_total_samples, tsk_id_t **ret_sample_set_index_map)
tsk_size_t *ret_total_samples, tsk_id_t *restrict node_index_map)
{
int ret = 0;
tsk_size_t i, j, k;
tsk_id_t u;
tsk_size_t total_samples = 0;
tsk_id_t *restrict node_index_map = tsk_malloc(num_nodes * sizeof(*node_index_map));

if (node_index_map == NULL) {
ret = TSK_ERR_NO_MEMORY;
goto out;
}
/* Assign the output pointer here so that it will be freed in the case
* of an error raised in the input checking */
*ret_sample_set_index_map = node_index_map;
const tsk_size_t num_nodes = self->tables->nodes.num_rows;
const tsk_flags_t *restrict node_flags = self->tables->nodes.flags;

for (j = 0; j < num_nodes; j++) {
node_index_map[j] = TSK_NULL;
}
/* printf("START\n"); */
i = 0;
for (j = 0; j < num_sample_sets; j++) {
total_samples += sample_set_sizes[j];
for (k = 0; k < sample_set_sizes[j]; k++) {
/* printf("j = %d k = %d i = %d\n", (int) j, (int) k, (int) i); */
u = sample_sets[i];
i++;
if (u < 0 || u >= (tsk_id_t) num_nodes) {
ret = TSK_ERR_NODE_OUT_OF_BOUNDS;
goto out;
}
/* Note: we require nodes to be samples because we have to think
* about how to normalise by the length of genome that the node
* is 'in' the tree for each window otherwise. */
if (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) {
ret = TSK_ERR_BAD_SAMPLES;
goto out;
}
if (node_index_map[u] != TSK_NULL) {
ret = TSK_ERR_DUPLICATE_SAMPLE;
goto out;
Expand All @@ -6793,17 +6806,24 @@ get_sample_set_index_map(const tsk_size_t num_nodes, const tsk_size_t num_sample
}

static void
fill_lower_triangle(
double *restrict result, const tsk_size_t n, const tsk_size_t num_windows)
fill_lower_triangle_count_normalise(const tsk_size_t num_windows, const tsk_size_t n,
const tsk_size_t *set_sizes, double *restrict result)
{
tsk_size_t i, j, k;
double denom;
double *restrict D;

/* TODO there's probably a better striding pattern that could be used here */
for (i = 0; i < num_windows; i++) {
D = result + i * n * n;
for (j = 0; j < n; j++) {
denom = (double) set_sizes[j] * (double) (set_sizes[j] - 1);
if (denom != 0) {
D[j * n + j] /= denom;
}
for (k = j + 1; k < n; k++) {
denom = (double) set_sizes[j] * (double) set_sizes[k];
D[j * n + k] /= denom;
D[k * n + j] = D[j * n + k];
}
}
Expand All @@ -6825,7 +6845,8 @@ tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_s
bool stat_site = !!(options & TSK_STAT_SITE);
bool stat_branch = !!(options & TSK_STAT_BRANCH);
bool stat_node = !!(options & TSK_STAT_NODE);
tsk_id_t *sample_set_index_map = NULL;
tsk_id_t *sample_set_index_map
= tsk_malloc(num_nodes * sizeof(*sample_set_index_map));
tsk_size_t j;

if (stat_node) {
Expand Down Expand Up @@ -6880,9 +6901,10 @@ tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_s
}
sample_set_sizes = tmp_sample_set_sizes;
}

/* printf("N = %d\n", (int) N); */
ret = get_sample_set_index_map(num_nodes, N, sample_set_sizes, sample_sets,
&total_samples, &sample_set_index_map);
ret = get_sample_set_index_map(
self, N, sample_set_sizes, sample_sets, &total_samples, sample_set_index_map);
if (ret != 0) {
goto out;
}
Expand All @@ -6900,7 +6922,7 @@ tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_s
if (ret != 0) {
goto out;
}
fill_lower_triangle(result, N, num_windows);
fill_lower_triangle_count_normalise(num_windows, N, sample_set_sizes, result);

if (options & TSK_STAT_SPAN_NORMALISE) {
span_normalise(num_windows, windows, N * N, result);
Expand Down
6 changes: 3 additions & 3 deletions python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -9794,9 +9794,9 @@ TreeSequence_divergence_matrix(TreeSequence *self, PyObject *args, PyObject *kwd
if (span_normalise) {
options |= TSK_STAT_SPAN_NORMALISE;
}
printf("num sets = %d\n", (int) num_sample_sets);
printf("sizes = %p\n", (void *) sample_set_sizes);
printf("sample_sets = %p\n", (void *) sample_sets);
/* printf("num sets = %d\n", (int) num_sample_sets); */
/* printf("sizes = %p\n", (void *) sample_set_sizes); */
/* printf("sample_sets = %p\n", (void *) sample_sets); */

// clang-format off
Py_BEGIN_ALLOW_THREADS
Expand Down
Loading

0 comments on commit 7c5d1d5

Please sign in to comment.