From fe6419ff1391d0867821eafdfa10b8c8bca86266 Mon Sep 17 00:00:00 2001 From: mish-su <74434157+mish-su@users.noreply.github.com> Date: Thu, 15 Jun 2023 13:54:35 -0400 Subject: [PATCH] Adding script files --- Scripts/compareLineages.py | 33 ++ Scripts/expected.13.14.tsv | 10 + Scripts/expected.14.15.tsv | 44 +++ Scripts/expected.15.16.tsv | 92 ++++++ Scripts/expected.2021-11-09_v1.2.133.tsv | 106 +++++++ Scripts/sankey_plots.R | 364 +++++++++++++++++++++++ 6 files changed, 649 insertions(+) create mode 100644 Scripts/compareLineages.py create mode 100644 Scripts/expected.13.14.tsv create mode 100644 Scripts/expected.14.15.tsv create mode 100644 Scripts/expected.15.16.tsv create mode 100644 Scripts/expected.2021-11-09_v1.2.133.tsv create mode 100644 Scripts/sankey_plots.R diff --git a/Scripts/compareLineages.py b/Scripts/compareLineages.py new file mode 100644 index 0000000..3f40a32 --- /dev/null +++ b/Scripts/compareLineages.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +import os +import csv +import itertools + +from pango_aliasor.aliasor import Aliasor + +aliasor = Aliasor() + +print("\t".join(["maple", "pangolin", "isAncestral", "distance"])) +with open('unique_mismatches_maple_vs_pangolin.tsv', 'r') as f: + reader = csv.DictReader(f, delimiter="\t") + for row in reader: + mUnc = aliasor.uncompress(row["maple"]) + pUnc = aliasor.uncompress(row["pangolin"]) + if mUnc.startswith(pUnc) or pUnc.startswith(mUnc): + isAnc = "1" + else: + isAnc = "0" + mUncL = mUnc.split('.') + pUncL = pUnc.split('.') + distance = 0 + for pair in itertools.zip_longest(mUncL, pUncL, fillvalue=None): + if pair[0] != pair[1]: + distance += 1 + if pair[0] and pair[1]: + distance += 1 + if mUnc.startswith("X") or pUnc.startswith("X"): + distS = "R" + else: + distS = str(distance) + print("\t".join([row["maple"], row["pangolin"], isAnc, distS])) diff --git a/Scripts/expected.13.14.tsv b/Scripts/expected.13.14.tsv new file mode 100644 index 0000000..ee0825c --- /dev/null +++ b/Scripts/expected.13.14.tsv @@ -0,0 +1,10 @@ +source target reason +AY.4 AY.4.1 new .78 +B.1.351 B.1.351.5 new .77 +B.1.617.2 AY.33 new .77 +B.1.617.2 AY.34 new .78 +B.1.617.2 AY.35 new .78 +B.1.617.2 AY.36 new .80 +B.1.617.2 AY.37 new .80 +B.1.617.2 AY.38 new .80 +P.1 P.1.12 new .77 \ No newline at end of file diff --git a/Scripts/expected.14.15.tsv b/Scripts/expected.14.15.tsv new file mode 100644 index 0000000..c7ccafa --- /dev/null +++ b/Scripts/expected.14.15.tsv @@ -0,0 +1,44 @@ +source target reason +AY.23 AY.23.1 new .86 +AY.29 AY.29.1 new .84 +AY.4 AY.4.2 new .82 +AY.4 AY.4.3 new .83 +AY.4 AY.4.4 new .86 +AY.4 AY.4.5 new .86 +B.1.617.2 AY.39 new .84 +B.1.617.2 AY.39.1 new .84 +B.1.617.2 AY.39.1.1 new .84 +B.1.617.2 AY.40 new .86 +B.1.617.2 AY.41 new .86 +B.1.617.2 AY.42 new .87 +B.1.617.2 AY.43 new .87 +B.1.617.2 AY.44 new .87 +B.1.617.2 AY.45 new .87 +B.1.617.2 AY.46 new .88 +B.1.617.2 AY.46.1 new .88 +B.1.617.2 AY.46.2 new .88 +B.1.617.2 AY.46.3 new .88 +B.1.617.2 AY.46.4 new .88 +B.1.617.2 AY.46.5 new .88 +B.1.617.2 AY.46.6 new .88 +B.1.617.2 AY.47 new .88 +P.1 P.1.1 updated .82 +P.1 P.1.2 updated .82 +P.1 P.1.3 updated .82 +P.1 P.1.4 updated .82 +P.1 P.1.5 updated .82 +P.1 P.1.6 updated .82 +P.1 P.1.7 updated .82 +P.1 P.1.8 updated .85 +P.1 P.1.9 updated .85 +P.1 P.1.10 updated .85 +P.1 P.1.11 updated .85 +P.1 P.1.12 updated .85 +P.1 P.1.13 new .85 +P.1 P.1.14 new .85 +P.1 P.1.15 new .85 +P.1 P.1.16 new .85 +P.1 P.1.17 new .85 +P.1 P.1.17.1 new .85 +P.1.10 P.1.10.1 updated .85 +P.1.10 P.1.10.2 updated .85 \ No newline at end of file diff --git a/Scripts/expected.15.16.tsv b/Scripts/expected.15.16.tsv new file mode 100644 index 0000000..3f5bc88 --- /dev/null +++ b/Scripts/expected.15.16.tsv @@ -0,0 +1,92 @@ +source target reason +AY.16 AY.16.1 new .89 +AY.34 AY.34.1 new .92 +AY.39 AY.39.2 new .91 +AY.4.2 AY.4.2.1 new .93 +AY.9 AY.9.1 new .90 +AY.9 AY.9.2 new .90 +AY.9 AY.9.2.1 new .90 +B.1 B.1.639 new .93 +B.1.617.2 AY.48 new .89 +B.1.617.2 AY.49 new .89 +B.1.617.2 AY.50 new .89 +B.1.617.2 AY.51 new .89 +B.1.617.2 AY.52 new .89 +B.1.617.2 AY.53 new .89 +B.1.617.2 AY.54 new .89 +B.1.617.2 AY.55 new .89 +B.1.617.2 AY.56 new .89 +B.1.617.2 AY.57 new .89 +B.1.617.2 AY.58 new .89 +B.1.617.2 AY.59 new .89 +B.1.617.2 AY.60 new .89 +B.1.617.2 AY.61 new .89 +B.1.617.2 AY.62 new .89 +B.1.617.2 AY.63 new .89 +B.1.617.2 AY.64 new .89 +B.1.617.2 AY.65 new .89 +B.1.617.2 AY.66 new .89 +B.1.617.2 AY.67 new .89 +B.1.617.2 AY.68 new .89 +B.1.617.2 AY.69 new .89 +B.1.617.2 AY.70 new .89 +B.1.617.2 AY.71 new .89 +B.1.617.2 AY.72 new .89 +B.1.617.2 AY.73 new .89 +B.1.617.2 AY.74 new .89 +B.1.617.2 AY.75 new .89 +B.1.617.2 AY.75.1 new .89 +B.1.617.2 AY.76 new .90 +B.1.617.2 AY.77 new .90 +B.1.617.2 AY.78 new .90 +B.1.617.2 AY.79 new .90 +B.1.617.2 AY.80 new .90 +B.1.617.2 AY.81 new .90 +B.1.617.2 AY.82 new .90 +B.1.617.2 AY.83 new .90 +B.1.617.2 AY.84 new .90 +B.1.617.2 AY.85 new .90 +B.1.617.2 AY.86 new .90 +B.1.617.2 AY.87 new .90 +B.1.617.2 AY.88 new .90 +B.1.617.2 AY.89 new .90 +B.1.617.2 AY.90 new .90 +B.1.617.2 AY.91 new .90 +B.1.617.2 AY.91.1 new .90 +B.1.617.2 AY.92 new .90 +B.1.617.2 AY.93 new .90 +B.1.617.2 AY.94 new .90 +B.1.617.2 AY.95 new .90 +B.1.617.2 AY.96 new .90 +B.1.617.2 AY.97 new .90 +B.1.617.2 AY.98 new .90 +B.1.617.2 AY.98.1 new .90 +B.1.617.2 AY.99 new .90 +B.1.617.2 AY.99.1 new .90 +B.1.617.2 AY.99.2 new .90 +B.1.617.2 AY.100 new .90 +B.1.617.2 AY.101 new .90 +B.1.617.2 AY.102 new .90 +B.1.617.2 AY.103 new .92 +B.1.617.2 AY.104 new .92 +B.1.617.2 AY.105 new .92 +B.1.617.2 AY.106 new .92 +B.1.617.2 AY.107 new .92 +B.1.617.2 AY.108 new .92 +B.1.617.2 AY.109 new .92 +B.1.617.2 AY.110 new .92 +B.1.617.2 AY.111 new .92 +B.1.617.2 AY.112 new .92 +B.1.617.2 AY.113 new .92 +B.1.617.2 AY.114 new .92 +B.1.617.2 AY.115 new .92 +B.1.617.2 AY.116 new .92 +B.1.617.2 AY.116.1 new .92 +B.1.617.2 AY.117 new .92 +B.1.617.2 AY.118 new .92 +B.1.617.2 AY.119 new .92 +B.1.617.2 AY.120 new .92 +B.1.617.2 AY.120.1 new .92 +B.1.617.2 AY.120.2 new .92 +B.1.617.2 AY.120.2.1 new .92 +B.1.617.2 AY.121 new .93 \ No newline at end of file diff --git a/Scripts/expected.2021-11-09_v1.2.133.tsv b/Scripts/expected.2021-11-09_v1.2.133.tsv new file mode 100644 index 0000000..7f4100f --- /dev/null +++ b/Scripts/expected.2021-11-09_v1.2.133.tsv @@ -0,0 +1,106 @@ +source target +AY.102 AY.102.1 +AY.102 AY.102.2 +AY.103 AY.103.1 +AY.103 AY.103.2 +AY.112 AY.112.1 +AY.115 AY.116 +AY.119 AY.119.1 +AY.119 AY.119.2 +AY.12 AY.121.1 +AY.122 AY.122.2 +AY.122 AY.122.3 +AY.122 AY.122.4 +AY.122 AY.122.5 +AY.124 AY.124.1 +AY.20 AY.20.1 +AY.23 AY.23.2 +AY.24 AY.24.1 +AY.25 AY.25.1 +AY.25 AY.25.1.1 +AY.25 AY.25.1.2 +AY.25 AY.25.2 +AY.25 AY.25.3 +AY.26 AY.26.1 +AY.3 AY.3.2 +AY.3 AY.3.3 +AY.3 AY.3.4 +AY.33 AY.33.1 +AY.34 AY.34.1 +AY.34 AY.34.2 +AY.36 AY.36.1 +AY.39 AY.39.3 +AY.39.1 AY.39.1.2 +AY.39.1 AY.39.1.3 +AY.4 AY.4.10 +AY.4 AY.4.11 +AY.4 AY.4.12 +AY.4 AY.4.13 +AY.4 AY.4.14 +AY.4 AY.4.15 +AY.4 AY.4.16 +AY.4 AY.4.17 +AY.4 AY.4.6 +AY.4 AY.4.7 +AY.4 AY.4.8 +AY.4 AY.4.9 +AY.4.2 AY.4.2.1 +AY.4.2 AY.4.2.2 +AY.4.2 AY.4.2.3 +AY.4.2 AY.4.2.4 +AY.42 AY.42.1 +AY.43 AY.43.1 +AY.43 AY.43.2 +AY.43 AY.43.3 +AY.43 AY.43.4 +AY.43 AY.43.5 +AY.43 AY.43.6 +AY.43 AY.43.7 +AY.43 AY.43.8 +AY.46.6 AY.46.6.1 +AY.5 AY.5.6 +AY.75 AY.75.2 +AY.75 AY.75.3 +AY.75.1 AY.75 +AY.75.1 AY.75.2 +AY.75.1 AY.75.3 +AY.89 AY.4 +AY.9.1 AY.9.2 +AY.9.2 AY.9.2.2 +AY.96 AY.46 +AY.96 AY.46.2 +AY.96 AY.46.6 +AY.96 B.1.617.2 +AY.97 AY.5.5 +B.1 B.1.639 +B.1 B.1.640 +B.1 B.1.640.1 +B.1 B.1.640.2 +B.1.177.79 B.1.177.60 +B.1.617.2 AY.112 +B.1.617.2 AY.121 +B.1.617.2 AY.121.1 +B.1.617.2 AY.122 +B.1.617.2 AY.123 +B.1.617.2 AY.123.1 +B.1.617.2 AY.124 +B.1.617.2 AY.124.1 +B.1.617.2 AY.124.1.1 +B.1.617.2 AY.125 +B.1.617.2 AY.125.1 +B.1.617.2 AY.126 +B.1.617.2 AY.127 +B.1.617.2 AY.127.1 +B.1.617.2 AY.127.2 +B.1.617.2 AY.128 +B.1.617.2 AY.129 +B.1.617.2 AY.130 +B.1.617.2 AY.131 +B.1.617.2 AY.132 +B.1.617.2 AY.133 +B.1.621 B.1.621.2 +B.1.621.1 BB.2 +B.1.628 XB +B.1.637 B.1.637.1 +P.1.12 P.1.12.1 +P.1.7 P.1.7.1 diff --git a/Scripts/sankey_plots.R b/Scripts/sankey_plots.R new file mode 100644 index 0000000..b795aff --- /dev/null +++ b/Scripts/sankey_plots.R @@ -0,0 +1,364 @@ +##### NECESSARY FILES ##### +# In order for the user to run this script, they will need to have 6 files in place: +# Two files with the sequence assignments for all versions, one for all four of pangolin v3 and one for pangolin v4 (which will be merged in the first step), labeled dataset_v3.csv and dataset_v4.csv. +# Four files indicating the expected changes accross different versions of pangolin, labeled expected.13.14.tsv; expected.14.15.tsv; expected.15.16.tsv and expected.2021-11-09_v1.2.133.tsv. + +setwd("/summary_calls") + +library(dplyr) +library(tidyverse) + +# Load file with all assignments for all versions + +### DATASET ### + +myfile = read.csv("dataset_v3.csv", header = TRUE, sep=",") +nameofthefile = "name_of_dataset" +joinfile = read.csv("dataset_v4.csv", header = TRUE, sep=",") +myfile = merge(myfile, joinfile[, c("taxon","lineage")], by="taxon") +colnames(myfile)[6] <- c("V4") + +# Extract individual counts for all versions in three separate files + +v1v2 = myfile %>% count(myfile[,2], myfile[,3]) +v2v3 = myfile %>% count(myfile[,3], myfile[,4]) +v3v4 = myfile %>% count(myfile[,4], myfile[,5]) +v4v5 = myfile %>% count(myfile[,5], myfile[,6]) + +# Rename column names to versions + +colnames(v1v2) <- c("v1","v2","n") +colnames(v2v3) <- c("v2","v3","n") +colnames(v3v4) <- c("v3","v4","n") +colnames(v4v5) <- c("v4","v5","n") + +# Create an index list for the sequences + +index1 = as.list(0:((nrow(myfile %>% count(myfile[,2])))-1)) #Take -1 since it starts at 0. +index2 = as.list(last(index1)+1:nrow(myfile %>% count(myfile[,3]))) +index3 = as.list(last(index2)+1:nrow(myfile %>% count(myfile[,4]))) +index4 = as.list(last(index3)+1:nrow(myfile %>% count(myfile[,5]))) +index5 = as.list(last(index4)+1:nrow(myfile %>% count(myfile[,6]))) + + +index1 = unlist(index1) +list1 = as.tibble(unique(myfile[,2])) +v1_indexed = cbind(list1,index1) +colnames(v1_indexed)[1] <- c("v1") + +index2 = unlist(index2) +list2 = as.tibble(unique(myfile[,3])) +v2_indexed = cbind(list2,index2) +colnames(v2_indexed)[1] <- c("v2") + +index3 = unlist(index3) +list3 = as.tibble(unique(myfile[,4])) +v3_indexed = cbind(list3,index3) +colnames(v3_indexed)[1] <- c("v3") + +index4 = unlist(index4) +list4 = as.tibble(unique(myfile[,5])) +v4_indexed = cbind(list4,index4) +colnames(v4_indexed)[1] <- c("v4") + +index5 = unlist(index5) +list5 = as.tibble(unique(myfile[,6])) +v5_indexed = cbind(list5,index5) +colnames(v5_indexed)[1] <- c("v5") + +# Merge indexes with individual pairs of v1/v2 v2/v3 and v3/v4 + +v1_table = merge(v1v2, v1_indexed[, c("v1","index1")], by="v1") +v1_table = merge(v1_table, v2_indexed[, c("v2","index2")], by="v2") + +v2_table = merge(v2v3, v2_indexed[, c("v2","index2")], by="v2") +v2_table = merge(v2_table, v3_indexed[, c("v3","index3")], by="v3") + +v3_table = merge(v3v4, v3_indexed[, c("v3","index3")], by="v3") +v3_table = merge(v3_table, v4_indexed[, c("v4","index4")], by="v4") + +v4_table = merge(v4v5, v4_indexed[, c("v4","index4")], by="v4") +v4_table = merge(v4_table, v5_indexed[, c("v5","index5")], by="v5") + +# Load lists of expected changes across versions + +expected_v1_v2 = read.csv("expected.13.14.tsv", header = TRUE, sep="\t") +expected_v2_v3 = read.csv("expected.14.15.tsv", header = TRUE, sep="\t") +expected_v3_v4 = read.csv("expected.15.16.tsv", header = TRUE, sep="\t") +expected_v4_v5 = read.csv("expected.2021-11-09_v1.2.133.tsv", header = TRUE, sep="\t") + +# if "none to none" print expected else, follow expected sheets. + +## V1 ## +v1_table$v1 = as.character(v1_table$v1) +v1_table$v2 = as.character(v1_table$v2) +ifelse(v1_table$v1=="None","Expected","Unexpected") + +v1_table$concat = paste(v1_table$v1,v1_table$v2) +expected_v1_v2$concat = paste(expected_v1_v2$source,expected_v1_v2$target) + +## V2 ## +v2_table$v2 = as.character(v2_table$v2) +v2_table$v3 = as.character(v2_table$v3) +ifelse(v2_table$v2=="None","Expected","Unexpected") + +v2_table$concat = paste(v2_table$v2,v2_table$v3) +expected_v2_v3$concat = paste(expected_v2_v3$source,expected_v2_v3$target) + +## V3 ## +v3_table$v3 = as.character(v3_table$v3) +v3_table$v4 = as.character(v3_table$v4) +ifelse(v3_table$v3=="None","Expected","Unexpected") + +v3_table$concat = paste(v3_table$v3,v3_table$v4) +expected_v3_v4$concat = paste(expected_v3_v4$source,expected_v3_v4$target) + +## V4 ## +v4_table$v4 = as.character(v4_table$v4) +v4_table$v5 = as.character(v4_table$v5) +ifelse(v4_table$v4=="None","Expected","Unexpected") + +v4_table$concat = paste(v4_table$v4,v4_table$v5) +expected_v4_v5$concat = paste(expected_v4_v5$source,expected_v4_v5$target) + +### Create empty matrix to build dataframe of expected and unexpected changes ### + +## V1 ## + +tempconcat <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat) <- x + +# first loop creates table of all expected as everything that is equal, and everything that is unequal is called unexpected + +for (i in 1:nrow(v1_table)) { + if (v1_table$v1[i]==v1_table$v2[i]) { + output <- data.frame(concat = v1_table$concat[i],expected="expected") + tempconcat = rbind(tempconcat,output)} + else + { + output <- data.frame(concat = v1_table$concat[i],expected="unexpected") + tempconcat = rbind(tempconcat,output)} +} + +# Create second empty matrix to fix unexpected changes that are actual expected changes due to version changes in the software + +tempconcat2 <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat2) <- x + +# Second loop creates matrix of expected changes between different versions + +for (p in v1_table$concat) { + for (i in expected_v1_v2$concat) { + if (i==p) + { + output <- data.frame(concat = i,expected="expected") + tempconcat2 = rbind(tempconcat2,output)} + else + next + }} + +# Merges expected changes of two previous dataframes and replaces "unexpected" values with "expected" for the valid ones (second dataframe) + +tempconcat$expected[match(tempconcat2$concat, tempconcat$concat)] <- tempconcat2$expected + +v1_table = merge(v1_table, tempconcat[, c("concat","expected")], by="concat") + +## V2 ## + +tempconcat <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat) <- x + +# first loop creates table of all expected as everything that is equal, and everything that is unequal is called unexpected + +for (i in 1:nrow(v2_table)) { + if (v2_table$v2[i]==v2_table$v3[i]) { + output <- data.frame(concat = v2_table$concat[i],expected="expected") + tempconcat = rbind(tempconcat,output)} + else + { + output <- data.frame(concat = v2_table$concat[i],expected="unexpected") + tempconcat = rbind(tempconcat,output)} +} + +# Create second empty matrix to fix unexpected changes that are actual expected changes due to version changes in the software + +tempconcat2 <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat2) <- x + +# Second loop creates matrix of expected changes between different versions + +for (p in v2_table$concat) { + for (i in expected_v2_v3$concat) { + if (i==p) + { + output <- data.frame(concat = i,expected="expected") + tempconcat2 = rbind(tempconcat2,output)} + else + next + }} + +# Merges expected changes of two previous dataframes and replaces "unexpected" values with "expected" for the valid ones (second dataframe) + +tempconcat$expected[match(tempconcat2$concat, tempconcat$concat)] <- tempconcat2$expected + +v2_table = merge(v2_table, tempconcat[, c("concat","expected")], by="concat") + +## V3 ## + +tempconcat <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat) <- x + +# first loop creates table of all expected as everything that is equal, and everything that is unequal is called unexpected + +for (i in 1:nrow(v3_table)) { + if (v3_table$v3[i]==v3_table$v4[i]) { + output <- data.frame(concat = v3_table$concat[i],expected="expected") + tempconcat = rbind(tempconcat,output)} + else + { + output <- data.frame(concat = v3_table$concat[i],expected="unexpected") + tempconcat = rbind(tempconcat,output)} +} + +# Create second empty matrix to fix unexpected changes that are actual expected changes due to version changes in the software + +tempconcat2 <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat2) <- x + +# Second loop creates matrix of expected changes between different versions + +for (p in v3_table$concat) { + for (i in expected_v3_v4$concat) { + if (i==p) + { + output <- data.frame(concat = i,expected="expected") + tempconcat2 = rbind(tempconcat2,output)} + else + next + }} + +# Merges expected changes of two previous dataframes and replaces "unexpected" values with "expected" for the valid ones (second dataframe) + +tempconcat$expected[match(tempconcat2$concat, tempconcat$concat)] <- tempconcat2$expected + +v3_table = merge(v3_table, tempconcat[, c("concat","expected")], by="concat") + +## V4 ## + +tempconcat <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat) <- x + +# first loop creates table of all expected as everything that is equal, and everything that is unequal is called unexpected + +for (i in 1:nrow(v4_table)) { + if (v4_table$v4[i]==v4_table$v5[i]) { + output <- data.frame(concat = v4_table$concat[i],expected="expected") + tempconcat = rbind(tempconcat,output)} + else + { + output <- data.frame(concat = v4_table$concat[i],expected="unexpected") + tempconcat = rbind(tempconcat,output)} +} + +# Create second empty matrix to fix unexpected changes that are actual expected changes due to version changes in the software + +tempconcat2 <- data.frame(matrix(ncol = 2, nrow = 0)) +x <- c("concat","expected") +colnames(tempconcat2) <- x + +# Second loop creates matrix of expected changes between different versions + +for (p in v4_table$concat) { + for (i in expected_v4_v5$concat) { + if (i==p) + { + output <- data.frame(concat = i,expected="expected") + tempconcat2 = rbind(tempconcat2,output)} + else + next + }} + +# Merges expected changes of two previous dataframes and replaces "unexpected" values with "expected" for the valid ones (second dataframe) + +tempconcat$expected[match(tempconcat2$concat, tempconcat$concat)] <- tempconcat2$expected + +v4_table = merge(v4_table, tempconcat[, c("concat","expected")], by="concat") + +## Prep and merge files ## + +# To-do's here: figure out index order to match to index dataframe. + +link_v1_table = v1_table %>% select(index1,index2,n,expected) +link_v2_table = v2_table %>% select(index2,index3,n,expected) +link_v3_table = v3_table %>% select(index3,index4,n,expected) +link_v4_table = v4_table %>% select(index4,index5,n,expected) + +names(link_v1_table) = c("source", "target", "value","group") +names(link_v2_table) = c("source", "target", "value","group") +names(link_v3_table) = c("source", "target", "value","group") +names(link_v4_table) = c("source", "target", "value","group") + +combined <- rbind(link_v1_table,link_v2_table) +#links <- rbind(combined,link_v3_table) +combined <- rbind(combined,link_v3_table) +links <- rbind(combined,link_v4_table) + + +nodes_v1_table = v1_table %>% select(v1,v2) +nodes_v1_table = rbind(nodes_v1_table$v1,nodes_v1_table$v2) + +nodes_v2_table = v2_table %>% select(v2,v3) +nodes_v2_table = rbind(nodes_v2_table$v2,nodes_v2_table$v3) + +nodes_v3_table = v3_table %>% select(v3,v4) +nodes_v3_table = rbind(nodes_v3_table$v3,nodes_v3_table$v4) + +nodes_v4_table = v4_table %>% select(v4,v5) +nodes_v4_table = rbind(nodes_v4_table$v4,nodes_v4_table$v5) + +nodes_v1_indexed = v1_indexed +nodes_v2_indexed = v2_indexed +nodes_v3_indexed = v3_indexed +nodes_v4_indexed = v4_indexed +nodes_v5_indexed = v5_indexed + +colnames(nodes_v1_indexed)[1] <- c("name") +colnames(nodes_v2_indexed)[1] <- c("name") +colnames(nodes_v3_indexed)[1] <- c("name") +colnames(nodes_v4_indexed)[1] <- c("name") +colnames(nodes_v5_indexed)[1] <- c("name") + +test1 = as_tibble(nodes_v1_indexed$name) +test2 = as_tibble(nodes_v2_indexed$name) +test3 = as_tibble(nodes_v3_indexed$name) +test4 = as_tibble(nodes_v4_indexed$name) +test5 = as_tibble(nodes_v5_indexed$name) +test6 = rbind(test1,test2) +test6 = rbind(test6,test3) +test6 = rbind(test6,test4) +test6 = rbind(test6,test5) +nodes = test6 +colnames(nodes)[1] <- c("name") + +### Plot Sankey ### + +library(networkD3) + +nodes$group <- as.factor(c("my_unique_group")) + +# Give a color for each group: +my_color <- 'd3.scaleOrdinal() .domain(["expected", "unexpected"]) .range(["grey", "red"])' + +sankeyNetwork(Links = links, Nodes = nodes, + Source = "source", Target = "target", + Value = "value", NodeID = "name", + fontSize= 10, nodeWidth = 50,nodePadding = 2,colourScale=my_color, LinkGroup="group", NodeGroup="group") + +print(nameofthefile) \ No newline at end of file