-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmotifs_test.R
132 lines (115 loc) · 4.96 KB
/
motifs_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Counting Motifs and their association with breakpoints
# Tested to see if motifs identified in (Brown, 2014) are present and associated with the observed breakpoints
# Not enough sample size to be robust
docs <- "C:/Users/donju/OneDrive/Documents/"
HIPPOS_B <- readDNAStringSet(paste0(docs, "HIPPOSB.fasta"))
# To create DNA stringset of length 'ws' centered on coordinates (provided in data frame 'bkpts')
sliceWindow <- function(df=bkpts, ws=400){
bkpt.dna <- DNAStringSet()
for (x in 1:length(df$Coordinate)){
s <- df$Sequence[x]
b <- df$Coordinate[x]
n <- df$Track[x]
h <- ws/2
bkpt.dna <- append(bkpt.dna, DNAStringSet(HIPPOS_B[[s]][(b-h+1):(b+h)]))
names(bkpt.dna)[x] <- paste0(n, "_", b)
}
return(bkpt.dna)
}
# To
countMotifs <- function(bkpt.dna, mod="total"){
bkpt.rc <- reverseComplement(bkpt.dna)
tggtgg <- vcountPattern("tggtgg", bkpt.dna) + vcountPattern("tggtgg", bkpt.rc)
cctcccct <- vcountPattern("cctcccct", bkpt.dna) + vcountPattern("cctcccct", bkpt.rc)
cccag <- vcountPattern("cccag", bkpt.dna) + vcountPattern("cccag", bkpt.rc)
tggag <- vcountPattern("tggag", bkpt.dna) + vcountPattern("tggag", bkpt.rc)
gggct <- vcountPattern("gggct", bkpt.dna) + vcountPattern("gggct", bkpt.rc)
aggag <- vcountPattern("aggag", bkpt.dna) + vcountPattern("aggag", bkpt.rc)
df <- data.frame(tggtgg, cctcccct, cccag, tggag, gggct, aggag)
rownames(df) <- names(bkpt.dna)
if (mod=="at_least"){ #if counting only at least one motif present
df[] <- lapply(df, function(x) ifelse(x>=1, 1, 0))
}
return(df)
}
randomCountMotifs <- function(bkpt_df, dnaset=HIPPOS, ws=200, mod="total"){
bkpt.randseq <- DNAStringSet()
wh <- ws/2
for (x in 1:nrow(bkpt_df)){
s <- bkpt_df$Sequence[x]
b <- sample(wh:(length(dnaset[[s]])-wh), 1) # pick 1 random number
dna <- DNAStringSet(dnaset[[s]][(b-wh+1):(b+wh)])
bkpt.randseq <- append(bkpt.randseq, dna)
#names(bkpt.randseq)[x] <- paste0(bkpt_df$Sequence[x], "_", b)
}
rand.motifs <- countMotifs(bkpt.randseq, mod)
return(apply(rand.motifs, 2, sum))
}
permutedCountMotifs <- function(bd=bkpt1kb){
shuff.dna <- shuffle(bd)
shuff.motifs <- countMotifs(shuff.dna)
shuff.motifv <- apply(shuff.motifs, 2, sum)
return(shuff.motifv)
}
bkpt50bp <- sliceWindow(bkpts, 50)
bkpt1kb <- sliceWindow(bkpts, 1000)
bkpt.motifs200 <- countMotifs(sliceWindow(bkpts, 200), 200)
bkpt.motifs200.al <- countMotifs(sliceWindow(bkpts, 200), "at_least")
bkpt.motifs100.al <- countMotifs(sliceWindow(bkpts, 100), "at_least")
bkpt.motifs50 <- countMotifs(bkpt50bp, 50)
bkpt.motifv <- apply(bkpt.motifs100, 2, sum)
bkpt.motift <- sum(bkpt.motifv)
motifv.100full.al <- apply(bkpt.motifs100.al, 2, sum)
motifv.200full <- apply(bkpt.motifs200, 2, sum)
motifv.200full.al <- apply(bkpt.motifs200.al, 2, sum)
motifv.400full <- apply(bkpt.motifs400, 2, sum)
#Randomly sample & count motifs around however many breakpoints, 1000 times
rand.200full.array <- replicate(1000, randomCountMotifs(bkpts, HIPPOS_B, 200))#, "at_least"))
rand.array <- rand.200full.array
rand.100full.array.al
rand.200full.array
rand.200full.array.al
rand.400full.array
rand.400mr.array
rand.mean <- apply(rand.array, 1, mean)
rand.sd <- apply(rand.array, 1, sd)
rand200full.pval <- pnorm(motifv.200full, mean=rand.mean, sd=rand.sd, lower.tail=FALSE)
rand200full.pval.al
rand100full.pval
rand200full.pval
rand400full.pval
rand200mr.pval
rand100mr.pval
#Saving to table file
motif_df <- data.frame(motifv.200full, shuff.mean, shuff.sd, shuff.pval, rand.mean, rand.sd, rand200full.pval)
motif_df
write.csv(motif_df, file=paste0(docs, "breakpoint motifs pvalue.csv"))
write.csv(bkpt.motifs200.al, file=paste0(docs, "motif present within 100-bp.csv"))
bkpt200bp.shuff <- shuffle(bkpt200bp)
# Create permuted control sequences
# Function to shuffle or randomize the order a given DNA sequence
shuffle <- function(bd){
bd.shuff <- bd
for (s in 1:length(bd)){
bd.shuff[[s]] <- sample(bd[[s]], length(bd[[s]]))
}
return(bd.shuff)
}
# Shuffle only middle 1 kb of 5 kb windows sliced in plot_nB-DNA.R, then write out fasta files
bkpt5kb <- sliceWindow(bkpts, 5000)
bkpt5kb.500shuff <- bkpt5kb
for (s in 1:length(bkpt5kb)){
bkpt5kb.500shuff[[s]][2001:3000] <- bkpt1kb.shuff[[s]]
writeXStringSet(bkpt5kb.500shuff[s], paste0(docs, "windows_shuffled/", names(bkpt5kb.500shuff[s]), "shuffled.fasta"))
}
#Shuffle nucleotides of (28) observed breakpoints, 1000 times
perm.array <- replicate(1000, permutedCountMotifs(bkpt200bp))
shuff.mean <- apply(perm.array, 1, mean)
shuff.sd <- apply(perm.array, 1, sd)
shuff.pval <- pnorm(motifv.200full, mean=shuff.mean, sd=shuff.sd, lower.tail=FALSE)
shuff.pval
perm.sum <- replicate(1000, sum(permutedCountMotifs(bkpt200bp)))
shuff.total.mean <- mean(perm.sum)
shuff.total.sd <- sd(perm.sum)
shuff.total.pval <- pnorm(bkpt.motift, mean=shuff.total.mean, sd=shuff.total.sd, lower.tail=FALSE)
shuff.total.pval