.Rhistory

library(cluster)
library(CrossClustering)
library(vegan)
library(Rtsne)
library(factoextra)
#tree manipulation
library(ape)
library(hutan)
#install bioconductor 3.8 https://www.bioconductor.org/install/
#install Biostrings: BiocManager::install("Biostrings", version = "3.8") ; do not update mclust because it will have a warning and then biostrings module not installed.
#~~~~~ Beginning of 'story''s code.~~~~~
## read your fasta in as Biostrings object
fasta.s <- readDNAStringSet("./data/sample_data/longest_pep_sample.fasta")
## get the read names (in your case it has the isoform info)
names.fasta <- names(fasta.s)
## extract only the relevant gene and isoform (JM: aka pep) id (split name by the period symbol)
#JM: modify line to fit my name format
gene.iso <- sapply(names.fasta,function(j) cbind(unlist(strsplit(j,'\\:'))[1:8]))
## convert to good data.frame = transpose result from previous step and add relevant column names
gene.iso.df <- data.frame(t(gene.iso))
#JM: alter to put your own column names in:
colnames(gene.iso.df) <- c('p','pepID','s','scaffID','g','geneID','t','transID')
gene.iso.df<-select(gene.iso.df, geneID, pepID)
## and length of isoforms
gene.iso.df$width <- width(fasta.s)
#Did not use rest of code.
#~~~~~ End of 'story''s code.~~~~~
#JM:
gene.iso.df<-rownames_to_column(gene.iso.df)
#~~~~~beginning of code by nassimhddd (Jul 18 '16) Stackoverflow~~~~~
longest.pep.df <-gene.iso.df%>%group_by(geneID)%>%mutate(the_rank = rank(-width,ties.method="random")) %>%filter(the_rank==1)%>%select(-the_rank)
#~~~~~End of nassimhddd's code.~~~~~
list<-longest.pep.df[,1,drop=FALSE]
#wait why doesn't col.names=FALSE remove the column name?? Oh well.
write.csv(list,"longestpep.txt",quote=FALSE,row.names=FALSE)
library( tidyverse )
library(ggplot2)
library(seqinr)
library(ape)
library(Biostrings)
library(dplyr)
library(plyr)
#clustering
library(cluster)
library(CrossClustering)
library(vegan)
library(Rtsne)
library(factoextra)
#tree manipulation
library(ape)
library(hutan)
#install bioconductor 3.8 https://www.bioconductor.org/install/
#install Biostrings: BiocManager::install("Biostrings", version = "3.8") ; do not update mclust because it will have a warning and then biostrings module not installed.
#~~~~~ Beginning of 'story''s code.~~~~~
## read your fasta in as Biostrings object
fasta.s <- readDNAStringSet("./data/sample_data/longest_pep_sample.fasta")
## get the read names (in your case it has the isoform info)
names.fasta <- names(fasta.s)
## extract only the relevant gene and isoform (JM: aka pep) id (split name by the period symbol)
#JM: modify line to fit my name format
gene.iso <- sapply(names.fasta,function(j) cbind(unlist(strsplit(j,'\\:'))[1:8]))
## convert to good data.frame = transpose result from previous step and add relevant column names
gene.iso.df <- data.frame(t(gene.iso))
#JM: alter to put your own column names in:
colnames(gene.iso.df) <- c('p','pepID','s','scaffID','g','geneID','t','transID')
gene.iso.df<-select(gene.iso.df, geneID, pepID)
## and length of isoforms
gene.iso.df$width <- width(fasta.s)
#Did not use rest of code.
#~~~~~ End of 'story''s code.~~~~~
#JM:
gene.iso.df<-rownames_to_column(gene.iso.df)
#~~~~~beginning of code by nassimhddd (Jul 18 '16) Stackoverflow~~~~~
longest.pep.df <-gene.iso.df%>%dplyr::group_by(geneID)%>%mutate(the_rank = rank(-width,ties.method="random")) %>%filter(the_rank==1)%>%select(-the_rank)
#~~~~~End of nassimhddd's code.~~~~~
list<-longest.pep.df[,1,drop=FALSE]
#wait why doesn't col.names=FALSE remove the column name?? Oh well.
write.csv(list,"longestpep.txt",quote=FALSE,row.names=FALSE)
detach(plyr)
detach("plyr")
detach("package:plyr", unload=TRUE)
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
echo=FALSE,
dpi=300,
cache=TRUE
)
library( tidyverse )
library(ggplot2)
library(seqinr)
library(ape)
library(Biostrings)
library(dplyr)
#library(plyr)
#clustering
library(cluster)
library(CrossClustering)
library(vegan)
library(Rtsne)
library(factoextra)
#tree manipulation
library(ape)
library(hutan)
#RUN SAMPLE DATA:
fasta_dir<-"data/sample_data/"
fasta<-c("sample1.fasta")
#RUN REAL DATA:
#fasta_dir<-"~/Desktop/compgen/repo/ensembl/genome_fasta/"
#fasta<-c("Amphimedon_queenslandica.Aqu1.dna.toplevel.fa","Capitella_teleta.Capitella_teleta_v1.0.dna.toplevel.fa","Drosophila_chr.fa", "Helobdella_robusta.Helro1.dna.toplevel.fa","Lottia_gigantea.Lotgi1.dna.toplevel.fa","Mnemiopsis_leidyi.MneLei_Aug2011.dna.toplevel.fa","Nematostella_vectensis.ASM20922v1.dna.toplevel.fa","Strongylocentrotus_purpuratus.Spur_3.1.dna.toplevel.fa","Taeniopygia_guttata.taeGut3.2.4.dna.toplevel.fa","Trichoplax_adhaerens.ASM15027v1.dna.toplevel.fa")
for (blah in fasta){
fasta_path<-paste0(fasta_dir,blah)
#histogram of all contig lengths,
scaffold.length.plot<-read.fasta(fasta_path)%>%getLength(.)%>%as.data.frame()%>%ggplot()+geom_histogram(aes(x=.), fill="skyblue3", color="black",binwidth=100)+xlab("Scaffold Length (bp)")+ylab("Frequency")+labs(title=blah)
ggsave(scaffold.length.plot,file=paste0("scaffold.len_",blah,".pdf"))
#histogram of contig lengths, restricted to scaffolds < 10,000 bp
restricted.plot<-read.fasta(fasta_path)%>%getLength(.)%>%as.data.frame()%>%filter(.<10000)%>%ggplot()+geom_histogram(aes(x=.), fill="skyblue3", color="black",binwidth=100)+xlab("Scaffold Length (bp)")+ylab("Frequency")+labs(title=blah)
restricted.plot
ggsave(restricted.plot,file=paste0("scaffold.len_10000bp_",blah,".pdf"))
#stats
print(blah)
fasta.len=read.fasta(fasta_path)%>%getLength()
print(paste("Median scaffold/contig length (bp):",median(as.numeric(fasta.len))))
print(paste("Max scaffold/contig length (bp):",max(as.numeric(fasta.len))))
print(paste("Minimum scaffold/contig length (bp):",min(as.numeric(fasta.len))))
}
#install.packages("ape")
#RUN SAMPLE DATA:
gff3_dir<-"data/sample_data/"
gff3<-"gff3_sample.gff3"
#RUN REAL DATA:
#gff3_dir<-"data/genomes/gff3/"
#gff3<-c("Amphimedon_queenslandica.Aqu1.42.gff3","Capitella_teleta.Capitella_teleta_v1.0.42.gff3","Danio_rerio.GRCz11.96.chr.gff3","Drosophila_melanogaster.BDGP6.22.96.chr.gff3","Helobdella_robusta.Helro1.42.gff3","Homo_sapiens.GRCh38.96.chr.gff3","Lottia_gigantea.Lotgi1.42.gff3","Mnemiopsis_leidyi.MneLei_Aug2011.42.gff3","Nematostella_vectensis.ASM20922v1.42.gff3","Strongylocentrotus_purpuratus.Spur_3.1.42.gff3","Taeniopygia_guttata.taeGut3.2.4.96.chr.gff3","Trichoplax_adhaerens.ASM15027v1.42.gff3")
for (blah in gff3){
gff3_path<-paste0(gff3_dir,blah)
contig.gene.df <- ape::read.gff(file=gff3_path, GFF3=TRUE) %>% select(.,"seqid","type","start","end") %>% filter(.,type == "gene")
#for histogram, take just the contig names and gene counts
gene.count<- as.data.frame(table(contig.gene.df$seqid))
#plot frequency of frequencies
gff3.plot<-ggplot(data = gene.count, aes(x=gene.count$Freq)) + geom_histogram(binwidth=1) + geom_vline(xintercept = 3,color="red")+ggtitle(blah) +xlab("Number of Genes Per Contig")+ylab("Frequency")+labs(title=blah)
gff3.plot
ggsave(gff3.plot,file=paste0("gff3_",blah,".pdf"))
#use for custom axes
gff3.plot.custom.axes<-ggplot(data = gene.count, aes(x=gene.count$Freq)) + geom_histogram(binwidth=1) + geom_vline(xintercept = 3,color="red") +xlim(0,10)+ylim(0,3000)+xlab("Number of Genes Per Contig")+ylab("Frequency")+labs(title=blah)
gff3.plot.custom.axes
ggsave(gff3.plot.custom.axes,file=paste0("gff3_customaxes_",blah,".pdf"))
#stats
print(blah)
print(paste("proportion contigs with gene count >= 3:",nrow(subset(gene.count, gene.count$Freq >= 3))/nrow(gene.count)*100))
print(paste("Median no. genes/contig:",median(gene.count$Freq)))
print(paste("Max no. genes/contig:",max(gene.count$Freq)))
}
#install bioconductor 3.8 https://www.bioconductor.org/install/
#install Biostrings: BiocManager::install("Biostrings", version = "3.8") ; do not update mclust because it will have a warning and then biostrings module not installed.
#~~~~~ Beginning of 'story''s code.~~~~~
## read your fasta in as Biostrings object
fasta.s <- readDNAStringSet("./data/sample_data/longest_pep_sample.fasta")
## get the read names (in your case it has the isoform info)
names.fasta <- names(fasta.s)
## extract only the relevant gene and isoform (JM: aka pep) id (split name by the period symbol)
#JM: modify line to fit my name format
gene.iso <- sapply(names.fasta,function(j) cbind(unlist(strsplit(j,'\\:'))[1:8]))
## convert to good data.frame = transpose result from previous step and add relevant column names
gene.iso.df <- data.frame(t(gene.iso))
#JM: alter to put your own column names in:
colnames(gene.iso.df) <- c('p','pepID','s','scaffID','g','geneID','t','transID')
gene.iso.df<-select(gene.iso.df, geneID, pepID)
## and length of isoforms
gene.iso.df$width <- width(fasta.s)
#Did not use rest of code.
#~~~~~ End of 'story''s code.~~~~~
#JM:
gene.iso.df<-rownames_to_column(gene.iso.df)
#~~~~~beginning of code by nassimhddd (Jul 18 '16) Stackoverflow~~~~~
longest.pep.df <-gene.iso.df%>%dplyr::group_by(geneID)%>%mutate(the_rank = rank(-width,ties.method="random")) %>%filter(the_rank==1)%>%select(-the_rank)
#~~~~~End of nassimhddd's code.~~~~~
list<-longest.pep.df[,1,drop=FALSE]
write.csv(list,"longestpep.txt",quote=FALSE,row.names=FALSE)
#~~~Create GFF3 Table~~~
#set up empty df to rbind to
all.df<-data.frame(matrix(ncol=5,nrow=0))
names<-c("seqid","type","start","end","attributes")
colnames(all.df)<-names
#set up file names to loop through
gff3_dir="data/genomes/gff3/"
gff3=c("Amphimedon_queenslandica.Aqu1.42.gff3","Capitella_teleta.Capitella_teleta_v1.0.42.gff3","Danio_rerio.GRCz11.95.gff3","Drosophila_melanogaster.BDGP6.95.gff3","Helobdella_robusta.Helro1.42.gff3","Homo_sapiens.GRCh38.95.gff3","Lottia_gigantea.Lotgi1.42.gff3","Mnemiopsis_leidyi.MneLei_Aug2011.42.gff3","Nematostella_vectensis.ASM20922v1.42.gff3","Strongylocentrotus_purpuratus.Spur_3.1.42.gff3","Taeniopygia_guttata.taeGut3.2.4.95.gff3","Trichoplax_adhaerens.ASM15027v1.42.gff3")
#read in each gff3 file and extract seqid, type, start, end, attributes. Filter lines by type = gene.
for (blah in gff3)
{
gff3_path<-paste0(gff3_dir,blah)
contig.gene.df <- ape::read.gff(file=gff3_path, GFF3=TRUE) %>% dplyr::select(.,"seqid","type","start","end","attributes") %>% filter(.,type == "gene")
all.df<-bind_rows(all.df,contig.gene.df)
}
#Parse out the gene_id from the attributes
all.df$gene<-gsub(".*gene_id=(.*?);.*","\\1",all.df$attributes)
#Construct gff3 table
gff3.table<-select(all.df,"seqid","start","end","gene")
#add animal IDs to gff3 table
gff3.table$animal<-c(rep.int("Amphimedon_queenslandica",43615),rep.int("Capitella_telata",32175),rep.int("Danio_rerio",25606),rep.int("Drosophila_melanogaster",13931),rep.int("Helobdella_robusta",23432),rep.int("Homo_sapiens",21492),rep.int("Lottia_gigantea",23349),rep.int("Mnemiopsis_leidyi",16559),rep.int("Nematostella_vectensis",24773),rep.int("Strongylocentrotus_purpuratus",28987),rep.int("Taeniopygia_guttata",17488),rep.int("Trichoplax_adhaerens",11520))
#~~~ GFF3 Table Finished ~~~
#Read in agalma homologs csv
agalma.homologs<-read.csv("agalma/no_hydra_aka_ALL_ANIMALS/compgen_homologs_nohydra_geneNames.csv",header=TRUE,sep=",")
library(plyr)
#must run previous chunk prior to running this chunk
gff3.table.save<-gff3.table
#***SUBSAMPLE: choose from which animals you want to cluster scaffolds. Not doing so makes computation impossible on a local machine.
filter_target=c("Homo_sapiens","Danio_rerio","Strongylocentrotus_purpuratus")
gff3.table<-filter(gff3.table, animal==filter_target)
#~~~Create contingency.table~~~
#Make scaffold names unique- join seqid and catalog id
contingency.table<-inner_join(gff3.table,agalma.homologs)%>%dplyr::select(.,seqid,homology_id,animal)%>%mutate(species_scaffold=str_c(seqid,animal,sep="_"))%>%dplyr::select(.,species_scaffold,homology_id)
#~~~contingency.table compelte~~~
#Use contingency.table to make the contingency table
contingency<-table(contingency.table$species_scaffold,contingency.table$homology_id)%>%as.matrix()
contingency<-1*(contingency>0)
#***DOWNSAMPLE contingency table for computation on a local machine.
contingency<-contingency[sample(100),sample(100)]
#distance matrix
dist.matrix<-daisy(contingency,metric="gower",stand=FALSE)
#cluster
#cross-cluster
cross.clust<-cc_crossclustering(dist.matrix,out=FALSE)
clust<-cc_get_cluster(cross.clust)
#plot w/ tSNE
#adjust perplexity if get error "perplexity is too large for the number of samples"
tsne_obj<-Rtsne(dist.matrix,is_distance=TRUE,perplexity=1)
tsne_data<-tsne_obj$Y%>%data.frame()%>%setNames(c("X","Y"))%>%mutate(cluster=factor(clust))
tsne_plot<-ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster)) + labs(title="Cross Cluster tSNE")
tsne_plot
#agglomerative
ag<-agnes(dist.matrix,diss=TRUE)
plot(ag,labels=FALSE)
#divisive
di<-diana(dist.matrix)
plot(di,labels=FALSE)
#kmeans
dist.matrix.coerced<-as.matrix(dist.matrix)
fviz_nbclust(dist.matrix.coerced,kmeans,method="wss")
kmean<-kmeans(dist.matrix,2)
kclust<-kmean$cluster
tsne_obj<-Rtsne(dist.matrix,is_distance=TRUE,perplexity=1)
tsne_data<-tsne_obj$Y%>%data.frame()%>%setNames(c("X","Y"))%>%mutate(cluster=factor(kclust))
tsn_plot_k<-ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster))+labs(title="Kmeans tSNE")
tsn_plot_k
#~~~Clustering complete~~~
#~~~Create master.table.~~~
#Put together a scaffold:cluster df
clust.df<-as.data.frame(clust)
colnames(clust.df)<-"cluster_id"
clust.df<-mutate(clust.df,species_scaffold=row.names(contingency))
master.table<-left_join(gff3.table.save,agalma.homologs)%>%select(.,"seqid","start","end","gene","homology_id","catalog_id","animal")%>%mutate(species_scaffold=str_c(seqid,animal,sep="_"))%>%left_join(.,clust.df,by="species_scaffold")%>%select(.,species_scaffold,start,end,gene,homology_id,animal,cluster_id)
#initialize
mrca.df<-data.frame()
taxa.list<-NULL
#phylogeny
tree_text = "(Mnemiopsis_leidyi,(Amphimedon_queenslandica,(Trichoplax_adhaerens,(Nematostella_vectensis,((Drosophila_melanogaster,(Lottia_gigantea,(Capitella_telata,Helobdella_robusta))),(Strongylocentrotus_purpuratus,(Danio_rerio,(Homo_sapiens,Taeniopygia_guttata))))))));"
phy = read.tree(text=tree_text)
plot(phy)
#create a list of unique homology_ids; do not include NAs
homolog.list<-unique(master.table$homology_id)
homolog.list<-homolog.list[!is.na(homolog.list)]
for (h in homolog.list){
#1.Find all taxa that possess a particular homology_id.
homolog.animals<-master.table%>%filter(.,homology_id == h) %>% select(.,animal)%>%unique()
#2. Find the most recent common ancestor of all those taxa.
#If homology_id is possessed by only 1 taxon (eg. shared among paralogs), then skip. By default, there is no opportunity for detecting absence.
if (nrow(homolog.animals) == 1){
next
}
if (nrow(homolog.animals) > 1){
mrca <-getMRCA(phy,as.character(homolog.animals$animal))
bind.df<-cbind(homology_id=h,mrca)
mrca.df<-rbind(mrca.df,bind.df)
}
}
#Add MRCAs to master.table
new.master.table<-left_join(master.table,mrca.df, by="homology_id")
absent.df<-data.frame(h=integer(0),absent.in.taxa=integer(0))
for (h in mrca.df$homology_id){
#3. Find all the tips of the last common ancestor.
all.tips<-mrca.df$mrca[mrca.df$homology_id==h]%>%descendants(phy,.)
all.tips<-all.tips[all.tips<=length(phy$tip.label)]
taxa<-new.master.table%>%filter(homology_id==h)%>%select(.,animal)%>%unique()
#first find the node number of all animals that taxa with that homology_id
for (t in taxa$animal){
find.taxa<-which(phy$tip.label==t)
taxa.list<-append(taxa.list,find.taxa)
}
#4. Compare which tips do not possess the homology_id.
#Where there are no instances in gene loss absent.in.taxa will equal integer(0). Hence need rbind.fill (not rbind)
absent.in.taxa<-setdiff(all.tips,taxa.list)
taxa.list<-NULL
#append to table of homology_id + absent taxa
combine.df<-cbind(h,absent.in.taxa)%>%as.data.frame()
absent.df<-rbind.fill(absent.df,combine.df)
}
#prep absent.df dataframe
#remove NAs in 'absent' column - these are homologs not absent in any of the child taxa.
absent.df<-na.omit(absent.df)
names(absent.df)<-c("homology_id","absent")
#Create table of scaffold - gene - homology id - taxa homolog is absent in
absence.results<-left_join(master.table,absent.df,by="homology_id")%>%select(.,species_scaffold,gene,homology_id,absent)%>%na.omit(absent)%>%group_by(homology_id)
write.csv(absence.results,"absence.analysis.results.csv",quote=FALSE,row.names=FALSE)
#Add to master table.
new.master.table<-left_join(new.master.table,absent.df)
# The `Parse GFF3 and Agalma data` chunk and the `cluster` chunk must be run first before this chunk. *Whatever you have set as the contingency table there will be the input for this chunk.*
#number of scaffolds
nrow(contingency)
#number of homologs shared across scaffolds
ncol(contingency)
#sums of each column
colsum<-colSums(contingency)
#histogram of number of sacffolds occupied per homolog
hist<-ggplot() + aes(colsum)+geom_histogram(fill="skyblue3",color="black")+xlab("Number of Scaffolds Occupied") + ylab("Count")+ggtitle("Number of Scaffolds Occupied Per Homolog (5taxa)")
#median number of scaffolds occupied
median(colsum)
#average number of sacffolds occupied
mean(colsum)
#min number of scaffolds occupied
min(colsum)
#max number of scaffolds occupied.
max(colsum)
hist
#mixing up the columns or rows will not affect overall clustering pattern (i.e. all clustering)
#make a data frame with the same proportion of 1's and 0's in random order
dim(contingency)
n_entries<-ncol(contingency)*nrow(contingency)
proportion_1s<-sum(contingency)/n_entries
proportion_0s<-1-proportion_1s
random.matrix<-c(rep("1",proportion_1s*n_entries),rep("0",proportion_0s*n_entries))%>%sample(.,n_entries)%>%matrix(.,ncol=ncol(contingency),nrow=nrow(contingency))%>%as.data.frame()
rownames(random.matrix)<-row.names(contingency)
colnames(random.matrix)<-colnames(contingency)
#distance matrix
rand.dist.matrix<-daisy(random.matrix,metric="gower",stand=FALSE)
#cross-clustering
rand.cross.clust<-cc_crossclustering(rand.dist.matrix,out=FALSE)
rand.clust<-cc_get_cluster(rand.cross.clust)
rand.tsne_obj<-Rtsne(rand.dist.matrix,is_distance=TRUE,perplexity=1)
rand.tsne_data<-rand.tsne_obj$Y%>%data.frame()%>%setNames(c("X","Y"))%>%mutate(rand.cluster=factor(rand.clust))
rand.tsne_plot<-ggplot(aes(x = X, y = Y), data = rand.tsne_data) + geom_point(aes(color = rand.cluster))
rand.tsne_plot
#agglomerative
rand.ag<-agnes(rand.dist.matrix,diss=TRUE)
plot(rand.ag,labels=FALSE)
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
echo=FALSE,
dpi=300,
cache=TRUE
)
library( tidyverse )
library(ggplot2)
library(seqinr)
library(ape)
library(Biostrings)
library(dplyr)
library(plyr)
#clustering
library(cluster)
library(CrossClustering)
library(vegan)
library(Rtsne)
library(factoextra)
#tree manipulation
library(ape)
library(hutan)
#~~~Create GFF3 Table~~~
#set up empty df to rbind to
all.df<-data.frame(matrix(ncol=5,nrow=0))
names<-c("seqid","type","start","end","attributes")
colnames(all.df)<-names
#set up file names to loop through
gff3_dir="data/genomes/gff3/"
gff3=c("Amphimedon_queenslandica.Aqu1.42.gff3","Capitella_teleta.Capitella_teleta_v1.0.42.gff3","Danio_rerio.GRCz11.95.gff3","Drosophila_melanogaster.BDGP6.95.gff3","Helobdella_robusta.Helro1.42.gff3","Homo_sapiens.GRCh38.95.gff3","Lottia_gigantea.Lotgi1.42.gff3","Mnemiopsis_leidyi.MneLei_Aug2011.42.gff3","Nematostella_vectensis.ASM20922v1.42.gff3","Strongylocentrotus_purpuratus.Spur_3.1.42.gff3","Taeniopygia_guttata.taeGut3.2.4.95.gff3","Trichoplax_adhaerens.ASM15027v1.42.gff3")
#read in each gff3 file and extract seqid, type, start, end, attributes. Filter lines by type = gene.
for (blah in gff3)
{
gff3_path<-paste0(gff3_dir,blah)
contig.gene.df <- ape::read.gff(file=gff3_path, GFF3=TRUE) %>% dplyr::select(.,"seqid","type","start","end","attributes") %>% filter(.,type == "gene")
all.df<-bind_rows(all.df,contig.gene.df)
}
#Parse out the gene_id from the attributes
all.df$gene<-gsub(".*gene_id=(.*?);.*","\\1",all.df$attributes)
#Construct gff3 table
gff3.table<-select(all.df,"seqid","start","end","gene")
#add animal IDs to gff3 table
gff3.table$animal<-c(rep.int("Amphimedon_queenslandica",43615),rep.int("Capitella_telata",32175),rep.int("Danio_rerio",25606),rep.int("Drosophila_melanogaster",13931),rep.int("Helobdella_robusta",23432),rep.int("Homo_sapiens",21492),rep.int("Lottia_gigantea",23349),rep.int("Mnemiopsis_leidyi",16559),rep.int("Nematostella_vectensis",24773),rep.int("Strongylocentrotus_purpuratus",28987),rep.int("Taeniopygia_guttata",17488),rep.int("Trichoplax_adhaerens",11520))
#~~~ GFF3 Table Finished ~~~
#Read in agalma homologs csv
agalma.homologs<-read.csv("compgen_homologs_nohydra_geneNames.csv",header=TRUE,sep=",")
agalma.homologs<-read.csv("agalma/no_hydra_aka_ALL_ANIMALS/compgen_homologs_nohydra_geneNames.csv",header=TRUE,sep=",")
#must run previous chunk prior to running this chunk
gff3.table.save<-gff3.table
#***SUBSAMPLE: choose from which animals you want to cluster scaffolds. Not doing so makes computation impossible on a local machine.
filter_target=c("Homo_sapiens","Danio_rerio","Strongylocentrotus_purpuratus")
gff3.table<-filter(gff3.table, animal==filter_target)
#~~~Create contingency.table~~~
#Make scaffold names unique- join seqid and catalog id
contingency.table<-inner_join(gff3.table,agalma.homologs)%>%dplyr::select(.,seqid,homology_id,animal)%>%mutate(species_scaffold=str_c(seqid,animal,sep="_"))%>%dplyr::select(.,species_scaffold,homology_id)
#~~~contingency.table compelte~~~
#Use contingency.table to make the contingency table
contingency<-table(contingency.table$species_scaffold,contingency.table$homology_id)%>%as.matrix()
contingency<-1*(contingency>0)
#***DOWNSAMPLE contingency table for computation on a local machine.
contingency<-contingency[sample(100),sample(100)]
#distance matrix
dist.matrix<-daisy(contingency,metric="gower",stand=FALSE)
#cluster
#cross-cluster
cross.clust<-cc_crossclustering(dist.matrix,out=FALSE)
clust<-cc_get_cluster(cross.clust)
#plot w/ tSNE
#adjust perplexity if get error "perplexity is too large for the number of samples"
tsne_obj<-Rtsne(dist.matrix,is_distance=TRUE,perplexity=1)
tsne_data<-tsne_obj$Y%>%data.frame()%>%setNames(c("X","Y"))%>%mutate(cluster=factor(clust))
tsne_plot<-ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster)) + labs(title="Cross Cluster tSNE")
tsne_plot
#agglomerative
ag<-agnes(dist.matrix,diss=TRUE)
plot(ag,labels=FALSE)
#divisive
di<-diana(dist.matrix)
plot(di,labels=FALSE)
#kmeans
dist.matrix.coerced<-as.matrix(dist.matrix)
fviz_nbclust(dist.matrix.coerced,kmeans,method="wss")
kmean<-kmeans(dist.matrix,2)
kclust<-kmean$cluster
tsne_obj<-Rtsne(dist.matrix,is_distance=TRUE,perplexity=1)
tsne_data<-tsne_obj$Y%>%data.frame()%>%setNames(c("X","Y"))%>%mutate(cluster=factor(kclust))
tsn_plot_k<-ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster))+labs(title="Kmeans tSNE")
tsn_plot_k
#~~~Clustering complete~~~
#~~~Create master.table.~~~
#Put together a scaffold:cluster df
clust.df<-as.data.frame(clust)
colnames(clust.df)<-"cluster_id"
clust.df<-mutate(clust.df,species_scaffold=row.names(contingency))
master.table<-left_join(gff3.table.save,agalma.homologs)%>%select(.,"seqid","start","end","gene","homology_id","catalog_id","animal")%>%mutate(species_scaffold=str_c(seqid,animal,sep="_"))%>%left_join(.,clust.df,by="species_scaffold")%>%select(.,species_scaffold,start,end,gene,homology_id,animal,cluster_id)
clust
dist.matrix
View(dist.matrix)
View(contingency)
clust
row.names(contingency)
question<-data.frame(scaff=row.names(contingency),cluster=clust)
View(question)
ag
kclust
question<-mutate(Kmeans=kmean)
question<-mutate(as.data.frame(Kmeans=kmean))
question<-mutate(questions,as.data.frame(Kmeans=kmean))
question<-mutate(questions,Kmeans=kmean)
View(kmean)
kmean$cluster
View(kmean$cluster)
question<-mutate(question,kmean$cluster)
question
question[kmeans=kmean$cluster]
View(question<-mutate(question,kmeans=kmean$cluster))
blahq<-mutate(question,kmeans=kmean$cluster)
questionmark<-data.frame(scaff=row.names(contingency),cc=clust,Kmeans=kmean$cluster)
View(questionmark)
ag
cuttree(ag,h=0.05)
cuttree(ag,h=0.05)
cutree(ag,h=0.05)
cuttree(ag,h=0.05)
cutree(ag,k=0.05)
cutree(ag,k=30)
agnes$height
agnes[height]
agnes
ag
ag$height
ag$order
ag$ac
di
questionmark<-data.frame(scaff=row.names(contingency),cc=clust,Kmeans=kmean$cluster,agnes.height=ag$height,diana=di$height)
questionmark<-data.frame(scaff=row.names(contingency),cc=clust,Kmeans=kmean$cluster,agnes.height=ag$height)
nrow(ag$height)
nrow(ag$height)
ag$height
max(ag$height)
ag$order.lab
di$order.lab
print(ag)
plot(ag, labels=TRuE)
plot(ag, labels=TRUE)
plot(ag,labels=TRUE)
summary.agnes(ag)
summary(ag)
ag$order
cutree(ag,k=2)
groups<-cutree(ag,k=2)
rect.hclust(ag,k=2,border="red")
ag$height
ag$order.lab
ag.df<-data.frame(scaff=ag$order.lab,ag.height<-ag$height)
gff3.table.save<-gff3.table
#***SUBSAMPLE: choose from which animals you want to cluster scaffolds. Not doing so makes computation impossible on a local machine.
filter_target=c("Homo_sapiens","Danio_rerio","Strongylocentrotus_purpuratus")
gff3.table<-filter(gff3.table, animal==filter_target)
#~~~Create contingency.table~~~
#Make scaffold names unique- join seqid and catalog id
contingency.table<-inner_join(gff3.table,agalma.homologs)%>%dplyr::select(.,seqid,homology_id,animal)%>%mutate(species_scaffold=str_c(seqid,animal,sep="_"))%>%dplyr::select(.,species_scaffold,homology_id)
#~~~contingency.table compelte~~~
#Use contingency.table to make the contingency table
contingency<-table(contingency.table$species_scaffold,contingency.table$homology_id)%>%as.matrix()
contingency<-1*(contingency>0)
View(contingency)