forked from LarsNauheimer/HybPhaser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1a_count_snps.R
99 lines (71 loc) · 3.31 KB
/
1a_count_snps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
##################
### SNPs count ###
##################
# load config
if (!(exists("config_file"))) {config_file <- "./config.txt"}
source(config_file)
# load packages
library(ape)
library(seqinr)
library(stringr)
# generate folder
output_Robjects <- file.path(path_to_output_folder,"00_R_objects", name_for_dataset_optimization_subset)
dir.create(output_Robjects, showWarnings = F, recursive = T)
#####################################################################################
### Counting polymorphic sites (masked as ambiguity codes in conseneus sequences) ###
#####################################################################################
# getting targets and sample names
if(targets_file_format == "AA"){
targets <- read.fasta(fasta_file_with_targets, seqtype = "AA", as.string = TRUE, set.attributes = FALSE)
} else if(targets_file_format == "DNA"){
targets <- read.fasta(fasta_file_with_targets, seqtype = "DNA", as.string = TRUE, set.attributes = FALSE)
} else {
print("Warning! Target file type not set properly. Should be 'DNA' or 'AA'!")
}
targets_name <- unique(gsub(".*-","",labels(targets)))
samples <- readLines(path_to_namelist)
if(intronerated_contig=="yes"){
intronerated_name <- "intronerated"
intronerated_underscore <- "_"
} else {
intronerated_name <- ""
intronerated_underscore <- ""
}
# function for counting ambiguities
seq_stats <- function(file){
fasta <- read.fasta(file, as.string=TRUE, set.attributes = FALSE )
seq <- gsub("N|[?]|-","",fasta[[1]])
c(round(str_length(seq),0),(str_count(seq,"Y|K|R|S|M|y|k|r|s|m|w") + str_count(seq,"W|D|H|B|V|w|d|h|b|v")*2) )
}
# generate empty tables for SNPS and sequence length
tab_snps <- data.frame(loci=targets_name)
tab_length <- data.frame(loci=targets_name)
# fill tables with information on snps and sequence length for each sample and locus
start_time <- Sys.time()
for(sample in samples){
#sample <- samples[1]
print(paste(sample," ", round(difftime(Sys.time(),start_time, units="secs"),0),"s", sep="") )
tab_sample <- data.frame(targets=targets_name, seq_length=NA, ambis=NA, ambi_prop=NA)
consensus_files <- list.files(file.path(path_to_output_folder,"01_data",sample, paste(intronerated_name,intronerated_underscore,"consensus", sep="")),full.names = T)
for(consensus_file in consensus_files) {
gene <- gsub("(_intronerated|).fasta","",gsub(".*/","",consensus_file))
file.path(path_to_output_folder,"01_data",sample, "consensus", paste(gene,intronerated_underscore,intronerated_name,".fasta",sep=""))
if(file.info(consensus_file)$size >1){
stats <- seq_stats(consensus_file)
} else {
stats <- c(NA,NA)
}
tab_sample$seq_length[match(gene,tab_sample$targets)] <- stats[1]
tab_sample$ambis[match(gene,tab_sample$targets)] <- stats[2]
}
tab_sample$ambi_prop <- tab_sample$ambis/tab_sample$seq_length
tab_snps[match(sample,samples)] <- tab_sample$ambi_prop
tab_length[match(sample,samples)] <- tab_sample$seq_length
}
colnames(tab_snps) <- samples
rownames(tab_snps) <- targets_name
colnames(tab_length) <- samples
rownames(tab_length) <- targets_name
### Generate output tables and save data in Robjects
saveRDS(tab_snps,file=file.path(output_Robjects,"Table_SNPs.Rds"))
saveRDS(tab_length,file=file.path(output_Robjects,"Table_consensus_length.Rds"))