From 9a144898b1c01ebe440cf46d7f1e13d89468023a Mon Sep 17 00:00:00 2001 From: "rupert@sorubim" Date: Fri, 12 Jan 2018 14:31:25 +0000 Subject: [PATCH] added basic cran code for v1.3-0 --- DESCRIPTION | 15 ++++++ MD5 | 92 ++++++++++++++++++++++++++++++++++++ NAMESPACE | 14 ++++++ NEWS | 66 ++++++++++++++++++++++++++ R/bestCloseMatch.R | 15 ++++++ R/cgraph.R | 8 ++++ R/chaoHaplo.R | 22 +++++++++ R/checkDNA.R | 8 ++++ R/dataStat.R | 9 ++++ R/haploAccum.R | 76 ++++++++++++++++++++++++++++++ R/heatmapSpp.R | 12 +++++ R/is.ambig.R | 8 ++++ R/localMinima.R | 10 ++++ R/maxInDist.R | 14 ++++++ R/monophyly.R | 26 +++++++++++ R/monophylyBoot.R | 39 ++++++++++++++++ R/nearNeighbour.R | 7 +++ R/nonConDist.R | 14 ++++++ R/nucDiag.R | 22 +++++++++ R/ordinDNA.R | 14 ++++++ R/paa.R | 10 ++++ R/plot.haploAccum.R | 28 +++++++++++ R/plot.ordinDNA.R | 49 +++++++++++++++++++ R/plot.slidWin.R | 61 ++++++++++++++++++++++++ R/polyBalance.R | 17 +++++++ R/rankSlidWin.R | 47 +++++++++++++++++++ R/read.BOLD.R | 35 ++++++++++++++ R/read.GB.R | 48 +++++++++++++++++++ R/rmSingletons.R | 4 ++ R/rosenberg.R | 13 ++++++ R/search.BOLD.R | 32 +++++++++++++ R/seeBarcode.R | 10 ++++ R/seqStat.R | 7 +++ R/slideAnalyses.R | 51 ++++++++++++++++++++ R/slideBoxplots.R | 38 +++++++++++++++ R/slideNucDiag.R | 11 +++++ R/slidingWindow.R | 16 +++++++ R/sppDist.R | 16 +++++++ R/sppDistMatrix.R | 15 ++++++ R/stats.BOLD.R | 11 +++++ R/tajima.K.R | 7 +++ R/tclust.R | 10 ++++ R/threshID.R | 33 +++++++++++++ R/threshOpt.R | 27 +++++++++++ R/tiporder.R | 6 +++ R/titv.R | 14 ++++++ R/tree.comp.R | 43 +++++++++++++++++ data/anoteropsis.rda | Bin 0 -> 1323 bytes data/dolomedes.rda | Bin 0 -> 1447 bytes data/sarkar.rda | Bin 0 -> 214 bytes inst/CITATION | 13 ++++++ man/anoteropsis.Rd | 17 +++++++ man/cgraph.Rd | 47 +++++++++++++++++++ man/chaoHaplo.Rd | 49 +++++++++++++++++++ man/checkDNA.Rd | 37 +++++++++++++++ man/dataStat.Rd | 56 ++++++++++++++++++++++ man/dolomedes.Rd | 15 ++++++ man/haploAccum.Rd | 55 ++++++++++++++++++++++ man/heatmapSpp.Rd | 49 +++++++++++++++++++ man/is.ambig.Rd | 42 +++++++++++++++++ man/localMinima.Rd | 52 +++++++++++++++++++++ man/monophyly.Rd | 104 +++++++++++++++++++++++++++++++++++++++++ man/nearNeighbour.Rd | 89 +++++++++++++++++++++++++++++++++++ man/nonConDist.Rd | 80 +++++++++++++++++++++++++++++++ man/nucDiag.Rd | 57 ++++++++++++++++++++++ man/ordinDNA.Rd | 56 ++++++++++++++++++++++ man/paa.Rd | 54 +++++++++++++++++++++ man/plot.haploAccum.Rd | 55 ++++++++++++++++++++++ man/plot.ordinDNA.Rd | 83 ++++++++++++++++++++++++++++++++ man/plot.slidWin.Rd | 67 ++++++++++++++++++++++++++ man/polyBalance.Rd | 43 +++++++++++++++++ man/rankSlidWin.Rd | 75 +++++++++++++++++++++++++++++ man/read.BOLD.Rd | 83 ++++++++++++++++++++++++++++++++ man/read.GB.Rd | 66 ++++++++++++++++++++++++++ man/rmSingletons.Rd | 45 ++++++++++++++++++ man/rosenberg.Rd | 61 ++++++++++++++++++++++++ man/sarkar.Rd | 17 +++++++ man/seeBarcode.Rd | 42 +++++++++++++++++ man/seqStat.Rd | 45 ++++++++++++++++++ man/slideAnalyses.Rd | 91 ++++++++++++++++++++++++++++++++++++ man/slideBoxplots.Rd | 77 ++++++++++++++++++++++++++++++ man/slideNucDiag.Rd | 66 ++++++++++++++++++++++++++ man/slidingWindow.Rd | 63 +++++++++++++++++++++++++ man/spider-package.Rd | 57 ++++++++++++++++++++++ man/sppDist.Rd | 57 ++++++++++++++++++++++ man/sppDistMatrix.Rd | 41 ++++++++++++++++ man/sppVector.Rd | 49 +++++++++++++++++++ man/tajima.K.Rd | 47 +++++++++++++++++++ man/tclust.Rd | 53 +++++++++++++++++++++ man/threshOpt.Rd | 73 +++++++++++++++++++++++++++++ man/tiporder.Rd | 47 +++++++++++++++++++ man/titv.Rd | 55 ++++++++++++++++++++++ man/tree.comp.Rd | 70 +++++++++++++++++++++++++++ 93 files changed, 3550 insertions(+) create mode 100644 DESCRIPTION create mode 100644 MD5 create mode 100644 NAMESPACE create mode 100644 NEWS create mode 100644 R/bestCloseMatch.R create mode 100644 R/cgraph.R create mode 100644 R/chaoHaplo.R create mode 100644 R/checkDNA.R create mode 100644 R/dataStat.R create mode 100644 R/haploAccum.R create mode 100644 R/heatmapSpp.R create mode 100644 R/is.ambig.R create mode 100644 R/localMinima.R create mode 100644 R/maxInDist.R create mode 100644 R/monophyly.R create mode 100644 R/monophylyBoot.R create mode 100644 R/nearNeighbour.R create mode 100644 R/nonConDist.R create mode 100644 R/nucDiag.R create mode 100644 R/ordinDNA.R create mode 100644 R/paa.R create mode 100644 R/plot.haploAccum.R create mode 100644 R/plot.ordinDNA.R create mode 100644 R/plot.slidWin.R create mode 100644 R/polyBalance.R create mode 100644 R/rankSlidWin.R create mode 100644 R/read.BOLD.R create mode 100644 R/read.GB.R create mode 100644 R/rmSingletons.R create mode 100644 R/rosenberg.R create mode 100644 R/search.BOLD.R create mode 100644 R/seeBarcode.R create mode 100644 R/seqStat.R create mode 100644 R/slideAnalyses.R create mode 100644 R/slideBoxplots.R create mode 100644 R/slideNucDiag.R create mode 100644 R/slidingWindow.R create mode 100644 R/sppDist.R create mode 100644 R/sppDistMatrix.R create mode 100644 R/stats.BOLD.R create mode 100644 R/tajima.K.R create mode 100644 R/tclust.R create mode 100644 R/threshID.R create mode 100644 R/threshOpt.R create mode 100644 R/tiporder.R create mode 100644 R/titv.R create mode 100644 R/tree.comp.R create mode 100644 data/anoteropsis.rda create mode 100644 data/dolomedes.rda create mode 100644 data/sarkar.rda create mode 100644 inst/CITATION create mode 100644 man/anoteropsis.Rd create mode 100644 man/cgraph.Rd create mode 100644 man/chaoHaplo.Rd create mode 100644 man/checkDNA.Rd create mode 100644 man/dataStat.Rd create mode 100644 man/dolomedes.Rd create mode 100644 man/haploAccum.Rd create mode 100644 man/heatmapSpp.Rd create mode 100644 man/is.ambig.Rd create mode 100644 man/localMinima.Rd create mode 100644 man/monophyly.Rd create mode 100644 man/nearNeighbour.Rd create mode 100644 man/nonConDist.Rd create mode 100644 man/nucDiag.Rd create mode 100644 man/ordinDNA.Rd create mode 100644 man/paa.Rd create mode 100644 man/plot.haploAccum.Rd create mode 100644 man/plot.ordinDNA.Rd create mode 100644 man/plot.slidWin.Rd create mode 100644 man/polyBalance.Rd create mode 100644 man/rankSlidWin.Rd create mode 100644 man/read.BOLD.Rd create mode 100644 man/read.GB.Rd create mode 100644 man/rmSingletons.Rd create mode 100644 man/rosenberg.Rd create mode 100644 man/sarkar.Rd create mode 100644 man/seeBarcode.Rd create mode 100644 man/seqStat.Rd create mode 100644 man/slideAnalyses.Rd create mode 100644 man/slideBoxplots.Rd create mode 100644 man/slideNucDiag.Rd create mode 100644 man/slidingWindow.Rd create mode 100644 man/spider-package.Rd create mode 100644 man/sppDist.Rd create mode 100644 man/sppDistMatrix.Rd create mode 100644 man/sppVector.Rd create mode 100644 man/tajima.K.Rd create mode 100644 man/tclust.Rd create mode 100644 man/threshOpt.Rd create mode 100644 man/tiporder.Rd create mode 100644 man/titv.Rd create mode 100644 man/tree.comp.Rd diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..9a88cea --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: spider +Type: Package +Title: Species Identity and Evolution in R +Version: 1.3-0 +Date: 2013-12-25 +Author: Samuel Brown, Rupert Collins, Stephane Boyer, Marie-Caroline Lefort, Jagoba Malumbres-Olarte, Cor Vink, Rob Cruickshank +Maintainer: Samuel Brown +Description: A package for the analysis of species limits and DNA barcoding data +License: GPL +LazyLoad: yes +Depends: ape, pegas +Packaged: 2013-12-27 04:41:31 UTC; sam +NeedsCompilation: no +Repository: CRAN +Date/Publication: 2013-12-27 07:30:41 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..cdebcad --- /dev/null +++ b/MD5 @@ -0,0 +1,92 @@ +6ef850e8e432b9a07bea47191083989c *DESCRIPTION +ccedd6712a186c4e228ed15a91f11bc1 *NAMESPACE +3ca298078ffa7ac200abef3880f87f64 *NEWS +e62af89c4a267ddca83f35e3fe45aee2 *R/bestCloseMatch.R +fd8a6057196e4b37eacaf2702ffae307 *R/cgraph.R +72989f994b8cbc46f268ae5685db274e *R/chaoHaplo.R +4eff78217d84219aaebc735c28541926 *R/checkDNA.R +5e3aa555d847d01f976df668706ce0ed *R/dataStat.R +595c83f688fd9a2c296d8eb245aafa73 *R/haploAccum.R +0279eec6c00e72eba4ceaf40df02e009 *R/heatmapSpp.R +7589015d4d0e23e0e375a20013be8eff *R/is.ambig.R +e1b514b3ba62439d0c880a4fc16da87d *R/localMinima.R +1fc67e9c3058d97ddc13df5d33716480 *R/maxInDist.R +216a150498025dd63a342afaaa2321e8 *R/monophyly.R +464fe43648d24bfc14bfa467e3ddb65a *R/monophylyBoot.R +c3cc009b71dbe65bbe6abd320b5a75d1 *R/nearNeighbour.R +ab6cd5493a4d65160a222f6f2b0f4c87 *R/nonConDist.R +5b6311a4810307002e91df28a1129b31 *R/nucDiag.R +ddfdef735e02ae9536ff0ad3e4d366cc *R/ordinDNA.R +db011b6dc2ec60650faedd89f2f4f3af *R/paa.R +3e4082cf030704b680d31a8124c0542e *R/plot.haploAccum.R +22603e5ef969a93b61cc28616cae1b37 *R/plot.ordinDNA.R +f67592797d2df601b3b5bbc8f9aabd61 *R/plot.slidWin.R +a2241669fe8df719b7c5236be8d14d73 *R/polyBalance.R +81468d47d63834ba47fe57655f67bf06 *R/rankSlidWin.R +06830956b67fa1d383f114c296bc8d5d *R/read.BOLD.R +ceb5f7e91c8f10fe002599df70565fc3 *R/read.GB.R +ffc7ad16cf592a257baa14cd85a77764 *R/rmSingletons.R +01dce3948a335e691f124a160208aad6 *R/rosenberg.R +12f6fa948324572903f7f9ba30676a39 *R/search.BOLD.R +62452208c0f3b1f49b6c33ea7023ab65 *R/seeBarcode.R +29ac3818d61baed21e8e78b5d57730f1 *R/seqStat.R +5781e96e6e54232f5b4bad3e37cbc9c1 *R/slideAnalyses.R +194ed594ea4c76d876523f2840ac582b *R/slideBoxplots.R +03a948a9a4fd88a8dba868b2426b7d7c *R/slideNucDiag.R +74cfd49a4b8abc658fcb8ceb63f7ba03 *R/slidingWindow.R +3511609b716c4584fc450e51f8822014 *R/sppDist.R +9d887177ee46fd110413143c6aaa0773 *R/sppDistMatrix.R +6adee9b3f4f7e03f565c8dc8ab0e806b *R/stats.BOLD.R +bc47e61e1f0c547cd7507e8d41475b2e *R/tajima.K.R +4d17fa0332ad6b57b1d72c10009e8ede *R/tclust.R +cfdccf7e0e59ae4440ceee670cb1b5cf *R/threshID.R +36917b934985d79bdcd18255a7ed07e7 *R/threshOpt.R +3b2a366119a12d521dd36e1089d668af *R/tiporder.R +88d340c437eb3fcbc80d5b27cec8fcb2 *R/titv.R +9c4ccbfa0a9c40518aa0fb9a55c0c680 *R/tree.comp.R +edbf5b74d01751270e51c2440f10a279 *data/anoteropsis.rda +a712abccd48075cbc548da442dfd926f *data/dolomedes.rda +c6688b7a4a356ce5c708bd30cc626f5a *data/sarkar.rda +4e86ccbe285fc2a1af25996cfc8b457f *inst/CITATION +151dc35bb3daf70a1b274c7bb064e9c7 *man/anoteropsis.Rd +9558e43c2f67120fd4c895a037fa1a43 *man/cgraph.Rd +d597bc9189acf85ec1e6392dba999108 *man/chaoHaplo.Rd +768ef7767fe1b49c6d84073a89655990 *man/checkDNA.Rd +daf7029dc23352d4080fe069ef41f1be *man/dataStat.Rd +e7c02820ae571e81c13adb7f068be728 *man/dolomedes.Rd +b35336167fe6b8df5adff3b35f433cbb *man/haploAccum.Rd +f23d99ed5511d13b7e8d9448d0da6112 *man/heatmapSpp.Rd +49de57ec34792547dd95134e3401d29f *man/is.ambig.Rd +060748e190f0e69b41ca5c7fe273cdb9 *man/localMinima.Rd +e48d6963b281b10c9e3475db4b06643b *man/monophyly.Rd +ac12c007eb3d9ffa710f6d30dec9108d *man/nearNeighbour.Rd +a3f99dc4b41a24bbc2ab15b14d5f4122 *man/nonConDist.Rd +7f6d54110e9c5ab22dba09a76250859f *man/nucDiag.Rd +67bc6ecb798844864b7a704b9b61b33d *man/ordinDNA.Rd +33d607edf25a5ffe7c249d7453b93a2e *man/paa.Rd +c51185d75775b4412904cc5612a3bab9 *man/plot.haploAccum.Rd +0f0c7676694d88e8a30c179cd7f9c415 *man/plot.ordinDNA.Rd +1464a0d035a146a7ffc06186f6d888e1 *man/plot.slidWin.Rd +596598619f59d979962f958adf55dd66 *man/polyBalance.Rd +91212b502f4232ab8de1b32b7c7e187b *man/rankSlidWin.Rd +197f3386d00be5bad7999ecf32189d0d *man/read.BOLD.Rd +b04e908ea0703656331dd88d5214012b *man/read.GB.Rd +6a877d3d9110047e4b643eba1dd84cb8 *man/rmSingletons.Rd +d5a18ce2ab6ba74a8298eb9bb909ab8e *man/rosenberg.Rd +e327eac7dd250d34ac50c1a6e8064cc0 *man/sarkar.Rd +6292e0149930ef0ed84547d77e9da426 *man/seeBarcode.Rd +0ee9121c8b61193e56943efd509977f8 *man/seqStat.Rd +a6767e83ec9592f2d9010242c27e0acc *man/slideAnalyses.Rd +0fd14c4652e09ef5618b371b9be57618 *man/slideBoxplots.Rd +ea4e754f069b34fe3fc3d9e63caa9852 *man/slideNucDiag.Rd +53579bb4209920f371ff0387f6785195 *man/slidingWindow.Rd +ca9ac3a2da2faaa5da664ffadbda1822 *man/spider-package.Rd +84d9cd7f8592d23479bd367b572ca17d *man/sppDist.Rd +ba0afcd2a953082fbf615c05e0c88969 *man/sppDistMatrix.Rd +7cb8795e205f98427cf6cce5c5489ea6 *man/sppVector.Rd +37652708d38147697e7bf28e0966d677 *man/tajima.K.Rd +d19607e2d857b9f66ecc031952c79900 *man/tclust.Rd +625a1e3dc27added71281f730f25fa34 *man/threshOpt.Rd +d64605bb7cd5b6a1dc477aca67ac7249 *man/tiporder.Rd +d4433bc014f57a1985367cd049adde85 *man/titv.Rd +e87a692944d243cf97d391354e99b50e *man/tree.comp.Rd diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..1bc18b8 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,14 @@ +# Remove the previous line if you edit this file + +# Export all names +exportPattern("^[^\\.]") + +# Import all packages listed as Imports or Depends +importFrom(ape, dist.dna, base.freq, seg.sites) +importFrom(pegas, haplotype) +importFrom(graphics, plot) + + +#S3 methods +S3method(plot, slidWin) +S3method(plot, haploAccum) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..6263dc5 --- /dev/null +++ b/NEWS @@ -0,0 +1,66 @@ +############################################ + SPIDER VERSION 1.3-0 + Released 25 December 2013 + + +-- read.GB: Space removed from around the pipe in the default naming scheme. +-- chaoHaplo: 95% confidence interval calculated around the estimated number of haplotypes. +-- New functions: cgraph, ordinDNA and plot.ordinDNA +-- namespace issues resolved + +############################################ + SPIDER VERSION 1.2-0 + Released 17 November 2012 + + +-- Functions added: heatmapSpp +-- read.BOLD: Function completely rewritten to compensate for BOLD deprecating their eFetch system + +############################################ + SPIDER VERSION 1.1-5 + Released 11 October 2012 + + +-- monophyly: Code changed from calling the ape C routine "bipartition" directly, to using prop.part() +-- monophylyBoot: Code changed from calling the ape C routine "bipartition" directly, to using prop.part() +-- tree.comp: Code changed from calling the ape C routine "bipartition" directly, to using prop.part() + +############################################ + SPIDER VERSION 1.1-4 + Released 17 June 2012 + + +-- sarkar: The addition of a dataset containing the dummy sequences published in Sarkar et al to illustrate the different categories of diagnostic nucleotides. + +############################################ + SPIDER VERSION 1.1-3 + Released 29 March 2012 + + +-- rankSlidWin: bug fixed that caused an error when multiple names were given to "criteria" + +############################################ + SPIDER VERSION 1.1-2 + Released 10 March 2012 + + +-- read.GB: modified to work with eFetch version 2.0 (http://www.ncbi.nlm.nih.gov/books/NBK25499/) +-- read.BOLD: help file updated +-- search.BOLD: Modified to work with BOLDsystems v 3.0 +-- Functions added: stats.BOLD + +############################################ + SPIDER VERSION 1.1-1 + Released 27 November 2011 + + +-- tiporder: "labels" option added to toggle between returning the labels (when "labels"=TRUE), or returning the indices (when "labels"=FALSE). + +############################################ + SPIDER VERSION 1.1-0 + Released 6 November 2011 + +-- Initial release onto CRAN +-- No known issues + +############################################ \ No newline at end of file diff --git a/R/bestCloseMatch.R b/R/bestCloseMatch.R new file mode 100644 index 0000000..136ea77 --- /dev/null +++ b/R/bestCloseMatch.R @@ -0,0 +1,15 @@ +bestCloseMatch <- function(distobj, sppVector, threshold = 0.01){ + distobj <- as.matrix(distobj) + diag(distobj) <- NA + output <- rep(NA, length(sppVector)) + aa <- apply(distobj, MARGIN=2, FUN=function(x) which(x == min(x, na.rm = TRUE))) + bb <- lapply(aa, function(x) unique(sppVector[x])) + cc <- sppVector == bb + dd <- sapply(1:length(sppVector), function(x) sppVector[x] %in% bb[[x]]) + ee <- apply(distobj, MARGIN=2, FUN=function(x) min(x, na.rm = TRUE)) + output[which(cc & dd)] <- "correct" + output[which(!cc & !dd)] <- "incorrect" + output[which(!cc & dd)] <- "ambiguous" + output[which(ee > threshold)] <- "no id" + output +} \ No newline at end of file diff --git a/R/cgraph.R b/R/cgraph.R new file mode 100644 index 0000000..b726d39 --- /dev/null +++ b/R/cgraph.R @@ -0,0 +1,8 @@ +cgraph <- function(x, y = NULL, ...){ + if(!is.null(y)) mat <- cbind(x, y) else mat <- x + dd <- dim(mat)[1] + if(dd < 2) return() + if(is.null(dd)) return() + ddComb <- combn(1:dd, 2) + segments(mat[ddComb[1,], 1], mat[ddComb[1,], 2], mat[ddComb[2,], 1], mat[ddComb[2,], 2], ...) +} diff --git a/R/chaoHaplo.R b/R/chaoHaplo.R new file mode 100644 index 0000000..ef6f621 --- /dev/null +++ b/R/chaoHaplo.R @@ -0,0 +1,22 @@ +chaoHaplo <- function(DNAbin){ + haplo <- haplotype(DNAbin) + i <- if(length(grep("[-|?|r|y|m|k|w|s|b|d|h|v|n]", DNAbin)) > 0) message("There are missing or ambiguous data, which may cause an overestimation of the number of haplotypes") + nums <- sapply(attr(haplo, "index"), length) + n <- dim(DNAbin)[1] + h <- length(nums) + s <- length(which(nums == 1)) + d <- length(which(nums == 2)) + #Estimated number of haplotypes (From Vink et al 2011) + if(d > 0) est <- h + ((s^2)/(2 * d)) else est <- h + ((s * (s - 1))/2) + + #Confidence intervals (Modified from Chao 1989) + varest <- est/((h / (est - h)) - n/est) + C <- exp(1.96 * sqrt(log( 1 + (varest / ((est - h)^2))))) + + low <- h + (est - h)/C + high <- h + (est - h) * C + + c(est, low, high) +} + + diff --git a/R/checkDNA.R b/R/checkDNA.R new file mode 100644 index 0000000..0d181db --- /dev/null +++ b/R/checkDNA.R @@ -0,0 +1,8 @@ +checkDNA <- +function(DNAbin, gapsAsMissing = TRUE){ + if(gapsAsMissing) bases <- c(2, 240, 4) else bases <- c(2, 240) + if(is.list(DNAbin)) output <- sapply(DNAbin, function(x) length(which(as.numeric(x) %in% bases))) + if(is.matrix(DNAbin)) output <- apply(DNAbin, MARGIN = 1, FUN = function(x) length(which(as.numeric(x) %in% bases))) +output +} + diff --git a/R/dataStat.R b/R/dataStat.R new file mode 100644 index 0000000..65a4f38 --- /dev/null +++ b/R/dataStat.R @@ -0,0 +1,9 @@ +dataStat <- function(sppVector, genVector, thresh = 5){ +unSpp <- unique(sppVector) +unGen <- unique(genVector) +sppnum <- sapply(unSpp, function(x) length(which(sppVector %in% x))) +tab <- table(NULL) +tab[1:7] <- c(length(unGen), length(unSpp), min(sppnum), max(sppnum), median(sppnum), mean(sppnum), length(which(sppnum < thresh))) +names(tab) <- c("Genera", "Species", "Min", "Max", "Median", "Mean", "Thresh" ) +round(tab, digits=0) +} diff --git a/R/haploAccum.R b/R/haploAccum.R new file mode 100644 index 0000000..f724dcf --- /dev/null +++ b/R/haploAccum.R @@ -0,0 +1,76 @@ +haploAccum<- function (DNAbin, method = "random", permutations = 100, ...){ + + if (is.list(DNAbin)) DNAbin <- as.matrix(DNAbin) # If seq DNAbin is list, turn it matrix + i <- (length(grep("[-|?|r|y|m|k|w|s|b|d|h|v|n]", DNAbin))>0) + message("There are missing or ambiguous data, which may cause an overestimation of the number of haplotypes") + + seq_names<-as.vector(rownames(DNAbin)) # Create a vector of seq name + nms.dat <- deparse(substitute(DNAbin)) # Create a character object from seq DNAbina + rownames(DNAbin) <- NULL # Remove row names + y <- apply(DNAbin, 1, rawToChar) # Translate sequences + n <- length(y) # Number of sequences + keep <- nhaplo <- 1L # To remove? + no <- list(1L) # To remove? + for (i in 2:n) { + already.seen <- FALSE + j <- 1L + while (j <= nhaplo) { + if (y[i] == y[keep[j]]) { + no[[j]] <- c(no[[j]], i) + already.seen <- TRUE + break + } + j <- j + 1L + } + if (!already.seen) { + keep <- c(keep, i) + nhaplo <- nhaplo + 1L + no[[nhaplo]] <- i + } + } + obj <- DNAbin[keep, ] + rownames(obj) <- as.character(as.roman(1:length(keep))) + class(obj) <- c("haplotype", "DNAbin") + attr(obj, "index") <- no + attr(obj, "from") <- nms.dat + n_haplo<-length(no) + z<- matrix(nrow=length(seq_names),ncol=n_haplo,0) + colnames(z)<-as.vector(unlist(attributes(obj)$dimnames[1])) + rownames(z)<-seq_names + for (i in c(1:n_haplo)){ + for (j in c(1:length(as.vector(unlist(attributes(obj)$index[i]))))){ + z[unlist(attributes(obj)$index[i])[j],i]<- 1 + } + } + z <- z[, colSums(z) > 0, drop=FALSE] + n <- nrow(z) + h <- ncol(z) + sequences <- 1:n + if (h == 1) { + z <- t(z) + n <- nrow(z) + h <- ncol(z) + } + accumulator <- function(x, sequences) { + rowSums(apply(x[sequences, ], 2, cumsum) > 0) + } + METHODS <- c("collector", "random") + method <- match.arg(method, METHODS) + haploaccum <- sdaccum <- perm <- NULL + if (n == 1) + message("There is only 1 sequence. No accumulation was possible") + switch(method, collector = { + haploaccum <- accumulator(z, sequences) }, + random = { + perm <- array(dim = c(n, permutations)) + for (i in 1:permutations) { + perm[, i] <- accumulator(z, sample(n)) + } + haploaccum <- apply(perm, 1, mean) + sdaccum <- apply(perm, 1, sd) + }) + out <- list(call = match.call(), method = method, sequences = sequences, + n.haplotypes = haploaccum, sd = sdaccum, perm = perm) + class(out) <- "haploAccum" + out +} diff --git a/R/heatmapSpp.R b/R/heatmapSpp.R new file mode 100644 index 0000000..effd043 --- /dev/null +++ b/R/heatmapSpp.R @@ -0,0 +1,12 @@ +heatmapSpp <- function(distObj, sppVector, col = NULL, axisLabels = NULL){ + if (!is.matrix(distObj)) distObj <- as.matrix(distObj) + + if (is.null(col)) cols <- c("#D33F6A", "#D95260", "#DE6355", "#E27449", "#E6833D", "#E89331", "#E9A229", "#EAB12A", "#E9C037", "#E7CE4C", "#E4DC68", "#E2E6BD") else cols <- col + + if (is.null(axisLabels)) axisLabels <- sppVector[order(sppVector)] else axisLabels <- axisLabels[order(sppVector)] + + image(distObj[order(sppVector), order(sppVector)], col = cols, xaxt = "n", yaxt = "n") + axis(1, at = seq(0, 1, length.out = dim(distObj)[1]), labels = axisLabels, las = 2) + axis(2, at = seq(0, 1, length.out = dim(distObj)[1]), labels = axisLabels, las = 2) + +} diff --git a/R/is.ambig.R b/R/is.ambig.R new file mode 100644 index 0000000..c998d0c --- /dev/null +++ b/R/is.ambig.R @@ -0,0 +1,8 @@ +is.ambig <- +function(DNAbin){ + x <- as.matrix(DNAbin) + bases <- c(136, 72, 40, 24) + ambig <- apply(x, 2, FUN=function(x) sum(as.numeric(!as.numeric(x) %in% bases))) + ambig > 0 +} + diff --git a/R/localMinima.R b/R/localMinima.R new file mode 100644 index 0000000..fa8f380 --- /dev/null +++ b/R/localMinima.R @@ -0,0 +1,10 @@ +localMinima <- function(distobj){ + den <- density(distobj) + a <- rep(NA, length(den$y)-2) + for(i in 2:(length(den$y)-1)) a[i-1] <- den$y[i-1] > den$y[i] & den$y[i+1] > den$y[i] + den$localMinima <- den$x[which(a)] + den$data.name <- deparse(substitute(distobj)) + den$call <- paste("density.default(", den$data.name, ")", sep="") + print(den$localMinima) + invisible(den) +} \ No newline at end of file diff --git a/R/maxInDist.R b/R/maxInDist.R new file mode 100644 index 0000000..f06b9b1 --- /dev/null +++ b/R/maxInDist.R @@ -0,0 +1,14 @@ +maxInDist <- +function(distobj, sppVector = NULL, propZero = FALSE, rmNA = FALSE){ + dat <- as.matrix(distobj) + if(length(sppVector) > 0) dimnames(dat)[[1]] <- sppVector + conSpecDists <- list() + for (i in 1:length(dimnames(dat)[[1]])) { + conSpec <- dimnames(dat)[[1]] == dimnames(dat)[[1]][i] + conSpecDists[[i]] <- max(dat[conSpec, i], na.rm = rmNA) + } + if (propZero) + output <- length(which(unlist(conSpecDists) == 0))/length(unlist(conSpecDists)) + else output <- unlist(conSpecDists) + output +} diff --git a/R/monophyly.R b/R/monophyly.R new file mode 100644 index 0000000..6a20ce0 --- /dev/null +++ b/R/monophyly.R @@ -0,0 +1,26 @@ +monophyly <- +function (phy, sppVector, pp = NA, singletonsMono = TRUE) +{ + res <- list() + xxx <- lapply(unique(sppVector), function(y) which(sppVector == + y)) + sppTab <- sapply(xxx, length) + singletons <- which(sppTab == 1) + nonSingletons <- which(sppTab != 1) + ifelse(is.na(pp), yyy <- prop.part(phy), yyy <- pp) + zzz <- sapply(yyy, length) + defNon <- which(!sppTab %in% zzz) + poss <- which(sppTab %in% zzz) + for (i in poss) { + res[i] <- NA + for (j in 1:length(yyy[which(zzz == sppTab[i])])) res[[i]][j] <- sum(as.numeric(!xxx[[i]] %in% + yyy[which(zzz == sppTab[i])][[j]])) + } + out <- sapply(res, function(x) as.logical(sum(as.numeric(x < 1)))) + if(is.list(out)) out <- rep(singletonsMono, length(singletons)) + out[defNon] <- FALSE + out[singletons] <- singletonsMono + out +} + + diff --git a/R/monophylyBoot.R b/R/monophylyBoot.R new file mode 100644 index 0000000..62354fe --- /dev/null +++ b/R/monophylyBoot.R @@ -0,0 +1,39 @@ +monophylyBoot <- +function (phy, sppVector, DNAbin, thresh = 0.7, reroot = TRUE, pp = NA, singletonsMono = TRUE, reps = 1000, block = 3) +{ + res <- list() + xxx <- lapply(unique(sppVector), function(y) which(sppVector == + y)) + if(reroot){ + testTr <- nj(dist.dna(DNAbin)) + maxInt <- max(testTr$edge.length[testTr$edge[,2] > length(testTr$tip.label)]) + nodeRoot <- testTr$edge[which(testTr$edge.length == maxInt), 2] + boot <- boot.phylo(phy, DNAbin, function(x) root(nj(dist.dna(x)), node = nodeRoot, resolve.root=TRUE), B = reps, block = block)/reps + } else boot <- boot.phylo(phy, DNAbin, function(x) nj(dist.dna(x)), B = reps, block = block)/reps + sppTab <- sapply(xxx, length) + singletons <- which(sppTab == 1) + nonSingletons <- which(sppTab != 1) + ifelse(is.na(pp), yyy <- prop.part(phy), yyy <- pp) + zzz <- sapply(yyy, length) + defNon <- which(!sppTab %in% zzz) + poss <- which(sppTab %in% zzz) + bb <- lapply(sppTab, function(x) boot[which(zzz == x)]) + tt <- lapply(sppTab, function(x) which(zzz == x)) + for(i in poss){ + res[i] <- NA + for(j in 1:length(tt[[i]])){ + res[[i]][j] <- sum(as.numeric(!xxx[[i]] %in% yyy[[ tt[[i]][j] ]])) + } + } + bc <- sapply(res, function(x) which(x == 0)) + bootCheck <- sapply(1:length(bc), function(x) bb[[x]][bc[[x]][1]]) + out <- sapply(res, function(x) as.logical(sum(as.numeric(x < + 1)))) + if(is.list(out)) out <- rep(singletonsMono, length(singletons)) + out[defNon] <- FALSE + out[singletons] <- singletonsMono + out[bootCheck < thresh] <- FALSE + store <- list(results= out, BSvalues = boot) + print(out) + invisible(store) +} diff --git a/R/nearNeighbour.R b/R/nearNeighbour.R new file mode 100644 index 0000000..69de8d1 --- /dev/null +++ b/R/nearNeighbour.R @@ -0,0 +1,7 @@ +nearNeighbour <- function(distobj, sppVector, names = FALSE){ + distobj <- as.matrix(distobj) + diag(distobj) <- NA + aa <- apply(distobj, MARGIN=2, FUN=function(x) which(x == min(x, na.rm = TRUE))) + bb <- sapply(aa, function(x) names(sort(table(sppVector[x]), decreasing=TRUE)[1])) + if(names) as.vector(bb) else as.vector(bb == sppVector) +} diff --git a/R/nonConDist.R b/R/nonConDist.R new file mode 100644 index 0000000..4809781 --- /dev/null +++ b/R/nonConDist.R @@ -0,0 +1,14 @@ +nonConDist <- +function(distobj, sppVector = NULL, propZero = FALSE, rmNA = FALSE){ + distobj <- as.matrix(distobj) + if(length(sppVector) > 0) dimnames(distobj)[[1]] <- sppVector + nonSpecDists <- list() + for(i in 1:length(dimnames(distobj)[[1]])){ + nonSpec <- dimnames(distobj)[[1]] != dimnames(distobj)[[1]][i] + nonSpecDists[[i]] <- min(distobj[nonSpec,i] , na.rm = rmNA) + } +if(propZero) output <- length(which(unlist(nonSpecDists) == 0))/length(unlist(nonSpecDists)) else output <- unlist(nonSpecDists) + +output +} + diff --git a/R/nucDiag.R b/R/nucDiag.R new file mode 100644 index 0000000..ed82b4b --- /dev/null +++ b/R/nucDiag.R @@ -0,0 +1,22 @@ +nucDiag <- function(DNAbin, sppVector){ + DNAbin <- as.matrix(DNAbin) + inform <- seg.sites(DNAbin) + sppSeqs <- lapply(unique(sppVector), function(x) which(sppVector == x)) + + siteCheck <- function(spp, site){ + res <- as.character(DNAbin[spp, site]) %in% as.character(DNAbin[-spp, site]) + #A 'res' of TRUE means that the nucleotide in the sp. is also present in the rest of the spp. + res <- as.logical(sum(as.numeric(res))) + res + } + li <- list() + for(i in 1:length(sppSeqs)){ + li[[i]] <- NA + for(j in 1:length(inform)){ + li[[i]][j] <- siteCheck(sppSeqs[[i]], inform[j]) + } + } + out <- lapply(li, function(x) inform[which(!x)]) + names(out) <- unique(sppVector) + out +} diff --git a/R/ordinDNA.R b/R/ordinDNA.R new file mode 100644 index 0000000..784580f --- /dev/null +++ b/R/ordinDNA.R @@ -0,0 +1,14 @@ +ordinDNA <- function(distobj, sppVector, ...){ + #Conduct Principal Coordinates Analysis + pco <- cmdscale(distobj, length(sppVector) - 1, eig = TRUE) + + ordinDNA <- list(pco = pco, sppVector = sppVector) + + attr(ordinDNA, "class") <- "ordinDNA" + + #Plotting + plot.ordinDNA(ordinDNA, ...) + + #Return object + invisible(ordinDNA) +} diff --git a/R/paa.R b/R/paa.R new file mode 100644 index 0000000..71276c5 --- /dev/null +++ b/R/paa.R @@ -0,0 +1,10 @@ +paa <- +function(data, sppVector){ + singPoly <- function(vec){ + aa <- unique(vec) + bb <- length(aa) + cc <- ifelse(bb == 1, aa, "poly") + cc + } + apply(data, MARGIN = 2, FUN = function(x) tapply(x, sppVector, singPoly)) +} \ No newline at end of file diff --git a/R/plot.haploAccum.R b/R/plot.haploAccum.R new file mode 100644 index 0000000..a9833e2 --- /dev/null +++ b/R/plot.haploAccum.R @@ -0,0 +1,28 @@ +plot.haploAccum <- + function(x, add = FALSE, ci = 2, ci.type = c("bar","line","polygon"), + col = par("fg"), ci.col = col, ci.lty = 1, xlab, + ylab = "Haplotypes", ylim, main = paste(x$method, "method of haplotype accumulation", sep=" "), ...) +{ + xaxvar <- x[["sequences"]] + if (missing(xlab)) xlab <- "Sequences" + ci.type <- match.arg(ci.type) + if (!add) { + if (missing(ylim)) + ylim <- c(1, max(x$n.haplotypes, x$n.haplotypes + ci*x$sd)) + plot(xaxvar, x$n.haplotypes, xlab=xlab, ylab=ylab, ylim=ylim, + type="n", main = main, ...) + } + if (!is.null(x$sd) && ci) + switch(ci.type, + bar = segments(xaxvar, x$n.haplotypes - ci*x$sd, xaxvar, + x$n.haplotypes + ci*x$sd, col=ci.col, lty=ci.lty, ...), + line = matlines(xaxvar, x$n.haplotypes + t(rbind(-ci,ci) %*% x$sd), + col=ci.col, lty=ci.lty, ...), + polygon = polygon(c(xaxvar, rev(xaxvar)), + c(x$n.haplotypes - ci*x$sd, rev(x$n.haplotypes + ci*x$sd)), col=ci.col, + lty=ci.lty, main = main, ...) + ) + lines(xaxvar, x$n.haplotypes,col=col, ...) + invisible() +} + diff --git a/R/plot.ordinDNA.R b/R/plot.ordinDNA.R new file mode 100644 index 0000000..e9b61f9 --- /dev/null +++ b/R/plot.ordinDNA.R @@ -0,0 +1,49 @@ +plot.ordinDNA <- function(x, majorAxes = c(1,2), plotCol = "default", trans = "CC", textcex = 0.7, pchCentroid = FALSE, sppBounds = "net", sppNames = TRUE, namePos = "top", ptPch = 21, ptCex = 0.5, netWd = 1, ...){ + #Colours + if(plotCol[1] == "default") { + plotCol <- c("#D33F6A", "#D95260", "#DE6355", "#E27449", "#E6833D", "#E89331", + "#E9A229", "#EAB12A", "#E9C037", "#E7CE4C", "#E4DC68", "#E2E6BD", + "#8E063B", "#A0323E", "#B04D41", "#C06544", "#CD7B48", "#D8904D", + "#E0A455", "#E7B65E", "#EAC76A", "#EAD577", "#E8E188", "#E2E6BD", + "#023FA5", "#5868AC", "#848DBC", "#A9AECB", "#C8CAD8", "#DDDEE0", + "#E1DDDD", "#D9C6C9", "#CEA5AC", "#BE7E8A", "#A94F64", "#8E063B" + ) + } else plotCol <- rep(plotCol, length(x$sppVector)/length(plotCol)) + transCol <- paste(plotCol, trans, sep = "") + sppVector <- x$sppVector + sppVecFac <- as.factor(sppVector) + sppVecFacNum <- as.numeric(unique(sppVecFac)) + + + #Figure out the centroid positions + mat <- x$pco$points[, majorAxes] + centroids <- aggregate(mat, list(sppVector), mean) + names(centroids) <- c("spp", "x", "y") + #Determine width of circle + maxDist <- function(xx){ + aa <- matrix(xx, ncol = 2) + aa <- rbind(apply(aa, 2, mean),aa) + bb <- as.matrix(dist(aa)) + max(bb[,1]) + } + radius <- sapply(unique(sppVector), function(xx) maxDist(mat[sppVector == xx,])) + radius <- radius[match(sort(names(radius)), names(radius))] + + # net setup + sppPoints <- lapply(unique(sppVector), function(xx) mat[sppVector == xx, , drop = FALSE]) + topPoint <- sapply(sppPoints, function(xx) xx[which.max(xx[,2]), ]) + + #Proportion of variation in each axis + propVar <- round(x$pco$eig/max(cumsum(x$pco$eig)) * 100, 1) + + if(namePos == "top") labRadius <- radius else if(namePos == "bottom") labRadius <- -radius else labRadius <- 0 + + plot(mat[,1], mat[,2], type = "n", asp = 1, xlab = paste("Major axis ", majorAxes[1], " (", propVar[majorAxes[1]], "%)", sep = ""), ylab = paste("Major axis ", majorAxes[2], " (", propVar[majorAxes[2]], "%)", sep = ""), ...) + if(sppBounds == "circles") symbols(centroids[,2], centroids[,3], circles = radius, fg = transCol[as.numeric(sort(unique(sppVecFac)))], bg = transCol[as.numeric(sort(unique(sppVecFac)))], inches = FALSE, add = TRUE) + if(sppBounds == "net") lapply(1:length(sppPoints), function(xx) cgraph(sppPoints[[xx]], col = plotCol[sppVecFacNum[xx]], lwd = netWd)) + if(pchCentroid) points(centroids[,2], centroids[,3], pch = ptPch, bg = plotCol[as.numeric(sort(unique(sppVecFac)))]) + if(sppNames & namePos != "topPoint") text(centroids[,2], centroids[,3] + labRadius, labels = sort(unique(sppVector)), cex = textcex, pos = 3, offset = 0.06) else if(sppNames & namePos == "topPoint") text(topPoint[1, ], topPoint[2, ], labels = unique(sppVector), cex = textcex, pos = 3, offset = 0.06) + points(mat[,1], mat[,2], pch=22, bg = plotCol[as.numeric(sppVecFac)], cex = ptCex) + #~ list(radius, centroids, mat) + #sppPoints +} diff --git a/R/plot.slidWin.R b/R/plot.slidWin.R new file mode 100644 index 0000000..f53c867 --- /dev/null +++ b/R/plot.slidWin.R @@ -0,0 +1,61 @@ +plot.slidWin <- +function(x, outliers = FALSE, ...){ + slidWin <- x + distM <- function(){ + plot(slidWin$pos_out, slidWin$dist_mean_out, xlab = "Window position", ylab = "Distance", main="Mean K2P distance of each window") + ylimzero <- range(slidWin$zero_out, slidWin$dat_zero_out, na.rm=TRUE) + plot(slidWin$pos_out, slidWin$zero_out, ylim = ylimzero, xlab = "Window position", ylab = "Proportion", main="Proportion of zero cells in K2P distance matrix") + abline(h=slidWin$dat_zero_out) + #thres_main <- paste("Number of cells above ", slidWin$thresA, " (black) and below ", slidWin$thresB, " (blue)", sep="") + #ylimthres <- range(slidWin$thres_above_out, slidWin$thres_below_out, na.rm=TRUE) + #plot(slidWin$pos_out, slidWin$thres_above_out, ylim = ylimthres, xlab = "Position", ylab = "Number of cells", main = thres_main) + #points(slidWin$pos_out, slidWin$thres_below_out, col = "blue") + plot(slidWin$pos_out, slidWin$nd_out, xlab = "Window position", ylab = "Number", main = "Sum of diagnostic nucleotides", type="l") + plot(slidWin$pos_out, slidWin$noncon_out, ylim=c(0, max(slidWin$noncon_out)), xlab = "Window position", ylab = "Proportion", main = "Proportion of zero non-conspecific K2P distances") + } + + distT <- function(){ + ylimcomp <- range(slidWin$comp_out, slidWin$comp_depth_out, na.rm=TRUE) + plot(slidWin$pos_tr_out, slidWin$comp_out, ylim = ylimcomp, xlab = "Window position", ylab = "Proportion", main="Congruence of NJ trees") + # points(slidWin$pos_tr_out, slidWin$comp_depth_out, col="blue") + plot(slidWin$pos_tr_out, slidWin$win_mono_out, xlab = "Window position", ylab = "Proportion", main = "Proportion of species that are monophyletic") + } + + if(slidWin$boxplot_out){ + layout(1) + if(slidWin$method == "overall"){ + plot(1, type = "n", xlim = c(1,length(slidWin$bp_out)), ylim = slidWin$bp_range_out, xaxt = "n", main="Boxplot of distance matrix for each window", xlab="Window position", ylab="Distance") + axis(1, at = 1:length(slidWin$bp_out), labels = as.character(slidWin$pos_out)) + for(i in 1:length(slidWin$bp_out)) bxp(slidWin$bp_out[[i]], at=i, add=TRUE, axes=FALSE, whisklty=1, whiskcol="red", staplelty=0, outpch=NA) + }#x11() + if(slidWin$method %in% c("interAll", "nonCon")){ + labs <- c("interAll", "nonCon") + desc <- c("all inter-specific", "closest non-conspecific") + plotTitle <- paste("Boxplots of", desc[which(slidWin$method==labs)], "distances (orange whiskers) and \nintra-specific distances (blue whiskers) in each window", sep=" ") + plot(1, type = "n", xlim = c(1,length(slidWin$bp_InterSpp_out)), ylim = c(0, median(slidWin$bp_range_out)), xaxt = "n", main=plotTitle, xlab="Window position", ylab="Distance") + axis(1, at = 1:length(slidWin$bp_InterSpp_out), labels = as.character(slidWin$pos_out)) + if(outliers){ + for(i in 1:length(slidWin$bp_InterSpp_out)) bxp(slidWin$bp_InterSpp_out[[i]], at=i, add=TRUE, axes=FALSE, whisklty=1, whiskcol="orange", staplelty=0, outpch=20, outcol="orange") + for(i in 1:length(slidWin$bp_IntraSpp_out)) bxp(slidWin$bp_IntraSpp_out[[i]], at=i, add=TRUE, axes=FALSE, whisklty=1, whiskcol="blue", staplelty=0, outpch=20, outcol="blue", outcex=0.75) + } + else{ + for(i in 1:length(slidWin$bp_InterSpp_out)) bxp(slidWin$bp_InterSpp_out[[i]], at=i, add=TRUE, axes=FALSE, whisklty=1, whiskcol="orange", staplelty=0, outpch=NA) + for(i in 1:length(slidWin$bp_IntraSpp_out)) bxp(slidWin$bp_IntraSpp_out[[i]], at=i, add=TRUE, axes=FALSE, whisklty=1, whiskcol="blue", staplelty=0, outpch=NA) + } + } + } + + if(slidWin$distMeasures && slidWin$treeMeasures){ + layout(matrix(1:6, ncol=2)) + distM() + distT() + } else { + if(slidWin$distMeasures) { + layout(1:4) + distM() } + if(slidWin$treeMeasures){ + layout(1:2) + distT()} + } +layout(1) +} diff --git a/R/polyBalance.R b/R/polyBalance.R new file mode 100644 index 0000000..08f024f --- /dev/null +++ b/R/polyBalance.R @@ -0,0 +1,17 @@ +polyBalance <- function(phy){ + nd <- node.depth(phy) + tab <- table(phy$edge[,1]) + ind <- as.numeric(names(tab[which(tab == 2)])) + tips <- nd[phy$edge[phy$edge[, 1] %in% ind, 2]] + nodes <- phy$edge[phy$edge[, 1] %in% ind, 1] + mat <- cbind(nodes, tips) + NNodes <- unique(phy$edge[,1]) + matInd <- unique(mat[,1]) + outMat <- matrix(NA, ncol = 2, nrow = length(NNodes)) + dimnames(outMat)[[1]] <- NNodes + for(i in 1:length(matInd)){ + outMat[NNodes == matInd[i],] <- mat[mat[,1] == matInd[i],2] + } + + outMat +} diff --git a/R/rankSlidWin.R b/R/rankSlidWin.R new file mode 100644 index 0000000..88381c1 --- /dev/null +++ b/R/rankSlidWin.R @@ -0,0 +1,47 @@ +rankSlidWin <- +function(slidWin, criteria = "mean_distance", num = 10){ + revRank <- function(xx) (length(xx)+1) - rank(xx, ties.method="min") + list2df <- function(listObj){ + len <- sapply(listObj, length) + mlen <- min(len) + for(i in 1:length(listObj)) listObj[[i]] <- listObj[[i]][1:mlen] + as.data.frame(listObj) + } + distCriteria <- distLabel <- treeCriteria <- treeLabel <- treeEX <- NULL + if(!slidWin$distMeasures && !slidWin$treeMeasures) stop("Object of class`slidWin' must be created using slideAnalyses") + if(!slidWin$distMeasures && slidWin$treeMeasures) criteria <- "monophyly" + if(slidWin$distMeasures){ + distCriteria <- c("mean_distance", "zero_noncon", "zero_distances", "diag_nuc") + distLabel <- c("dist_mean_out", "noncon_out", "zero_out", "nd_out") + } + if(slidWin$treeMeasures){ + treeCriteria <- c("monophyly", "clade_comparison", "clade_comp_shallow") + treeLabel <- c("win_mono_out", "comp_out", "comp_depth_out") + treeEX <- c("pos_tr_out") + } + measures <- c("position", distCriteria, treeCriteria) + #Remove objects not of interest + excluded <- match(c("dat_zero_out", "boxplot_out", "distMeasures", "treeMeasures", treeEX), names(slidWin)) + dFrame <- list2df(slidWin[-excluded]) + #Reorder and rename dataframe columns + dfOrder <- match(c("pos_out", distLabel, treeLabel), names(dFrame)) + dFrame <- dFrame[ , dfOrder] + names(dFrame) <- measures + #Order rows + high <- match(c("monophyly", "clade_comparison", "clade_comp_shallow", "diag_nuc", "mean_distance"), measures) + highVal <- as.data.frame(lapply(dFrame, revRank)) + dfVals <- highVal + if(slidWin$distMeasures){ + low <- match(c("zero_noncon", "zero_distances"), measures) + lowVal <- as.data.frame(lapply(dFrame, function(x) rank(x, ties.method="min"))) + dfVals[ , low] <- lowVal[ , low] + } + if("all" %in% criteria) rowOrd <- order(apply(dfVals[ , -1], MARGIN=1, FUN=sum)) + else{ + ordNum <- which(measures %in% criteria) + if(length(criteria) > 1) rowOrd <- order(apply(dfVals[ , ordNum], MARGIN=1, FUN=sum)) + else rowOrd <- order(dfVals[ , ordNum]) + } + #Return top 10 + head(dFrame[ rowOrd , ], n = as.integer(num)) +} \ No newline at end of file diff --git a/R/read.BOLD.R b/R/read.BOLD.R new file mode 100644 index 0000000..db6606b --- /dev/null +++ b/R/read.BOLD.R @@ -0,0 +1,35 @@ +read.BOLD <- function(IDs){ + allSeqs <- list() + + for(i in 1:length(IDs)){ + URL <- paste("http://v3.boldsystems.org/index.php/Public_RecordView?processid=", IDs[i], sep = "") + res <- scan(file = URL, what = "", sep = "\n", quiet = TRUE) + #DNA sequence + dnaLineNo <- grep("Locus", res) + dnaBlock <- paste(res[dnaLineNo: (dnaLineNo + 19)], collapse="") + dnaSeq <- strsplit(dnaBlock, split = "
|
")[[1]][2] + gene <- strsplit(strsplit(dnaBlock, split = "td")[[1]][4], split = "<|>")[[1]][2] + + #Taxonomy---Subfamily, genus, species, BIN number + taxLineNo <- grep("TAXONOMY", res) + taxBlock <- paste(res[taxLineNo: (taxLineNo + 30)], collapse="") + taxFields <- strsplit(gsub("\\t", "", taxBlock), split = "|") + sciname <- strsplit(taxFields[[1]][8], split = ">|<")[[1]][29] + BIN <- strsplit(taxFields[[1]][10], split = ">|<")[[1]][31] + + seqs <- as.DNAbin(list(strsplit(tolower(dnaSeq), split="")[[1]])) + nam <- paste(IDs[i], sciname, BIN, sep = "|") + names(seqs) <- nam + attr(seqs, "species") <- sciname + attr(seqs, "BIN") <- BIN + attr(seqs, "accession_num") <- IDs[i] + attr(seqs, "gene") <- gene + allSeqs[[i]] <- seqs + } + collSeqs <- do.call(c, allSeqs) + attr(collSeqs, "species") <- unlist(lapply(allSeqs, function(x) attr(x, "species"))) + attr(collSeqs, "accession_num") <- unlist(lapply(allSeqs, function(x) attr(x, "accession_num"))) + attr(collSeqs, "BIN") <- unlist(lapply(allSeqs, function(x) attr(x, "BIN"))) + attr(collSeqs, "gene") <- unlist(lapply(allSeqs, function(x) attr(x, "gene"))) + collSeqs +} \ No newline at end of file diff --git a/R/read.GB.R b/R/read.GB.R new file mode 100644 index 0000000..2e45ff1 --- /dev/null +++ b/R/read.GB.R @@ -0,0 +1,48 @@ +read.GB <- +function(access.nb, seq.names = access.nb, species.names = TRUE, gene=TRUE, access=TRUE, as.character = FALSE) { + N <- length(access.nb) + nrequest <- N%/%400 + as.logical(N%%400) + X <- character(0) + for (i in 1:nrequest) { + a <- (i - 1) * 400 + 1 + b <- 400 * i + if (i == nrequest) + b <- N + URL <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=", + paste(access.nb[a:b], collapse = ","), "&rettype=gb&retmode=text", + sep = "") + X <- c(X, scan(file = URL, what = "", sep = "\n", quiet = TRUE)) + } + FI <- grep("^ {0,}ORIGIN", X) + 1 + LA <- which(X == "//") - 1 + obj <- list() + length(obj) <- N + for (i in 1:N) { + tmp <- gsub("[[:digit:] ]", "", X[FI[i]:LA[i]]) + obj[[i]] <- unlist(strsplit(tmp, NULL)) + } + names(obj) <- seq.names + if (!as.character) + obj <- as.DNAbin(obj) + if (species.names) { + tmp <- character(N) + sp <- grep("ORGANISM", X) + for (i in 1:N) tmp[i] <- unlist(strsplit(X[sp[i]], " +ORGANISM +"))[2] + attr(obj, "species") <- gsub(" ", "_", tmp) + if (gene) { + tmp2 <- character(N) + def <- grep("DEFINITION", X) + for (i in 1:N) tmp2[i] <- unlist(strsplit(X[def[i]], "DEFINITION +"))[2] + attr(obj, "gene") <- gsub(" ", "_", tmp2) + if (access) { + tmp3 <- character(N) + def <- grep("ACCESSION", X) + for (i in 1:N) tmp3[i] <- unlist(strsplit(X[def[i]], "ACCESSION +"))[2] + attr(obj, "accession_num") <- gsub(" ", "_", tmp3) + } + names(obj)<-paste(attr(obj,"accession_num"), "|", attr(obj,"species"), sep = "") + obj +} +} +} + diff --git a/R/rmSingletons.R b/R/rmSingletons.R new file mode 100644 index 0000000..07982c5 --- /dev/null +++ b/R/rmSingletons.R @@ -0,0 +1,4 @@ +rmSingletons <- function(sppVector, exclude = TRUE){ + singletons <- names(which(table(sppVector) == 1)) + if(exclude) which(!sppVector %in% singletons) else which(sppVector %in% singletons) +} \ No newline at end of file diff --git a/R/rosenberg.R b/R/rosenberg.R new file mode 100644 index 0000000..22119ca --- /dev/null +++ b/R/rosenberg.R @@ -0,0 +1,13 @@ +rosenberg <- function(phy){ + RosenbergP_AB <- function(a, b){ + d1 <- choose(a+b, a) + d2 <- a+b-1 + n1 <- 2 + n2 <- 1 + (n1/d1)*(n2/d2) + } + mat <- polyBalance(phy) + lab <- apply(mat, 1, function(x) RosenbergP_AB(x[1], x[2])) + lab +} + diff --git a/R/search.BOLD.R b/R/search.BOLD.R new file mode 100644 index 0000000..48bae5d --- /dev/null +++ b/R/search.BOLD.R @@ -0,0 +1,32 @@ +search.BOLD <- +function(taxon, exhaustive = FALSE){ + space <- grep(" ", taxon) + if(length(space) > 0) taxon[space] <- sapply(space, function(x) paste("\"", taxon[x], "\"", sep="")) + taxon <- paste(taxon, collapse="%20") + taxon <- gsub(" ", "%20", taxon) + if(exhaustive){ + num <- stats.BOLD(taxon) + if(num > 500){ + offsetVals <- 500 * 0:(as.integer(num/500)) + IDs <- list() + for(i in 1:length(offsetVals)){ + URL <- paste("http://v3.boldsystems.org/index.php/Public_Ajax_RecordList?query=", taxon, "&limit=500&offset=", offsetVals[i], sep="") + res <- scan(file = URL, what = "", sep = "\n", quiet = TRUE) + res <- res[grep("selected_processids", res)] + IDs[[i]] <- sapply(strsplit(res, "\""), function(x) x[6]) + } + IDs <- unlist(IDs) + } else exhaustive <- FALSE + } + if(!exhaustive){ + URL <- paste("http://v3.boldsystems.org/index.php/Public_Ajax_RecordList?query=", taxon, "&limit=500&offset=0", sep="") + res <- scan(file = URL, what = "", sep = "\n", quiet = TRUE) + res <- res[grep("selected_processids", res)] + IDs <- sapply(strsplit(res, "\""), function(x) x[6]) + } + if(length(IDs) == 0) stop("No taxa with that name found on BOLD") + else if(length(IDs) == 500){ + warning("Maximum number of records reached. Try using 'exhaustive = TRUE'") + IDs + } else IDs +} \ No newline at end of file diff --git a/R/seeBarcode.R b/R/seeBarcode.R new file mode 100644 index 0000000..025fb76 --- /dev/null +++ b/R/seeBarcode.R @@ -0,0 +1,10 @@ +seeBarcode <- +function(seq, col=c("green", "blue", "black", "red")){ + if(!is.null(dim(seq))) if(dim(seq)[1] > 1) stop("Single sequences only please!") + bases <- c(136, 40, 72, 24) + pos <- 1:length(seq) + ind <- match(as.numeric(seq), bases) + plot(1, 1, xlim=c(0,max(pos)), ylim=c(0,1), xaxt="n", yaxt="n", xlab=NA, ylab=NA, + bty="n", type="n", main=dimnames(seq)[[1]]) + abline(v=pos, col=col[ind]) +} \ No newline at end of file diff --git a/R/seqStat.R b/R/seqStat.R new file mode 100644 index 0000000..8ae6d8a --- /dev/null +++ b/R/seqStat.R @@ -0,0 +1,7 @@ +seqStat <- function(DNAbin, thresh = 500){ + rr <- sapply(DNAbin, FUN = function(x) length(x) - checkDNA(x, gapsAsMissing=TRUE)) + tab <- table(NULL) + tab[1:5] <- c(min(rr), max(rr), mean(rr), median(rr), length(which(rr < thresh))) + names(tab) <- c("Min", "Max", "Mean", "Median", "Thresh") + round(tab, digits=0) +} diff --git a/R/slideAnalyses.R b/R/slideAnalyses.R new file mode 100644 index 0000000..aa97c66 --- /dev/null +++ b/R/slideAnalyses.R @@ -0,0 +1,51 @@ +slideAnalyses <- +function(DNAbin, sppVector, width, interval = 1, distMeasures = TRUE, treeMeasures = FALSE){ + #Produce distance matrix, number of zero cells of full sequence + boxplot_out <-FALSE + dat <- as.matrix(DNAbin) + dimnames(dat)[[1]] <- sppVector + datd <- dist.dna(dat, pairwise.deletion = TRUE) + dat_zero_out <- sum(as.numeric(datd == 0))/length(datd) + #Create the windows + win <- slidingWindow(dat, width, interval = interval) + pos_out <- sapply(win, function(x) attr(x, "window")[1]) + win_dist <- lapply(win, function(x) dist.dna(x, pairwise.deletion = TRUE)) + #Distance metrics + if(distMeasures){ + #Mean of distance matrix + dist_mean_out <- sapply(win_dist, function(x) mean(x, na.rm=TRUE)) + #Number of zero cells + zero_out <- sapply(win_dist, function(y) sum(as.numeric(y == 0), na.rm=TRUE)/length(y)) + ################## + #Threshold measures REMOVED + #thres_above_out <- sapply(win_dist, function(x) sum( as.numeric(x >= thresA) ) ) + #thres_below_out <- sapply(win_dist, function(x) sum( as.numeric(x <= thresB) ) ) + ################## + #Diagnostic nucleotides + nd_out <- slideNucDiag(DNAbin, sppVector, width, interval) + nd_out <- colSums(nd_out) + #Nearest non-conspecific distance + noncon_out <- sapply(win_dist, function(x) nonConDist(x, propZero = TRUE, rmNA=TRUE)) + + } + if(treeMeasures){ + #Produce NJ tree of full sequence + dat_tr <- nj(datd) + depth <- which(node.depth(dat_tr)[node.depth(dat_tr) > 1] <= median(node.depth(dat_tr)[node.depth(dat_tr) > 1])) + #Remove windows with NA distances + dist_na <- sapply(win_dist, function(x) sum( as.numeric( is.na(x) ) ) ) + pos_tr_out <- pos_out[ dist_na < 1 ] + win_tr <- lapply(win_dist[ dist_na < 1 ], nj) + #Comparing clade composition with full alignment + comp_out <- sapply(win_tr, function(x) tree.comp(dat_tr, x)) + comp_depth_out <- sapply(win_tr, function(x) tree.comp(dat_tr, x, method="shallow")) + #Monophyly + win_mono <- lapply(win_tr, function(x) monophyly(x, sppVector)) + win_mono_out <- sapply(win_mono, function(x) length(which(x))/length(x)) + } +rm(list = ls()[!ls() %in% c(ls(pattern="_out"), ls(pattern="res"))]) +output <- as.list(environment()) +class(output) <- "slidWin" +output +} + diff --git a/R/slideBoxplots.R b/R/slideBoxplots.R new file mode 100644 index 0000000..7f270f6 --- /dev/null +++ b/R/slideBoxplots.R @@ -0,0 +1,38 @@ +slideBoxplots <- +function(DNAbin, sppVector, width, interval = 1, method="nonCon"){ + #Produce distance matrix, number of zero cells of full sequence + boxplot_out <- TRUE + dat <- as.matrix(DNAbin) + dimnames(dat)[[1]] <- sppVector + datd <- dist.dna(dat, pairwise.deletion = TRUE) + #Create the windows + win <- slidingWindow(dat, width-1, interval = interval) + pos_out <- sapply(win, function(x) attr(x, "window")[1]) + win_dist <- lapply(win, function(x) dist.dna(x, pairwise.deletion = TRUE)) + bp_range_out <- range(win_dist,na.rm=TRUE) + if(method == "overall"){ + #Boxplots of overall distance matrix + bp_out <- lapply(win_dist, function(x) boxplot(x, plot=FALSE)) + } + if(method == "interAll"){ + #Boxplots of intra- and inter-specific distances + spp_dist <- lapply(win_dist, function(x) sppDist(x, sppVector)) + bp_InterSpp_out <- lapply(spp_dist, function(x) boxplot(x$inter, plot=FALSE)) + bp_IntraSpp_out <- lapply(spp_dist, function(x) boxplot(x$intra, plot=FALSE)) + } + if(method == "nonCon"){ + #Boxplots of closest non-conspecific distance + spp_dist <- lapply(win_dist, function(x) sppDist(x, sppVector)) + bp_IntraSpp_out <- lapply(spp_dist, function(x) boxplot(x$intra, plot=FALSE)) + spp_nonCon <- lapply(win_dist, nonConDist) + bp_InterSpp_out <- lapply(spp_nonCon, function(x) boxplot(x, plot=FALSE)) + bp_range_out <- c(0, max(unlist(spp_nonCon), na.rm = TRUE)) + } + +rm(list = ls()[!ls() %in% c(ls(pattern="_out"), ls(pattern="thod"))]) +distMeasures <- FALSE +treeMeasures <- FALSE +output <- as.list(environment()) +class(output) <- "slidWin" +output +} diff --git a/R/slideNucDiag.R b/R/slideNucDiag.R new file mode 100644 index 0000000..35d8c3a --- /dev/null +++ b/R/slideNucDiag.R @@ -0,0 +1,11 @@ +slideNucDiag <- +function(DNAbin, sppVector, width, interval = 1){ + nd <- nucDiag(DNAbin, sppVector) + if(interval == "codons") interval <- 3 + win <- seq(1, dim(DNAbin)[2] - width, by = interval) + mat <- matrix(NA, nrow = length(nd), ncol = length(win)) + for(i in 1:length(win)) mat[ ,i] <- sapply(nd, function(x) length(which(x %in% win[i]:(win[i] + width)))) + dimnames(mat)[[1]] <- unique(sppVector) + mat +} + diff --git a/R/slidingWindow.R b/R/slidingWindow.R new file mode 100644 index 0000000..4ac9cc3 --- /dev/null +++ b/R/slidingWindow.R @@ -0,0 +1,16 @@ +slidingWindow <- +function(DNAbin, width, interval = 1){ + annote <- function(x){ + width <- width - 1 + y <- align[ , x:(x+width)] + attr(y, "window") <- c(x, x+width) + y + } + if(interval == "codons") interval <- 3 + align <- as.matrix(DNAbin) + len <- dim(align)[[2]] + iter <- seq(1, len-width, by = interval) + li <- lapply(iter, function(x) annote(x)) + li +} + diff --git a/R/sppDist.R b/R/sppDist.R new file mode 100644 index 0000000..6a02bb1 --- /dev/null +++ b/R/sppDist.R @@ -0,0 +1,16 @@ +sppDist <- +function(distobj, sppVector){ + distobj <- as.matrix(distobj) + attr(distobj, "dimnames")[[1]] <- sppVector + taxa <- unique(sppVector) + intra <- list() + inter <- list() + for(i in 1:length(taxa)){ + for(j in 1:length(taxa)){ + sppMat <- distobj[which(dimnames(distobj)[[1]] == taxa[i]), which(dimnames(distobj)[[1]] == taxa[j])] + if(taxa[i] == taxa[j]) intra[[length(intra)+1]] <- sppMat[lower.tri(sppMat)] else inter[[length(inter)+1]] <- sppMat + } + } +list(inter = unlist(inter), intra = unlist(intra)) +} + diff --git a/R/sppDistMatrix.R b/R/sppDistMatrix.R new file mode 100644 index 0000000..009615b --- /dev/null +++ b/R/sppDistMatrix.R @@ -0,0 +1,15 @@ +sppDistMatrix <- +function(distobj, sppVector){ + dat <- as.matrix(distobj) + attr(dat, "dimnames")[[1]] <- sppVector + taxa <- unique(sppVector) + pair.mat <- matrix(data = NA, nrow = length(taxa), ncol = length(taxa), dimnames = list(one = taxa,two = taxa)) + for(i in 1:length(taxa)){ + for(j in 1:length(taxa)){ + sppMat <- dat[which(dimnames(dat)[[1]] == taxa[i]), which(dimnames(dat)[[1]] == taxa[j])] + if(taxa[i] == taxa[j]) pair.mat[i,j] <- mean(sppMat[lower.tri(sppMat)]) else pair.mat[i,j] <- mean(sppMat) + } + } +pair.mat +} + diff --git a/R/stats.BOLD.R b/R/stats.BOLD.R new file mode 100644 index 0000000..5088f6b --- /dev/null +++ b/R/stats.BOLD.R @@ -0,0 +1,11 @@ +stats.BOLD <- +function(taxon){ + space <- grep(" ", taxon) + if(length(space) > 0) taxon[space] <- sapply(space, function(x) paste("\"", taxon[x], "\"", sep="")) + taxon <- paste(taxon, collapse="%20") + taxon <- gsub(" ", "%20", taxon) + URL <- paste("http://v3.boldsystems.org/index.php/Public_SearchTerms?query=", taxon, sep="") + res <- scan(file = URL, what = "", sep = "\n", quiet = TRUE) + res <- res[grep("totalRecords", res)] + as.numeric(gsub("[^[:digit:]]", "", res)) +} \ No newline at end of file diff --git a/R/tajima.K.R b/R/tajima.K.R new file mode 100644 index 0000000..866edf6 --- /dev/null +++ b/R/tajima.K.R @@ -0,0 +1,7 @@ +tajima.K <- +function(DNAbin, prop = TRUE){ + res <- mean(dist.dna(DNAbin, model="N")) + if(prop) res <- res/dim(DNAbin)[2] + res +} + diff --git a/R/tclust.R b/R/tclust.R new file mode 100644 index 0000000..6e43a20 --- /dev/null +++ b/R/tclust.R @@ -0,0 +1,10 @@ +tclust <- function(distobj, threshold = 0.01){ + distobj <- as.matrix(distobj) + out <- unique(apply(distobj, 2, function(x) which(x < threshold))) + for(i in 1:length(out)){ + for(j in 1:length(out)){ + if(length(intersect(out[[i]], out[[j]])) > 0) out[[i]] <- union(out[[i]], out[[j]]) else out[[i]] <- out[[i]] + } + } + unique(lapply(out, sort)) +} \ No newline at end of file diff --git a/R/threshID.R b/R/threshID.R new file mode 100644 index 0000000..eca4666 --- /dev/null +++ b/R/threshID.R @@ -0,0 +1,33 @@ +threshID <- function(distobj, sppVector, threshold = 0.01){ + distobj <- as.matrix(distobj) + diag(distobj) <- NA + output <- rep(NA, length(sppVector)) + aa <- apply(distobj, MARGIN=2, FUN=function(x) which(x < threshold)) + if(is.matrix(aa)) aa <- c(unname(as.data.frame(aa))) else aa + bb <- lapply(aa, function(x) unique(sppVector[x])) + cc <- sppVector == bb + dd <- sapply(1:length(sppVector), function(x) sppVector[x] %in% bb[[x]]) + ee <- apply(distobj, MARGIN=2, FUN=function(x) min(x, na.rm = TRUE)) + output[which(cc & dd)] <- "correct" + output[which(!cc & !dd)] <- "incorrect" + output[which(!cc & dd)] <- "ambiguous" + output[which(ee > threshold)] <- "no id" + output +} + +###################### +#Return the name of the result + +#~ idBOLD <- function(distobj, sppVector, threshold = 0.01, highThreshold = 0.05){ + #~ distobj <- as.matrix(distobj) + #~ diag(distobj) <- NA + #~ output <- rep(NA, length(sppVector)) + #~ for(i in 1:length(sppVector)){ + #~ high <- sppVector[which(distobj[,i] < highThreshold)] + #~ low <- sppVector[which(distobj[,i] < threshold)] + + #~ if(length(high) == 0) output[i] <- NA else { + #~ if(length(unique(low)) == 1) output[i] <- unique(low) else output[i] <- FALSE} + #~ } + #~ output +#~ } diff --git a/R/threshOpt.R b/R/threshOpt.R new file mode 100644 index 0000000..5b320fb --- /dev/null +++ b/R/threshOpt.R @@ -0,0 +1,27 @@ +threshOpt <- function(distobj, sppVector, threshold = 0.01){ + distobj <- as.matrix(distobj) + #individual (diag) values excluded + diag(distobj) <- NA + #indices of singletons + singletons <- rmSingletons(sppVector, exclude=FALSE) + #gets vector of column rows of all non-singletons + ZZ <- rmSingletons(sppVector, exclude=TRUE) + #run the sensitivity loop - singleton species (MU) excluded from iteration - only uses ZZ for query + #False positive - no matches within x% of query --- as the "NA" value + #Positive - only 1 sp. within x% of query --- as the "TRUE" value; + #False negative - more than 1 spp. within x% of query --- as the "FALSE" value + #remember for 0, we need to say ==0 rather than <0 + OUT <- NULL + for(i in 1:dim(distobj)[1]) { + inThresh <- sppVector[which(distobj[,i] < threshold)] + if(length(inThresh) == 0 && i %in% singletons) OUT[i] <- "True neg" else { + if(length(inThresh) == 0 && !i %in% singletons) OUT[i] <- "False pos" else{ + if(length(unique(inThresh)) == 1 && unique(inThresh) == sppVector[i]) OUT[i] <- "True pos" else{ + if(length(unique(inThresh)) == 1 && !unique(inThresh) == sppVector[i]) OUT[i] <- "False neg" else OUT[i] <- "False neg"} + }}} + OUT <- factor(OUT, levels=c("True neg", "True pos", "False neg", "False pos")) + tab <- table(OUT) + tab <- c(threshold, tab, sum(tab[3], tab[4])) + names(tab)[c(1,6)] <- c("Threshold", "Cumulative error") + tab +} diff --git a/R/tiporder.R b/R/tiporder.R new file mode 100644 index 0000000..abf0e7d --- /dev/null +++ b/R/tiporder.R @@ -0,0 +1,6 @@ +tiporder <- function(phy, labels = TRUE){ + nn <- length(phy$tip.label) #How many tips on the tree? + edge <- phy$edge + nums <- rev(edge[edge[,2] %in% 1:nn, 2]) + if(labels == TRUE) phy$tip.label[nums] else nums +} diff --git a/R/titv.R b/R/titv.R new file mode 100644 index 0000000..c0a45e5 --- /dev/null +++ b/R/titv.R @@ -0,0 +1,14 @@ +titv <- +function(DNAbin){ +mat<-as.matrix(DNAbin) +res<-matrix(NA, ncol=dim(mat)[1], nrow=dim(mat)[1], dimnames=list(x=names(DNAbin), y=names(DNAbin))) +for(i in 1:(dim(mat)[1] - 1)){ +for(j in (i+1):dim(mat)[1]){ +vec<-as.numeric(mat[i,])+as.numeric(mat[j,])-8 +res[j,i]<-sum(!is.na(match(vec,c(200,56))))#Transitions +res[i,j]<-sum(!is.na(match(vec,c(152,168,88,104))))#Transversions +} +} +res +} + diff --git a/R/tree.comp.R b/R/tree.comp.R new file mode 100644 index 0000000..1dd5c8b --- /dev/null +++ b/R/tree.comp.R @@ -0,0 +1,43 @@ +tree.comp <- +function (phy1, phy2, method = "prop") { + phy1 <- unroot(phy1) + phy2 <- unroot(phy2) + nphy1 <- length(phy1$tip.label) + bp1 <- prop.part(phy1) + bp1 <- lapply(bp1, function(xx) sort(phy1$tip.label[xx])) + nphy2 <- length(phy2$tip.label) + bp2.tmp <- prop.part(phy2) + bp2 <- lapply(bp2.tmp, function(xx) sort(phy2$tip.label[xx])) + bp2.comp <- lapply(bp2.tmp, function(xx) setdiff(1:nphy2, xx)) + bp2.comp <- lapply(bp2.comp, function(xx) sort(phy2$tip.label[xx])) + q1 <- length(bp1) + q2 <- length(bp2) + p <- 0 + for (i in 1:q1) { + for (j in 1:q2) { + if (identical(bp1[[i]], bp2[[j]]) | identical(bp1[[i]], + bp2.comp[[j]])) { + p <- p + 1 + break + } + } + } + if(method == "shallow"){ + shallow <- which(node.depth(phy1)[node.depth(phy1) > 1] <= median(node.depth(phy1)[node.depth(phy1) > 1])) + p2 <- 0 + for (i in shallow) { + for (j in 1:q2) { + if (identical(bp1[[i]], bp2[[j]]) | identical(bp1[[i]], + bp2.comp[[j]])) { + p2 <- p2 + 1 + break + } + } + } + dT <- p2/length(shallow) + } + if(method == "PH85") dT <- q1 + q2 - 2 * p + if(method =="prop") dT <- p/q1 +dT +} + diff --git a/data/anoteropsis.rda b/data/anoteropsis.rda new file mode 100644 index 0000000000000000000000000000000000000000..5326baea5d4ba11f611873b31e212ef2475992c7 GIT binary patch literal 1323 zcmV+`1=RWuJXu)cEhd9^Pv%p6Jr1_VfOi+VEN z9cvwDuoLzb{0cnEz8s?CpYG~*x2v7*nE5nu99O5ls;{a$jy-vQ@%hh(pC3k1bP}CB zx)+_C^4-M=-`wMmU-{z^D>mD-+}u`K6-B3U6#eyotpS=pgv1+l+#*S0GyEyy+Tz+7 z4iqB*0PhIm7#jVdce(cp(7A7({V8TZ(?X($Kx_oUP;?q0_aJ-MpMs8of`bPzzY}W5 za^H!0K{*H^6e_bxGoV0I#Es;jwvlXOnu1KTmV?Be!@Ce%g&OdRRLCsoG;tP|EXP4+ zA+xa<0%Xyp(Qiyyj)R;RBjtinSy1gmN=#GOJIWEHP?843rk#kjW79f!l!CoFY3(QwyTj@#2mFO1;ZH#}%z;yH3Y>#9l7Y)I=r}hMw4Exo zYnsA)qBDgNN|prb!6&!BP^J=0gJUgx;508l=x<;QSPSTDp=vhoF$D4tI_$)Ddc7(Tt?< zkKq^|K%MJkHK(po1GNHtpr{v2YxEL&9hwKU2^*HQsX~GN(sw0?{f!*tHI`2SPn+!- zsI-H90jC87d)S{x#iZLwS%~ZM_!||@hnAWGAonpo$sF+d0bPLv>AqzGL@D#@Z#@OM z@kdsZF82%JBrT2J38mCw)Q7UL6+Oz$ZAPEgt`kDo3d_7`cpWq zvU=>d08@y3wDE~y@1QI| zeoT=b=mi9kK&{<>u~SUB>@f0WwKb4I9yTxV(E!UJY9iweS9d||3|M%jmt_6@77$wr z+$km~S&NO5rGFMy0d*y<2v;mC=vsJFI0Xszl{SP9AbUpc;9l~l7~E%@e7^56F9@Gx z(~>ceX_7%Y#GroHYj?tZJ95cl2F#EJ>EYlt${mMO1Ry`M!zl>uL38N3d zN;F?K^cV(By3k+i(&`5WS?LFKk3$M_07aqww(0abTl1&inx8*K(F<<*rVV+(hZJm` zR($_yH|}vewy{4sE4Q06W6QsD_BC6xVpnyR$GxRWZ&}HGObo=Q7zxR8n=HW?4K@0M%bR;W`#NOGA**N>>swgJRi5eIX8K}%F}9_ zb(Uw?vSLLWlf}5*F)nLoe2$I#8Ypl5U9oP;ioeKA(cG{+XQ|2OO_p=IO4i3{J$p4~ z6$%)wA6}ah&5CN**iFWqt#;K`8WrX9jQ1VOF|Wmb-ZMYN_VuRNGEsDeck?fAMcxpp zNki`Q_QqJzinqdwEk9k17|s2;E5Gb+Sh>wCxk&rl*W-3kzdx}3Ki-5nE3SD2-1J?W h{P$NmtEz6s_XmHuxIDYcigtMP{RhNmp?*R+006>4fhqt1 literal 0 HcmV?d00001 diff --git a/data/dolomedes.rda b/data/dolomedes.rda new file mode 100644 index 0000000000000000000000000000000000000000..70b31b7221e4febd7b30e4b21b3f8e59db316669 GIT binary patch literal 1447 zcmV;Y1z7qYiwFP!000001Kpj?ZWBoqg_~qRGmIjE&`5rPk-YRzcLGZ!FIcf)#nyw7 zCK5?BlVQUX_!)T^M5VjSj>~QP>#oz+{9F}T_j|tj!uVOTP0+Hbc(sF1evu8ZEy;OCN-T7I!)1?nKXnQe^N z?-GiEvO9#~5ekeVbsF=YYj=R9`UKe?P1g>(C5D7|Z^uqR7NAJTC!LwbbUGl%QL7kd zr=!zZpo^sQ3DI@ZR8C@<^k6-cs4U1Ql$~)TotdkEh&NgjiiCVJp1V$w8Z%e>G#rjx zqG7H*Y&ry#U5d3kS?J8v0hI*_PqDyAlkp%RIs~Siz_>F{md?1xb#%HxXpeQ^<-68R zi)yFK4w<L zGIku%=}=ZLtmV+34yq&L>`)|KXF{J$=}s0Yj=8ROC_6-lEI@P!y6g~)v(wR`NGW!v zPPj3SE4;T9Hqt2vVYt&QZ?SMiBTc%d z)6prL1po;*j%19OlKxM!OL6XW!zpxgm1py}gK~|UW|!Iz&MKftI^n1@b&+(!P0dX6 zPhB7j9?e(eGcjGO7}FWg9il^6u3z7`iwh}&XcDk>h{^)9>!hh`Bm|V5t}~5+%xvv= zod1Z<*e9KEZf40^Q?GQiQO zGKR64J!jJ&Qp^r%I@z7s%iM(}TK`)|x7Hu~;-60D`jT!`7160$Whi|Xy|)tvO|)q-sn;kNw-yy1;#$1>~xxRX6k^lLxZ66>14;;7^V0_*#&QXeZIK9 zaUU+qcgN1MKi$cP;F4e6CAagNhd0O0Ue(vVsIINvJ^SYR_U_-=XfzEDrsLpXa#r{-4`CbD#AxNYx2t=?l`<^`o>+AhwP!gya*4j zf`cN=pzuLoF79p@b4Tq;G@d%fRjm38(>ypB`&bwL3ir@r6h7=KI4CCJ!K#?|pZ=bI zb$R)RysQrfKhe=I zCLV1i1RGz|yS~?VmtK0wuW52k03d*1LouXd6i}E!1WU>bCEJH=0YW79W)4WuOtXOt zS|faQK|o>sriQ>YqHr*xu)cwhpdHY3VWxAh_7z$no4hu>S2d-z83-Vzy_;BKvNW4gSzv7&$S6Nhb*&IAt>d}t>v@FYh`i->Iqxou2#JkBX^WrI$asv!MmY?~p QYOEc)56B#!j^hCU0NV0juK)l5 literal 0 HcmV?d00001 diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..d08ee13 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,13 @@ +citHeader("To cite spider in a publication, please use:") + +citEntry(entry="Article", + title = "{SPIDER}: an {R} package for the analysis of species identity and evolution, with particular reference to {DNA} barcoding", + author = personList(as.person("Samuel D. J. Brown"),as.person("Rupert A. Collins"), as.person("Stephane Boyer"), as.person("Marie-Caroline Lefort"), as.person("Jagoba Malumbres-Olarte"), as.person("Cor J. Vink"), as.person("Robert H. Cruickshank")), + journal = "Molecular Ecology Resources", + year = 2012, + volume = 12, + pages = "562--565", + + textVersion = "Brown S. D. J., Collins R. A., Boyer S., Lefort M.-C., Malumbres-Olarte J., Vink C. J., & Cruickshank R. H. 2012. SPIDER: an R package for the analysis of species identity and evolution, with particular reference to DNA barcoding. Molecular Ecology Resources 12:562-565") + +citFooter("As spider is developing rapidly, you may want to cite also its version number (found with 'library(help = spider)').") \ No newline at end of file diff --git a/man/anoteropsis.Rd b/man/anoteropsis.Rd new file mode 100644 index 0000000..50f77be --- /dev/null +++ b/man/anoteropsis.Rd @@ -0,0 +1,17 @@ +\name{anoteropsis} +\docType{data} +\alias{anoteropsis} +\title{Cytochrome oxidase I (COI) sequences of New Zealand _Anoteropsis_ species} + +\description{A set of 33 sequences of the mitochondrial protein-coding gene cytochrome oxidase I from 20 species of the New Zealand wolf spider genus \emph{Anoteropsis} (Lycosidae) and two species of \emph{Artoria} as outgroups. The sequences are available on GenBank as accession numbers AY059961 through AY059993.} + +\usage{anoteropsis} + +\format{A DNAbin object containing 33 sequences with a length of 409 base pairs stored as a matrix.} + +\source{ +Vink, C. J., and Paterson, A. M. (2003). Combined molecular and morphological phylogenetic analyses of the New Zealand wolf spider genus _Anoteropsis_ +(Araneae: Lycosidae). _Molecular Phylogenetics and Evolution_ *28* 576-587. +} + +\keyword{Datasets} \ No newline at end of file diff --git a/man/cgraph.Rd b/man/cgraph.Rd new file mode 100644 index 0000000..1715ee5 --- /dev/null +++ b/man/cgraph.Rd @@ -0,0 +1,47 @@ +\name{cgraph} +\alias{cgraph} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Complete graph} + +\description{ +Creates a complete graph for the given cloud of vertices. +} + +\usage{cgraph(x, y, ...)} +%- maybe also 'usage' for other objects documented here. + +\arguments{ + \item{x}{X values, or a matrix with two columns containing X and Y values.} + \item{y}{Y values. Can be left empty if \code{x} is a matrix.} + \item{...}{Other arguments to be passed to \code{\link{segments}}.} +} + +\details{ +If \code{y} is not given, \code{x} is required to be a matrix containing both x and y values. +} + +\value{Plots a complete graph between the given vertices.} + +\author{Samuel Brown } + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{plot.ordinDNA}}. +} + +\examples{ +x <- runif(15) +y <- runif(15) + +plot(x, y) +cgraph(x, y) + +M <- cbind(x, y) +cgraph(M[1:10,], col = "blue") + +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Visualisation} diff --git a/man/chaoHaplo.Rd b/man/chaoHaplo.Rd new file mode 100644 index 0000000..27080a0 --- /dev/null +++ b/man/chaoHaplo.Rd @@ -0,0 +1,49 @@ +\name{chaoHaplo} +\alias{chaoHaplo} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Chao estimator of haplotype number +} +\description{ +Calculates the Chao1 estimate of the number of haplotypes in a population based on the total number of haplotypes present, and the number of singletons and doubletons in the dataset. +} +\usage{ +chaoHaplo(DNAbin) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +An object of class `DNAbin'. +} +} +\details{ +The function assumes a large number of specimens have been sampled and that duplicate haplotypes have not been removed. Interpretation becomes difficult when more than one species is included in the dataset. +} +\value{ +An vector of length three, giving the estimated total number of haplotypes in the population, and lower and upper 95\% confidence limits. +} +\references{ +Vink, C. J., McNeill, M. R., Winder, L. M., Kean, J. M., and Phillips, C. B. (2011). PCR analyses of gut contents of pasture arthropods. In: Paddock to PCR: Demystifying Molecular Technologies for Practical Plant Protection (eds. Ridgway, H. J., Glare, T. R., Wakelin, S. A., O'Callaghan, M.), pp. 125-134. New Zealand Plant Protection Society, Lincoln. + +Chao, A. (1989). Estimating population size for sparse data in capture-recapture experimnets. _Biometrics_ *45* 427-438. +} +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{haploAccum}} +} +\examples{ +data(dolomedes) +#Create dataset with multiple copies of Dolomedes haplotypes +doloSamp <- dolomedes[sample(16, 100, replace=TRUE, prob=c(0.85, rep(0.01, 15))), ] + +chaoHaplo(doloSamp) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/checkDNA.Rd b/man/checkDNA.Rd new file mode 100644 index 0000000..9d63c33 --- /dev/null +++ b/man/checkDNA.Rd @@ -0,0 +1,37 @@ +\name{checkDNA} +\alias{checkDNA} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Check a DNA alignment for missing data} + +\description{This functions counts the number of bases in an alignment that are composed of missing data.} + +\usage{checkDNA(DNAbin, gapsAsMissing = TRUE)} + + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{A DNA alignment of class `DNAbin'.} + \item{gapsAsMissing}{Logical. Should gaps (coded as '-') be considered missing bases? Default of TRUE.} +} + +\details{ +This function considers bases coded as '?' and 'N' as missing data. By default, gaps (coded as '-') are also considered missing. +} + +\value{ +A numeric vector giving the number of missing bases in each sequence of the alignment. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +data(anoteropsis) +checkDNA(anoteropsis) +checkDNA(anoteropsis, gapsAsMissing=FALSE) +} + +\keyword{Utilities} \ No newline at end of file diff --git a/man/dataStat.Rd b/man/dataStat.Rd new file mode 100644 index 0000000..3a7e20e --- /dev/null +++ b/man/dataStat.Rd @@ -0,0 +1,56 @@ +\name{dataStat} +\alias{dataStat} + +\title{ +Taxa statistics +} + +\description{ +Returns the numbers of species, genera and individuals in the dataset. +} + +\usage{dataStat(sppVector, genVector, thresh) +} + +\arguments{ + \item{sppVector}{ +Species vector (see \code{\link{sppVector}}). +} + +\item{genVector}{ +Genus vector that defines the genera of each individual, created in a similar manner to the species vector. +} + + \item{thresh}{ +Threshold for adequate individual/species number. Default of 5. +} +} + +\details{ +The value \code{NULL} can be passed to \code{gen} if genera are not of interest in the dataset. +} + +\value{ +A table giving the number of genera and species in the dataset; giving the minimum, maximum, mean and median number of individuals per species, and the number of species below the given threshold. +} + +\author{ +Rupert Collins +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +data(anoteropsis) +#Species vector +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) +#Genus vector +anoGen <- sapply(strsplit(anoSpp, split="_"), function(x) x[1]) +dataStat(anoSpp, anoGen) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} +\keyword{Utilities} \ No newline at end of file diff --git a/man/dolomedes.Rd b/man/dolomedes.Rd new file mode 100644 index 0000000..3c5b0bd --- /dev/null +++ b/man/dolomedes.Rd @@ -0,0 +1,15 @@ +\name{dolomedes} +\docType{data} +\alias{dolomedes} +\title{Cytochrome oxidase I (COI) sequences of New Zealand _Dolomedes_ species} + +\description{A set of 37 sequences of the mitochondrial protein-coding gene cytochrome oxidase I from the 4 New Zealand species of the nursery-web spider genus \emph{Dolomedes} (Pisauridae). These sequences are available on GenBank as accession numbers GQ337328 through GQ337385.} + +\usage{dolomedes} + +\format{A DNAbin object containing 37 sequences with a length of 850 base pairs stored as a matrix.} + +\source{ +Vink, C. J., and Duperre, N. (2010). Pisauridae (Arachnida: Araneae). _Fauna of New Zealand_ *64* 1-54.} + +\keyword{Datasets} \ No newline at end of file diff --git a/man/haploAccum.Rd b/man/haploAccum.Rd new file mode 100644 index 0000000..44f6dfb --- /dev/null +++ b/man/haploAccum.Rd @@ -0,0 +1,55 @@ +\name{haploAccum} +\alias{haploAccum} + +\title{Haplotype accumulation curves} + +\description{ +\code{haploAccum} identifies the different haplotypes represented in a set of DNA sequences and performs the calculations for plotting haplotype accumulations curves (see \code{\link{plot.haploAccum}}). +} + +\usage{haploAccum(DNAbin, method = "random", permutations = 100, ...) +} + +\arguments{ + \item{DNAbin}{A set of DNA sequences in an object of class `DNAbin'.} + \item{method}{Method for haplotype accumulation. Method \code{"collector"} enters the sequences in the order that they appear in the sequence alignment and \code{"random"} adds the sequences in a random order.} + \item{permutations}{Number of permutations for method \code{"random"}.} + \item{...}{Other parameters to functions.} +} + +\details{ +Haplotype accumulation curves can be used to assess haplotype diversity in an area or compare different populations, or to evaluate sampling effort. \code{``random''} calculates the mean accumulated number of haplotypes and its standard deviation through random permutations (subsampling of sequences), similar to the method to produce rarefaction curves (Gotelli and Colwell 2001). +} + +\value{ +An object of class `haploAccum' with items: + \item{call}{Function call.} + \item{method}{Method for accumulation.} + \item{sequences}{Number of analysed sequences.} + \item{n.haplotypes}{Accumulated number of haplotypes corresponding to each number of sequences.} + \item{sd}{The standard deviation of the haplotype accumulation curve. Estimated through permutations for \code{method = "random"} and \code{NULL} for \code{method = "collector"}.} + \item{perm}{Results of the permutations for \code{method = "random"}.} +} + +\references{ + Gotellli, N.J. & Colwell, R.K. (2001). Quantifying biodiversity: procedures and pitfalls in measurement and comparison of species richness. _Ecology Letters_ *4*, 379--391. +} + +\author{ +Jagoba Malumbres-Olarte . +} + +\note{ +This function is based on the functions \code{haplotype} (E. Paradis) from the package 'pegas' and \code{specaccum} (R. Kindt) from the package'vegan'. Missing or ambiguous data will be detected and indicated by a warning, as they may cause an overestimation of the number of haplotypes. +} + +\examples{ +data(dolomedes) +#Generate multiple haplotypes +doloHaplo <- dolomedes[sample(37, size = 200, replace = TRUE), ] +dolocurv <- haploAccum(doloHaplo, method = "random", permutations = 100) +dolocurv +plot(dolocurv) +} + +\keyword{Sampling} \ No newline at end of file diff --git a/man/heatmapSpp.Rd b/man/heatmapSpp.Rd new file mode 100644 index 0000000..97856f9 --- /dev/null +++ b/man/heatmapSpp.Rd @@ -0,0 +1,49 @@ +\name{heatmapSpp} +\alias{heatmapSpp} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Visualise a distance matrix using a heatmap} + +\description{This function plots a heatmap of the distance matrix, with shorter distances indicated by darker colours.} + +\usage{heatmapSpp(distObj, sppVector, col = NULL, axisLabels = NULL)} + + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{distObj}{A matrix or object of class \code{dist}.} + \item{sppVector}{The species vector. See \code{\link{sppVector}}.} + \item{col}{A vector giving the colours for the heatmap.} + \item{axisLabels}{A character vector that provides the axis labels for the heatmap. By default the species vector is used.} +} + +\details{ + +The default palette has been taken from the \code{colorspace} package. +} + +\value{ +Plots a heatmap of the distance matrix. Darker colours indicate shorter distances, lighter colours indicate greater distances. +} + +\author{ +Samuel Brown +} + +\examples{ +data(dolomedes) +doloDist <- dist.dna(dolomedes, model = "raw") +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) +heatmapSpp(doloDist, doloSpp) +heatmapSpp(doloDist, doloSpp, axisLabels = dimnames(dolomedes)[[1]]) + +data(anoteropsis) +anoDist <- dist.dna(anoteropsis, model = "raw") +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) +heatmapSpp(anoDist, anoSpp) + +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities} diff --git a/man/is.ambig.Rd b/man/is.ambig.Rd new file mode 100644 index 0000000..9529de0 --- /dev/null +++ b/man/is.ambig.Rd @@ -0,0 +1,42 @@ +\name{is.ambig} +\alias{is.ambig} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Missing bases in alignments +} + +\description{ +Checks what columns in an alignment have ambiguous bases or missing data. +} + +\usage{ +is.ambig(DNAbin) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +A DNA alignment of class `DNAbin'. +} +} +\details{ +Ambiguous bases are bases that have been coded with any of the Union of Pure and Applied Chemistry (IUPAC) DNA codes that are not A, C, G, or T. Missing data are bases that have been coded with "-", "?" or "N". +} +\value{ +A logical vector containing TRUE if ambiguous bases or missing data are present, FALSE if not. Does not differentiate between the two classes of data. +} + +\author{Samuel Brown } + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{checkDNA}} +} +\examples{ +data(woodmouse) +is.ambig(woodmouse) +#Columns with ambiguous bases +which(is.ambig(woodmouse)) +} + +\keyword{Utilities} \ No newline at end of file diff --git a/man/localMinima.Rd b/man/localMinima.Rd new file mode 100644 index 0000000..d28f7ce --- /dev/null +++ b/man/localMinima.Rd @@ -0,0 +1,52 @@ +\name{localMinima} +\alias{localMinima} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Determine thresholds from a density plot} + +\description{This function determines possible thresholds from the distance matrix for an alignment.} + +\usage{ +localMinima(distobj) +} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{distobj}{A distance object (usually from \code{\link{dist.dna}}).} +} + +\details{ +This function is based on the concept of the barcoding gap, where a dip in the density of genetic distances indicates the transition between intra- and inter-specific distances. Understanding your data is vital to correctly interpreting the output of this function, but as a start, the first local minimum is often a good place to start. + +The value of this function is that it does not require prior knowledge of species identity to get an indication of potential threshold values. +} + +\value{ +An object of class `density', which is a list containing the values calculated by \code{\link{density}}. The element \code{localMinima} has been added, which contains the values of the local minima of the density plot. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{dist.dna}}, \code{\link{density}}. +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} + +\examples{ + +data(anoteropsis) +anoDist <- dist.dna(anoteropsis) + +anoThresh <- localMinima(anoDist) +plot(anoThresh) +anoThresh$localMinima +#Often the first value is the one to go for: +anoThresh$localMinima[1] +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/monophyly.Rd b/man/monophyly.Rd new file mode 100644 index 0000000..1b4813c --- /dev/null +++ b/man/monophyly.Rd @@ -0,0 +1,104 @@ +\name{monophyly} +\alias{monophyly} +\alias{monophylyBoot} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Species monophyly over a tree +} + +\description{ +Determines if the species given in \code{sppVector} form monophyletic groups on a given tree. +} + +\usage{ +monophyly(phy, sppVector, pp = NA, singletonsMono = TRUE) +monophylyBoot(phy, sppVector, DNAbin, thresh = 0.7, reroot=TRUE, + pp = NA, singletonsMono = TRUE, reps = 1000, block = 3) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{phy}{ +A tree of class `phylo'. +} + \item{sppVector}{ + Species vector. See \code{\link{sppVector}}}. + \item{pp}{ +Object of class `prop.part'. Assists in speeding up the function, if it has been called already. Default of NA, calling \code{\link{prop.part}} internally. +} + \item{singletonsMono}{ +Logical. Should singletons (i.e. only a single specimen representing that species) be treated as monophyletic? Default of TRUE. Possible values of FALSE and NA. +} + \item{DNAbin}{ +An object of class 'DNAbin'. Required for calculating bootstrap values. +} + \item{thresh}{Numeric between 0 and 1. Bootstrap threshold under which potentially monophyletic species are negated. Default of 0.7. +} + \item{reroot}{Logical. Should the bootstrap replicates be rerooted on the longest edge? Default of TRUE. +} + \item{reps}{Numeric. Number of bootstrap replications. Default of 1000. +} + \item{block}{The number of nucleotides that will be resampled together. Default of 3 to resample on the codon level. +} +} +\details{ +\code{monophyly} determines if each species is monophyletic. \code{monophylyBoot} incorporates a bootstrap test to determine the support for this monophyly. Species with a bootstrap support lower than \code{"thresh"} are recorded as FALSE. + +Rerooting is done on the longest internal edge in the tree returned by \code{nj(dist.dna(DNAbin))}. +} +\value{ +\code{monophyly} returns a logical vector, stating if each species is monophyletic. Values correspond to the species order given by \code{unique(sppVector)}. + +\code{monophylyBoot} returns a list with the following elements: + \item{results}{A logical vector, stating if each species is monophyletic with a bootstrap support higher than the given threshold.} + \item{BSvalues}{A numeric vector giving the bootstrap proportions for each node of \code{phy}.} +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{prop.part}}, \code{\link{root}}, \code{\link{boot.phylo}}. +} +\examples{ +#Random trees +set.seed(16) +tr <- rtree(15) +spp <- rep(LETTERS[1:5], rep(3,5)) +monophyly(tr, spp) + +tr2 <- tr +spp2 <- c(rep(LETTERS[1:4], rep(3,4)), LETTERS[5:7]) +monophyly(tr2, spp2) + +#Empirical data +\dontrun{ +data(anoteropsis) +anoTree <- nj(dist.dna(anoteropsis)) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +monophyly(anoTree, anoSpp) +monophyly(anoTree, anoSpp, singletonsMono=FALSE) +unique(anoSpp) + +#To get score for each individual +anoMono <- monophyly(anoTree, anoSpp) +anoMono[match(anoSpp, unique(anoSpp))] + +data(woodmouse) +woodTree <- nj(dist.dna(woodmouse)) +woodSpp <- c("D", "C", "C", "A", "A", "E", "A", "F", "C", "F", "E", "D", "A", "A", "E") +unique(woodSpp) +monophyly(woodTree, woodSpp) +woodMono <- monophylyBoot(woodTree, woodSpp, woodmouse) +woodMono$results +woodMono$BSvalues + +monophylyBoot(woodTree, woodSpp, woodmouse, reroot = FALSE) +monophylyBoot(woodTree, woodSpp, woodmouse, thresh = 0.9, reroot = FALSE) +} + +} diff --git a/man/nearNeighbour.Rd b/man/nearNeighbour.Rd new file mode 100644 index 0000000..cdeeeef --- /dev/null +++ b/man/nearNeighbour.Rd @@ -0,0 +1,89 @@ +\name{nearNeighbour} +\alias{bestCloseMatch} +\alias{nearNeighbour} +\alias{threshID} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Measures of identification accuracy} + +\description{Tests of barcoding efficacy using distance-based methods.} + +\usage{ +bestCloseMatch(distobj, sppVector, threshold = 0.01) +nearNeighbour(distobj, sppVector, names = FALSE) +threshID(distobj, sppVector, threshold = 0.01)} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{distobj}{A distance object (usually from \code{\link{dist.dna}}).} + \item{sppVector}{Vector of species names. See \code{\link{sppVector}}.} + \item{threshold}{Distance cutoff for identifications. Default of 0.01 (1\%).} + \item{names}{Logical. Should the names of the nearest match be shown? Default of FALSE.} +} + +\details{ +These functions test barcoding efficacy and are not identification tools. All sequences must be identified prior to testing. Each sequence is considered an unknown while the remaining sequences in the dataset constitute the DNA barcoding database that is used for identification. If the identification from the test is the same as the pre-considered identification, a correct result is returned. + +\code{bestCloseMatch} conducts the "best close match" analysis of Meier et al. (2006), considering the closest individual unless it is further than the given threshold, which results in no identification. More than one species tied for closest match results in an assignment of "ambiguous". When the threshold is large, this analysis will return essentially the same result as \code{nearNeighbour}. + +\code{nearNeighbour} finds the closest individual and returns if their names are the same (TRUE) or different (FALSE). If \code{names = TRUE}, the name of the closest individual is returned. Ties are decided by majority rule. + +\code{threshID} conducts a threshold-based analysis, similar to that conducted by the "Identify Specimen" tool provided by the Barcode of Life Database (\url{http://www.boldsystems.org/views/idrequest.php}). It is more inclusive than \code{bestCloseMatch}, considering ALL sequences within the given threshold. +} + +\value{ +\code{bestCloseMatch} and \code{threshID} return a character vector giving the identification status of each individual. + \item{"correct"}{The name of the closest match is the same} + \item{"incorrect"}{The name of the closest match is different} + \item{"ambiguous"}{More than one species is the closest match (\code{bestCloseMatch}), or is within the given threshold (\code{threshID})} + \item{"no id"}{No species are within the threshold distance} + +\code{nearNeighbour} returns a logical vector or (if \code{names = TRUE}) the name for the nearest individual. +} + +\references{ +Meier, R., Shiyang, K., Vaidya, G., & Ng, P. (2006). DNA barcoding and taxonomy in Diptera: a tale of high intraspecific variability and low identification success. _Systematic Biology_ +*55* (5) 715-728. + +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{dist.dna}}, +\code{\link{sppVector}} +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} + +\examples{ + +data(anoteropsis) +anoDist <- dist.dna(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split = "_"), + function(x) paste(x[1], x[2], sep = "_")) + +bestCloseMatch(anoDist, anoSpp) +bestCloseMatch(anoDist, anoSpp, threshold = 0.005) +nearNeighbour(anoDist, anoSpp) +nearNeighbour(anoDist, anoSpp, names = TRUE) +threshID(anoDist, anoSpp) +threshID(anoDist, anoSpp, threshold = 0.003) + +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +bestCloseMatch(doloDist, doloSpp) +bestCloseMatch(doloDist, doloSpp, threshold = 0.005) +nearNeighbour(doloDist, doloSpp) +nearNeighbour(doloDist, doloSpp, names=TRUE) +threshID(doloDist, doloSpp) +threshID(doloDist, doloSpp, threshold = 0.003) +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/nonConDist.Rd b/man/nonConDist.Rd new file mode 100644 index 0000000..48f0da1 --- /dev/null +++ b/man/nonConDist.Rd @@ -0,0 +1,80 @@ +\name{nonConDist} +\alias{nonConDist} +\alias{maxInDist} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Nearest non-conspecific and maximum intra-specific distances +} +\description{ +These functions give the distances to the nearest non-conspecific and furthest conspecific representatives for each individual in the dataset. +} +\usage{ +nonConDist(distobj, sppVector, propZero = FALSE, rmNA = FALSE) +maxInDist(distobj, sppVector, propZero = FALSE, rmNA = FALSE) +} +%- maybe also 'usage' for other objects documented here. + +\arguments{ + \item{distobj}{ +Distance matrix. +} + \item{sppVector}{ +Species vector (see \code{\link{sppVector}}). Default of NULL. +} + \item{propZero}{ +Logical. TRUE gives the proportion of zero distances. +} + \item{rmNA}{ +Logical. TRUE ignores missing values in the distance matrix. Default of FALSE +} +} +\details{ +\code{nonConDist} returns the minimum inter-specific distance for each individual. + +\code{maxInDist} returns the maximum intra-specific distance for each individual. + +These two functions can be used to create a version of the barcoding gap. +} + +\value{ +If \code{propZero=FALSE}, a numeric vector giving the distance of the closest non-conspecific individual (\code{nonConDist}) or the most distant conspecific individual (\code{maxInDist}). + +If \code{propZero=TRUE}, a single number giving the proportion of zero distances. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +data(anoteropsis) +anoDist <- dist.dna(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +nonConDist(anoDist, anoSpp) +nonConDist(anoDist, anoSpp, propZero=TRUE) + +maxInDist(anoDist, anoSpp) +maxInDist(anoDist, anoSpp, propZero=TRUE) + +#Barcoding gap +inter <- nonConDist(anoDist, anoSpp) +intra <- maxInDist(anoDist, anoSpp) +hist(inter-intra) + +#An alternative way of plotting the gap +bnd <- cbind(data.frame(inter, intra)) +ord <- bnd[order(bnd$inter),] +plot(ord$inter, type="n", ylab="Percent K2P distance", xlab="Individual") +segCol <- rep("gray50", length(ord$inter)) +segCol[ord$inter-ord$intra < 0] <- "red" +segments(x0=1:length(ord$inter), y0=ord$inter, y1=ord$intra, col=segCol, lwd=6) + + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/nucDiag.Rd b/man/nucDiag.Rd new file mode 100644 index 0000000..e0ced84 --- /dev/null +++ b/man/nucDiag.Rd @@ -0,0 +1,57 @@ +\name{nucDiag} +\alias{nucDiag} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Nucleotide diagnostics for species alignments +} +\description{ +Determines the diagnostic nucleotides for each species given in \code{sppVector}. +} + +\usage{ +nucDiag(DNAbin, sppVector) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +An object of class 'DNAbin'. +} + \item{sppVector}{ +The species vector (see \code{\link{sppVector}}). +} +} + + +\value{ +A list giving the pure, simple diagnostic nucleotides (i.e. those nucleotides that are fixed within species and different from all other species) for each species in the species vector. A result of \code{integer(0)} indicates there are no diagnostic nucleotides for those species. +} +\references{ +Sarkar, I., Planet, P., & DeSalle, R. (2008). CAOS software for use in character- +based DNA barcoding. _Molecular Ecology Resources_ *8* 1256-1259 +} +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ + \code{\link{slideNucDiag}} +} +\examples{ +data(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +nucDiag(anoteropsis, anoSpp) + +#To view the nucleotide values +anoNuc <- nucDiag(anoteropsis, anoSpp) +as.character(anoteropsis[ ,anoNuc[[1]][1] ]) + + + +data(sarkar) +sarkarSpp <- substr(dimnames(sarkar)[[1]], 1, 3) +nucDiag(sarkar, sarkarSpp) +} diff --git a/man/ordinDNA.Rd b/man/ordinDNA.Rd new file mode 100644 index 0000000..c18d9ca --- /dev/null +++ b/man/ordinDNA.Rd @@ -0,0 +1,56 @@ +\name{ordinDNA} +\alias{ordinDNA} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Calculates a Principal Components Ordination of genetic distances} + +\description{Calculates Principical Coonrdinates Analysis on a matrix of genetic distances and plots an ordination of the first two major axes.} + +\usage{ +ordinDNA(distobj, sppVector, ...) +} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{distobj}{A distance matrix.} + \item{sppVector}{The species vector (see \code{\link{sppVector}}).} + \item{...}{Other arguments to be passed to \code{\link{plot.ordinDNA}}.} +} + +\details{ +This function is a wrapper for \code{\link{cmdscale}}, which performs a Principal Coordinates Analysis on the distance matrix given. In addition, it plots an ordination of the genetic distance matrix given, showing the relative distance between each of the species in the dataset. It is presented as an alternative to the neighbour-joining trees which are frequently used for the visualisation of DNA barcoding data. NJ trees show hypotheses of relationships, which are inappropriate for the questions usally asked in DNA barcoding studies. + +The distance between the centroids of the clusters are roughly proportional to the genetic distances between the species. NOTE: it is important to remember that the plot shows only one plane of a multi-dimensional space. Species with overlapping circles are not necessarily conspecific. Further exploration is required. +} + +\value{ +Plots an ordination of the first two major axes showing the positions of each individual (squares), the centroid of each species (circular bullet and name of species), and the variation in the species (large circle, the radius of which is the distance to the furthest individual from the centroid). + +Additionally returns a list of class \code{"ordinDNA"} with the following elements: +\item{pco}{Output of the Principal Coordinates Analysis.} +\item{sppVector}{Character vector giving the species vector.} + +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{cmdscale}}, \code{\link{plot.ordinDNA}} +} + +\examples{ + +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +doloOrd <- ordinDNA(doloDist, doloSpp) +doloOrd +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/paa.Rd b/man/paa.Rd new file mode 100644 index 0000000..04c987d --- /dev/null +++ b/man/paa.Rd @@ -0,0 +1,54 @@ +\name{paa} +\alias{paa} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Population Aggregate Analysis} + +\description{ +Conducts population aggregate analysis over a matrix of characters of interest. +} + +\usage{paa(data, sppVector)} +%- maybe also 'usage' for other objects documented here. + +\arguments{ + \item{data}{A data matrix with columns as characters and rows as individuals.} + \item{sppVector}{The species vector. See \code{\link{sppVector}}.} +} +\details{ +When used on DNA sequences, the function treats gaps as seperate characters. +} +\value{A matrix with species as rows and characters as columns. Cells give the character state of each species if fixed, or "poly" if the character is polymorphic.} + +\references{ +Sites, J. W. J., & Marshall, J. C. (2003). Delimiting species: a Renaissance +issue in systematic biology. _Trends in Ecology and Evolution_ *18* (9), 462-470. +} + +\author{Samuel Brown } + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +#Create some exemplar data +u <- sample(c(0,1), 16, replace=TRUE) +v <- rep(c(0,1), rep(8,2)) +x <- rep(c(1,0), rep(8,2)) +y <- sample(c(0,1), 16, replace=TRUE) +z <- rep(c(1,0), rep(8,2)) + +dat <- cbind(u,v,x,y,z) +popn <- rep(c("A","B", "C", "D"), rep(4,4)) + +paa(dat, popn) + +#Use on DNA sequences +data(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +paa(as.character(anoteropsis), anoSpp) +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Morphology} diff --git a/man/plot.haploAccum.Rd b/man/plot.haploAccum.Rd new file mode 100644 index 0000000..31227e6 --- /dev/null +++ b/man/plot.haploAccum.Rd @@ -0,0 +1,55 @@ +\name{plot.haploAccum} +\alias{plot.haploAccum} + +\title{Plotting haplotype accumulation curves} + +\description{ +Plots the accumulation curves calculated by \code{\link{haploAccum}}. +} + + +\usage{ +\method{plot}{haploAccum}(x, add = FALSE, ci = 2, ci.type = c("bar","line","polygon"), + col = par("fg"), ci.col = col, ci.lty = 1, xlab, ylab = "Haplotypes", ylim, + main = paste(x$method, "method of haplotype accumulation", sep=" "), ...) +} + +\arguments{ + \item{x}{A `haploAccum' object obtained from \code{\link{haploAccum}}.} + \item{add}{Add graph to an existing graph.} + \item{ci}{Multiplier for the calculation of confidence intervals from standard deviation. \code{ci = 0} prevents the drawing of confidence intervals.} + \item{ci.type}{Type of confidence intervals: \code{"bar"} for vertical bars, \code{"line"} for lines, and \code{"polygon"} for a shaded area.} + \item{col}{Colour for curve line.} + \item{ci.col}{Colour for lines or shaded area when \code{"polygon"}.} + \item{ci.lty}{Line type for confidence interval lines or border of the \code{"polygon"}.} + \item{xlab}{Label for the X-axis.} + \item{ylab}{Label for the Y-axis.} + \item{ylim}{Y-axis limits.} + \item{main}{Title of the plot.} + \item{...}{Other parameters to pass to plot.} +} + +\value{ +Plots a haplotype accumulation curve and confidence intervals depending on the options given to \code{\link{haploAccum}}. +} + +\references{ + Gotellli, N.J. & Colwell, R.K. (2001). Quantifying biodiversity: procedures and pitfalls in measurement and comparison of species richness. _Ecology Letters_ *4* 379--391. +} + +\author{ +Jagoba Malumbres-Olarte . +} + +\examples{ +data(dolomedes) +#Generate multiple haplotypes +doloHaplo <- dolomedes[sample(37, size = 200, replace = TRUE), ] +dolocurv <- haploAccum(doloHaplo, method = "random", permutations = 100) + +plot(dolocurv) +plot(dolocurv, add = FALSE, ci = 2, ci.type = "polygon", col = "blue", ci.col = "red", + ci.lty = 1) +} + +\keyword{Sampling} \ No newline at end of file diff --git a/man/plot.ordinDNA.Rd b/man/plot.ordinDNA.Rd new file mode 100644 index 0000000..cde72a0 --- /dev/null +++ b/man/plot.ordinDNA.Rd @@ -0,0 +1,83 @@ +\name{plot.ordinDNA} +\alias{plot.ordinDNA} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Plot an 'ordinDNA' object +} +\description{ +Plots an ordination of the Principal Components Analysis conducted by \code{\link{ordinDNA}}. +} +\usage{ +\method{plot}{ordinDNA}(x, majorAxes = c(1,2), plotCol = "default", trans = "CC", + textcex = 0.7, pchCentroid = FALSE, sppBounds = "net", sppNames = TRUE, + namePos = "top", ptPch = 21, ptCex = 0.5, netWd = 1, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{x}{An object of class `ordinDNA'.} + \item{majorAxes}{Numeric. Gives the numbers of the major axes that should be plotted. Default of the first two major axes (\code{majorAxes = c(1,2)})} + \item{plotCol}{A vector of RGB colours giving the colours of the points and circles. Must be in the form of a character vector with elements "#XXXXXX" where XXXXXX gives the hexadecimal value for the colours desired. Default of \code{"default"}. Colours are recycled if necessary.} + \item{trans}{A character vector giving the hexadecimal value for the transparency of the circles. Default of "CC".} + \item{textcex}{Numeric. Controls the size of the text giving the species value of the circles.} + \item{pchCentroid}{Numeric. Controls the shape of the point showing the centroid of the circle for each species. Default of FALSE, no plotting of centroid position.} + \item{sppBounds}{Option to determine the method of visualising conspecific points. Options of \code{"net"} (the default), \code{"none"} or \code{"circles"}.} + \item{sppNames}{Logical. Should species names be plotted? Default of TRUE.} + \item{namePos}{Character vector of length 1 giving the position where the species names should be plotted. Possible values are: "top" and "bottom", anything else plots the names at the centroid.} + \item{ptPch}{Numeric. Number of the symbol to be used for plotting. see \code{\link{points}}. Default of 21.} + \item{ptCex}{Numeric. Number governing the size of the points. Default of 0.5.} + \item{netWd}{Numeric. Number governing the width of the lines in the netowk. Default of 1.} + + \item{...}{Other arguments to be passed to \code{plot}.} +} + +\details{ +\code{plot.ordinDNA} calculates the centroid and radius of the most variable individual for each species in the multivariate space of the Principal Components Analysis object given. + +\code{majorAxes} plots the axes in the form \code{c(x, y)}. The maximum number of axes calculated is the number of specimens in the dataset minus one. + +\code{sppBounds} has the following options: \code{"net"} (the default) creates a complete graph between all individuals within a species. If \code{"circles"} is specified, a circle is drawn with a center fixed on the centroid, and a radius of the length to the maximally distant individual. Selecting the option of \code{"none"} means the individuals are not connected in any way. +} + +\value{ +Plots an ordination of the first two major axes showing the positions of each individual (squares), the centroid of each species (circular bullet and name of species), and the variation in the species (large circle, the radius of which is the distance to the furthest individual from the centroid). + +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{ordinDNA}}, \code{\link{cgraph}}. +} + +\examples{ +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +doloOrd <- ordinDNA(doloDist, doloSpp) + +plot(doloOrd) +plot(doloOrd, majorAxes = c(1,3)) +plot(doloOrd, textcex = 0.001) +plot(doloOrd, plotCol = c("#FF0000", "#00FF00", "#0000FF")) +plot(doloOrd, namesPos = "bottom") +plot(doloOrd, namesPos = "centre") + +data(anoteropsis) +anoDist <- dist.dna(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +anoOrd <- ordinDNA(anoDist, anoSpp) + +plot(anoOrd, sppBounds = "circles") + + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Visualisation} diff --git a/man/plot.slidWin.Rd b/man/plot.slidWin.Rd new file mode 100644 index 0000000..dcc98b5 --- /dev/null +++ b/man/plot.slidWin.Rd @@ -0,0 +1,67 @@ +\name{plot.slidWin} +\alias{plot.slidWin} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Plot a 'slidWin' object +} +\description{ +Graphical representation of the summary statistics derived from \code{\link{slideAnalyses}} and \code{\link{slideBoxplots}} +} +\usage{ +\method{plot}{slidWin}(x, outliers = FALSE, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{x}{ +An object of class `slidWin'. +} + \item{outliers}{ +Logical. When the results of \code{\link{slideBoxplots}} are being called, should the outliers be plotted? Default of FALSE. +} +\item{...}{ +Other arguments to be passed to \code{plot}. +} +} +\details{ +When boxplots of methods \code{nonCon} and \code{interAll}, the y-axis limits are constrained to the midpoint of the range covered by the boxplots, so that the intra-specific variation can be seen. +} +\value{ +Plots graphs depending on the options given to \code{\link{slideAnalyses}} or \code{\link{slideBoxplots}}. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{slideAnalyses}}, +\code{\link{slideBoxplots}}. +} +\examples{ +data(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +doloSlide <- slideAnalyses(dolomedes, doloSpp, 200, interval=10, treeMeasures=TRUE) + +plot(doloSlide) + +doloBox <- slideBoxplots(dolomedes, doloSpp, 200, interval=10, method="overall") + +plot(doloBox) + + +data(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +anoBox <- slideBoxplots(anoteropsis, anoSpp, 200, interval=10, method="interAll") + +plot(anoBox) +plot(anoBox, outliers=TRUE) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Sliding window} diff --git a/man/polyBalance.Rd b/man/polyBalance.Rd new file mode 100644 index 0000000..57fff6d --- /dev/null +++ b/man/polyBalance.Rd @@ -0,0 +1,43 @@ +\name{polyBalance} +\alias{polyBalance} + +\title{Balance of a phylogenetic tree with polytomies} + +\description{This function computes the numbers of descendants for each dichotomous branch of a phylogenetic tree.} + +\usage{polyBalance(phy)} + +\arguments{ + \item{phy}{A tree of class `phylo'.} +} + +\details{ +The function extends \code{\link{balance}} to allow the balance of a tree with polytomies to be calculated. When the tree is fully dichotomous, the result is identical to \code{\link{balance}}. + +} + +\value{ +A numeric matrix with two columns and one row for each node of the tree. The columns give the numbers of descendants on each node. Non-dichotomous nodes are reported as 'NA'. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{balance}}. +} + +\examples{ + set.seed(55) + tr <- rtree(15) + tr2 <- di2multi(tr, tol=0.02) + polyBalance(tr) + polyBalance(tr2) +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities} diff --git a/man/rankSlidWin.Rd b/man/rankSlidWin.Rd new file mode 100644 index 0000000..0b125be --- /dev/null +++ b/man/rankSlidWin.Rd @@ -0,0 +1,75 @@ +\name{rankSlidWin} +\alias{rankSlidWin} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Rank a 'slidWin' object. +} +\description{ +Display the highest ranking windows measured by \code{\link{slideAnalyses}}. +} +\usage{ +rankSlidWin(slidWin, criteria="mean_distance", num = 10) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{slidWin}{ +An object of class `slidWin', made using \code{\link{slideAnalyses}}. +} + \item{criteria}{ +Name of criteria to sort by. Can be any of the following: \code{"mean_distance", "monophyly", "clade_comparison", "clade_comp_shallow", "zero_noncon", "zero_distances", "diag_nuc"} or \code{"all"}. Default of \code{"mean_distance"} if distance measures have been calculated, otherwise \code{"monophyly"}. +} + \item{num}{ +Number of windows to return. Default of 10. +} +} +\details{ +The criteria for \code{rankSlidWin} correspond to the variables outputted by \code{\link{slideAnalyses}} and are sorted in the following manner: +\tabular{lll}{ +\code{rankSlidWin} criterion: \tab \code{\link{slideAnalyses}} output:\tab Sorting method:\cr +\code{"mean_distance"} \tab \code{"dist_mean_out"} \tab Ascending\cr +\code{"monophyly"} \tab \code{"win_mono_out"} \tab Ascending\cr +\code{"clade_comparison"} \tab \code{"comp_out"} \tab Ascending\cr +\code{"clade_comp_shallow"} \tab \code{"comp_depth_out"} \tab Ascending\cr +\code{"zero_noncon"} \tab \code{"noncon_out"} \tab Descending\cr +\code{"zero_distances"} \tab \code{"zero_out"} \tab Descending\cr +\code{"diag_nuc"} \tab \code{"nd_out"} \tab Ascending\cr +} + +Given a sequence of 1:10, the ascending method of sorting considers 10 as high. The descending method considers 1 as high. + +The \code{"all"} criterion returns the windows that have the highest cumulative total score over all criteria. +} + +\value{ +A data frame giving the values of the measures calculated by \code{\link{slideAnalyses}}, ranked to show the top 10 positions based on the criterion given. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{slideAnalyses}}. +} +\examples{ +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +doloSlide <- slideAnalyses(dolomedes, doloSpp, 200, interval = 10, treeMeasures = TRUE) + +rankSlidWin(doloSlide) +rankSlidWin(doloSlide, criteria = "zero_distances") + +doloSlide2 <- slideAnalyses(dolomedes, doloSpp, 200, interval = 10, treeMeasures = FALSE) +rankSlidWin(doloSlide2) + +doloSlide3 <- slideAnalyses(dolomedes, doloSpp, 200, interval = 10, distMeasures = FALSE, + treeMeasures = TRUE) +rankSlidWin(doloSlide3) +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Sliding window} diff --git a/man/read.BOLD.Rd b/man/read.BOLD.Rd new file mode 100644 index 0000000..b5865d3 --- /dev/null +++ b/man/read.BOLD.Rd @@ -0,0 +1,83 @@ +\name{read.BOLD} +\alias{read.BOLD} +\alias{search.BOLD} +\alias{stats.BOLD} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Downloads DNA sequences from the Barcode of Life Database (BOLD)} + +\description{These functions allow DNA sequences to be downloaded from the Barcode of Life Database (BOLD).} + +\usage{search.BOLD(taxon, exhaustive = FALSE) +stats.BOLD(taxon) +read.BOLD(IDs)} + + + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{taxon}{A character vector of the names of the taxa of interest.} + \item{exhaustive}{Logical. Should the function search for more than 500 process IDs? Default of FALSE.} + \item{IDs}{A character vector containing BOLD process ID numbers.} +} + +\details{\code{search.BOLD} retrieves BOLD process identification numbers for any given taxon using the API for BOLD version 3.0. By default, it only returns the first 500 process IDs for the given taxon. By selecting the option \code{exhaustive = TRUE}, the function can be made to search for more than 500 process IDs, but is much slower. + +\code{stats.BOLD} retrieves the total number of records for the given taxon. + +\code{read.BOLD} downloads the sequences associated with the process identification numbers using a brute force method of downloading the specimen record, then searching and splitting the HTML code to remove the relevant information. This process is likely to make the function fairly unstable if BOLD make any changes to their website. + +Previous versions of \code{read.BOLD} used the eFetch web service offered by BOLD to enable batch retrieval of records, however from October 2012 BOLD deprecated eFetch without providing a replacement service. + +} + +\value{ +\code{search.BOLD} returns a character vector giving the process identification numbers of the specimens found by the search. + +\code{read.BOLD} returns an object of class `DNAbin'. This object has the attributes "species", "accession_num", and "gene". +} + +\section{Warning}{ +On 26 Oct 2011, attempts to access records using the eFetch system through a web browser resulted in an error, saying that eFetch and eSearch are offline for maintainance. + +As of 7 March 2012, both functions have been modified to interface with the new BOLD architecture, and work as expected. + +29 Oct 2012: It appears that BOLD has taken eFetch offline permanently, rendering \code{read.BOLD} as it currently stands useless. While we may be able to work out something, this will require a complete rewrite of the function. \code{search.BOLD} continues to work as intended. + +17 Dec 2012: A new version of \code{read.BOLD} has been released that appears to work (for the time being). +} + +\references{ +BOLD web services: +\url{http://services.boldsystems.org/}. + +BOLD version 3.0 +\url{http://v3.boldsystems.org/}. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{read.GB}}. +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} + +\examples{ +\dontrun{ +stats.BOLD("Pisauridae") + +search.BOLD(c("Danio kyathit", "Dolomedes", "Sitona discoideus")) + +nn <- search.BOLD("Pisauridae") +pisaurid <- read.BOLD(nn) + +write.dna(pisaurid, "filename.fas", format="fasta")} +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} +\keyword{Datasets}% __ONLY ONE__ keyword per line diff --git a/man/read.GB.Rd b/man/read.GB.Rd new file mode 100644 index 0000000..ff4970d --- /dev/null +++ b/man/read.GB.Rd @@ -0,0 +1,66 @@ +\name{read.GB} +\alias{read.GB} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Download sequences from Genbank with metadata. +} +\description{ +Downloads sequences associated with the given accession numbers into a `DNAbin' class. +} +\usage{ +read.GB(access.nb, seq.names = access.nb, species.names = TRUE, gene = TRUE, + access = TRUE, as.character = FALSE) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{access.nb}{ +A character vector giving the GenBank accession numbers to download. +} + \item{seq.names}{ +A character vector giving the names to give to each sequence. Defaults to "accession number | species name". +} + \item{species.names}{ +Logical. Should species names be downloaded? Default of TRUE. +} + \item{gene}{ +Logical. Should the name of the gene region be downloaded? Default of TRUE. +} + \item{access}{ +Logical. Should the accession number be downloaded? Default of TRUE. +} + \item{as.character}{ +Logical. Should the sequences be returned as character vector? Default of FALSE, function returns sequences as a `DNAbin' object. +} +} +\details{ +This function is a modification of \code{\link[ape:ape-package]{read.GenBank}} to include metadata with each sequence. Additional data currently implemented are the species names and the gene region from which sequences were derived. +} +\value{ +A 'DNAbin' object with the following attributes: \code{"species"}, \code{"gene"}, and \code{"accession_num"}. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{read.GenBank}}. +} +\examples{ +\dontrun{ +read.GB("AY059961") + +#Download the sequences making data(anoteropsis) from Genbank +nums <- 59961:59993 +seqs <- paste("AY0", nums, sep="") +dat <- read.GB(seqs) + +attr(dat, "species") +attr(dat, "gene") +attr(dat, "accession_num")} +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Datasets} diff --git a/man/rmSingletons.Rd b/man/rmSingletons.Rd new file mode 100644 index 0000000..d657b5b --- /dev/null +++ b/man/rmSingletons.Rd @@ -0,0 +1,45 @@ +\name{rmSingletons} +\alias{rmSingletons} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Detect and remove singletons +} +\description{ +A utility to detect and remove species represented only by singletons. +} +\usage{ +rmSingletons(sppVector, exclude = TRUE) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{sppVector}{Vector of species names. (see \code{\link{sppVector}}).} + \item{exclude}{Logical. Should singletons be removed? Default of TRUE.} +} +\details{ +When \code{exclude = TRUE} (the default), singletons are excluded and the vector returns the index of all non-singletons in the dataset. When \code{exclude = FALSE}, the indices of the singletons are presented. +} +\value{ +Returns a numeric vector giving the indices of the selected individuals. +} + +\author{Samuel Brown } + +\examples{ +data(anoteropsis) +anoDist <- dist.dna(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + +rmSingletons(anoSpp) +rmSingletons(anoSpp, exclude=FALSE) + +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +rmSingletons(doloSpp) +rmSingletons(doloSpp, exclude=FALSE) +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities} diff --git a/man/rosenberg.Rd b/man/rosenberg.Rd new file mode 100644 index 0000000..4c4ec0c --- /dev/null +++ b/man/rosenberg.Rd @@ -0,0 +1,61 @@ +\name{rosenberg} +\alias{rosenberg} + +\title{Rosenberg's probability of reciprocal monophyly} + +\description{This function computes Rosenberg's probability of reciprocal monophyly for each dichotomous node of a phylogenetic tree.} + +\usage{rosenberg(phy)} + +\arguments{ + \item{phy}{A tree of class `phylo'.} +} + +\details{ +Because \code{ape} plots node labels in a different manner to the method in which they are stored, when plotting the node labels made by \code{rosenberg}, make sure the \code{node} argument is given as shown in the examples below. +} + +\value{ +A numeric vector with names giving the node numbers of \code{phy}. +} + +\references{ +Rosenberg, N. A. (2007). Statistical tests for taxonomic distinctiveness from observations of monophyly. _Evolution_ *61* (2), 317-323. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{nodelabels}}. +} + +\examples{ +data(anoteropsis) +anoTr <- nj(dist.dna(anoteropsis)) +anoLab <- rosenberg(anoTr) +plot(anoTr) +nodelabels(round(anoLab,3), node=as.numeric(names(anoLab))) + +data(dolomedes) +doloTr <- nj(dist.dna(dolomedes)) +doloRose <- rosenberg(doloTr) +plot(doloTr) +nodelabels(round(doloRose, 3)) + +#Colour circles for nodes with a probability < 0.005 +doloNodes <- doloRose < 0.005 +doloLabs <- doloRose +doloLabs[doloNodes] <- "blue" +doloLabs[!doloNodes] <- "red" +plot(doloTr, cex=0.7) +nodelabels(pch=21, bg=doloLabs, node=as.numeric(names(doloLabs)), cex=2) +legend(x=0.015, y=16.13, legend=c("significant", "not significant"), pch=21, + pt.bg=c("blue", "red"), bty="n", pt.cex=2) + +} + +\keyword{Sampling} \ No newline at end of file diff --git a/man/sarkar.Rd b/man/sarkar.Rd new file mode 100644 index 0000000..13c7550 --- /dev/null +++ b/man/sarkar.Rd @@ -0,0 +1,17 @@ +\name{sarkar} +\docType{data} +\alias{sarkar} +\title{Dummy sequences illustrating the categories of diagnostic nucleotides} + +\description{A set of 8 dummy sequences published in Sarkar et al 2008 to illustrate the different categories of diagnostic nucleotides.} + +\usage{sarkar} + +\format{A DNAbin object containing 8 sequences with a length of 18 base pairs stored as a matrix.} + +\source{ +Sarkar, I., Planet, P., & DeSalle, R. (2008). CAOS software for use in character- +based DNA barcoding. _Molecular Ecology Resources_ *8* 1256-1259 +} + +\keyword{Datasets} \ No newline at end of file diff --git a/man/seeBarcode.Rd b/man/seeBarcode.Rd new file mode 100644 index 0000000..d837507 --- /dev/null +++ b/man/seeBarcode.Rd @@ -0,0 +1,42 @@ +\name{seeBarcode} +\alias{seeBarcode} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Create illustrative barcodes} + +\description{This function plots an illustrative barcode consisting of vertical bands in four colours corresponding to the DNA bases adenine (A), cytosine (C), guanine (G) and thiamine (T).} + +\usage{seeBarcode(seq, col=c("green", "blue", "black", "red"))} + + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{seq}{A single sequence of class `DNAbin'.} + \item{col}{A character vector of length 4 giving colours to represent A, G, C and T respectively.} +} + +\details{ +Green, blue, black and red are the standard colours representing A, G, C and T respectively. +} + +\value{ +Plots an illustrative barcode. +} + +\author{ +Samuel Brown +} + +\examples{ +layout(matrix(1:6, ncol=1)) +par(mar=c(0.5, 0, 0.5, 0)) +data(woodmouse) +seeBarcode(woodmouse[1,]) +seeBarcode(woodmouse[1,], col=c("pink", "orange", "steelblue", "yellow")) +seeBarcode(woodmouse[1,], col=c("black", "white", "white", "black")) +apply(woodmouse[1:3,], MARGIN=1, FUN=seeBarcode) +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} +\keyword{Utilities} diff --git a/man/seqStat.Rd b/man/seqStat.Rd new file mode 100644 index 0000000..52c1af1 --- /dev/null +++ b/man/seqStat.Rd @@ -0,0 +1,45 @@ +\name{seqStat} +\alias{seqStat} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Sequence statistics +} +\description{ +Utility that produces a table giving summary statistics for a `DNAbin' object. +} +\usage{ +seqStat(DNAbin, thresh) +} + +\arguments{ + \item{DNAbin}{ +Alignment of class `DNAbin'. +} + \item{thresh}{ +Threshold sequence length. Default of 500 (minimum length for official DNA barcodes). +} +} + +\details{ +This function considers bases coded as '?', 'N' and '-' as missing data. +} + +\value{ +A table giving the minimum, maximum, mean and median sequence lengths, and the number of sequences with lengths below the given threshold. +} + +\author{ +Rupert Collins +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +data(anoteropsis) +seqStat(anoteropsis) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} +\keyword{Utilities} \ No newline at end of file diff --git a/man/slideAnalyses.Rd b/man/slideAnalyses.Rd new file mode 100644 index 0000000..2640be1 --- /dev/null +++ b/man/slideAnalyses.Rd @@ -0,0 +1,91 @@ +\name{slideAnalyses} +\alias{slideAnalyses} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Sliding window analyses +} +\description{ +Wraps a number of measures used in sliding window analyses into one easy-to-use function. +} +\usage{ +slideAnalyses(DNAbin, sppVector, width, interval = 1, + distMeasures = TRUE, treeMeasures = FALSE) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +A DNA alignment of class `DNAbin'. +} + \item{sppVector}{ +Species vector (see \code{\link{sppVector}}). +} + \item{width}{ +Desired width of windows in number of nucleotides. +} + \item{interval}{ +Distance between each window in number of nucleotides. Default of 1. Giving the option of 'codons' sets the size to 3. +} + \item{distMeasures}{ +Logical. Should distance measures be calculated? Default of TRUE. +} + \item{treeMeasures}{ +Logical. Should tree-based measures be calculated? Default of FALSE. +} +} + +\details{ +Distance measures include the following: +proportion of zero non-conspecific distances, +number of diagnostic nucleotides, +number of zero-length distances, + and overall mean distance. + +Tree-based measures include the following: +proportion of species that are monophyletic, +proportion of clades that are identical between the neighbour joining tree calculated for the window and the tree calculated for the full dataset, + and the latter with \code{method="shallow"}. + +Tree-based measures are a lot more time-intensive than distance measures. When dealing with lots of taxa and short windows, this part of the function can take hours. + +Both distance and tree measures are calculated from a K2P distance matrix created from the data with the option \code{pairwise.deletion = TRUE}. When sequences with missing data are compared with other sequences, a \code{NA} distance results. These are ignored in the calculation of \code{slideAnalyses} distance metrics. However, the tree measures cannot cope with this missing data, and so no result is returned for windows where some sequences solely contain missing data. +} + +\value{ +An object of class 'slidWin' which is a list containing the following elements: +\item{win_mono_out}{Proportion of species that are monophyletic.} +\item{comp_out}{Proportion of clades that are identical between the NJ tree calculated for the window and the tree calculated for the full dataset.} +\item{comp_depth_out}{Proportion of shallow clades that are identical.} +\item{pos_tr_out}{Index of window position for tree-based analyses.} +\item{noncon_out}{Proportion of zero non-conspecific distances.} +\item{nd_out}{The sum of diagnostic nucleotides for each species.} +\item{zero_out}{The number of zero-length distances.} +\item{dist_mean_out}{Overall mean K2P distance of each window.} +\item{pos_out}{Index of window position.} +\item{dat_zero_out}{Number of zero inter-specific distances in the full dataset.} +\item{boxplot_out}{Always FALSE. Required for \code{\link{plot.slidWin}}.} +\item{distMeasures}{Value of argument. Required for \code{\link{plot.slidWin}}.} +\item{treeMeasures}{Value of argument. Required for \code{\link{plot.slidWin}}.} + +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{dist.dna}}, \code{\link{plot.slidWin}}, \code{\link{rankSlidWin}}, \code{\link{slideNucDiag}}. +} +\examples{ +\dontrun{ +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +slideAnalyses(dolomedes, doloSpp, 200, interval=10, treeMeasures=TRUE) +} +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Sliding window} diff --git a/man/slideBoxplots.Rd b/man/slideBoxplots.Rd new file mode 100644 index 0000000..469783e --- /dev/null +++ b/man/slideBoxplots.Rd @@ -0,0 +1,77 @@ +\name{slideBoxplots} +\alias{slideBoxplots} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Boxplots across windows +} +\description{ +Calculates boxplots of genetic distances using sliding windows. +} +\usage{ +slideBoxplots(DNAbin, sppVector, width, interval = 1, method = "nonCon") +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +A DNA alignment of class `DNAbin'. +} + \item{sppVector}{ +A species vector (see \code{\link{sppVector}}). +} + \item{width}{ +Width of windows. +} + \item{interval}{ +Distance between each window in number of base pairs. Default of 1. Giving the option of \code{"codons"} sets the size to 3. +} + \item{method}{ +Options of \code{"overall", "interAll"} or \code{"nonCon"} (the default). +} +} +\details{ +Giving \code{method="overall"} calculates the boxplot for the distance matrix of each window. + +Giving \code{method="interAll"} calculates boxplots for the inter- and intra-specific distances of each window, showing the result for ALL inter-specific distances. + +Giving \code{method="nonCon"} calculates boxplots for the inter- and intra-specific distances of each window, showing the result for only the nearest-conspecific distances for each individual. +} +\value{ +A list with + + \item{treeMeasures}{Logical. Tree measures calculated? Always FALSE.} + \item{distMeasures}{Logical. Distance measures calculated? Always FALSE.} + \item{bp_out}{If \code{method="overall"}, contains the boxplot objects of each window.} + \item{bp_InterSpp_out}{If \code{method!="overall"}, contains the boxplot objects of the interspecific distances of each window.} + \item{bp_IntraSpp_out}{If \code{method!="overall"}, contains the boxplot objects of the intraspecific distances of each window.} + \item{bp_range_out}{range of y-axis values.} + \item{pos_out}{x-axis values.} + \item{boxplot_out}{Logical. Boxplots calculated? Always TRUE.} + \item{method}{The method used for calculating boxplots. \code{"overall", "interAll"} or \code{"nonCon"}.} +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{boxplot}}, \code{\link{plot.slidWin}}, \code{\link{slideAnalyses}}, \code{\link{slidingWindow}}. +} +\examples{ +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +doloNonCon <- slideBoxplots(dolomedes, doloSpp, 200, interval=10) +plot(doloNonCon) + +doloOverall <- slideBoxplots(dolomedes, doloSpp, 200, interval=10, method="overall") +plot(doloOverall) + +doloInterall <- slideBoxplots(dolomedes, doloSpp, 200, interval=10, method="interAll") +plot(doloInterall) +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Sliding window} diff --git a/man/slideNucDiag.Rd b/man/slideNucDiag.Rd new file mode 100644 index 0000000..039a549 --- /dev/null +++ b/man/slideNucDiag.Rd @@ -0,0 +1,66 @@ +\name{slideNucDiag} +\alias{slideNucDiag} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Sliding nucleotide diagnostics +} +\description{ +Calculates the number of diagnostic nucleotides in sliding windows. +} +\usage{ +slideNucDiag(DNAbin, sppVector, width, interval = 1) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +A DNA alignment of class `DNAbin'. +} + \item{sppVector}{ +Species vector (see \code{\link{sppVector}}). +} + \item{width}{ +Desired width of windows in number of base pairs. +} + \item{interval}{ +Distance between each window in number of base pairs. Default of 1. Giving the option of \code{"codons"} sets the size to 3. +} +} + +\details{ +Determines the number of diagnostic nucleotides for each species in each window. +} + +\value{ +A matrix giving the number of diagnostic nucleotides for each species (rows) in each window (columns). +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{slideAnalyses}}, \code{\link{slideBoxplots}}, \code{\link{slidingWindow}}. +} +\examples{ +data(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +slideNucDiag(dolomedes, doloSpp, 200, interval = 3) + +slidND <- slideNucDiag(dolomedes, doloSpp, 200, interval = 3) + +#Number of basepairs for each species +matplot(t(slidND), type = "l") + +#Number of basepairs for a single species +plot(slidND[4, ], type = "l") + +#Total number of basepairs per window +plot(colSums(slidND), type = "l") + + +} + +\keyword{Sliding window} diff --git a/man/slidingWindow.Rd b/man/slidingWindow.Rd new file mode 100644 index 0000000..8cdef97 --- /dev/null +++ b/man/slidingWindow.Rd @@ -0,0 +1,63 @@ +\name{slidingWindow} +\alias{slidingWindow} +%- Also NEED an '\alias' for EACH other topic documented here. + +\title{Create windows along an alignment} + +\description{ +Creates windows of a specified width along a DNA alignment. +} + +\usage{ +slidingWindow(DNAbin, width, interval = 1) +} +%- maybe also 'usage' for other objects documented here. + +\arguments{ + \item{DNAbin}{A DNA alignment of class `DNAbin'.} + \item{width}{Width of each window.} + \item{interval}{Numeric or option of \code{"codons"}. This sets interval between windows. Default of 1. Setting the option to "codons" gives an interval of 3.} +} +\details{ +Sliding window analyses are often used to determine the variability along sequences. This can be useful for investigating whether there is evidence for recombination, developing shorter genetic markers, or for determining variation within a gene. + +Analyses can be conducted on each window using \code{\link{lapply}}. + +} + +\value{A list of `DNAbin' objects, with each alignment being \code{width} bases in length. The list has length of the DNA alignment minus the width. The positions covered by each window can be retreived with \code{attr(x, "window")}. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{lapply}}, +\code{\link{slideAnalyses}}, +\code{\link{slideBoxplots}}. +} + +\examples{ +data(woodmouse) +woodmouse <- woodmouse[,1:20] +win1 <- slidingWindow(woodmouse, width = 10) +length(win1) + +win2 <- slidingWindow(woodmouse, width = 10, interval = 2) +length(win2) + +win3 <- slidingWindow(woodmouse, width = 10, interval = "codons") +length(win3) + +win4 <- slidingWindow(woodmouse, width = 15) +length(win4) +attr(win4[[1]], "window") +attr(win4[[2]], "window") +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Sliding window} diff --git a/man/spider-package.Rd b/man/spider-package.Rd new file mode 100644 index 0000000..d9b6b99 --- /dev/null +++ b/man/spider-package.Rd @@ -0,0 +1,57 @@ +\name{spider-package} +\alias{spider-package} +\alias{spider} +\docType{package} +\title{ +Species Identity and Evolution in R +} +\description{ +Spider: SPecies IDentity and Evolution in R, is an R package implementing a number of useful analyses for DNA barcoding studies and associated research into species delimitation and speciation. Included are functions for generating summary statistics from DNA barcode data, assessing specimen identification efficacy, and for testing and optimising divergence threshold limits. In terms of investigating evolutionary and taxonomic questions, techniques for sliding window, population aggregate, and nucleotide diagnostic analyses are also provided. + +The complete list of functions can be displayed with \code{library(help=spider)}. + +More information, including a tutorial on the use of spider can be found at \code{http://spider.r-forge.r-project.org}. +} + +\details{ +\tabular{ll}{ +Package: \tab spider\cr +Type: \tab Package\cr +Version: \tab 1.3-0\cr +Date: \tab 2013-12-25\cr +License: \tab GPL\cr +LazyLoad: \tab yes\cr +} + +A few of the key functions provided by spider: + +DNA barcoding: +\code{\link{bestCloseMatch}}, \code{\link{nearNeighbour}}, \code{\link{threshID}}, \code{\link{threshOpt}}. + +Sliding window: +\code{\link{slidingWindow}}, \code{\link{slideAnalyses}}, \code{\link{slideBoxplots}}. + +Nucleotide diagnostics: +\code{\link{nucDiag}}. + +Morphological techniques: +\code{\link{paa}}. +} +\author{ +Samuel Brown, Rupert Collins, Stephane Boyer, Marie-Caroline Lefort, Jagoba Malumbres-Olarte, Cor Vink, Rob Cruickshank + +Maintainer: +Samuel Brown +} +\references{ +Brown S. D. J., Collins R. A., Boyer S., Lefort M.-C., Malumbres-Olarte J., Vink C. J., & Cruickshank R. H. In Press. SPIDER: an R package for the analysis of species identity and evolution, with particular reference to DNA barcoding. _Molecular Ecology Resources_ +} + +%~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ +%~~ the R documentation directory ~~ +\keyword{ package } +\seealso{ + +\code{\link{ape-package}}, +\code{\link{pegas-package}}. +} diff --git a/man/sppDist.Rd b/man/sppDist.Rd new file mode 100644 index 0000000..17afc8f --- /dev/null +++ b/man/sppDist.Rd @@ -0,0 +1,57 @@ +\name{sppDist} +\alias{sppDist} + +\title{Intra and inter-specific distances} + +\description{ +Separates a distance matrix into its inter- and intra-specific components. +} +\usage{ +sppDist(distobj, sppVector) +} + +\arguments{ + \item{distobj}{ +A distance matrix. +} + \item{sppVector}{ +The species vector (see \code{\link{sppVector}}). +} +} +\details{ +This function can be used to produce histograms and other charts exploring the `barcode gap', such as in the examples below. +} +\value{ +A list with two elements: +\item{inter}{A numeric vector containing ALL inter-specific pairwise distances.} +\item{intra}{A numeric vector containing ALL intra-specific pairwise distances.} +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{sppDistMatrix}}. +} +\examples{ +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +doloSpDist <- sppDist(doloDist, doloSpp) + +doloSpDist + +#Histogram of the barcode gap +transGreen <- rgb(0, 1, 0, 0.5) #Make a slightly transparent colour to see some overlap +hist(doloSpDist$inter, col="grey") +hist(doloSpDist$intra, col=transGreen, add=TRUE) + +#Boxplot of the same +boxplot(doloSpDist) +} + +\keyword{Barcoding} diff --git a/man/sppDistMatrix.Rd b/man/sppDistMatrix.Rd new file mode 100644 index 0000000..e112c1b --- /dev/null +++ b/man/sppDistMatrix.Rd @@ -0,0 +1,41 @@ +\name{sppDistMatrix} +\alias{sppDistMatrix} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Mean intra- and inter-specific distance matrix +} +\description{ +Creates a matrix giving the mean distances within and between species. +} +\usage{ +sppDistMatrix(distobj, sppVector) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{distobj}{ +A distance matrix. +} + \item{sppVector}{ +The species vector (see \code{\link{sppVector}}). +} +} + +\value{ +A square matrix with dimensions \code{length(sppVector)}. It contains the mean intra specific distances down the diagonal, and the mean pairwise distance between the species in the triangles. The two triangles are identical. +} + +\author{ +Samuel Brown +} + +\examples{ +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +sppDistMatrix(doloDist, doloSpp) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/sppVector.Rd b/man/sppVector.Rd new file mode 100644 index 0000000..e0496ed --- /dev/null +++ b/man/sppVector.Rd @@ -0,0 +1,49 @@ +\name{sppVector} +\alias{sppVector} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Species Vectors +} +\description{ +A grouping variable that gives an identity to the individuals in various analyses. +} +\details{ +Species vectors are the key concept behind a lot of \code{spider}'s functionality. They are the method used to group data from individuals into species. It is important to note that "species" in this context can mean any cluster (real or otherwise) that is of interest. Populations, demes, subspecies and genera could be the taxa segregated by "species vectors". + +The two characteristics of a species vector are UNIQUENESS between species and CONSISTENCY within them. R recognises differences of a single character between elements, leading to \code{spider} considering these elements to represent different species. + +There is an easy way and a hard way to create species vectors. The hard way is to type out each element in the vector, making sure no typos or alignment errors are made. + +The easy way is to add species designations into your data matrix from the beginning in such a way that it is easy to use R's data manipulation tools to create a species vector from the names of your data. See the examples for a few ways to do this. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +Functions for creating species vectors: +\code{\link{strsplit}}, \code{\link{substr}}, \code{\link{sapply}}. + +Functions that use species vectors: +\code{\link{nearNeighbour}}, \code{\link{monophyly}}, \code{\link{nonConDist}}, \code{\link{nucDiag}}, \code{\link{rmSingletons}}, \code{\link{slideAnalyses}}, \code{\link{slideBoxplots}}, \code{\link{sppDist}}, \code{\link{sppDistMatrix}}, \code{\link{threshOpt}}. +} + +\examples{ +data(dolomedes) +#Dolomedes species vector +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) + +data(anoteropsis) +#Anoteropsis species vector +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) + + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities} + diff --git a/man/tajima.K.Rd b/man/tajima.K.Rd new file mode 100644 index 0000000..c320d75 --- /dev/null +++ b/man/tajima.K.Rd @@ -0,0 +1,47 @@ +\name{tajima.K} +\alias{tajima.K} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Calculate Tajima's K index of divergence +} +\description{ +Calculates Tajima's K index of divergence. +} +\usage{ +tajima.K(DNAbin, prop = TRUE) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +An object of class `DNAbin'. +} + \item{prop}{ +Logical. Should the function report the number of substitutions per nucleotide? Default of TRUE. +} +} + +\value{ +A vector of length 1. If \code{prop = FALSE}, the mean number of substitutions between any two sequences is returned. If \code{prop = TRUE} (the default), this number is returned as the mean number of substitutions per nucleotide (i.e. the above divided by the length of the sequences). +} +\references{ +Tajima, F. (1983). Evolutionary relationship of DNA sequences in finite +populations. _Genetics_ *105*, 437-460. + +} +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{dist.dna}}. +} + +\examples{ +data(anoteropsis) +tajima.K(anoteropsis) +tajima.K(anoteropsis, prop = FALSE) +} + +\keyword{Utilities} diff --git a/man/tclust.Rd b/man/tclust.Rd new file mode 100644 index 0000000..a341ef6 --- /dev/null +++ b/man/tclust.Rd @@ -0,0 +1,53 @@ +\name{tclust} +\alias{tclust} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{Clustering by a threshold} + +\description{Identifies clusters, excluding individuals greater than the threshold from any member.} + +\usage{ +tclust(distobj, threshold = 0.01) +} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{distobj}{A distance object (usually from \code{\link{dist.dna}}).} + \item{threshold}{Distance cutoff for clustering. Default of 0.01 (1\%).} +} + +\details{ +If two individuals are more distant than \code{threshold} from each other, but both within \code{threshold} of a third, all three are contained in a single cluster. +} + +\value{ +A list with each element giving the index of the individuals contained in each cluster. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{dist.dna}}, \code{\link{localMinima}}. +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} + +\examples{ + +data(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) +anoDist <- dist.dna(anoteropsis) + +tclust(anoDist) + +#Names of individuals +anoClust <- tclust(anoDist) +lapply(anoClust, function(x) anoSpp[x]) +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/threshOpt.Rd b/man/threshOpt.Rd new file mode 100644 index 0000000..f80ba9e --- /dev/null +++ b/man/threshOpt.Rd @@ -0,0 +1,73 @@ +\name{threshOpt} +\alias{threshOpt} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Threshold optimisation +} +\description{ +Determines the positive, negative, false positive and false negative rates of identification accuracy for a given threshold. +} +\usage{ +threshOpt(distobj, sppVector, threshold) +} + +\arguments{ + \item{distobj}{ +Distance matrix. +} + +\item{sppVector}{ +Species vector (see \code{\link{sppVector}}). +} + + \item{threshold}{ +Threshold distance for delimiting intra- and inter-specific variation. Default of 0.01. +} +} + +\details{ +When run over a range of thresholds, this function allows the optimisation of threshold values based on minimising the identification error rates. See the example below for more details. +} + +\value{ +A table giving the threshold and number of negative and positive identifications, number of false negative and false positive identifications, and the cumulative error. +} +\references{ +Meyer, C. P., and Paulay, G. (2005). DNA barcoding: error rates based on +comprehensive sampling. _PLoS Biology_ *3* (12), 2229-2238. + +} +\author{ +Rupert Collins +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{localMinima}}. +} +\examples{ +data(anoteropsis) +anoDist <- dist.dna(anoteropsis) +anoSpp <- sapply(strsplit(dimnames(anoteropsis)[[1]], split="_"), + function(x) paste(x[1], x[2], sep="_")) +threshOpt(anoDist, anoSpp) + +data(dolomedes) +doloDist <- dist.dna(dolomedes) +doloSpp <- substr(dimnames(dolomedes)[[1]], 1, 5) +threshOpt(doloDist, doloSpp) + +#Conduct the analysis over a range of values to determine the optimum threshold +threshVal <- seq(0.001,0.02, by = 0.001) +opt <- lapply(threshVal, function(x) threshOpt(doloDist, doloSpp, thresh = x)) +optMat <- do.call(rbind, opt) +barplot(t(optMat)[4:5,], names.arg=optMat[,1], xlab="Threshold values", + ylab="Cumulative error") +legend(x = 2.5, y = 29, legend = c("False positives", "False negatives"), + fill = c("grey75", "grey25")) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Barcoding} diff --git a/man/tiporder.Rd b/man/tiporder.Rd new file mode 100644 index 0000000..b48d2a3 --- /dev/null +++ b/man/tiporder.Rd @@ -0,0 +1,47 @@ +\name{tiporder} +\alias{tiporder} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Orders tip labels by their position on the tree. +} + +\description{ +Provides an ordered vector of tip labels, corresponding to their position on the tree. +} + +\usage{ +tiporder(phy, labels = TRUE) +} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{phy}{A tree of class `phylo'.} + \item{labels}{Logical. Should labels be printed? If FALSE, the indices are given. Default of TRUE.} +} + +\value{ +A character or numeric vector giving the names of the tip in the order of their position on the tree. The order is that from top to bottom when the tree is plotted with \code{direction = "rightwards"}. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +data(anoteropsis) +anoTree <- nj(dist.dna(anoteropsis)) +tiporder(anoTree) +tiporder(anoTree, labels = FALSE) + + +data(woodmouse) +woodTree <- nj(dist.dna(woodmouse)) +tiporder(woodTree) +tiporder(ladderize(woodTree)) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities} diff --git a/man/titv.Rd b/man/titv.Rd new file mode 100644 index 0000000..acec108 --- /dev/null +++ b/man/titv.Rd @@ -0,0 +1,55 @@ +\name{titv} +\alias{titv} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Number of pairwise transitions and transversions in an alignment. +} +\description{ +Calculates the number of pairwise transitions and transversions between sequences. +} +\usage{ +titv(DNAbin) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{DNAbin}{ +A DNA alignment of class `DNAbin'. +} +} + +\value{ +A square matrix with dimensions of \code{length(dat)}. The upper triangle contains the number of transversions. The lower triangle contains the number of transitions. +} + +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\examples{ +data(dolomedes) + +subs <- titv(dolomedes) + +#Transversions +subs[upper.tri(subs)] +tv <- t(subs) +tv <- tv[lower.tri(tv)] + +#Transitions +ti <- subs[lower.tri(subs)] + + +#Saturation plot +doloDist <- dist.dna(dolomedes) +plot(doloDist, ti, type="p", pch=19, col="blue", + main="Saturation plot of number of transitions and transversions\n + against K2P distance. Red: transversions. Blue: transitions") +points(doloDist, tv, pch=19, col="red") + + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities} diff --git a/man/tree.comp.Rd b/man/tree.comp.Rd new file mode 100644 index 0000000..5945885 --- /dev/null +++ b/man/tree.comp.Rd @@ -0,0 +1,70 @@ +\name{tree.comp} +\alias{tree.comp} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Tree comparisons +} +\description{ +Compares the clades between two trees. +} + +\usage{ +tree.comp(phy1, phy2, method = "prop") +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{phy1, phy2}{ +Trees of class `phylo' to compare. +} + \item{method}{ +One of the following options: + \itemize{ + \item \code{"prop"}---returns the proportion of clades that are the same between the two trees + \item \code{"shallow"}---returns the proportion of shallow clades (clades where \code{node.depth} < median \code{node.depth}) that are the same between the two trees default of \code{"prop"}. + \item \code{"PH85"}---returns the topological distance of Penny and Hendy (1985). + } +} +} +\details{ +This function is a modification of the \code{\link{dist.topo}} function in \code{ape} to give similarity between the two trees as a proportion, and to account for the unreliable resolution of deeper nodes that affect some methods of tree construction (such as NJ). + +It is important that the tip labels of the two trees are the same. If the tip labels are different between the two trees, the method will not recognise any similarity between them. + +This function does not take into account differences in branch length. The \code{"score"} method in \code{\link{dist.topo}} does this if desired. +} +\value{ +Numeric vector of length 1. + +If \code{method = "prop"}, the number returned is the proportion of nodes in the first tree for which there is a node in the second that contains the same tips. Higher number represents greater similarity. If it is 1, the trees are identical. If 0, the trees have no similarity whatsoever. + +When \code{method = "shallow"}, only those nodes tipwards of the median node depth are taken into account. This will not be useful for small trees, but may be helpful with larger datasets. + +\code{"PH85"} is the Penny and Hendy (1985) distance. This measure is the default of \code{\link{dist.topo}}. In this measure, the smaller the number, the closer the trees are. If the trees are identical, this results in 0. + +} +\references{ +Penny, D. and Hendy, M. D. (1985) The use of tree comparison metrics. _Systematic Zoology_ *34* 75-82. + +} +\author{ +Samuel Brown +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +\code{\link{node.depth}}, \code{\link{dist.topo}}. +} + +\examples{ +set.seed(15) +tr <- rtree(15) +set.seed(22) +tr2 <- rtree(15) +tree.comp(tr, tr2) +tree.comp(tr, tr2, method="PH85") +tree.comp(tr, tr2, method="shallow") +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{Utilities}