diff --git a/R/RcppExports.R b/R/RcppExports.R index 26b075b..ca2a353 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,6 +2,5 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 calculate_log_probability <- function(kmer_genus_count, word_specific_priors, genus_counts) { - .Call(`_phylotypr_calculate_log_probability`, kmer_genus_count, word_specific_priors, genus_counts) + .Call(`_phylotypr_calculate_log_probability`, kmer_genus_count, word_specific_priors, genus_counts) } - diff --git a/benchmarking/benchmark_str.R b/benchmarking/benchmark_str.R index 606caaa..cfcdcc2 100644 --- a/benchmarking/benchmark_str.R +++ b/benchmarking/benchmark_str.R @@ -28,7 +28,7 @@ n_kmers <- stringi::stri_length(sequence) - 8 + 1 b_substr <- function() { seq_kmers <- character(length = n_kmers) - for(i in seq_along(1:n_kmers)){ + for (i in seq_along(1:n_kmers)) { seq_kmers[i] <- substr(sequence, i, i + 8 - 1) } } @@ -36,17 +36,17 @@ b_substr <- function() { b_substring <- function() { substring(sequence, 1:n_kmers, 8:1500) } -i_substring <- function(){ +i_substring <- function() { stringi::stri_sub(sequence, 1:n_kmers, 8:1500) } -r_substring <- function(){ +r_substring <- function() { stringr::str_sub(sequence, 1:n_kmers, 8:1500) } microbenchmark(b_substr(), b_substring(), i_substring(), r_substrin) -#toupper and tolower -#seq_to_base4 +# toupper and tolower +# seq_to_base4 sequence_upper <- sequence sequence_lower <- tolower(sequence) @@ -69,44 +69,46 @@ r_toupper <- function(x) { stringr::str_to_upper(x) } -microbenchmark(b_toupper(sequence_upper), - b_toupper(sequence_lower), - b_tolower(sequence_upper), - b_tolower(sequence_lower), - - i_toupper(sequence_upper), - i_toupper(sequence_lower), - i_tolower(sequence_upper), - i_tolower(sequence_lower), - - r_toupper(sequence_upper), - r_toupper(sequence_lower), - r_tolower(sequence_upper), - r_tolower(sequence_lower)) +microbenchmark( + b_toupper(sequence_upper), + b_toupper(sequence_lower), + b_tolower(sequence_upper), + b_tolower(sequence_lower), + i_toupper(sequence_upper), + i_toupper(sequence_lower), + i_tolower(sequence_upper), + i_tolower(sequence_lower), + r_toupper(sequence_upper), + r_toupper(sequence_lower), + r_tolower(sequence_upper), + r_tolower(sequence_lower) +) # want to replace non-atgc with a n character sequence_good <- sequence sequence_bad <- paste0(sequence_good, "R") -b_gsub <- function(x){ +b_gsub <- function(x) { gsub(pattern = "[^ACGT]", replacement = "N", x = x) } -i_replace <- function(x){ +i_replace <- function(x) { stringi::stri_replace_all_charclass(x, "[^ATGC]", "N") } -r_replace <- function(x){ +r_replace <- function(x) { stringr::str_replace_all(x, "[^ATGC]", "N") } -microbenchmark(b_gsub(sequence_good), b_gsub(sequence_bad), - i_replace(sequence_good), i_replace(sequence_bad), - r_replace(sequence_good), r_replace(sequence_bad)) +microbenchmark( + b_gsub(sequence_good), b_gsub(sequence_bad), + i_replace(sequence_good), i_replace(sequence_bad), + r_replace(sequence_good), r_replace(sequence_bad) +) # need to convert A C G T to 0 1 2 3 -b_chartr <- function(){ +b_chartr <- function() { chartr("ACGT", "0123", sequence) } i_chartr <- function() { diff --git a/benchmarking/benchmark_tab.R b/benchmarking/benchmark_tab.R index 328d387..4e0dcfc 100644 --- a/benchmarking/benchmark_tab.R +++ b/benchmarking/benchmark_tab.R @@ -4,7 +4,7 @@ set.seed(19760620) fake_kmers <- sample(1:10, size = 50, replace = TRUE) + 20 n_kmers <- 64 -k_table <- function(){ +k_table <- function() { kmer_table <- table(fake_kmers) kmer_values <- names(kmer_table) |> as.numeric() kmer_counts <- as.numeric(kmer_table) @@ -13,7 +13,7 @@ k_table <- function(){ kmer_vector } -k_tabulate <- function(){ +k_tabulate <- function() { tabulate(fake_kmers, nbins = n_kmers) } diff --git a/benchmarking/benchmarking.R b/benchmarking/benchmarking.R index ffd0ae6..23fabf4 100644 --- a/benchmarking/benchmarking.R +++ b/benchmarking/benchmarking.R @@ -4,7 +4,7 @@ library(Rcpp) vector_grow_append <- function(x) { output <- numeric() - for(i in 1:x) { + for (i in 1:x) { output <- append(output, i^2) } output @@ -12,7 +12,7 @@ vector_grow_append <- function(x) { list_grow_append <- function(x) { output <- vector(mode = "list") - for(i in 1:x) { + for (i in 1:x) { output <- append(output, i^2) } output @@ -20,7 +20,7 @@ list_grow_append <- function(x) { vector_grow_c <- function(x) { output <- numeric() - for(i in 1:x) { + for (i in 1:x) { output <- c(output, i^2) } output @@ -28,7 +28,7 @@ vector_grow_c <- function(x) { list_grow_c <- function(x) { output <- vector(mode = "list") - for(i in 1:x) { + for (i in 1:x) { output <- c(output, i^2) } output @@ -36,7 +36,7 @@ list_grow_c <- function(x) { vector_grow_br <- function(x) { output <- numeric() - for(i in 1:x) { + for (i in 1:x) { output[i] <- i^2 } output @@ -44,7 +44,7 @@ vector_grow_br <- function(x) { list_grow_br <- function(x) { output <- vector(mode = "list") - for(i in 1:x) { + for (i in 1:x) { output[i] <- i^2 } output @@ -52,7 +52,7 @@ list_grow_br <- function(x) { vector_prealloc_sng <- function(x) { output <- numeric(x) - for(i in 1:x) { + for (i in 1:x) { output[i] <- i^2 } output @@ -60,7 +60,7 @@ vector_prealloc_sng <- function(x) { vector_prealloc_dbl <- function(x) { output <- numeric(x) - for(i in 1:x) { + for (i in 1:x) { output[[i]] <- i^2 } output @@ -68,7 +68,7 @@ vector_prealloc_dbl <- function(x) { list_prealloc_dbl <- function(x) { output <- vector(mode = "list", x) - for(i in 1:x) { + for (i in 1:x) { output[[i]] <- i^2 } output @@ -98,28 +98,32 @@ list_xapply <- function(x) { lapply(1:x, \(i) i^2) } -vector_map <- function(x){ +vector_map <- function(x) { map_dbl(1:x, \(i) i^2) } -list_map <- function(x){ +list_map <- function(x) { map(1:x, \(i) i^2) } -vector_magrittr <- function(x){ +vector_magrittr <- function(x) { 1:x %>% (\(i) i^2)() } -list_magrittr <- function(x){ - 1:x %>% (\(i) i^2)() %>% as.list() +list_magrittr <- function(x) { + 1:x %>% + (\(i) i^2)() %>% + as.list() } -vector_base <- function(x){ +vector_base <- function(x) { 1:x |> (\(i) i^2)() } -list_base <- function(x){ - 1:x |> (\(i) i^2)() |> as.list() +list_base <- function(x) { + 1:x |> + (\(i) i^2)() |> + as.list() } @@ -127,39 +131,36 @@ n <- 1e4 sourceCpp("benchmarking.cpp") mb_by_n <- function(n) { - - microbenchmark( - vector_grow_append(n), - vector_grow_c(n), - vector_grow_br(n), - # vector_prealloc_sng(n), - vector_prealloc_dbl(n), - vector_colon(n), - vector_seq(n), - vector_xapply(n), - vector_rcpp(n), - vector_base(n), - vector_magrittr(n), - vector_map(n), - - list_grow_append(n), - list_grow_c(n), - list_grow_br(n), - # list_prealloc_sng(n), - list_prealloc_dbl(n), - list_colon(n), - list_seq(n), - list_xapply(n), - list_rcpp(n), - list_base(n), - list_magrittr(n), - list_map(n) - ) %>% + microbenchmark( + vector_grow_append(n), + vector_grow_c(n), + vector_grow_br(n), + # vector_prealloc_sng(n), + vector_prealloc_dbl(n), + vector_colon(n), + vector_seq(n), + vector_xapply(n), + vector_rcpp(n), + vector_base(n), + vector_magrittr(n), + vector_map(n), + list_grow_append(n), + list_grow_c(n), + list_grow_br(n), + # list_prealloc_sng(n), + list_prealloc_dbl(n), + list_colon(n), + list_seq(n), + list_xapply(n), + list_rcpp(n), + list_base(n), + list_magrittr(n), + list_map(n) + ) %>% group_by(expr) %>% summarize(median_time = median(time)) %>% arrange(-median_time) %>% mutate(n = n) - } ns <- c(1, 10, 100, 1000, 2500, 5000, 7500, 10000, 12500, 15000) @@ -178,32 +179,31 @@ mb_data %>% mb_by_n(1e4) %>% - mutate(expr = str_replace(expr, "\\(n\\)", ""), - expr = str_replace(expr, "_", "-")) %>% + mutate( + expr = str_replace(expr, "\\(n\\)", ""), + expr = str_replace(expr, "_", "-") + ) %>% separate_wider_delim(expr, names = c("structure", "f"), delim = "-") %>% select(-n) %>% pivot_wider(names_from = structure, values_from = median_time) vector_get_one <- function(x) { - index <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50) x[5] - } vector_get_one_named <- function(x) { - - index <- c("ccx", "grs", "lmh", "bls", "gjy", - "eee", "arn", "kwp", "ffz", "kcn") + index <- c( + "ccx", "grs", "lmh", "bls", "gjy", + "eee", "arn", "kwp", "ffz", "kcn" + ) x["lmh"] - } vector_get_ten <- function(x) { - index <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50) x[5] @@ -216,13 +216,13 @@ vector_get_ten <- function(x) { x[40] x[45] x[50] - } vector_get_ten_named <- function(x) { - - index <- c("ccx", "grs", "lmh", "bls", "gjy", - "eee", "arn", "kwp", "ffz", "kcn") + index <- c( + "ccx", "grs", "lmh", "bls", "gjy", + "eee", "arn", "kwp", "ffz", "kcn" + ) x["ccx"] x["grs"] @@ -238,38 +238,39 @@ vector_get_ten_named <- function(x) { vector_get_c <- function(x) { - index <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50) - c(x[5], x[10], x[15], x[20], x[25], - x[30], x[35], x[40], x[45], x[50]) - + c( + x[5], x[10], x[15], x[20], x[25], + x[30], x[35], x[40], x[45], x[50] + ) } vector_get_c_named <- function(x) { - - index <- c("ccx", "grs", "lmh", "bls", "gjy", - "eee", "arn", "kwp", "ffz", "kcn") - - c(x["ccx"], x["grs"], x["lmh"], x["bls"], - x["gjy"], x["eee"], x["arn"], x["kwp"], - x["ffz"], x["kcn"]) + index <- c( + "ccx", "grs", "lmh", "bls", "gjy", + "eee", "arn", "kwp", "ffz", "kcn" + ) + + c( + x["ccx"], x["grs"], x["lmh"], x["bls"], + x["gjy"], x["eee"], x["arn"], x["kwp"], + x["ffz"], x["kcn"] + ) } vector_get_index <- function(x) { - index <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50) x[index] - } vector_get_index_named <- function(x) { - - index <- c("ccx", "grs", "lmh", "bls", "gjy", - "eee", "arn", "kwp", "ffz", "kcn") + index <- c( + "ccx", "grs", "lmh", "bls", "gjy", + "eee", "arn", "kwp", "ffz", "kcn" + ) x[index] - } long_vector <- (1:1e4)^2 @@ -280,25 +281,27 @@ n <- expand_grid(letters, letters, letters) |> long_namedlist <- long_list names(long_namedlist) <- n[1:length(long_list)] -microbenchmark(vector_get_one(long_vector), - vector_get_ten(long_vector), - vector_get_c(long_vector), - vector_get_index(long_vector), - vector_get_one(long_list), - vector_get_ten(long_list), - vector_get_c(long_list), - vector_get_index_named(long_list), - vector_get_one_named(long_namedlist), - vector_get_ten_named(long_namedlist), - vector_get_c_named(long_namedlist), - vector_get_index_named(long_namedlist) - ) %>% +microbenchmark( + vector_get_one(long_vector), + vector_get_ten(long_vector), + vector_get_c(long_vector), + vector_get_index(long_vector), + vector_get_one(long_list), + vector_get_ten(long_list), + vector_get_c(long_list), + vector_get_index_named(long_list), + vector_get_one_named(long_namedlist), + vector_get_ten_named(long_namedlist), + vector_get_c_named(long_namedlist), + vector_get_index_named(long_namedlist) +) %>% group_by(expr) %>% summarize(median_time = median(time)) %>% arrange(-median_time) %>% - mutate(expr = str_replace(expr, "\\((.*)\\)", " \\1"), - expr = str_replace(expr, "vector_", "")) %>% + mutate( + expr = str_replace(expr, "\\((.*)\\)", " \\1"), + expr = str_replace(expr, "vector_", "") + ) %>% separate_wider_delim(expr, names = c("f", "data"), delim = " ") %>% mutate(f = str_replace(f, "_named", "")) %>% pivot_wider(names_from = data, values_from = median_time) - diff --git a/benchmarking/benchmarking_df.R b/benchmarking/benchmarking_df.R index a951d84..3f30317 100644 --- a/benchmarking/benchmarking_df.R +++ b/benchmarking/benchmarking_df.R @@ -11,92 +11,110 @@ set.seed(19760620) n_kmers <- 4^8 n_genera <- 4000 -df_grow_rbind <- function(n){ +df_grow_rbind <- function(n) { df <- data.frame(kmer = NULL, genus = NULL, count = NULL) - for(i in 1:n) { - df <- rbind(df, list(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1))) + for (i in 1:n) { + df <- rbind(df, list( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + )) } df } -tbl_grow_rbind <- function(n){ +tbl_grow_rbind <- function(n) { df <- tibble(kmer = NULL, genus = NULL, count = NULL) - for(i in 1:n) { - df <- bind_rows(df, tibble_row(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1))) + for (i in 1:n) { + df <- bind_rows(df, tibble_row( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + )) } df } -dt_grow_rbind <- function(n){ +dt_grow_rbind <- function(n) { df <- data.table(kmer = NULL, genus = NULL, count = NULL) - for(i in 1:n) { - df <- rbindlist(list(df, list(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1)))) + for (i in 1:n) { + df <- rbindlist(list(df, list( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + ))) } df } -df_grow_index <- function(n){ +df_grow_index <- function(n) { df <- data.frame(kmer = 0, genus = 0, count = 0) - for(i in 1:n) { - df[i, ] <- list(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1)) + for (i in 1:n) { + df[i, ] <- list( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + ) } df } -tbl_grow_index <- function(n){ +tbl_grow_index <- function(n) { df <- tibble(kmer = 0, genus = 0, count = 0) - for(i in 1:n) { - df[i, ] <- tibble_row(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1)) + for (i in 1:n) { + df[i, ] <- tibble_row( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + ) } df } -df_prealloc_index <- function(n){ +df_prealloc_index <- function(n) { df <- data.frame(kmer = rep(0, n), genus = rep(0, n), count = rep(0, n)) - for(i in 1:n) { - df[i, ] <- list(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1)) + for (i in 1:n) { + df[i, ] <- list( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + ) } df } -tbl_prealloc_index <- function(n){ +tbl_prealloc_index <- function(n) { df <- tibble(kmer = rep(0, n), genus = rep(0, n), count = rep(0, n)) - for(i in 1:n) { - df[i, ] <- tibble_row(kmer = sample(n_kmers, 1), - genus = sample(n_genera, 1), - count = sample(10, 1)) + for (i in 1:n) { + df[i, ] <- tibble_row( + kmer = sample(n_kmers, 1), + genus = sample(n_genera, 1), + count = sample(10, 1) + ) } df } -dt_prealloc_index_orig <- function(n){ +dt_prealloc_index_orig <- function(n) { df <- data.table(kmer = rep(0, n), genus = rep(0, n), count = rep(0, n)) - for(i in 1:n) { - df[i, (colnames(df)) := list(sample(n_kmers, 1), - sample(n_genera, 1), - sample(10, 1))] + for (i in 1:n) { + df[i, (colnames(df)) := list( + sample(n_kmers, 1), + sample(n_genera, 1), + sample(10, 1) + )] } df } -dt_prealloc_index_set <- function(n){ +dt_prealloc_index_set <- function(n) { df <- data.table(kmer = rep(0, n), genus = rep(0, n), count = rep(0, n)) - for(i in 1:n) { - set(df, i, 1:3, list(sample(n_kmers, 1), - sample(n_genera, 1), - sample(10, 1))) + for (i in 1:n) { + set(df, i, 1:3, list( + sample(n_kmers, 1), + sample(n_genera, 1), + sample(10, 1) + )) } df } @@ -142,13 +160,13 @@ matrix_predefine <- function(n) { m <- matrix(0, nrow = n_kmers, ncol = n_genera) - for(i in 1:n) { + for (i in 1:n) { m[kmer[i], genus[i]] <- count[i] } m } -matrix_sparseC_old <- function(n){ +matrix_sparseC_old <- function(n) { kmer <- sample(n_kmers, n, replace = TRUE) genus <- sample(n_genera, n, replace = TRUE) count <- sample(10, n, replace = TRUE) @@ -156,21 +174,20 @@ matrix_sparseC_old <- function(n){ m <- Matrix(0, nrow = n_kmers, ncol = n_genera) m <- as(m, "CsparseMatrix") - for(i in 1:n) { + for (i in 1:n) { m[kmer[i], genus[i]] <- count[i] } m - } -matrix_sparseC <- function(n){ +matrix_sparseC <- function(n) { kmer <- sample(n_kmers, n, replace = TRUE) genus <- sample(n_genera, n, replace = TRUE) count <- sample(10, n, replace = TRUE) sparseMatrix(i = kmer, j = genus, x = count, repr = "C") } -matrix_sparseR_old <- function(n){ +matrix_sparseR_old <- function(n) { kmer <- sample(n_kmers, n, replace = TRUE) genus <- sample(n_genera, n, replace = TRUE) count <- sample(10, n, replace = TRUE) @@ -178,13 +195,13 @@ matrix_sparseR_old <- function(n){ m <- Matrix(0, nrow = n_kmers, ncol = n_genera) m <- as(m, "RsparseMatrix") - for(i in 1:n) { + for (i in 1:n) { m[kmer[i], genus[i]] <- count[i] } m } -matrix_sparseR <- function(n){ +matrix_sparseR <- function(n) { kmer <- sample(n_kmers, n, replace = TRUE) genus <- sample(n_genera, n, replace = TRUE) count <- sample(10, n, replace = TRUE) @@ -192,7 +209,7 @@ matrix_sparseR <- function(n){ } -matrix_sparseT_old <- function(n){ +matrix_sparseT_old <- function(n) { kmer <- sample(n_kmers, n, replace = TRUE) genus <- sample(n_genera, n, replace = TRUE) count <- sample(10, n, replace = TRUE) @@ -200,14 +217,13 @@ matrix_sparseT_old <- function(n){ m <- Matrix(0, nrow = n_kmers, ncol = n_genera) m <- as(m, "TsparseMatrix") - for(i in 1:n) { + for (i in 1:n) { m[kmer[i], genus[i]] <- count[i] } m - } -matrix_sparseT <- function(n){ +matrix_sparseT <- function(n) { kmer <- sample(n_kmers, n, replace = TRUE) genus <- sample(n_genera, n, replace = TRUE) count <- sample(10, n, replace = TRUE) @@ -219,30 +235,30 @@ Rcpp::sourceCpp("benchmarking_df.cpp") n <- 100 -microbenchmark(df_grow_rbind(n), - df_grow_index(n), - df_prealloc_index(n), - df_predefine(n), - tbl_grow_rbind(n), - tbl_grow_index(n), - tbl_prealloc_index(n), - tbl_predefine(n), - dt_grow_rbind(n), - dt_prealloc_index_orig(n), - dt_prealloc_index_set(n), - dt_predefine(n), - matrix_sparseC_old(n), - matrix_sparseR_old(n), - matrix_sparseT_old(n), - matrix_sparseC(n), - matrix_sparseR(n), - matrix_sparseT(n), - list_predefine(n), - vector_predefine(n), - matrix_predefine(n), - df_rcpp(n) - - ) %>% +microbenchmark( + df_grow_rbind(n), + df_grow_index(n), + df_prealloc_index(n), + df_predefine(n), + tbl_grow_rbind(n), + tbl_grow_index(n), + tbl_prealloc_index(n), + tbl_predefine(n), + dt_grow_rbind(n), + dt_prealloc_index_orig(n), + dt_prealloc_index_set(n), + dt_predefine(n), + matrix_sparseC_old(n), + matrix_sparseR_old(n), + matrix_sparseT_old(n), + matrix_sparseC(n), + matrix_sparseR(n), + matrix_sparseT(n), + list_predefine(n), + vector_predefine(n), + matrix_predefine(n), + df_rcpp(n) +) %>% group_by(expr) %>% summarize(median_time = median(time)) %>% arrange(-median_time) @@ -275,98 +291,98 @@ lst <- list_predefine(n) arr <- arrow_table(df) -get_df_single <- function(){ - df[which(df$kmer == 20),] +get_df_single <- function() { + df[which(df$kmer == 20), ] } -get_df_three <- function(){ - df[which(df$kmer == 20 | df$kmer == 30 | df$kmer == 50),] +get_df_three <- function() { + df[which(df$kmer == 20 | df$kmer == 30 | df$kmer == 50), ] } -get_dt_single <- function(){ - dt[which(kmer == 20),] +get_dt_single <- function() { + dt[which(kmer == 20), ] } -get_dt_three <- function(){ - dt[which(kmer == 20 | kmer == 30 | kmer == 50),] +get_dt_three <- function() { + dt[which(kmer == 20 | kmer == 30 | kmer == 50), ] } -get_dt_singlek <- function(){ +get_dt_singlek <- function() { dt2[.(20)] } -get_dt_threek <- function(){ - dt2[.(c(20, 50, 30)),] +get_dt_threek <- function() { + dt2[.(c(20, 50, 30)), ] } -get_tbl_single <- function(){ +get_tbl_single <- function() { filter(tbl, kmer == 20) } -get_tbl_three <- function(){ +get_tbl_three <- function() { filter(tbl, kmer == 20 | kmer == 30 | kmer == 50) } -get_duck_single <- function(){ +get_duck_single <- function() { filter(duck, kmer == 20) } -get_duck_three <- function(){ +get_duck_three <- function() { filter(duck, kmer == 20 | kmer == 30 | kmer == 50) } -get_arrow_single <- function(){ +get_arrow_single <- function() { filter(arr, kmer == 20) } -get_arrow_three <- function(){ +get_arrow_three <- function() { filter(arr, kmer == 20 | kmer == 30 | kmer == 50) } -get_dbi_single <- function(){ +get_dbi_single <- function() { dbGetQuery(con, "SELECT * FROM duck WHERE kmer == 20") } -get_dbi_three <- function(){ +get_dbi_three <- function() { dbGetQuery(con, "SELECT * FROM duck WHERE kmer == 20 OR kmer == 30 OR kmer == 50") } -get_tbl_threeJ <- function(){ +get_tbl_threeJ <- function() { filter_table <- tibble(kmer = c(20, 30, 50)) inner_join(tbl, filter_table, by = "kmer") } -get_mfull_single <- function(){ - mfull[20,] +get_mfull_single <- function() { + mfull[20, ] } -get_mfull_three <- function(){ - mfull[c(20, 30, 50),] +get_mfull_three <- function() { + mfull[c(20, 30, 50), ] } -get_msparseT_single <- function(){ - msparseT[20,] +get_msparseT_single <- function() { + msparseT[20, ] } -get_msparseT_three <- function(){ - msparseT[c(20, 30, 50),] +get_msparseT_three <- function() { + msparseT[c(20, 30, 50), ] } -get_msparseR_single <- function(){ - msparseR[20,] +get_msparseR_single <- function() { + msparseR[20, ] } -get_msparseR_three <- function(){ - msparseR[c(20, 30, 50),] +get_msparseR_three <- function() { + msparseR[c(20, 30, 50), ] } -get_msparseC_single <- function(){ - msparseC[20,] +get_msparseC_single <- function() { + msparseC[20, ] } -get_msparseC_three <- function(){ - msparseC[c(20, 30, 50),] +get_msparseC_three <- function() { + msparseC[c(20, 30, 50), ] } kmer_vec <- lst$kmer @@ -375,81 +391,90 @@ count_vec <- lst$count get_vector_single <- function() { kmer_present <- kmer_vec == 20 - list(kmer = kmer_vec[kmer_present], - genus = genus_vec[kmer_present], - count = count_vec[kmer_present]) + list( + kmer = kmer_vec[kmer_present], + genus = genus_vec[kmer_present], + count = count_vec[kmer_present] + ) } get_vector_threeOR <- function() { kmer_present <- kmer_vec == 20 | kmer_vec == 30 | kmer_vec == 50 - list(kmer = kmer_vec[kmer_present], - genus = genus_vec[kmer_present], - count = count_vec[kmer_present]) + list( + kmer = kmer_vec[kmer_present], + genus = genus_vec[kmer_present], + count = count_vec[kmer_present] + ) } get_vector_threeIN <- function() { kmer_present <- kmer_vec %in% c(20, 30, 50) - list(kmer = kmer_vec[kmer_present], - genus = genus_vec[kmer_present], - count = count_vec[kmer_present]) + list( + kmer = kmer_vec[kmer_present], + genus = genus_vec[kmer_present], + count = count_vec[kmer_present] + ) } get_vector_str2lang <- function() { kmer_present <- eval(str2lang(paste(paste("kmer_vec == ", c(20, 30, 50)), - collapse = " | "))) + collapse = " | " + ))) kmer_vec[kmer_present] } -get_which_single <- function(){ +get_which_single <- function() { kmer_present <- which(kmer_vec == 20) - list(kmer = kmer_vec[kmer_present], - genus = genus_vec[kmer_present], - count = count_vec[kmer_present]) + list( + kmer = kmer_vec[kmer_present], + genus = genus_vec[kmer_present], + count = count_vec[kmer_present] + ) } -get_which_three <- function(){ +get_which_three <- function() { kmer_present <- which(kmer_vec == 20 | kmer_vec == 30 | kmer_vec == 50) - list(kmer = kmer_vec[kmer_present], - genus = genus_vec[kmer_present], - count = count_vec[kmer_present]) + list( + kmer = kmer_vec[kmer_present], + genus = genus_vec[kmer_present], + count = count_vec[kmer_present] + ) } microbenchmark(get_df_single(), - get_duck_single(), - get_arrow_single(), - get_dt_single(), - get_dt_singlek(), - get_tbl_single(), - get_dbi_single(), - get_mfull_single(), - get_msparseT_single(), - get_msparseR_single(), - get_msparseC_single(), - # get_vector_single(), - get_which_single(), - - get_df_three(), - get_dt_three(), - get_dt_threek(), - get_dbi_three(), - get_tbl_three(), - get_duck_three(), - get_arrow_three(), - # get_tbl_threeJ(), - get_mfull_three(), - get_msparseT_three(), - get_msparseR_three(), - get_msparseC_three(), - #get_vector_threeOR(), - #get_vector_threeIN(), - get_which_three(), - #get_vector_str2lang(), - times = 10 - ) %>% + get_duck_single(), + get_arrow_single(), + get_dt_single(), + get_dt_singlek(), + get_tbl_single(), + get_dbi_single(), + get_mfull_single(), + get_msparseT_single(), + get_msparseR_single(), + get_msparseC_single(), + # get_vector_single(), + get_which_single(), + get_df_three(), + get_dt_three(), + get_dt_threek(), + get_dbi_three(), + get_tbl_three(), + get_duck_three(), + get_arrow_three(), + # get_tbl_threeJ(), + get_mfull_three(), + get_msparseT_three(), + get_msparseR_three(), + get_msparseC_three(), + # get_vector_threeOR(), + # get_vector_threeIN(), + get_which_three(), + # get_vector_str2lang(), + times = 10 +) %>% group_by(expr) %>% summarize(median_time = median(time)) %>% arrange(-median_time) %>% print(n = Inf) - diff --git a/benchmarking/benchmarking_joins.R b/benchmarking/benchmarking_joins.R index a22f777..eef58d0 100644 --- a/benchmarking/benchmarking_joins.R +++ b/benchmarking/benchmarking_joins.R @@ -1,24 +1,31 @@ +animal_legs <- data.frame( + animal = c("cow", "fish", "chicken", "dog", "sheep"), + n_legs = c(4, 0, 2, 4, 4) +) -animal_legs <- data.frame(animal = c("cow", "fish", "chicken", "dog", "sheep"), - n_legs = c(4, 0, 2, 4, 4)) - -animal_sounds <- data.frame(animal = c("cow", "chicken", "cat", "sheep", "dog"), - sounds = c("mooo", "cluck", "meow", "baaa", "bark")) +animal_sounds <- data.frame( + animal = c("cow", "chicken", "cat", "sheep", "dog"), + sounds = c("mooo", "cluck", "meow", "baaa", "bark") +) -animal_pats_farm <- data.frame(animals = c("cow", "fish", "chicken", "dog", "cat", "sheep"), - pat_has = c(T, F, T, T, T, T)) -#base R -merge(animal_legs, animal_sounds, all = FALSE) #inner join -merge(animal_legs, animal_sounds, all = FALSE, by = "animal") #inner join -merge(animal_legs, animal_sounds, all = TRUE) #full join -merge(animal_legs, animal_sounds, all.x = TRUE) #left join -merge(animal_legs, animal_sounds, all.y = TRUE) #right join +animal_pats_farm <- data.frame( + animals = c("cow", "fish", "chicken", "dog", "cat", "sheep"), + pat_has = c(T, F, T, T, T, T) +) +# base R +merge(animal_legs, animal_sounds, all = FALSE) # inner join +merge(animal_legs, animal_sounds, all = FALSE, by = "animal") # inner join +merge(animal_legs, animal_sounds, all = TRUE) # full join +merge(animal_legs, animal_sounds, all.x = TRUE) # left join +merge(animal_legs, animal_sounds, all.y = TRUE) # right join -merge(animal_legs, animal_pats_farm, all = FALSE, - by.x = "animal", by.y = "animals") #inner join +merge(animal_legs, animal_pats_farm, + all = FALSE, + by.x = "animal", by.y = "animals" +) # inner join -#dplyr +# dplyr inner_join(animal_legs, animal_sounds) inner_join(animal_legs, animal_sounds, by = "animal") inner_join(animal_legs, animal_sounds, by = join_by(animal)) @@ -27,31 +34,31 @@ full_join(animal_sounds, animal_legs, by = "animal") left_join(animal_legs, animal_sounds, by = "animal") right_join(animal_legs, animal_sounds, by = "animal") -#inner_join(animal_legs, animal_pats_farm) #Inner join errors +# inner_join(animal_legs, animal_pats_farm) #Inner join errors inner_join(animal_legs, animal_pats_farm, by = c("animal" = "animals")) inner_join(animal_legs, animal_pats_farm, by = join_by(animal == animals)) inner_join(animal_legs, animal_pats_farm, by = join_by(animal == animals)) |> inner_join(x = _, animal_sounds, by = "animal") -#%>% +# %>% inner_join(animal_legs, animal_pats_farm, by = join_by(animal == animals)) %>% inner_join(., animal_sounds, by = "animal") -#data.table +# data.table animal_legs_dt <- data.table::data.table(animal_legs, key = "animal") animal_sounds_dt <- data.table::data.table(animal_sounds, key = "animal") -#animal_legs[row, column, by, other arguments] -animal_legs_dt[animal_sounds_dt, on = .(animal)] #right join -animal_sounds_dt[animal_legs_dt, on = .(animal)] #left join -animal_sounds_dt[animal_legs_dt, nomatch = NULL, on = .(animal)] #inner join -animal_sounds_dt[animal_legs_dt, nomatch = NULL] #inner join +# animal_legs[row, column, by, other arguments] +animal_legs_dt[animal_sounds_dt, on = .(animal)] # right join +animal_sounds_dt[animal_legs_dt, on = .(animal)] # left join +animal_sounds_dt[animal_legs_dt, nomatch = NULL, on = .(animal)] # inner join +animal_sounds_dt[animal_legs_dt, nomatch = NULL] # inner join uniq_animals <- unique(c(animal_sounds_dt$animal, animal_legs_dt$animal)) -animal_legs_dt[animal_sounds_dt[uniq_animals], on = .(animal)] #full join +animal_legs_dt[animal_sounds_dt[uniq_animals], on = .(animal)] # full join animal_pats_farm_dt <- data.table::data.table(animal_pats_farm, key = "animals") @@ -73,8 +80,7 @@ microbenchmark::microbenchmark( dt = { fasta_dt <- data.table::data.table(fasta_df, key = "id") genera_dt <- data.table::data.table(genera, key = "id") - fasta_dt[genera_dt, nomatch = NULL, on = .(id)] #inner join + fasta_dt[genera_dt, nomatch = NULL, on = .(id)] # inner join }, dtA = fasta_dtA[genera_dtA, nomatch = NULL, on = .(id)] - ) diff --git a/benchmarking/benchmarking_readfile.R b/benchmarking/benchmarking_readfile.R index 4e5f861..0b8b5fb 100644 --- a/benchmarking/benchmarking_readfile.R +++ b/benchmarking/benchmarking_readfile.R @@ -1,12 +1,10 @@ - file <- "benchmarking/trainset19_072023.rdp/trainset19_072023.rdp.fasta" microbenchmark::microbenchmark( scan = scan(file, sep = "\n", what = character(), quiet = TRUE) |> stringi::stri_detect_regex("^>"), rl = readLines(file) |> stringi::stri_detect_regex("^>"), dt = data.table::fread(file, header = FALSE)$V1 |> stringi::stri_detect_regex("^>"), - r_l= readr::read_lines(file) |> stringi::stri_detect_regex("^>"), + r_l = readr::read_lines(file) |> stringi::stri_detect_regex("^>"), v_l = vroom::vroom_lines(file) |> stringi::stri_detect_regex("^>"), times = 10 - ) diff --git a/benchmarking/benchmarking_readtsv.R b/benchmarking/benchmarking_readtsv.R index 1767346..f52871c 100644 --- a/benchmarking/benchmarking_readtsv.R +++ b/benchmarking/benchmarking_readtsv.R @@ -1,29 +1,37 @@ taxonomy <- "benchmarking/trainset19_072023.rdp/trainset19_072023.rdp.tax" genera <- readr::read_tsv(taxonomy, - col_names = c("accession", "taxonomy")) |> + col_names = c("accession", "taxonomy") +) |> dplyr::mutate(taxonomy = stringi::stri_replace_all_regex(taxonomy, ";$", "")) microbenchmark::microbenchmark( - r.d = read.delim(taxonomy, header = FALSE, - col.names = c("accession", "taxonomy"), - colClasses = c("character", "character")) |> + r.d = read.delim(taxonomy, + header = FALSE, + col.names = c("accession", "taxonomy"), + colClasses = c("character", "character") + ) |> dplyr::mutate(taxonomy = stringi::stri_replace_last_regex(taxonomy, ";$", "")), r_t = readr::read_tsv(taxonomy, - col_names = c("accession", "taxonomy"), - col_types = readr::cols(.default = readr::col_character())) |> + col_names = c("accession", "taxonomy"), + col_types = readr::cols(.default = readr::col_character()) + ) |> dplyr::mutate(taxonomy = stringi::stri_replace_last_regex(taxonomy, ";$", "")), - v = vroom::vroom(taxonomy, delim = "\t", - col_names = c("accession", "taxonomy"), - col_types = vroom::cols(.default = vroom::col_character())) |> + v = vroom::vroom(taxonomy, + delim = "\t", + col_names = c("accession", "taxonomy"), + col_types = vroom::cols(.default = vroom::col_character()) + ) |> dplyr::mutate(taxonomy = stringi::stri_replace_last_regex(taxonomy, ";$", "")), - d.t = data.table::fread(taxonomy, sep = "\t", header = FALSE, - col.names = c("accession", "taxonomy")) |> + d.t = data.table::fread(taxonomy, + sep = "\t", header = FALSE, + col.names = c("accession", "taxonomy") + ) |> dplyr::mutate(taxonomy = stringi::stri_replace_last_regex(taxonomy, ";$", "")), arr = arrow::read_tsv_arrow(taxonomy, - col_names = c("accession", "taxonomy"), - col_types = "cc") |> + col_names = c("accession", "taxonomy"), + col_types = "cc" + ) |> dplyr::mutate(taxonomy = stringi::stri_replace_last_regex(taxonomy, ";$", "")) - ) diff --git a/inst/WORDLIST b/inst/WORDLIST index 97e390b..02a3b30 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,16 +1,109 @@ +AEM +al +Appl +archaeal +aut +autom Barnesiella +Benchmarking +BugReports +bz +CallEntries +CallMethodDef +cerr CMD Codecov +compileAttributes +Config +const +countSEXP +countsSEXP +cout +cre +DataFrame +desc +df +DL +dll +DllInfo +doi +edu +endif +Environ +et +eukaryotic +fasta +FASTA +FUNC +Garrity +github greengenes +gz +http +https +ifdef +init +JM +kmer kmers +knitr +LazyData +LazyDataCompression +LinkingTo +linter +MERCHANTABILITY +Microbiol microbiome mothur +nolint +NONINFRINGEMENT nt +NumericMatrix +NumericVector +ORCID +pds phylotypr +PMC +PMCID +PMID Porphyromonadaceae +priorsSEXP +pschloss +purrr QIIME +Rcerr +Rcout +rcpp +Rcpp +RCPP +RcppExport RDP +rdp RDP's +readr +registerRoutines +Rfast Ribosomal Riffomonas +riffomonas +rmarkdown +rngScope +RNGScope +RObject +Rostream +ROSTREAM +Roxygen +RoxygenNote rRNA +seqeunces +SEXP +sourceforge +stringi +sublicense +testthat +Tiedje +trainset +umich +useDynamicSymbols +VignetteBuilder +xz