Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dir_stat() #6

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Description: Intense parallel workloads can be difficult to monitor.
visualize the log file to reveal potential resource-related
reasons for the crash. The 'autometric' package borrows heavily from
the methods of packages 'ps' <doi:10.32614/CRAN.package.ps> and 'psutil'.
Version: 0.1.2.9000
Version: 0.1.2.9001
License: MIT + file LICENSE
URL:
https://wlandau.github.io/autometric/,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(dir_stat)
export(log_active)
export(log_phase_get)
export(log_phase_reset)
Expand Down
5 changes: 2 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# autometric 0.1.2.9000

# autometric 0.1.2.9001

* Add `dir_stat()` to return file info metadata in a directory faster than `base::file.info()` (#5).

# autometric 0.1.2

Expand Down Expand Up @@ -36,4 +36,3 @@
# autometric 0.0.3

* First version

100 changes: 100 additions & 0 deletions R/dir_stat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#' @title Efficiently describe files in a directory.
#' @export
#' @family utilities
#' @description List the paths, sizes, modification times,
#' of all the regular files at (or linked from)
#' the top level in a directory.
#' @details In large computational pipelines, it is common to end up with
#' tens of thousands of log files in a directory.
#' At this level of scale, [base::file.info()]
#' is slow on older file systems.
#' [autometric::dir_stat()] can be up to 40 times faster where
#' the C implementation is supported
#' (POSIX.1-2008 machines and Mac OS).
#'
#' [dir_stat()] is not recursive: it only queries regular files at the
#' top level of a directory. In addition, it follows symbolic links:
#' if a file is a link, then [dir_stat()] describes the file it points
#' to, rather than the link itself.
#' @return A data frame with one row per file and columns for the file path,
#' numeric size, and modification time stamp of each file.
#' The units of these last two columns are controlled by the
#' `units_size` and `units_mtime` arguments, respectively.
#' @param path Character string, file path to the directory of files
#' to describe.
#' @param units_size Character string with the units of the returned
#' `size` column in the output: `"megabytes"`, `"bytes"`, `"kilobytes"`,
#' or `"gigabytes"`.
#' @param units_mtime Character string with the units of the returned
#' `mtime` column in the output with file modification time stamps.
#' Choices are `"POSIXct` for a `POSIXct` time object or `"numeric"`
#' for an ordinary numeric vector.
#' @param recent Either `NULL` or an optional `"difftime"` object.
#' If a `"difftime"` object is supplied, then [dir_stat()]
#' only shows the most recently modified files in that time window.
#' For example, `recent = as.difftime(1.5, units = "hours")` tells
#' [dir_stat()] to only return information on files modified within
#' the last 1.5 hours.
#' @param method Character string, type of implementation used.
#' Set to `"c"` for an implementation that is up to 40 times faster than
#' [base::file.info()] but may not be supported on certain platforms.
#' Set to `"r"` to run [base::file.info()], which is slower.
#' If `method` is `"c"` but the C implementation is not supported
#' on your platform, [dir_stat()] automatically falls back on
#' [base::file.info()].
#' The C implementation is supported on POSIX.1-2008 machines and on Mac OS.
#' @examples
#' file.create(tempfile())
#' file.create(tempfile())
#' if (tolower(Sys.info()["sysname"]) != "windows") {
#' print(dir_stat(tempdir(), recent = as.difftime(1, units = "hours")))
#' }
dir_stat <- function(
path,
units_size = c("megabytes", "bytes", "kilobytes", "gigabytes"),
units_mtime = c("POSIXct", "numeric"),
recent = NULL,
method = c("c", "r")
) {
stopifnot(is.character(path))
stopifnot(!anyNA(path))
stopifnot(all(nzchar(path)))
stopifnot(dir.exists(path))
units_size <- match.arg(units_size)
units_mtime <- match.arg(units_mtime)
method <- match.arg(method)
if (method == "r" || is.null(out <- dir_stat_c(path, units_mtime))) {
out <- dir_stat_r(path, units_mtime)
}
out$size <- out$size * get_factor_size(units_size)
if (!is.null(recent)) {
stopifnot(length(recent) == 1L)
stopifnot(!anyNA(recent))
stopifnot(inherits(recent, "difftime"))
out <- out[.POSIXct(out$mtime) > Sys.time() - recent, ]
}
out
}

dir_stat_c <- function(path, units_mtime) {
out <- .Call(r_dir_stat, path, PACKAGE = "autometric")
if (identical(units_mtime, "POSIXct")) {
out$mtime <- .POSIXct(out$mtime)
}
as.data.frame(out)
}

dir_stat_r <- function(path, units_mtime) {
directories <- list.dirs(path, full.names = TRUE, recursive = FALSE)
files <- setdiff(list.files(path, full.names = TRUE), directories)
info <- file.info(files, extra_cols = FALSE)
out <- data.frame(
path = rownames(info),
size = as.numeric(info$size),
mtime = info$mtime
)
if (identical(units_mtime, "numeric")) {
out$mtime <- as.numeric(out$mtime)
}
out
}
1 change: 1 addition & 0 deletions R/log_active.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Check the log thread.
#' @export
#' @family log
#' @description Check if the log is running.
#' @return `TRUE` if a background thread is actively writing to the log,
#' `FALSE` otherwise. The result is based on a static C variable,
Expand Down
1 change: 1 addition & 0 deletions R/log_phase_get.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Get log phase
#' @export
#' @family phase
#' @description Get the current log phase.
#' @return Character string with the name of the current log phase.
#' @examples
Expand Down
1 change: 1 addition & 0 deletions R/log_phase_reset.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Reset log phase
#' @export
#' @family phase
#' @description Reset the current log phase to the default value.
#' @return `NULL` (invisibly). Called for its side effects.
#' @examples
Expand Down
1 change: 1 addition & 0 deletions R/log_phase_set.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Set log phase
#' @export
#' @family phase
#' @description Set the current log phase.
#' @return `NULL` (invisibly). Called for its side effects.
#' @param phase Character string with the phase of the log.
Expand Down
1 change: 1 addition & 0 deletions R/log_plot.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Plot a metric of a process over time
#' @export
#' @family log
#' @description Visualize a metric of a log over time for a single process ID
#' in a single log file.
#' @return A base plot of a metric of a log over time.
Expand Down
1 change: 1 addition & 0 deletions R/log_print.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Print once to the log.
#' @export
#' @family log
#' @description Sample CPU load metrics and
#' print a single line to the log for each process in `pids`.
#' Used for debugging and testing only. Not for users.
Expand Down
33 changes: 2 additions & 31 deletions R/log_read.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Read a log.
#' @export
#' @family log
#' @description Read a log file into R.
#' @details [log_read()] is capable of reading a log file where both
#' `autometric` and other processes have printed. Whenever `autometric`
Expand Down Expand Up @@ -131,7 +132,7 @@ log_read <- function(
out$name[is.na(out$name)] <- ""
out$status <- as.integer(out$status)
factor_cpu <- get_factor_cpu(units_cpu)
factor_memory <- get_factor_memory(units_memory)
factor_memory <- get_factor_size(units_memory)
factor_time <- get_factor_time(units_time)
for (field in c("core", "cpu")) {
out[[field]] <- as.numeric(out[[field]] * factor_cpu)
Expand Down Expand Up @@ -164,33 +165,3 @@ list_files <- function(path, hidden) {
character(0L)
}
}

get_factor_time <- function(units) {
switch(
units,
seconds = 1,
minutes = 1 / 60,
hours = 1 / (60 * 60),
days = 1 / (60 * 60 * 24)
)
}

get_factor_cpu <- function(units) {
switch(
units,
percentage = 1,
fraction = 1 / 100
)
}

c("megabytes", "bytes", "kilobytes", "gigabytes")

get_factor_memory <- function(units) {
switch(
units,
bytes = 1L,
kilobytes = 1e-3,
megabytes = 1e-6,
gigabytes = 1e-9
)
}
1 change: 1 addition & 0 deletions R/log_start.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Start the log thread.
#' @export
#' @family log
#' @description Start a background thread that periodically writes
#' system usage metrics of the current R process to a log file.
#' See [log_read()] for explanations of the specific metrics.
Expand Down
1 change: 1 addition & 0 deletions R/log_stop.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Stop the log thread.
#' @export
#' @family log
#' @description Stop the background thread that periodically writes
#' system usage metrics of the current R process to a log file.
#' @details The background thread is detached, so is there no way to
Expand Down
1 change: 1 addition & 0 deletions R/log_support.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @title Log support
#' @export
#' @family log
#' @description Check if your system supports background logging.
#' @details The background logging functionality requires a Linux, Mac,
#' or Windows computer, It also requires POSIX thread support
Expand Down
27 changes: 27 additions & 0 deletions R/utils_units.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
get_factor_time <- function(units) {
switch(
units,
seconds = 1,
minutes = 1 / 60,
hours = 1 / (60 * 60),
days = 1 / (60 * 60 * 24)
)
}

get_factor_cpu <- function(units) {
switch(
units,
percentage = 1,
fraction = 1 / 100
)
}

get_factor_size <- function(units) {
switch(
units,
bytes = 1L,
kilobytes = 1e-3,
megabytes = 1e-6,
gigabytes = 1e-9
)
}
125 changes: 125 additions & 0 deletions inst/tinytest/test-dir_stat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
local({
if (tolower(Sys.info()["sysname"]) != "windows") {
for (units_mtime in c("POSIXct", "numeric")) {
path <- tempfile()
dir.create(path)
writeLines("line", file.path(path, "a"))
writeLines("line2", file.path(path, "b"))
dir.create(file.path(path, "dir"))
file.create(file.path(path, "dir", "x"))
out_c <- dir_stat(
path,
method = "c",
units_size = "bytes",
units_mtime = units_mtime
)
out_r <- dir_stat(
path,
method = "r",
units_size = "bytes",
units_mtime = units_mtime
)
out_c <- out_c[order(out_c$path), ]
out_r <- out_c[order(out_r$path), ]
out_c$path <- basename(out_c$path)
out_r$path <- basename(out_r$path)
out_c$size <- as.integer(out_c$size)
out_r$size <- as.integer(out_r$size)
expect_equal(out_c, out_r)
expect_equal(as.numeric(out_c$mtime) - as.numeric(out_r$mtime), c(0, 0))
expect_equal(
as.character(.POSIXct(out_c$mtime)),
as.character(.POSIXct(out_r$mtime))
)
expect_equal(out_c$size[out_c$path == "a"], 5L)
expect_equal(out_c$size[out_c$path == "b"], 6L)
expect_equal(sort(colnames(out_c)), sort(c("path", "size", "mtime")))
expect_equal(nrow(out_c), 2L)
unlink(path, recursive = TRUE)
}
}
})

local({
if (tolower(Sys.info()["sysname"]) != "windows") {
path <- tempfile()
dir.create(path)
writeLines("line", file.path(path, "a"))
writeLines("line2", file.path(path, "b"))
out <- dir_stat(
path,
method = "c",
units_size = "bytes",
units_mtime = "numeric",
recent = as.difftime(1, units = "hours")
)
out$path <- basename(out$path)
expect_equal(nrow(out), 2L)
expect_equal(out$size[out$path == "a"], 5L)
expect_equal(out$size[out$path == "b"], 6L)
expect_true(is.numeric(out$mtime))
unlink(path, recursive = TRUE)
}
})

local({
if (tolower(Sys.info()["sysname"]) != "windows") {
path <- tempfile()
dir.create(path)
out <- dir_stat(
path,
method = "c",
units_size = "bytes",
units_mtime = "numeric",
recent = as.difftime(1, units = "hours")
)
expect_equal(nrow(out), 0L)
expect_equal(sort(colnames(out)), sort(c("path", "size", "mtime")))
unlink(path, recursive = TRUE)
}
})

local({
if (tolower(Sys.info()["sysname"]) != "windows") {
path <- tempfile()
dir.create(path)
link <- file.path(path, "link")
target <- tempfile()
writeLines("a", target)
file.symlink(from = target, to = link)
out_c <- dir_stat(
path,
method = "c",
units_size = "bytes",
units_mtime = "numeric",
recent = as.difftime(1, units = "hours")
)
out_r <- dir_stat(
path,
method = "r",
units_size = "bytes",
units_mtime = "numeric",
recent = as.difftime(1, units = "hours")
)
expect_equal(out_c$size, 2L)
expect_equal(out_r$size, 2L)
writeLines("abc", target)
out_c <- dir_stat(
path,
method = "c",
units_size = "bytes",
units_mtime = "numeric",
recent = as.difftime(1, units = "hours")
)
out_r <- dir_stat(
path,
method = "r",
units_size = "bytes",
units_mtime = "numeric",
recent = as.difftime(1, units = "hours")
)
expect_equal(out_c$size, 4L)
expect_equal(out_r$size, 4L)
unlink(c(path, target), recursive = TRUE)
}
})
Loading
Loading