Skip to content

Commit

Permalink
improve assign_round_number so that it works on distinct dates
Browse files Browse the repository at this point in the history
  • Loading branch information
truenomad committed Oct 16, 2024
1 parent 9e45b2f commit 9bd07f3
Showing 1 changed file with 37 additions and 37 deletions.
74 changes: 37 additions & 37 deletions R/assign_round_number.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,60 +36,60 @@
#' print(result)
#'
#' @export
assign_round_number <- function(df,
date_col = "today",
additional_grouping = NULL,
# Define the function to assign round numbers based on proximity
assign_round_number <- function(df,
date_col = "today",
additional_grouping = NULL,
threshold = 7) {
# Ensure dates are of Date type
df[[date_col]] <- as.Date(df[[date_col]])

# Check if there are enough dates to perform clustering
if (nrow(df) < 2) {
return(rep(NA, nrow(df))) # Return NA if insufficient data
# Get distinct dates for clustering
distinct_dates <- df |>
dplyr::distinct(!!rlang::sym(date_col)) |>
dplyr::arrange(!!rlang::sym(date_col))

# Check if there are enough distinct dates to perform clustering
if (nrow(distinct_dates) < 2) {
df$round_group <- NA # Return NA if insufficient data
return(df)
}

# Create a distance matrix
distance_matrix <- as.matrix(dist(df[[date_col]]))
distance_matrix <- as.matrix(dist(distinct_dates[[date_col]]))

# Perform hierarchical clustering
hc <- hclust(as.dist(distance_matrix))

# Cut the dendrogram to form groups
clusters <- cutree(hc, h = threshold) # Cut tree at specified height
clusters <- cutree(hc, h = threshold) # Cut tree at specified height

# Add clusters to the data frame
df$ClusterID <- clusters
# Create a round group label for the distinct dates
distinct_dates$ClusterID <- clusters
round_labels <- unique(format(distinct_dates[[date_col]], "%B %Y"))

# Create a round group label for the entire dataset
round_labels <- unique(format(df[[date_col]], "%B %Y"))
# Create round group labels based on clusters
distinct_dates <- distinct_dates |>
dplyr::group_by(ClusterID) |>
dplyr::mutate(
round_group = format(dplyr::first(.data[[date_col]]), "%B %Y")
) |>
dplyr::ungroup()

# Group by ClusterID and any additional grouping columns
if (!is.null(additional_grouping)) {
df_grouped <- df |>
dplyr::group_by(ClusterID, !!!rlang::syms(additional_grouping)) |>
dplyr::mutate(
# Label each group with the month and year of the first date
# in the group
round_group = format(dplyr::first(.data[[date_col]]), "%B %Y")) |>
dplyr::ungroup()
} else {
df_grouped <- df |>
dplyr::group_by(ClusterID) |>
dplyr::mutate(
# Label each group with the month and year of the first date
# in the group
round_group = format(dplyr::first(.data[[date_col]]), "%B %Y")
) |> dplyr::ungroup()
}
df <- df |>
dplyr::left_join(
distinct_dates |>
dplyr::select(!!rlang::sym(date_col), ClusterID, round_group),
by = date_col
)

df_grouped <- df_grouped |>
df <- df |>
dplyr::mutate(
# Convert round_group to a factor with ordered levels
# based on unique dates
round_group = factor(round_group,
round_group = factor(round_group,
levels = round_labels,
ordered = TRUE)) |>
dplyr::select(-ClusterID)
ordered = TRUE
)
)

return(df_grouped)
return(df)
}

0 comments on commit 9bd07f3

Please sign in to comment.