diff --git a/NAMESPACE b/NAMESPACE index 7c44bb9..5a26e13 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,7 +31,10 @@ export(relatedness) export(repairSex) export(resample) export(simulatePedigree) -export(summarizePedigree) +export(summarizeFamilies) +export(summarizeMatrilines) +export(summarizePatrilines) +export(summarizePedigrees) export(vech) import(data.table) import(kinship2) diff --git a/R/summarizePedigree.R b/R/summarizePedigree.R index caf665a..92bd89b 100644 --- a/R/summarizePedigree.R +++ b/R/summarizePedigree.R @@ -2,45 +2,43 @@ #' #' This function summarizes pedigree data, including calculating summary statistics for all numeric variables, #' and finding the originating member for each family, maternal, and paternal line. -#' -#' @param pedigree_data A data frame containing the pedigree data. -#' @param personID A character string indicating the column name for the person ID variable. -#' @param momID A character string indicating the column name for the mother ID variable. -#' @param dadID A character string indicating the column name for the father ID variable. +#' @inheritParams ped2fam +#' @inheritParams ped2maternal +#' @inheritParams ped2paternal +#' @param nbiggest The number of biggest lines to return. +#' @param noldest The number of oldest lines to return. +#' @param byr The column name for birth year. +#' @param type The type of summary statistics to calculate. Options are "fathers", "mothers", and "families". #' @returns A list containing summary statistics for family, maternal, and paternal lines, as well as the 5 oldest and biggest lines. #' @import data.table #' @export -summarizePedigree <- function(ped, famID = "famID", personID = "ID", +summarizePedigrees <- function(ped, famID = "famID", personID = "ID", momID = "momID", dadID = "dadID", matID = "matID", patID = "patID", - yrb = NULL, + byr = NULL, type = c("fathers", "mothers", "families"), nbiggest = 5, noldest = 5) { - # checks - if(!requireNamespace("data.table", quietly = TRUE)) { - stop("The 'data.table' package is required for this function. Please install it and try again.") - } if(personID %in% c(famID, momID, dadID, matID)) { stop("personID cannot be the same as any of the other ID variables.") } if(!all(c(personID, momID, dadID) %in% names(ped))) { stop("personID, momID, and dadID must be columns in the pedigree data.") } - if(!is.null(yrb) && !yrb %in% names(ped)) { - stop("yrb must be a column in the pedigree data.") + if(!is.null(byr) && !byr %in% names(ped)) { + stop("byr must be a column in the pedigree data.") } # Convert to data.table ped_dt <- data.table::as.data.table(ped) # Build the pedigree using the provided functions - if(!famID %in% names(ped_dt)) { - ped_dt <- BGmisc::ped2fam(ped_dt, personID = personID, momID = momID, dadID = dadID, famID = famID) + if(!famID %in% names(ped_dt) & "families" %in% type ) { + ped_dt <- ped2fam(ped_dt, personID = personID, momID = momID, dadID = dadID, famID = famID) } - if(!matID %in% names(ped_dt)) { - ped_dt <- BGmisc::ped2maternal(ped_dt, personID = personID, momID = momID, dadID = dadID, matID = matID) + if(!matID %in% names(ped_dt) & "mothers" %in% type ) { + ped_dt <- ped2maternal(ped_dt, personID = personID, momID = momID, dadID = dadID, matID = matID) } - if(!patID %in% names(ped_dt)) { - ped_dt <- BGmisc::ped2paternal(ped_dt, personID = personID, momID = momID, dadID = dadID, patID = patID) + if(!patID %in% names(ped_dt) & "fathers" %in% type) { + ped_dt <- ped2paternal(ped_dt, personID = personID, momID = momID, dadID = dadID, patID = patID) } # Function to calculate summary statistics for all numeric variables @@ -49,11 +47,11 @@ summarizePedigree <- function(ped, famID = "famID", personID = "ID", summary_stats <- data[, lapply(.SD, function(x) { list( count = .N, - mean = mean(x, na.rm = TRUE), - median = median(x, na.rm = TRUE), - min = min(x, na.rm = TRUE), - max = max(x, na.rm = TRUE), - sd = sd(x, na.rm = TRUE) + mean = base::mean(x, na.rm = TRUE), + median = stats::median(x, na.rm = TRUE), + min = base::min(x, na.rm = TRUE), + max = base::max(x, na.rm = TRUE), + sd = stats::sd(x, na.rm = TRUE) ) }), by = group_var, .SDcols = numeric_cols] @@ -67,55 +65,125 @@ summarizePedigree <- function(ped, famID = "famID", personID = "ID", data[order(byr), .SD[1], by = group_var] } + # Initialize output list + output <- list() + # Calculate summary statistics for families, maternal lines, and paternal lines - family_summary_dt <- calculate_summary_dt(ped_dt, famID) - maternal_summary_dt <- calculate_summary_dt(ped_dt, matID) - paternal_summary_dt <- calculate_summary_dt(ped_dt, patID) - - # Find the originating member for each line - originating_member_family <- find_originating_member(ped_dt, famID) - originating_member_maternal <- find_originating_member(ped_dt, matID) - originating_member_paternal <- find_originating_member(ped_dt, patID) - - # Merge summary statistics with originating members for additional information - family_summary_dt <- merge(family_summary_dt, originating_member_family, by = famID, suffixes = c("", "_founder")) - maternal_summary_dt <- merge(maternal_summary_dt, originating_member_maternal, by = matID, suffixes = c("", "_founder")) - paternal_summary_dt <- merge(paternal_summary_dt, originating_member_paternal, by = patID, suffixes = c("", "_founder")) - - if(!is.null(byr)){ - # Identify the 5 oldest lines - oldest_families <- family_summary_dt[order(byr)][1:noldest] - oldest_maternal <- maternal_summary_dt[order(byr)][1:noldest] - oldest_paternal <- paternal_summary_dt[order(byr)][1:noldest] + + + + if("families" %in% type) { + family_summary_dt <- calculate_summary_dt(ped_dt, famID) + # Find the originating member for each line + originating_member_family <- find_originating_member(ped_dt, famID) + # Merge summary statistics with originating members for additional information + family_summary_dt <- merge(family_summary_dt, originating_member_family, + by = famID, suffixes = c("", "_founder")) + output$family_summary <- family_summary_dt } - # Identify the 5 biggest lines - biggest_families <- family_summary_dt[order(-count)][1:nbiggest] - biggest_maternal <- maternal_summary_dt[order(-count)][1:nbiggest] - biggest_paternal <- paternal_summary_dt[order(-count)][1:nbiggest] - - # Output the results as a list - if(!is.null(byr)){ - output <- list( - family_summary = family_summary_dt, - maternal_summary = maternal_summary_dt, - paternal_summary = paternal_summary_dt, - oldest_families = oldest_families, - oldest_maternal = oldest_maternal, - oldest_paternal = oldest_paternal, - biggest_families = biggest_families, - biggest_maternal = biggest_maternal, - biggest_paternal = biggest_paternal - ) - } else { - output <- list( - family_summary = family_summary_dt, - maternal_summary = maternal_summary_dt, - paternal_summary = paternal_summary_dt, - biggest_families = biggest_families, - biggest_maternal = biggest_maternal, - biggest_paternal = biggest_paternal - ) + if("mothers" %in% type) { + maternal_summary_dt <- calculate_summary_dt(ped_dt, matID) + originating_member_maternal <- find_originating_member(ped_dt, matID) + maternal_summary_dt <- merge(maternal_summary_dt, originating_member_maternal, by = matID, suffixes = c("", "_founder")) + output$maternal_summary <- maternal_summary_dt + } + if("fathers" %in% type) { + paternal_summary_dt <- calculate_summary_dt(ped_dt, patID) + originating_member_paternal <- find_originating_member(ped_dt, patID) + paternal_summary_dt <- merge(paternal_summary_dt, originating_member_paternal, by = patID, suffixes = c("", "_founder")) + output$paternal_summary <- paternal_summary_dt + } + + # Optionally find the superlative lines + + ## oldest + if(!is.null(byr) && noldest > 0 & noldest <= nrow(ped_dt)) { + if("families" %in% type) { + output$oldest_families <- family_summary_dt[order(get(byr))][1:noldest] + } + if("mothers" %in% type) { + output$oldest_maternal <- maternal_summary_dt[order(get(byr))][1:noldest] + } + if("fathers" %in% type) { + output$oldest_paternal <- paternal_summary_dt[order(get(byr))][1:noldest] + } + } + + # biggest lines + if(!is.null(nbiggest) & nbiggest > 0 & nbiggest <= nrow(ped_dt)) { + if("families" %in% type) { + output$biggest_families <- family_summary_dt[order(-get("count"))][1:nbiggest] + } + if("mothers" %in% type) { + output$biggest_maternal <- maternal_summary_dt[order(-get("count"))][1:nbiggest] + } + if("fathers" %in% type) { + output$biggest_paternal <- paternal_summary_dt[order(-get("count"))][1:nbiggest] + } } + return(output) } +#' Summarize the maternal lines in a pedigree +#' @inheritParams summarizePedigrees +#' @seealso [summarizePedigrees ()] +#' @export +#' +summarizeMatrilines <- function(ped, famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", + matID = "matID", patID = "patID", + byr = NULL, + nbiggest = 5, noldest = 5) { + # Call to wrapper function + summarizePedigrees(ped = ped, + personID = personID, + nbiggest = nbiggest, + noldest = noldest, + byr=byr, + momID = momID, dadID = dadID, + famID = famID, matID=matID, patID=patID, + type = "mothers") +} + +#' Summarize the paternal lines in a pedigree +#' @inheritParams summarizePedigrees +#' @seealso [summarizePedigrees ()] +#' @export +#' +summarizePatrilines <- function(ped, famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", + matID = "matID", patID = "patID", + byr = NULL, + nbiggest = 5, noldest = 5) { + # Call to wrapper function + summarizePedigrees(ped = ped, + personID = personID, + nbiggest = nbiggest, + noldest = noldest, + byr=byr, + momID = momID, dadID = dadID, + famID = famID, matID=matID, patID = patID, + type = "fathers") +} + +#' Summarize the families in a pedigree +#' @inheritParams summarizePedigrees +#' @seealso [summarizePedigrees ()] +#' @export + +summarizeFamilies <- function(ped, famID = "famID", personID = "ID", + momID = "momID", dadID = "dadID", + matID = "matID", patID = "patID", + byr = NULL, + nbiggest = 5, noldest = 5) { + # Call to wrapper function + summarizePedigrees(ped = ped, + personID = personID, + nbiggest = nbiggest, + noldest = noldest, + byr=byr, + momID = momID, dadID = dadID, + famID = famID, matID=matID, patID = patID, + type = "families") +} diff --git a/man/summarizeFamilies.Rd b/man/summarizeFamilies.Rd new file mode 100644 index 0000000..2fc239c --- /dev/null +++ b/man/summarizeFamilies.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{summarizeFamilies} +\alias{summarizeFamilies} +\title{Summarize the families in a pedigree} +\usage{ +summarizeFamilies( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + nbiggest = 5, + noldest = 5 +) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{famID}{character. Name of the column to be created in ped for the family ID variable} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} + +\item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} + +\item{byr}{The column name for birth year.} + +\item{nbiggest}{The number of biggest lines to return.} + +\item{noldest}{The number of oldest lines to return.} +} +\description{ +Summarize the families in a pedigree +} +\seealso{ +[summarizePedigrees ()] +} diff --git a/man/summarizeMatrilines.Rd b/man/summarizeMatrilines.Rd new file mode 100644 index 0000000..5cb37d1 --- /dev/null +++ b/man/summarizeMatrilines.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{summarizeMatrilines} +\alias{summarizeMatrilines} +\title{Summarize the maternal lines in a pedigree} +\usage{ +summarizeMatrilines( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + nbiggest = 5, + noldest = 5 +) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{famID}{character. Name of the column to be created in ped for the family ID variable} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} + +\item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} + +\item{byr}{The column name for birth year.} + +\item{nbiggest}{The number of biggest lines to return.} + +\item{noldest}{The number of oldest lines to return.} +} +\description{ +Summarize the maternal lines in a pedigree +} +\seealso{ +[summarizePedigrees ()] +} diff --git a/man/summarizePatrilines.Rd b/man/summarizePatrilines.Rd new file mode 100644 index 0000000..3bfc5f0 --- /dev/null +++ b/man/summarizePatrilines.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{summarizePatrilines} +\alias{summarizePatrilines} +\title{Summarize the paternal lines in a pedigree} +\usage{ +summarizePatrilines( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + nbiggest = 5, + noldest = 5 +) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{famID}{character. Name of the column to be created in ped for the family ID variable} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} + +\item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} + +\item{byr}{The column name for birth year.} + +\item{nbiggest}{The number of biggest lines to return.} + +\item{noldest}{The number of oldest lines to return.} +} +\description{ +Summarize the paternal lines in a pedigree +} +\seealso{ +[summarizePedigrees ()] +} diff --git a/man/summarizePedigree.Rd b/man/summarizePedigree.Rd deleted file mode 100644 index f424e65..0000000 --- a/man/summarizePedigree.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarizePedigree.R -\name{summarizePedigree} -\alias{summarizePedigree} -\title{Summarize Pedigree Data} -\usage{ -summarizePedigree( - ped, - famID = "famID", - personID = "ID", - momID = "momID", - dadID = "dadID", - matID = "matID", - patID = "patID", - yrb = NULL, - nbiggest = 5, - noldest = 5 -) -} -\arguments{ -\item{personID}{A character string indicating the column name for the person ID variable.} - -\item{momID}{A character string indicating the column name for the mother ID variable.} - -\item{dadID}{A character string indicating the column name for the father ID variable.} - -\item{pedigree_data}{A data frame containing the pedigree data.} -} -\value{ -A list containing summary statistics for family, maternal, and paternal lines, as well as the 5 oldest and biggest lines. -} -\description{ -This function summarizes pedigree data, including calculating summary statistics for all numeric variables, -and finding the originating member for each family, maternal, and paternal line. -} diff --git a/man/summarizePedigrees.Rd b/man/summarizePedigrees.Rd new file mode 100644 index 0000000..b7c488f --- /dev/null +++ b/man/summarizePedigrees.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarizePedigree.R +\name{summarizePedigrees} +\alias{summarizePedigrees} +\title{Summarize Pedigree Data} +\usage{ +summarizePedigrees( + ped, + famID = "famID", + personID = "ID", + momID = "momID", + dadID = "dadID", + matID = "matID", + patID = "patID", + byr = NULL, + type = c("fathers", "mothers", "families"), + nbiggest = 5, + noldest = 5 +) +} +\arguments{ +\item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} + +\item{famID}{character. Name of the column to be created in ped for the family ID variable} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} + +\item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} + +\item{byr}{The column name for birth year.} + +\item{type}{The type of summary statistics to calculate. Options are "fathers", "mothers", and "families".} + +\item{nbiggest}{The number of biggest lines to return.} + +\item{noldest}{The number of oldest lines to return.} +} +\value{ +A list containing summary statistics for family, maternal, and paternal lines, as well as the 5 oldest and biggest lines. +} +\description{ +This function summarizes pedigree data, including calculating summary statistics for all numeric variables, +and finding the originating member for each family, maternal, and paternal line. +}