# memorize_missing
#' @name memorize_missing
#' @title Memorize missingness pattern and randomize attributes
#' @description Use this function to keep the pattern of missing data (0/1).
#' The pattern can be randomized based on dataset attributes/covariates.
#' This can be useful to generate missingness on simulated dataset with the
#' same number of individuals, populations and markers or to analyze the accuracy
#' of imputation algorithms. A vignette is under construction to leverage this
#' function.
#' @inheritParams radiator::tidy_genomic_data
#' @param strata (optional/required) Required for VCF and haplotypes files,
#' optional for the other formats supported.
#' See documentation of \code{\link[radiator]{tidy_genomic_data}} for more info.
#' Default: \code{strata = NULL}.
#' @param randomize (optional, string) To randomize the missingness of specific attributes.
#' Available options: \code{"markers", "populations", "individuals" and "overall"}.
#' All options can be selected in a string,
#' \code{randomize = c("markers", "populations", "individuals", "overall")}
#' Default: \code{randomize = NULL} will only keep the original missingness pattern.
#' @param filename (optional) The name of the file (extension not necessary)
#' written to the working directory and containing the missing info.
#' Default: \code{filename = NULL}, the missing info is in the global
#' environment only.
#'
#' \code{grur} takes advantage of the lightweight and speedy file reading/writing
#' package \code{fst} (\emph{Lightning Fast Serialization of Data Frames for R})
#' to write the dataframe to the working directory. This file can be used inside
#' \code{generate_missing} (coming soon) function.
# \code{\link[grur]{generate_missing}} function.
#' @return A tidy dataframe in the global environment with columns:
#' \code{POP_ID}, \code{INDIVIDUALS}, \code{MARKERS}, and in the subsequent
#' columns, the missingness info coded 0 for missing and 1 for genotyped.
#' Depending on the value chosen for the argument \code{randomize},
#' the columns are:
#' \itemize{
#' \item \code{MISSING_ORIGINAL}: for the original missing pattern (always present)
#' \item \code{MISSING_MARKERS_MIX}: for the missing pattern randomized by markers (optional)
#' \item \code{MISSING_POP_MIX}: for the missing pattern randomized by populations (optional)
#' \item \code{MISSING_INDIVIDUALS_MIX}: for the missing pattern randomized by individuals (optional)
#' \item \code{MISSING_OVERALL_MIX}: for the missing pattern randomized overall (optional)
#' }
#'
#' @examples
#' \dontrun{
#' missing.memory <- memorize_missing(
#' data = "batch_1.vcf",
#' strata = "population.map.strata.tsv",
#' randomize = "populations", filename = "missing.memory.panda"
#' )
#' }
#' @export
#' @rdname memorize_missing
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
memorize_missing <- function(
data,
strata = NULL,
randomize = NULL,
filename = NULL
) {
cat("#######################################################################\n")
cat("###################### grur: memorize_missing #########################\n")
cat("#######################################################################\n")
timing <- proc.time()
# manage missing arguments -----------------------------------------------------
if (missing(data)) stop("Input file missing")
# import data ----------------------------------------------------------------
missing.memories <- suppressMessages(
radiator::tidy_genomic_data(
data = data,
vcf.metadata = FALSE,
strata = strata,
monomorphic.out = FALSE,
common.markers = FALSE,
verbose = FALSE
))
if (!"MARKERS" %in% colnames(missing.memories) & "LOCUS" %in% colnames(missing.memories)) {
missing.memories <- dplyr::rename(.data = missing.memories, MARKERS = LOCUS)
}
# stats
message("Number of populations: ", dplyr::n_distinct(missing.memories$POP_ID))
message("Number of individuals: ", dplyr::n_distinct(missing.memories$INDIVIDUALS))
message("Number of markers: ", dplyr::n_distinct(missing.memories$MARKERS), "\n")
missing.memories <- dplyr::select(
.data = missing.memories, POP_ID, INDIVIDUALS, MARKERS, GT) %>%
dplyr::mutate(
MISSING_ORIGINAL = as.numeric(dplyr::if_else(GT == "000000", "0", "1"))
) %>%
dplyr::select(-GT) %>%
dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS)
if (!is.null(randomize)) {
if ("populations" %in% randomize) {
message("Randomizing populations")
strata.df <- dplyr::distinct(missing.memories, POP_ID, INDIVIDUALS) %>%
dplyr::mutate(POP_ID_PERM = sample(x = .$POP_ID, size = nrow(.), replace = FALSE))
missing.memories$MISSING_POP_MIX <- missing.memories %>%
dplyr::left_join(strata.df, by = c("INDIVIDUALS", "POP_ID")) %>%
dplyr::arrange(POP_ID_PERM, INDIVIDUALS, MARKERS) %>%
dplyr::select(MISSING_ORIGINAL) %>% purrr::flatten_dbl(.)
strata.df <- NULL
}
if ("markers" %in% randomize) {
message("Randomizing markers")
markers.perm <- dplyr::distinct(missing.memories, MARKERS) %>%
dplyr::mutate(MARKERS_PERM = sample(x = .$MARKERS, size = nrow(.), replace = FALSE))
missing.memories$MISSING_MARKERS_MIX <- missing.memories %>%
dplyr::left_join(markers.perm, by = "MARKERS") %>%
dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS_PERM) %>%
dplyr::select(MISSING_ORIGINAL) %>% purrr::flatten_dbl(.)
markers.perm <- NULL
}
if ("individuals" %in% randomize) {
message("Randomizing individuals")
individuals.perm <- dplyr::distinct(missing.memories, INDIVIDUALS) %>%
dplyr::mutate(INDIVIDUALS_PERM = sample(x = .$INDIVIDUALS, size = nrow(.), replace = FALSE))
missing.memories$MISSING_INDIVIDUALS_MIX <- missing.memories %>%
dplyr::left_join(individuals.perm, by = "INDIVIDUALS") %>%
dplyr::arrange(POP_ID, INDIVIDUALS_PERM, MARKERS) %>%
dplyr::select(MISSING_ORIGINAL) %>% purrr::flatten_dbl(.)
individuals.perm <- NULL
}
if ("overall" %in% randomize) {
message("Randomizing the overall pattern of missingness")
missing.memories$MISSING_OVERALL_MIX <- missing.memories %>%
dplyr::mutate(
MISSING_MIX = sample(x = .$MISSING_ORIGINAL,
size = nrow(.), replace = FALSE)
) %>%
dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS) %>%
dplyr::select(MISSING_MIX) %>% purrr::flatten_dbl(.)
}
}
if (!is.null(filename)) {
message("\nWriting file to working directory")
message("use:\n??fst::read.st \nfor documentation on how to import dataframe
of missing patterns back in R")
#fst::write.fst(x = missing.memories, path = filename)
}
# Results --------------------------------------------------------------------
timing <- proc.time() - timing
message("\nComputation time: ", round(timing[[3]]), " sec")
cat("############################## completed ##############################\n")
return(missing.memories)
}#End memorize_missing
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.