#' @title Descriptive statistics on intergenic distance data
#'
#' @description Descriptive statistics on intergenic distance data
#'
#' @param GeneSetDistances A \code{tibble} with intergenic distances for the
#' different gene sets, as generated by the
#' \code{\link{dist2Neighbors}} function.
#' @param confLevel Confidence level between 0 and 1.
#' @param nboot Integer (Default to 1e4). Number of bootstrap samples.
#' @param CItype Type of confidence interval ("bca" or "perc").
#' @param ncores Number of cores to use for boostrap.
#'
#' @importFrom magrittr %>% %<>%
#' @importFrom rlang .data
#' @importFrom dplyr select group_by
#' @importFrom tidyr nest unnest
#' @importFrom purrr map
#' @importFrom stats quantile median sd
#' @importFrom boot boot boot.ci
#' @importFrom parallel detectCores
#'
#' @export
#'
#' @return A tibble with the following columns:
#' \itemize{
#' \item GeneSet. Name of the gene set.
#' \item Side. Upstream or Downstream.
#' \item Orientation. Orientation of the neighbor (SameStrand or OppositeStrand).
#' \item n. Number of observations.
#' \item Min. Minimum distance.
#' \item Q1. First quartile.
#' \item Median.
#' \item Median_LowerCI. Lower bound of the median bootstrap confidence interval.
#' \item Median_UpperCI. Upper bound of the median bootstrap confidence interval.
#' \item Mean.
#' \item Mean_LowerCI. Lower bound of the mean bootstrap confidence interval.
#' \item Mean_UpperCI. Upper bound of the mean bootstrap confidence interval.
#' \item Q3. Third quartile.
#' \item Max. Maximum distance.
#' \item SD. Standard deviation.
#' \item SEM. Standard Error of the Mean.
#' }
#'
#' @section NOTE:
#' When nboot is too small and CItype=="bca", the following error can occur:
#' "estimated adjustment 'a' is NA"
#' In this case, use CItype="perc" and/or increase nboot.
#' Using nboot >= 1e4 (the default) is always recommended.
#'
#' @seealso \code{\link[boot]{boot}},
#' \code{\link[boot]{boot.ci}}
#'
#' @examples
#' ## Obtain gene neighborhood information:
#' GeneNeighbors <- getGeneNeighborhood(Genegr)
#' ## Get a (random) set of (100) genes:
#' set.seed(123)
#' randGenes <- sample(names(Genegr), 100)
#' ## Extract distances for this set of genes and for all genes :
#' myGeneSets <- list("RandomGenes" = randGenes,
#' "AllGenes" = GeneNeighbors$GeneName)
#' distForGeneSets <- dist2Neighbors(GeneNeighbors,
#' myGeneSets)
#' ## Obtain descriptive statistics for these sets of genes:
#' distStats(distForGeneSets,
#' nboot = 1e3, CItype = "perc", ncores = 1)
#'
#' @author Pascal GP Martin
#'
distStats <- function(GeneSetDistances,
confLevel = 0.95,
nboot = 1e4,
CItype=c("bca", "perc"),
ncores = NULL) {
# Code for bootstrap with bca CI is adapted from:
# https://www.painblogr.org/2017-10-18-purrring-through-bootstraps.html
#-----------
# Check arguments
#-----------
## GeneSetDistances
if (!is.data.frame(GeneSetDistances)) {
stop("GeneSetDistances should be a data frame")
}
if (!all(c("GeneName", "Neighbor", "Orientation",
"Side", "Distance", "GeneSet") %in% colnames(GeneSetDistances))) {
stop("GeneSet should have colums 'GeneName', 'Neighbor', 'Orientation', ",
"'Side', 'Distance', and 'GeneSet'. See dist2Neighbors function.")
}
## SetCItype
CItype <- match.arg(CItype)
## Check confLevel
if (confLevel > 100 || confLevel <=0) {
stop("conflevel should not be >100 or <=0")
}
if (confLevel >1) {
confLevel = confLevel / 100
}
## Parameters for parallel computing of bootstrap samples
parallelType <- "multicore"
if (is.null(ncores)) {
availCores <- parallel::detectCores() -1
ncores <- ifelse(!is.na(availCores) && availCores > 0,
availCores,
1)
}
if (.Platform$OS.type=="windows") {
ncores <- 1
}
if (ncores == 1) {parallelType <- "no"}
#-----------
# Get Bootstrap samples and confidence intervals for the mean and median
#-----------
### When nboot is too small and CItype=="bca", we can get the error:
### "estimated adjustment 'a' is NA"
### In this case, either use CItype="perc" and/or increase nboot
## split/nest the data by GeneSet, Side and Orientation
distByGrp <- GeneSetDistances %>%
dplyr::select(-.data$Neighbor) %>%
dplyr::group_by(.data$GeneSet, .data$Side, .data$Orientation) %>%
tidyr::nest()
##Functions on which we want boostrap confidence intervals
mystatfun <- function(genodist, indices) {
c(mean(genodist[indices], na.rm = TRUE),
median(genodist[indices], na.rm = TRUE))
}
##Add the boostrap samples to the data structure:
distByGrp %<>%
dplyr::mutate(bootSamples =
purrr::map(.x = .data$data,
~boot::boot(data = .x$Distance,
statistic = mystatfun,
R = nboot,
stype = "i",
parallel = parallelType,
ncpus = ncores)))
## Get confidence interval for the mean
distByGrp %<>%
dplyr::mutate(bootCI_mean =
purrr::map(.x = .data$bootSamples,
~boot::boot.ci(.x,
conf = confLevel,
type = CItype,
index = 1)))
## Get confidence interval for the median
distByGrp %<>%
dplyr::mutate(bootCI_median =
purrr::map(.x = .data$bootSamples,
~boot::boot.ci(.x,
conf = confLevel,
type = CItype,
index = 2)))
#-----------
# Assemble all stats in a single table
#-----------
## Get all statistics in a table
citype <- ifelse(CItype=="perc", "percent", "bca")
distByGrp <- distByGrp %>%
dplyr::mutate(n = purrr::map(.x = .data$data,
~sum(!is.na(.x$Distance))),
Min = purrr::map(.x = .data$data,
~min(.x$Distance, na.rm = TRUE)),
Q1 = purrr::map(.x = .data$data,
~quantile(.x$Distance, 0.25, na.rm = TRUE)),
Median = purrr::map(.x = .data$bootCI_median,
~.x$t0),
Median_LowerCI = purrr::map(.x = .data$bootCI_median,
~.x[[citype]][[4]]),
Median_UpperCI = purrr::map(.x = .data$bootCI_median,
~.x[[citype]][[5]]),
Mean = purrr::map(.x = .data$bootCI_mean,
~.x$t0),
Mean_LowerCI = purrr::map(.x = .data$bootCI_mean,
~.x[[citype]][[4]]),
Mean_UpperCI = purrr::map(.x = .data$bootCI_mean,
~.x[[citype]][[5]]),
Q3 = purrr::map(.x = .data$data,
~quantile(.x$Distance, 0.75, na.rm = TRUE)),
Max = purrr::map(.x = .data$data,
~max(.x$Distance, na.rm = TRUE)),
SD = purrr::map(.x = .data$data,
~sd(.x$Distance, na.rm = TRUE)),
SEM = purrr::map(.x = .data$data,
~sd(.x$Distance, na.rm = TRUE)/
sqrt(sum(!is.na(.x$Distance))))
) %>%
dplyr::select(-.data$data, -.data$bootSamples,
-.data$bootCI_median, -.data$bootCI_mean) %>%
tidyr::unnest(cols = c(.data$n, .data$Min, .data$Q1,
.data$Median, .data$Median_LowerCI,
.data$Median_UpperCI, .data$Mean,
.data$Mean_LowerCI, .data$Mean_UpperCI,
.data$Q3, .data$Max, .data$SD, .data$SEM))
return(distByGrp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.