R/distStats.R

Defines functions distStats

Documented in distStats

#' @title Descriptive statistics on intergenic distance data
#'
#' @description Descriptive statistics on intergenic distance data
#'
#' @param GeneSetDistances A \code{tibble} with intergenic distances for the
#'                         different gene sets, as generated by the
#'                         \code{\link{dist2Neighbors}} function.
#' @param confLevel Confidence level between 0 and 1.
#' @param nboot Integer (Default to 1e4). Number of bootstrap samples.
#' @param CItype Type of confidence interval ("bca" or "perc").
#' @param ncores Number of cores to use for boostrap.
#'
#' @importFrom magrittr %>% %<>%
#' @importFrom rlang .data
#' @importFrom dplyr select group_by
#' @importFrom tidyr nest unnest
#' @importFrom purrr map
#' @importFrom stats quantile median sd
#' @importFrom boot boot boot.ci
#' @importFrom parallel detectCores
#'
#' @export
#'
#' @return A tibble with the following columns:
#' \itemize{
#'   \item GeneSet. Name of the gene set.
#'   \item Side. Upstream or Downstream.
#'   \item Orientation. Orientation of the neighbor (SameStrand or OppositeStrand).
#'   \item n. Number of observations.
#'   \item Min. Minimum distance.
#'   \item Q1. First quartile.
#'   \item Median.
#'   \item Median_LowerCI. Lower bound of the median bootstrap confidence interval.
#'   \item Median_UpperCI. Upper bound of the median bootstrap confidence interval.
#'   \item Mean.
#'   \item Mean_LowerCI. Lower bound of the mean bootstrap confidence interval.
#'   \item Mean_UpperCI. Upper bound of the mean bootstrap confidence interval.
#'   \item Q3. Third quartile.
#'   \item Max. Maximum distance.
#'   \item SD. Standard deviation.
#'   \item SEM. Standard Error of the Mean.
#' }
#'
#' @section NOTE:
#' When nboot is too small and CItype=="bca", the following error can occur:
#' "estimated adjustment 'a' is NA"
#' In this case, use CItype="perc" and/or increase nboot.
#' Using nboot >= 1e4 (the default) is always recommended.
#'
#' @seealso \code{\link[boot]{boot}},
#'          \code{\link[boot]{boot.ci}}
#'
#' @examples
#' ## Obtain gene neighborhood information:
#'   GeneNeighbors <- getGeneNeighborhood(Genegr)
#' ## Get a (random) set of (100) genes:
#'   set.seed(123)
#'   randGenes <- sample(names(Genegr), 100)
#' ## Extract distances for this set of genes and for all genes :
#'  myGeneSets <- list("RandomGenes" = randGenes,
#'                     "AllGenes" = GeneNeighbors$GeneName)
#'  distForGeneSets <- dist2Neighbors(GeneNeighbors,
#'                                    myGeneSets)
#' ## Obtain descriptive statistics for these sets of genes:
#' distStats(distForGeneSets,
#'           nboot = 1e3, CItype = "perc", ncores = 1)
#'
#' @author Pascal GP Martin
#'


distStats <- function(GeneSetDistances,
                      confLevel = 0.95,
                      nboot = 1e4,
                      CItype=c("bca", "perc"),
                      ncores = NULL) {

# Code for bootstrap with bca CI is adapted from:
# https://www.painblogr.org/2017-10-18-purrring-through-bootstraps.html

#-----------
# Check arguments
#-----------

## GeneSetDistances
if (!is.data.frame(GeneSetDistances)) {
    stop("GeneSetDistances should be a data frame")
}

if (!all(c("GeneName", "Neighbor", "Orientation",
           "Side", "Distance", "GeneSet") %in% colnames(GeneSetDistances))) {
    stop("GeneSet should have colums 'GeneName', 'Neighbor', 'Orientation', ",
         "'Side', 'Distance', and 'GeneSet'. See dist2Neighbors function.")
}

## SetCItype
CItype <- match.arg(CItype)

## Check confLevel
if (confLevel > 100 || confLevel <=0) {
    stop("conflevel should not be >100 or <=0")
}

if (confLevel >1) {
    confLevel = confLevel / 100
}

## Parameters for parallel computing of bootstrap samples
parallelType <- "multicore"
if (is.null(ncores)) {
    availCores <- parallel::detectCores() -1
    ncores <- ifelse(!is.na(availCores) && availCores > 0,
                     availCores,
                     1)
}

if (.Platform$OS.type=="windows") {
    ncores <- 1
}

if (ncores == 1) {parallelType <- "no"}

#-----------
# Get Bootstrap samples and confidence intervals for the mean and median
#-----------
### When nboot is too small and CItype=="bca", we can get the error:
###  "estimated adjustment 'a' is NA"
### In this case, either use CItype="perc" and/or increase nboot

## split/nest the data by GeneSet, Side and Orientation
distByGrp <- GeneSetDistances %>%
    dplyr::select(-.data$Neighbor) %>%
    dplyr::group_by(.data$GeneSet, .data$Side, .data$Orientation) %>%
    tidyr::nest()

##Functions on which we want boostrap confidence intervals
mystatfun <- function(genodist, indices) {
    c(mean(genodist[indices], na.rm = TRUE),
      median(genodist[indices], na.rm = TRUE))
}

##Add the boostrap samples to the data structure:
distByGrp %<>%
    dplyr::mutate(bootSamples =
                      purrr::map(.x = .data$data,
                                 ~boot::boot(data = .x$Distance,
                                             statistic = mystatfun,
                                             R = nboot,
                                             stype = "i",
                                             parallel = parallelType,
                                             ncpus = ncores)))

## Get confidence interval for the mean
distByGrp %<>%
    dplyr::mutate(bootCI_mean =
                      purrr::map(.x = .data$bootSamples,
                                 ~boot::boot.ci(.x,
                                                conf = confLevel,
                                                type = CItype,
                                                index = 1)))

## Get confidence interval for the median
distByGrp %<>%
    dplyr::mutate(bootCI_median =
                      purrr::map(.x = .data$bootSamples,
                                 ~boot::boot.ci(.x,
                                                conf = confLevel,
                                                type = CItype,
                                                index = 2)))

#-----------
# Assemble all stats in a single table
#-----------
## Get all statistics in a table
citype <- ifelse(CItype=="perc", "percent", "bca")

distByGrp <- distByGrp %>%
    dplyr::mutate(n = purrr::map(.x = .data$data,
                                 ~sum(!is.na(.x$Distance))),
                  Min = purrr::map(.x = .data$data,
                                   ~min(.x$Distance, na.rm = TRUE)),
                  Q1 = purrr::map(.x = .data$data,
                                  ~quantile(.x$Distance, 0.25, na.rm = TRUE)),
                  Median = purrr::map(.x = .data$bootCI_median,
                                      ~.x$t0),
                  Median_LowerCI = purrr::map(.x = .data$bootCI_median,
                                              ~.x[[citype]][[4]]),
                  Median_UpperCI = purrr::map(.x = .data$bootCI_median,
                                              ~.x[[citype]][[5]]),
                  Mean = purrr::map(.x = .data$bootCI_mean,
                                    ~.x$t0),
                  Mean_LowerCI = purrr::map(.x = .data$bootCI_mean,
                                            ~.x[[citype]][[4]]),
                  Mean_UpperCI = purrr::map(.x = .data$bootCI_mean,
                                            ~.x[[citype]][[5]]),
                  Q3 = purrr::map(.x = .data$data,
                                  ~quantile(.x$Distance, 0.75, na.rm = TRUE)),
                  Max = purrr::map(.x = .data$data,
                                   ~max(.x$Distance, na.rm = TRUE)),
                  SD = purrr::map(.x = .data$data,
                                  ~sd(.x$Distance, na.rm = TRUE)),
                  SEM = purrr::map(.x = .data$data,
                                   ~sd(.x$Distance, na.rm = TRUE)/
                                       sqrt(sum(!is.na(.x$Distance))))
    ) %>%
    dplyr::select(-.data$data, -.data$bootSamples,
                  -.data$bootCI_median, -.data$bootCI_mean) %>%
    tidyr::unnest(cols = c(.data$n, .data$Min, .data$Q1,
                           .data$Median, .data$Median_LowerCI,
                           .data$Median_UpperCI, .data$Mean,
                           .data$Mean_LowerCI, .data$Mean_UpperCI,
                           .data$Q3, .data$Max, .data$SD, .data$SEM))

return(distByGrp)

}
pgpmartin/GeneNeighborhood documentation built on Sept. 2, 2021, 6:37 a.m.