R/dredge_correlation.r

#' @title dredge_correlation
#'
#' @description data dredge a \code{\link[base]{data.frame}} using \code{\link[stats]{cor.test}}
#'
#' @export
#'
#' @param data a \code{\link[base]{data.frame}} to operate over
#' @param grid a search grid in the form of \code{\link[mndredge]{make_correlation_grid}} to use in guiding the dredging
#' @param min_pvalue the maximum the pvalue from \code{\link[stats]{lm}} can be before the result is filtered out. this generaly is an issue only in overfitted models or synthetic data
#' @param max_pvalue the maximum the pvalue from \code{\link[stats]{lm}} can be before the result is filtered out
#' @param ... addtional parameters to pass to \code{\link[stats]{cor.test}}
#' @return a tidy \code{\link[base]{data.frame}} containing the following components:
#' \describe{
#'   \item{column_a}{the first column to compare}
#'   \item{column_b}{the second column to compare}
#'   \item{estimate}{the estimate from \code{\link[stats]{cor.test}}}
#'   \item{p.value}{the p.value from \code{\link[stats]{cor.test}}}
#'   \item{lag}{the amount of lag for the current compare}
#' }
#'
#' @author Mark Newman, \email{mark@trinetteandmark.com}
#' @keywords utilities
#' @family dredging
#'
#' @examples
#'   \dontshow{
#'     library(magrittr)
#'     library(doParallel)
#'     library(mndredge) }
#'   set.seed(0)
#'   y1 <- seq(-10, 10, .5)
#'   y2 <-
#'     y1 + y1 %>%
#'     length() %>%
#'     rnorm()
#'   registerDoParallel(cores=2)
#'   data <- data.frame(y1, y2)
#'   grid <- data %>% make_correlation_grid()
#'   data %>% dredge_correlation(grid)
#'
dredge_correlation <-
  function(
    data,
    grid,
    min_pvalue = getOption("mndredge_min_pvalue", 0),
    max_pvalue = getOption("mndredge_max_pvalue", 0.05),
    ...) {
    
    # quite incorrect `devtools::check()` note
    i <- NULL
    
    stopifnot(
      data %>% is.data.frame(),
      data %>% nrow() >= 3,
      grid %>% is.data.frame(),
      min_pvalue %>% is.numeric(),
      min_pvalue >= 0,
      min_pvalue < 1,
      max_pvalue %>% is.numeric(),
      max_pvalue >= 0,
      max_pvalue < 1,
      min_pvalue < max_pvalue)
	  
	tal <- apply_lag

    grid_res <-
      foreach(
        i = iter(grid, by = "row"),
        .packages = c("magrittr")) %dopar% {
      
          t1 <-
            data[, c(i$pair_1, i$pair_2)] %>%
            tal(
              hold = i$pair_1,
              move = i$pair_2,
              lag = i$lag)
          ct <- suppressWarnings(cor.test(t1[,i$pair_1], t1[,i$pair_2], ...))
      
          exclude <-
            ct$p.value %>% is.na() ||
            ct$p.value < min_pvalue ||
            max_pvalue < ct$p.value
          if(exclude) {
            NULL }
          else {
            names(ct$estimate) <- NULL
            c(i$pair_1, i$pair_2, ct$estimate, ct$p.value, i$lag)
          }}
  
    res_df <-
      do.call(rbind, grid_res) %>%
      data.frame(stringsAsFactors = F)
  
    if((res_df %>% nrow()) == 0) {
      data.frame(
        column_a = character(),
        column_b = character(),
        estimate = numeric(),
        p.value = numeric(),
        lag = numeric())
      }
    else {
      res_df %>%
        make_column_numeric(3) %>%
        make_column_numeric(4) %>%
        make_column_numeric(5) %>%
        set_colnames(c("column_a", "column_b", "estimate", "p.value", "lag")) %>%
        set_rownames(NULL)
    }}
markanewman/mndredge documentation built on May 9, 2019, 5:52 a.m.