R/EmpiricalFns.R

Defines functions resampleScoreModelPair estimateDifferenceZeroCrossing render.sigr_emptest listToDataFrame mkResampleDiffWorker resampleScoreModel mkResampleWorker permutationScoreModel render.sigr_permtest mkPermWorker

Documented in estimateDifferenceZeroCrossing permutationScoreModel render.sigr_emptest render.sigr_permtest resampleScoreModel resampleScoreModelPair

mkPermWorker <- function(modelValues,yValues,scoreFn) {
  force(yValues)
  force(modelValues)
  force(scoreFn)
  n <- length(modelValues)
  if(is.null(yValues)) {
    yValues <- logical(n)
  }
  function(x) {
    perm <- sample.int(n,n,replace=FALSE)
    score <- scoreFn(modelValues,yValues[perm])
    score
  }
}


#' Format an empirical test (quality of categorical prediction)
#'
#' @param statistic wrapped T-test
#' @param ... not used, force use of named binding for later arguments
#' @param format if set the format to return ("html", "latex", "markdown", "ascii")
#' @param statDigits integer number of digits to show in summary.
#' @param sigDigits integer number of digits to show in significances.
#' @param pLargeCutoff value to declare non-significance at or above.
#' @param pSmallCutoff smallest value to print
#' @return formatted string
#'
#'
#' @export
render.sigr_permtest <- function(statistic,
                                 ...,
                                 format,
                                 statDigits=4,
                                 sigDigits=4,
                                 pLargeCutoff=0.05,
                                 pSmallCutoff=1.0e-5) {
  wrapr::stop_if_dot_args(substitute(list(...)), "sigr::render.sigr_permtest")
  if (missing(format) || is.null(format)) {
    format <- getRenderingFormat()
  }
  if(!isTRUE(format %in% formats)) {
    format <- "ascii"
  }
  fsyms <- syms[format,]
  pString <- render(wrapSignificance(statistic$pValue,
                                     symbol='p'),
                    format=format,
                    sigDigits=sigDigits,
                    pLargeCutoff=pLargeCutoff,
                    pSmallCutoff=pSmallCutoff)
  formatStr <- paste0(fsyms['startB'],'Studentized permutation test',fsyms['endB'],
                      ': ',statistic$test,', ',
                      ' summary: ',pString)
  formatStr
}


#' Empirical permutation test of significance of scoreFn(modelValues,yValues) >= scoreFn(modelValues,perm(yValues)).
#'
#' Treat permutation re-samples as similar to bootstrap replications.
#'
#' @param modelValues numeric array of predictions.
#' @param yValues numeric/logical array of outcomes, dependent, or truth values
#' @param scoreFn function with signature scoreFn(modelValues,yValues) returning scalar numeric score.
#' @param ... not used, forces later arguments to be bound by name
#' @param na.rm logical, if TRUE remove NA values
#' @param returnScores logical if TRUE return detailed permutedScores
#' @param nRep integer number of repititions to perform
#' @param parallelCluster optional snow-style parallel cluster.
#' @return summaries
#'
#' @examples
#'
#' set.seed(25325)
#' y <- 1:5
#' m <- c(1,1,2,2,2)
#' cor.test(m,y,alternative='greater')
#' f <- function(modelValues,yValues) cor(modelValues,yValues)
#' permutationScoreModel(m,y,f)
#'
#' @export
#'
permutationScoreModel <- function(modelValues, yValues,
                                  scoreFn,
                                  ...,
                                  na.rm= FALSE,
                                  returnScores= FALSE,
                                  nRep= 100,
                                  parallelCluster= NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "sigr::permutationScoreModel")
  if(!is.numeric(modelValues)) {
    stop("sigr::permutationScoreModel modelValues must be numeric")
  }
  if((!is.logical(yValues)) &&(!is.numeric(yValues))) {
    stop("sigr::permutationScoreModel yValues must be logical or numeric")
  }
  if(length(modelValues)!=length(yValues)) {
    stop("sigr::permutationScoreModel must have length(modelValues)==length(yValues)")
  }
  nNA <- sum(is.na(modelValues) | is.na(yValues))
  if(na.rm) {
    goodPosns <- (!is.na(modelValues)) & (!is.na(yValues))
    modelValues <- modelValues[goodPosns]
    yValues <- yValues[goodPosns]
  }
  dim <- length(yValues)
  observedScore <- scoreFn(modelValues,yValues)
  permWorker <- mkPermWorker(modelValues=modelValues,
                             yValues=yValues,
                             scoreFn=scoreFn)
  permutedScores <- plapply(1:nRep,permWorker,parallelCluster)
  permutedScores <- as.numeric(permutedScores)
  meanv <- mean(permutedScores)
  sdv <- sqrt(sum((permutedScores-meanv)^2)/(length(permutedScores)-1))
  z <- (observedScore-meanv)/sdv
  df = length(permutedScores)-1
  pValue <- stats::pt(z,df=df,lower.tail=FALSE)
  pFreq <- sum(permutedScores>=observedScore)/length(permutedScores)
  ret <- list(fnName='permutationScoreModel',
              test='is observed score greater than permuted score',
              observedScore=observedScore,
              df=df,
              z=z,
              mean=meanv,
              sd=sdv,
              pValue=pValue,
              pFreq=pFreq,
              nNA=nNA,
              n=dim)
  if(returnScores) {
    ret$permutedScores <- permutedScores
  }
  class(ret) <- c('sigr_permtest', 'sigr_statistic')
  ret
}




mkResampleWorker <- function(modelValues,
                              yValues,
                              scoreFn) {
  force(yValues)
  force(modelValues)
  force(scoreFn)
  n <- length(modelValues)
  if(is.null(yValues)) {
    yValues <- logical(n)
  }
  function(x) {
    samp <- sample.int(n,n,replace=TRUE)
    score <- scoreFn(modelValues[samp],yValues[samp])
    score
  }
}

#' Studentized bootstrap variance estimate for scoreFn(yValues,modelValues).
#'
#' @param modelValues numeric array of predictions (model to test).
#' @param yValues numeric/logical array of outcomes, dependent, or truth values
#' @param scoreFn function with signature scoreFn(modelValues,yValues) returning scalar numeric score.
#' @param ... not used, forces later arguments to be bound by name
#' @param na.rm logical, if TRUE remove NA values
#' @param returnScores logical if TRUE return detailed resampledScores
#' @param nRep integer number of repititions to perform
#' @param parallelCluster optional snow-style parallel cluster.
#' @return summaries
#'
#' @examples
#'
#' set.seed(25325)
#' y <- 1:5
#' m1 <- c(1,1,2,2,2)
#' cor.test(m1,y,alternative='greater')
#' f <- function(modelValues,yValues) {
#'  if((sd(modelValues)<=0)||(sd(yValues)<=0)) {
#'    return(0)
#'  }
#'  cor(modelValues,yValues)
#' }
#' s <- sigr::resampleScoreModel(m1,y,f)
#' print(s)
#' z <- (s$observedScore-0)/s$sd # should check size of z relative to bias!
#' pValue <- pt(z,df=length(y)-2,lower.tail=FALSE)
#' pValue
#'
#'
#' @export
#'
#
resampleScoreModel <- function(modelValues,
                               yValues,
                               scoreFn,
                               ...,
                               na.rm= FALSE,
                               returnScores= FALSE,
                               nRep= 100,
                               parallelCluster= NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "sigr::resampleScoreModel")
  if(!is.numeric(modelValues)) {
    stop("sigr::resampleScoreModel modelValues must be numeric")
  }
  if((!is.logical(yValues)) &&(!is.numeric(yValues))) {
    stop("sigr::resampleScoreModel yValues must be logical or numeric")
  }
  if(length(modelValues)!=length(yValues)) {
    stop("sigr::resampleScoreModel must have length(modelValues)==length(yValues)")
  }
  nNA <- sum(is.na(modelValues) | is.na(yValues))
  if(na.rm) {
    goodPosns <- (!is.na(modelValues)) & (!is.na(yValues))
    modelValues <- modelValues[goodPosns]
    yValues <- yValues[goodPosns]
  }
  dim <- length(yValues)
  observedScore <- scoreFn(modelValues, yValues)
  resampleWorker <- mkResampleWorker(modelValues=modelValues,
                                          yValues=yValues,
                                          scoreFn=scoreFn)
  resampledScores <- plapply(1:nRep,resampleWorker,parallelCluster)
  resampledScores <- as.numeric(resampledScores)
  meanv <- mean(resampledScores)
  sdv <- sqrt(sum((resampledScores-meanv)^2)/(length(resampledScores)-1))
  ret <- list(fnName='resampleScoreModel',
              observedScore=observedScore,
              bias=meanv-observedScore,
              sd=sdv,
              nNA=nNA,
              n=dim)
  if(returnScores) {
    ret$resampledScores <- resampledScores
  }
  ret
}






mkResampleDiffWorker <- function(model1Values,
                                 model2Values,
                                 yValues,
                                 scoreFn,
                                 sameSample) {
  force(yValues)
  force(model1Values)
  force(model2Values)
  force(scoreFn)
  force(sameSample)
  n <- length(model1Values)
  if(is.null(yValues)) {
    yValues <- logical(n)
  }
  if(is.null(model2Values)) {
    model2Values <- numeric(n)
  }
  function(x) {
    samp1 <- sample.int(n,n,replace=TRUE)
    if(sameSample) {
      samp2 <- samp1
    } else {
      samp2 <- sample.int(n,n,replace=TRUE)
    }
    score1 <- scoreFn(model1Values[samp1],yValues[samp1])
    score2 <- scoreFn(model2Values[samp2],yValues[samp2])
    list(score1=score1,score2=score2,diff=score1-score2)
  }
}

# convert a non-empty list of single row data frames with the identical non-empty
# numeric column structure into a data frame (without bringing in dplyr/tidyr
# dependency, and faster than do.call(rbind,rows)).
listToDataFrame <- function(rows) {
  n <- length(rows)
  cols <- names(rows[[1]])
  d <- data.frame(x=numeric(n))
  colnames(d) <- cols[[1]]
  for(ci in cols) {
    d[[ci]] <- vapply(rows,function(ri) { ri[[ci]] },numeric(1))
  }
  d
}


#' Format an empirical test (quality of categorical prediction)
#'
#' @param statistic wrapped T-test
#' @param ... not used, force use of named binding for later arguments
#' @param format if set the format to return ("html", "latex", "markdown", "ascii")
#' @param statDigits integer number of digits to show in summaries.
#' @param sigDigits integer number of digits to show in significances.
#' @param pLargeCutoff value to declare non-significance at or above.
#' @param pSmallCutoff smallest value to print
#' @return formatted string
#'
#'
#' @export
render.sigr_emptest <- function(statistic,
                                ...,
                                format,
                                statDigits=4,
                                sigDigits=4,
                                pLargeCutoff=0.05,
                                pSmallCutoff=1.0e-5) {
  wrapr::stop_if_dot_args(substitute(list(...)), "sigr::render.sigr_emptest")
  if (missing(format) || is.null(format)) {
    format <- getRenderingFormat()
  }
  if(!isTRUE(format %in% formats)) {
    format <- "ascii"
  }
  fsyms <- syms[format,]
  pString <- render(wrapSignificance(statistic$eValue,
                                     symbol='e'),
                    format=format,
                    sigDigits=sigDigits,
                    pLargeCutoff=pLargeCutoff,
                    pSmallCutoff=pSmallCutoff)
  formatStr <- paste0(fsyms['startB'],'Studentized empirical test',fsyms['endB'],
                      ': ',statistic$test,', ',
                      ' summary: ',pString)
  formatStr
}


#' Studentized estimate of how often a difference is below zero.
#'
#' @param resampledDiffs numeric vector resampled observations
#' @param na.rm logical, if TRUE remove NA values
#' @return estimated probability of seeing a re-sampled difference below zero.
#'
#' @examples
#'
#' set.seed(2352)
#' resampledDiffs <- rnorm(10)+1
#' estimateDifferenceZeroCrossing(resampledDiffs)
#'
#' @export
#'
estimateDifferenceZeroCrossing <- function(resampledDiffs,
                                           na.rm= FALSE) {
  nNA <- sum(is.na(resampledDiffs))
  if(na.rm) {
    goodPosns <- !is.na(resampledDiffs)
    resampledDiffs <- resampledDiffs[goodPosns]
  }
  dim <- length(resampledDiffs)
  meanv <- mean(resampledDiffs)
  sdv <- sqrt(sum((resampledDiffs-meanv)^2)/(length(resampledDiffs)-1))
  z <- meanv/sdv
  df = length(resampledDiffs)-1
  eFreq <- sum(resampledDiffs<=0)/length(resampledDiffs)
  eValue <- stats::pt(z,df=df,lower.tail=FALSE)
  ret <- list(fnName='estimateDifferenceZeroCrossing',
              test="is difference greater than zero on re-samples",
              z=z,
              meanv=meanv,
              sd=sdv,
              eValue=eValue,
              eFreq=eFreq,
              nNA=nNA,
              n=dim)
  class(ret) <- c('sigr_emptest', 'sigr_statistic')
  ret
}

#' Studentized bootstrap test of strength of scoreFn(yValues,model1Values) > scoreFn(yValues,model1Values).
#'
#' Studentized bootstrap test of strength of scoreFn(yValues,model1Values) > scoreFn(yValues,model1Values)
#' sampled with replacement.
#'
#' True confidence intervals are harder to get right (see
#' "An Introduction to the Bootstrap", Bradely Efron,
#' and Robert J. Tibshirani, Chapman & Hall/CRC, 1993.),
#' but we will settle for simple p-value estimates.
#'
#' @param model1Values numeric array of predictions (model to test).
#' @param model2Values numeric array of predictions (reference model).
#' @param yValues numeric/logical array of outcomes, dependent, or truth values
#' @param scoreFn function with signature scoreFn(modelValues,yValues) returning scalar numeric score.
#' @param ... not used, forces later arguments to be bound by name.
#' @param na.rm logical, if TRUE remove NA values
#' @param returnScores logical if TRUE return detailed resampledScores.
#' @param nRep integer number of repititions to perform.
#' @param parallelCluster optional snow-style parallel cluster.
#' @param sameSample logical if TRUE use the same sample in computing both scores during bootstrap replication (else use independent samples).
#' @return summaries
#'
#' @examples
#'
#' set.seed(25325)
#' y <- 1:5
#' m1 <- c(1,1,2,2,2)
#' m2 <- c(1,1,1,1,2)
#' cor(m1,y)
#' cor(m2,y)
#' f <- function(modelValues,yValues) {
#'   if((sd(modelValues)<=0)||(sd(yValues)<=0)) {
#'     return(0)
#'   }
#'   cor(modelValues,yValues)
#' }
#' resampleScoreModelPair(m1,m2,y,f)
#'
#' @export
#'
#
resampleScoreModelPair <- function(model1Values,
                                   model2Values,
                                   yValues,
                                   scoreFn,
                                   ...,
                                   na.rm= FALSE,
                                   returnScores= FALSE,
                                   nRep= 100,
                                   parallelCluster= NULL,
                                   sameSample= FALSE) {
  wrapr::stop_if_dot_args(substitute(list(...)), "sigr::resampleScoreModelPair")
  if(!is.numeric(model1Values)) {
    stop("sigr::resampleScoreModelPair model1Values must be numeric")
  }
  if(!is.numeric(model2Values)) {
    stop("sigr::resampleScoreModelPair model2Values must be numeric")
  }
  if((!is.logical(yValues)) &&(!is.numeric(yValues))) {
    stop("sigr::resampleScoreModelPair yValues must be logical or numeric")
  }
  if(length(model1Values)!=length(yValues)) {
    stop("sigr::resampleScoreModelPair must have length(model1Values)==length(yValues)")
  }
  if(length(model2Values)!=length(yValues)) {
    stop("sigr::resampleScoreModelPair must have length(model2Values)==length(yValues)")
  }
  nNA <- sum(is.na(model1Values) | is.na(model2Values) | is.na(yValues))
  if(na.rm) {
    goodPosns <- (!is.na(model1Values)) & (!is.na(model2Values)) & (!is.na(yValues))
    model1Values <- model1Values[goodPosns]
    model2Values <- model2Values[goodPosns]
    yValues <- yValues[goodPosns]
  }
  dim <- length(yValues)
  observedScore1 <- scoreFn(model1Values,yValues)
  observedScore2 <- scoreFn(model2Values,yValues)
  resampleWorker <- mkResampleDiffWorker(model1Values=model1Values,
                                         model2Values=model2Values,
                                         yValues=yValues,
                                         scoreFn=scoreFn,
                                         sameSample=sameSample)
  detailedResampledScores <- plapply(1:nRep,resampleWorker,parallelCluster)
  observedDiff <- observedScore1-observedScore2
  detailedResampledScores <- listToDataFrame(detailedResampledScores)
  resampledDiffs <- detailedResampledScores$diff
  ret <- estimateDifferenceZeroCrossing(resampledDiffs, na.rm=FALSE)
  ret$nNA <- nNA
  ret$n <- dim
  ret$observedScore1 <- observedScore1
  ret$observedScore2 <- observedScore2
  ret$observedDiff <- observedDiff
  ret$sameSample <- sameSample
  if(returnScores) {
    ret$detailedResampledScores <- detailedResampledScores
  }
  ret
}

Try the sigr package in your browser

Any scripts or data that you put into this service are public.

sigr documentation built on Aug. 20, 2023, 9:06 a.m.