R/santaR_pvalue_fit.R
In santaR: Short Asynchronous Time-Series Analysis

Documented in santaR_pvalue_fit santaR_pvalue_fit_within

#' Evaluate difference in group trajectories based on the comparison of model fit (F-test) between one and two groups
#'
#' Evaluate the difference in group trajectories by executing a t-test based on the comparison of improvement in model fit \emph{(F-test)} between fitting one group mean curve to all individuals and fitting two group mean curves. This between-class differential evolution test, evaluates whether fitting 2 group curves decreases the residuals compared to a single group mean curve. The statistic employed is defined as a quantification of evidence for differential evolution, with the larger the statistic the more differentially evolving the variable appears to be. Individual group membership is repeatedly randomly permuted to generate new random groups and group mean curves, then employed to compute a \emph{Null} distribution of the statistic (improvement in model fit from one to two groups). The improvement in model fit for the real group membership is then compared to this \emph{Null} distribution \emph{(of no group difference)} and a \emph{p}-value is computed. Adapted from \cite{Storey and al. 'Significance analysis of time course microarray experiments', PNAS, 2005 [1]}.
#' \itemize{
#'   \item The \emph{p}-value is calculated as \code{(b+1)/(nPerm+1)} as to not report a \emph{p}-value=0 (which would give problem with FDR correction) and reduce type I error.
#'   \item The \emph{p}-value will vary depending on the random sampling. Therefore a confidence interval can be constructed using Wilson's interval which presents good properties for small number of trials and probabilities close to 0 or 1.
#' }
#'
#' @param SANTAObj A fitted \emph{SANTAObj} as generated by \code{\link{santaR_fit}}.
#' @param nPerm (int) Number of permutations. Default 1000.
#' @param alpha (float) Confidence Interval on the permuted \emph{p}-value \emph{(0.05 for 95\% Confidence Interval)}. Default 0.05.
#'
#' @return A \emph{SANTAObj} with added \emph{p}-value fit and confidence interval on the \emph{p}-value.
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' ## Default parameter values decreased to ensure an execution < 2 seconds
#' Yi          <- acuteInflammation$data$var_3
#' ind         <- acuteInflammation$meta$ind
#' time        <- acuteInflammation$meta$time
#' group       <- acuteInflammation$meta$group
#' grouping    <- get_grouping(ind, group)
#' inputMatrix <- get_ind_time_matrix(Yi, ind, time)
#' SANTAObj    <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#' SANTAObj    <- santaR_pvalue_fit(SANTAObj, nPerm=100)
#'
#' @references [1] Storey, J. D., Xiao, W., Leek, J. T., Tompkins, R. G. & Davis, R. W. Significance analysis of time course microarray experiments. \emph{Proceedings of the National Academy of Sciences of the United States of America} \strong{102}, 12837-42 (2005).
#'
#' @family Analysis
#' @seealso Comparison with constant model with \code{\link{santaR_pvalue_fit_within}}
#'
#' @export
santaR_pvalue_fit         <- function(SANTAObj,nPerm=1000,alpha=0.05) {
  ## COMMENT
  # Ind .pred have the smoothness penalty, groupMeanCurve is therefore fitted at df=nbTP. 
  # as we permutate the Ind (.in and .pred), no need to refit @ original df
	
  ## FUNCTION
  SSquare               <- function(observed,fitted) {
    ## Sum of Square of the residuals across all individuals and timepoints
    #   observed  = matrix IND x TIME
    #   fitted    = predicted values for TIME (row vector)
    SSq.ind <- apply( observed, 1, function(x) sum((x-fitted)^2,na.rm=TRUE) )
    return(sum(SSq.ind))
  }
  stat.fit              <- function(models) {
    ## Compares goodness of fit of the model under the null hypothesis to that under the alternative hypothesis
    # models = list(mod.null, mod.alt)
    
    SSq.null  <- SSquare(models$mod.null$obs, models$mod.null$fit)
    SSq.alt1  <- SSquare(models$mod.alt1$obs, models$mod.alt1$fit)
    SSq.alt2  <- SSquare(models$mod.alt2$obs, models$mod.alt2$fit)
    SSq.alt   <- SSq.alt1 + SSq.alt2
    if(SSq.alt==0) {                                        # stop from dividing by zero
      SSq.alt <- 10*.Machine$double.eps 
      if(SSq.null==0) { SSq.null <- 10*.Machine$double.eps }  # stop from having a -1 when it should be 0
    }   
    Ftest     <- (SSq.null - SSq.alt)/SSq.alt
    return(Ftest)
  }            # F-test statistic (goodness of fit)
  between.null.sim.ind  <- function(group1,group2) {
    ## Simulate NULL population by resampling the individuals
    # init
    n1    <- dim(group1$inp)[1]
    n2    <- dim(group2$inp)[1]
    n     <- n1+n2
    inp   <- rbind(group1$inp, group2$inp)
    pred  <- rbind(group1$pred,group2$pred)
    # random draw
    sampler1  <- sample(1:n, size=n1, replace=TRUE)
    sampler2  <- sample(1:n, size=n2, replace=TRUE)
    # make groups
    pop1 <- list( inp=inp[sampler1,], pred=pred[sampler1,] )
    pop2 <- list( inp=inp[sampler2,], pred=pred[sampler2,] )
    return(list(pop1=pop1,pop2=pop2))
  }     # Simulate NULL population by resampling individuals
  fit.mean.curve        <- function(group,df){
    ## Works on .pred values for each individual, that already has the smoothness penalty, so fit groupMeanCurve with df=nbTP
    # The if condition shouldn't be necessary as no NA in .pred values
    time      <- as.numeric(colnames(group))
    meanVal   <- colMeans(group, na.rm=TRUE)
    notNA     <- !is.na(meanVal)
    if( sum(notNA) >= df ) {
      return(stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) ))
    } else {
      return(NA)
    }
  }           # return the fitted smooth.spline object for that group
  fit.models            <- function(population,df) {
    ## Fit both the NULL meanCurve, and the 2 ALTERNATE groupMeanCurve on a population IND x TIME
    #
    ## Returns a list of models, observations and corresponding fitted values
    #
    ## fit on .pred to save the step of ind prediction at each run of the function, obs on inp points for residuals calculation
    
    group1 <- population$pop1
    group2 <- population$pop2
    
    # Fit NULL model
    group.null  <- rbind(group1$inp,group2$inp)
    fit.null    <- fit.mean.curve( rbind(group1$pred,group2$pred),df)  
    if(! inherits(fit.null, 'smooth.spline')) { return(NA) }    # detect failure in fitting mean curve
    model.null  <- list(obs=group.null, fit=stats::predict(fit.null,y=as.numeric(colnames(group.null)))$y)
    
    # Fit ALTERNATE model
    fit.alt1   <- fit.mean.curve(group1$pred,df)
    fit.alt2   <- fit.mean.curve(group2$pred,df)
    if( (! inherits(fit.alt1, 'smooth.spline')) & (! inherits(fit.alt2, 'smooth.spline')) ) { return(NA) }
    model.alt1 <- list(obs=group1$inp, fit=stats::predict(fit.alt1,y=as.numeric(colnames(group1$pred)))$y)
    model.alt2 <- list(obs=group2$inp, fit=stats::predict(fit.alt2,y=as.numeric(colnames(group2$pred)))$y)
    
    return(list(mod.null=model.null, mod.alt1=model.alt1, mod.alt2=model.alt2))
  }     # fit NULL and ALTERNATE models
  
  ## Init
  if (length(SANTAObj$groups) != 2) {
    message("Error: Check input, p-values can only be calculated on 2 groups")
    stop("Error: Check input, p-values can only be calculated on 2 groups")
  }
  z           <- stats::qnorm(1-0.5*alpha)
  df          <- SANTAObj$properties$df
  groupData1  <- list( inp=SANTAObj$groups[[1]]$groupData.in, pred=SANTAObj$groups[[1]]$groupData.pred )
  groupData2  <- list( inp=SANTAObj$groups[[2]]$groupData.in, pred=SANTAObj$groups[[2]]$groupData.pred )
  
  if( is.data.frame(groupData1$inp) & is.data.frame(groupData2$inp)){
    ## Fstat
    models  <- fit.models(list(pop1=groupData1, pop2=groupData2), df) # mod.null, mod.alt1, mod.alt2
    if( !is.list(models) ) { return(NA) }                             # fitting of one of the mean curve must have failed
    Fstat   <- stat.fit( models )                                     # NULL real vs alternative
    
    ## Bootstrap, NULL real vs NULL generated
    Fstat0  <- replicate( nPerm, stat.fit( fit.models(between.null.sim.ind(groupData1,groupData2), df) ) )
    p       <- (sum( Fstat0 >= Fstat)+1)/(nPerm+1)
    
    SANTAObj$general$pval.fit   <- p
    SANTAObj$general$pval.fit.l <- (1/(1+((z^2)/nPerm))) * ( p+((z^2)/(2*nPerm)) - z*sqrt( (1/nPerm)*p*(1-p) + (z^2)/(4*(nPerm^2)) ) )
    SANTAObj$general$pval.fit.u <- (1/(1+((z^2)/nPerm))) * ( p+((z^2)/(2*nPerm)) + z*sqrt( (1/nPerm)*p*(1-p) + (z^2)/(4*(nPerm^2)) ) )
  }
  
  SANTAObj$properties$pval.fit$status  <- TRUE
  SANTAObj$properties$pval.fit$nPerm   <- nPerm
  SANTAObj$properties$pval.fit$alpha   <- alpha
  return(SANTAObj)
}


#' Evaluate difference between a group mean curve and a constant model using the comparison of model fit (F-test)
#'
#' Execute a t-test based on the comparison of improvement of model fit from a single group mean curve to the fit of both a group mean curve and a constant linear model. This statistic identifies within-class differential evolution, and test whether the population average time curve is flat or not. \emph{n} constant linear model are generated to match the \emph{n} individual trajetories. The \emph{Null} distribution is generated by permuting the \emph{n} group individuals and the \emph{n} constant trajectories. The real improvement in model fit for the real group membership versus flat trajectories is then compared to the \emph{Null} distribution of model fit improvement, similarly to \code{\link{santaR_pvalue_fit}}. Adapted from \cite{Storey and al. 'Significance analysis of time course microarray experiments', PNAS, 2005 [1]}.
#'
#' @param SANTAGroup A fitted group extracted from a \emph{SANTAObj} generated by \code{\link{santaR_fit}}.
#' @param nPerm (int) Number of permutations. Default 1000.
#'
#' @return A \emph{p-value}
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' ## Default parameter values decreased to ensure an execution < 2 seconds
#' Yi          <- acuteInflammation$data$var_3
#' ind         <- acuteInflammation$meta$ind
#' time        <- acuteInflammation$meta$time
#' group       <- acuteInflammation$meta$group
#' grouping    <- get_grouping(ind, group)
#' inputMatrix <- get_ind_time_matrix(Yi, ind, time)
#' SANTAObj    <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#' SANTAGroup  <- SANTAObj$groups[[1]]
#' #SANTAGroup <- SANTAObj$groups$Group1
#' santaR_pvalue_fit_within(SANTAGroup, nPerm=500)
#' # ~0.6726747
#'
#' @references [1] Storey, J. D., Xiao, W., Leek, J. T., Tompkins, R. G. & Davis, R. W. Significance analysis of time course microarray experiments. \emph{Proceedings of the National Academy of Sciences of the United States of America} \strong{102}, 12837-42 (2005).
#'
#' @seealso Inter-group comparison with \code{\link{santaR_pvalue_fit}}
#'
#' @export
santaR_pvalue_fit_within  <- function(SANTAGroup,nPerm=1000) {
  ## COMMENT
  # Ind .pred have the smoothness penalty, groupMeanCurve is therefore fitted at df=nbTP. 
  # as we bootstrap the Ind (.in and .pred), no need to refit @ original df
  
  
  ## FUNCTION
  SSquare             <- function(observed,fitted) {
    ## Sum of Square of the residuals across all individuals and timepoints
    #   observed  = matrix IND x TIME
    #   fitted    = predicted values for TIME (row vector)
    SSq.ind <- apply( observed, 1, function(x) sum((x-fitted)^2,na.rm=TRUE) )
    return(sum(SSq.ind))
  }
  stat.fit            <- function(models) {
    ## Compares goodness of fit of the model under the null hypothesis to that under the alternative hypothesis
    # models = list(mod.null, mod.alt)
    
    if( is.data.frame(models$mod.alt$obs) ){        # check if the spline fitting did work
      SSq.alt   <- SSquare(models$mod.alt$obs, models$mod.alt$fit)
      SSq.null  <- SSquare(models$mod.null$obs,models$mod.null$fit)
      Ftest     <- (SSq.null - SSq.alt)/SSq.alt
      return(Ftest)
    } else {
      return(NA)
    }
  }               # F-test statistic (goodness of fit)
  within.null.sim.ind <- function(group) {
    ## Simulate NULL population by resampling the individuals
    
    time        <- as.numeric(colnames(group$inp))
    n.ind       <- dim(group$inp)[1]
    sampler     <- sample(1:n.ind, size=n.ind, replace=TRUE)
    population  <- list( inp=group$inp[sampler,], pred=group$pred[sampler,] )
    
    return(population)
  }          # Simulate NULL population by resampling individuals
  fit.models          <- function(groupData) {
    ## Fit both the NULL model lm( y ~ 1 ), and the ALTERNATIVE groupMeanCurve on a population IND x TIME
    # groupData   = IND x TIME matrix to fit
    #
    ## Returns a list of models, observations and corresponding fitted values
    #
    # NULL model on inp data, ALT model employs predicted data for model, inp for residuals
    # groupMean on .pred values for each individual, that already has the smoothness penalty, so fit groupMeanCurve with df=nbTP
    
    # init
    time <- as.numeric(colnames(groupData$inp))
    
    # Fit NULL model
    suppressMessages( y <- reshape2::melt(groupData$inp, na.rm=TRUE)$value )
    model.null          <- list(obs=groupData$inp, fit=stats::predict(object=stats::lm( y ~ 1),newdata=data.frame(y=time)))
    
    # Fit ALTERNATIVE model
    meanVal             <- colMeans(groupData$pred, na.rm=TRUE) # using pred saves time on the step of predicting the individual before the average curve
    notNA               <- !is.na(meanVal)
    model.alternative   <- list(obs=groupData$inp, fit=stats::predict(stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) ), time)$y)
    
    return(list(mod.null=model.null, mod.alt=model.alternative))
  }         # fit NULL and ALTERNATIVE models
  
  if( is.data.frame(SANTAGroup$groupData.in) ){
    ## INIT
    groupData <- list( inp=SANTAGroup$groupData.in, pred=SANTAGroup$groupData.pred )
    models    <- fit.models(groupData)             # return mod.null, mod.alt
    Fstat     <- stat.fit( models )                   # NULL real vs alternative
    
    if(is.na(Fstat)){
      return(NA)
    } else {
      ## Bootstrap, NULL real vs NULL generated
      Fstat0 <- replicate( nPerm, stat.fit( fit.models(within.null.sim.ind(groupData)) ) )            # only need the individuals
      
      p <- (sum( Fstat0 >= Fstat)+1)/(nPerm+1)
      
      return(p)
    }
  } else {
    return(NA)
  }
}