R/santaR_pvalue_dist.R
In santaR: Short Asynchronous Time-Series Analysis

Documented in santaR_pvalue_dist santaR_pvalue_dist_within

#' Evaluate difference in group trajectories based on the comparison of distance between group mean curves
#'
#' Evaluate the difference in group trajectories by executing a t-test based on the comparison of distance between group mean curves. Individual group membership is repeatedly randomly permuted to generate new random groups and group mean curves, then employed to compute a \emph{Null} distribution of distance between goup mean curves. The distance between two group mean curves is defined as the area between both curves. The distance between the real group mean curves is then compared to this \emph{Null} distribution and a \emph{p}-value is computed.
#' \itemize{
#'   \item The Pearson correlation coefficient between the two group mean curves is calculated to detect highly correlated group shapes if required.
#'   \item The \emph{p}-value is calculated as \code{(b+1)/(nPerm+1)} as to not report a \emph{p}-value=0 (which would give problem with FDR correction) and reduce type I error.
#'   \item The \emph{p}-value will vary depending on the random sampling. Therefore a confidence interval can be constructed using Wilson's interval which presents good properties for small number of trials and probabilities close to 0 or 1.
#' }
#'
#' @param SANTAObj A fitted \emph{SANTAObj} as generated by \code{\link{santaR_fit}}.
#' @param nPerm (int) Number of permutations. Default 1000.
#' @param nStep (int) Number of steps employed for the calculation of the area between group mean curves. Default is 5000.
#' @param alpha (float) Confidence Interval on the permuted \emph{p}-value \emph{(0.05 for 95\% Confidence Interval)}. Default 0.05.
#'
#' @return A \emph{SANTAObj} with added \emph{p}-value dist and confidence interval on the \emph{p}-value.
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' ## Default parameter values decreased to ensure an execution < 2 seconds
#' Yi          <- acuteInflammation$data$var_3
#' ind         <- acuteInflammation$meta$ind
#' time        <- acuteInflammation$meta$time
#' group       <- acuteInflammation$meta$group
#' grouping    <- get_grouping(ind, group)
#' inputMatrix <- get_ind_time_matrix(Yi, ind, time)
#' SANTAObj    <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#' SANTAObj    <- santaR_pvalue_dist(SANTAObj, nPerm=100)
#'
#' @family Analysis
#' @seealso Comparison with constant model with \code{\link{santaR_pvalue_dist_within}}
#'
#' @export
santaR_pvalue_dist        <- function(SANTAObj,nPerm=1000,nStep=5000,alpha=0.05) {
  ## COMMENT
  # Ind .pred have the smoothness penalty, groupMeanCurve is therefore fitted at df=nbTP. 
  # as we permutate the Ind (.in and .pred), no need to refit @ original df
  
  ## FUNCTION
  dist.fit              <- function(models,nStep){
    #         __a__
    #       /|     \
    #     /  h      \ 
    #   /____|__b____\
    #
    #   Area = (a+b)/2 * h
    #
    # models$mod.grp1$obs is ~ equivalent to SANTAObj$group[[x]]$groupData.in, which is a DataFrame with all the times across all groups as colnames (and NA if not measured for a given sample at a given time)
    #
    rng   <- range(as.numeric(colnames(models$mod.grp1$obs)))
    h     <- (rng[2]-rng[1])/nStep                            # binwidth
    grid  <- seq(from=rng[1],to=rng[2],by=h)
    
    proj    <- matrix( c(stats::predict(models$mod.grp1$fit,x=grid)$y, stats::predict(models$mod.grp2$fit,x=grid)$y) , nrow=2, byrow=TRUE)
    dist.sp <- abs(proj[1,]-proj[2,])
    dist.sp <- sum( c(dist.sp,dist.sp[2:(length(dist.sp)-1)]) )/2
    
    return(h*dist.sp)
  } 
  between.null.sim.ind  <- function(group1,group2) {
    ## Simulate NULL population by resampling the individuals
    # init
    n1    <- dim(group1$inp)[1]
    n2    <- dim(group2$inp)[1]
    n     <- n1+n2
    inp   <- rbind(group1$inp, group2$inp)
    pred  <- rbind(group1$pred,group2$pred)
    # random draw
    sampler1  <- sample(1:n, size=n1, replace=TRUE)
    sampler2  <- sample(1:n, size=n2, replace=TRUE)
    # make groups
    return(list(pop1=list( inp=inp[sampler1,], pred=pred[sampler1,] ) ,pop2=list( inp=inp[sampler2,], pred=pred[sampler2,] ) ))
  }     # Simulate 2 NULL populations by resampling individuals
  fit.mean.curve        <- function(group,df){
    ## Works on .pred values for each individual, that already have the smoothness penalty, so fit groupMeanCurve with df=nbTP
    # The if condition shouldn't be necessary as no NA in .pred values (all group are projected onto the same shared time axis in santaR_fit)
    time      <- as.numeric(colnames(group))
    meanVal   <- colMeans(group, na.rm=TRUE)
    notNA     <- !is.na(meanVal)
    if( sum(notNA) >= df ) {
      return(stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) ))
    } else {
      return(NA)
    }
  }           # return the fitted smooth.spline for that group
  fit.models            <- function(population,df) {
    ## Fit 2 groupMeanCurve on a population IND x TIME
    #
    ## Returns a list of models: observations and fitted spline
    #
    ## fit on .pred to save the step of ind prediction at each run of the function
    
    group1 <- population$pop1
    group2 <- population$pop2
    
    fit.grp1   <- fit.mean.curve(group1$pred,df)
    fit.grp2   <- fit.mean.curve(group2$pred,df)
    if( (! inherits(fit.grp1, 'smooth.spline')) & (! inherits(fit.grp2, 'smooth.spline')) ) { return(NA) }
    model.grp1 <- list(obs=group1$inp, fit=fit.grp1)                  # obs only needed in dist.fit to get time range (colnames in SANTAObj$groups[[x]]$groupData.in shared across all groups)
    model.grp2 <- list(obs=group2$inp, fit=fit.grp2)
    
    return(list(mod.grp1=model.grp1, mod.grp2=model.grp2))
  }     # fit 2 group models
  cor.models            <- function(models,nStep){
    # Project the two GroupMeanCurves (same as dist.fit) and return the Pearson correlation between them
    rng   <- range(as.numeric(colnames(models$mod.grp1$obs)))
    h     <- (rng[2]-rng[1])/nStep                            # binwidth
    grid  <- seq(from=rng[1],to=rng[2],by=h)
    
    proj    <- matrix( c(stats::predict(models$mod.grp1$fit,x=grid)$y, stats::predict(models$mod.grp2$fit,x=grid)$y) , nrow=2, byrow=TRUE)
    
    return( stats::cor(proj[1,], proj[2,], method='pearson') )
  }
  
  ## Init
  if (length(SANTAObj$groups) != 2) {
    message("Error: Check input, p-values can only be calculated on 2 groups")
    stop("Error: Check input, p-values can only be calculated on 2 groups")
  }
  z           <- stats::qnorm(1-0.5*alpha)
  df          <- SANTAObj$properties$df
  groupData1  <- list( inp=SANTAObj$groups[[1]]$groupData.in, pred=SANTAObj$groups[[1]]$groupData.pred )
  groupData2  <- list( inp=SANTAObj$groups[[2]]$groupData.in, pred=SANTAObj$groups[[2]]$groupData.pred )
  
  if( is.data.frame(groupData1$inp) & is.data.frame(groupData2$inp)){
    
    ## distance
    models  <- fit.models(list(pop1=groupData1, pop2=groupData2), df) # mod.grp1, mod.grp2
    if( !is.list(models) ) { return(NA) }                             # fitting of one of the mean curve must have failed
    dist    <- dist.fit( models, nStep )                              # NULL real (vs alternate)
    
    ## Bootstrap
    dist0  <- replicate( nPerm, dist.fit( fit.models(between.null.sim.ind(groupData1,groupData2), df), nStep) )
    p      <- (sum( dist0 >= dist)+1)/(nPerm+1)
    
    SANTAObj$general$pval.dist   <- p
    SANTAObj$general$pval.dist.l <- (1/(1+((z^2)/nPerm))) * ( p+((z^2)/(2*nPerm)) - z*sqrt( (1/nPerm)*p*(1-p) + (z^2)/(4*(nPerm^2)) ) )
    SANTAObj$general$pval.dist.u <- (1/(1+((z^2)/nPerm))) * ( p+((z^2)/(2*nPerm)) + z*sqrt( (1/nPerm)*p*(1-p) + (z^2)/(4*(nPerm^2)) ) )
    
    ## Group curves correlation coefficients
    SANTAObj$general$pval.curveCorr = cor.models( models, nStep )
  } 
  
  SANTAObj$properties$pval.dist$status  <- TRUE
  SANTAObj$properties$pval.dist$nPerm   <- nPerm
  SANTAObj$properties$pval.dist$alpha   <- alpha
  return(SANTAObj)
}


#' Evaluate difference between a group mean curve and a constant model
#'
#' Execute a t-test based on the comparison of distance between a group mean curve and a constant linear model. Generate \emph{n} constant linear model. The \emph{Null} distribution is generated by permuting the \emph{n} group individuals and the \emph{n} constant trajectories. The real distance (area) between the group trajectory and the flat trajectory is compared to the \emph{Null} distribution of distances, similarly to \code{\link{santaR_pvalue_dist}}.
#'
#' @param SANTAGroup A fitted group extracted from a \emph{SANTAObj} generated by \code{\link{santaR_fit}}.
#' @param nPerm (int) Number of permutations. Default 1000.
#' @param nStep (int) Number of steps employed for the calculation of the area between group mean curves. Default is 5000.
#'
#' @return A \emph{p-value}
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' ## Default parameter values decreased to ensure an execution < 2 seconds
#' Yi          <- acuteInflammation$data$var_3
#' ind         <- acuteInflammation$meta$ind
#' time        <- acuteInflammation$meta$time
#' group       <- acuteInflammation$meta$group
#' grouping    <- get_grouping(ind, group)
#' inputMatrix <- get_ind_time_matrix(Yi, ind, time)
#' SANTAObj    <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#' SANTAGroup  <- SANTAObj$groups[[2]]
#' #SANTAGroup <- SANTAObj$groups$Group2
#' santaR_pvalue_dist_within(SANTAGroup, nPerm=500)
#' # ~0.00990099
#'
#' @seealso Inter-group comparison with \code{\link{santaR_pvalue_dist}}
#'
#' @export
santaR_pvalue_dist_within <- function(SANTAGroup,nPerm=1000,nStep=5000) {
  ## COMMENT
  # Ind .pred have the smoothness penalty, groupMeanCurve is therefore fitted at df=nbTP. 
  # as we bootstrap the Ind (.in and .pred), no need to refit @ original df
  
  ## FUNCTION
  dist.fit              <- function(models,nStep){
    #         __a__
    #       /|     \
    #     /  h      \ 
    #   /____|__b____\
    #
    #   Area = (a+b)/2 * h
    #
    rng   <- range(as.numeric(colnames(models$mod.grp1$obs)))
    h     <- (rng[2]-rng[1])/nStep                            # binwidth
    grid  <- seq(from=rng[1],to=rng[2],by=h)
    
    proj    <- matrix( c(stats::predict(models$mod.grp1$fit,x=grid)$y, stats::predict(models$mod.grp2$fit,x=grid)$y) , nrow=2, byrow=TRUE)
    dist.sp <- abs(proj[1,]-proj[2,])
    dist.sp <- sum( c(dist.sp,dist.sp[2:(length(dist.sp)-1)]) )/2
    
    return(h*dist.sp)
  } 
  within.null.sim.ind   <- function(group1,group2) {
    ## Simulate NULL population by resampling the individuals
    # init
    n1    <- dim(group1$inp)[1]
    n2    <- dim(group2$inp)[1]
    n     <- n1+n2
    inp   <- rbind(group1$inp, group2$inp)
    pred  <- rbind(group1$pred,group2$pred)
    # random draw
    sampler1  <- sample(1:n, size=n1, replace=TRUE)
    sampler2  <- sample(1:n, size=n2, replace=TRUE)
    # make groups
    return(list(pop1=list( inp=inp[sampler1,], pred=pred[sampler1,] ) ,pop2=list( inp=inp[sampler2,], pred=pred[sampler2,] ) ))
  }     # Simulate 2 NULL populations by resampling individuals
  fit.mean.curve        <- function(group){
    ## Works on .pred values for each individual, that already have the smoothness penalty, so fit groupMeanCurve with df=nbTP
    time      <- as.numeric(colnames(group))
    meanVal   <- colMeans(group, na.rm=TRUE)
    notNA     <- !is.na(meanVal)
    return(stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) ))
  }           # return the fitted smooth.spline for that group
  fit.models            <- function(population) {
    ## Fit 2 groupMeanCurve on a population IND x TIME
    #
    ## Returns a list of models: observations and fitted spline
    #
    ## fit on .pred to save the step of ind prediction at each run of the function
    
    group1 <- population$pop1
    group2 <- population$pop2
    
    fit.grp1   <- fit.mean.curve(group1$pred)
    fit.grp2   <- fit.mean.curve(group2$pred)
    if( (! inherits(fit.grp1, 'smooth.spline')) & (! inherits(fit.grp2, 'smooth.spline')) ) { return(NA) }
    model.grp1 <- list(obs=group1$inp, fit=fit.grp1)                  # obs only needed in dist.fit to get time range
    model.grp2 <- list(obs=group2$inp, fit=fit.grp2)
    
    return(list(mod.grp1=model.grp1, mod.grp2=model.grp2))
  }     # fit 2 group models
  gen.grp.flat          <- function(group) {
    n           <- dim(group)[1]
    time        <- as.numeric(colnames(group))
    suppressMessages( y <- reshape2::melt(group, na.rm=TRUE)$value )
    group.flat  <- data.frame( t(stats::predict(object=stats::lm( y ~ 1),newdata=data.frame(y=time))) )
    group.flat  <- group.flat[rep(1,n),]
    colnames(group.flat) <- colnames(group)
    return(list( inp=group.flat, pred=group.flat ))
  }             # Return a group of flat individuals lm(y ~ 1) representing no time trend 
  
  ## Init
  groupData1  <- list( inp=SANTAGroup$groupData.in, pred=SANTAGroup$groupData.pred )
  groupData2  <- gen.grp.flat(groupData1$inp)                         # generate flat curve on input data
  
  if( is.data.frame(groupData1$inp) & is.data.frame(groupData2$inp)){
    
    ## distance
    models  <- fit.models(list(pop1=groupData1, pop2=groupData2)) # mod.grp1, mod.grp2
    if( !is.list(models) ) { return(NA) }                             # fitting of one of the mean curve must have failed
    dist    <- dist.fit( models, nStep )                              # NULL real vs alternative
    
    ## Bootstrap
    dist0  <- replicate( nPerm, dist.fit( fit.models(within.null.sim.ind(groupData1,groupData2)), nStep) )
    
    p <- (sum( dist0 >= dist)+1)/(nPerm+1)
    
    return(p)
  } else {
    return(NA)
  }
}