R/santaR_fit.R

Defines functions santaR_fit

Documented in santaR_fit

#' Generate a SANTAObj for a variable
#'
#' Generate a \emph{SANTAObj} containing all the splines model for individual and group time evolutions. Once all the splines representing individual and group evolutions are fitted, all time-points are back-projected (projected) and employed in subsequent analysis in place of the input measurements (functional approach). A grouping can be provided to separate individuals and compare trajectories: any number of groups can be provided, but comparision of group trajectories can only be executed between 2 groups.
#' \itemize{
#'   \item Individual trajectories with less than 4 time-points are rejected due to constraints on \code{\link[stats]{smooth.spline}} fitting \emph{(number of time-points < 4)}.
#'   \item Individual trajectories with less time-points than \emph{df} are rejected due to constraints on \code{\link[stats]{smooth.spline}} fitting \emph{(number of time-points < df)}.
#'   \item Rejected individual trajectories are not taken into account for mean curves calculations.
#' }
#'
#' @param inputMatrix \code{data.frame} of measurements for each IND x TIME as generated by \code{\link{get_ind_time_matrix}}. Rows are unique Individual IDs and columns unique measurement Time. Pairs of (IND,TIME) without a measurement are left as NA.
#' @param df (float) Degree of freedom to employ for fitting the \code{\link[stats]{smooth.spline}}
#' @param grouping NA or a \code{data.frame} with 2 columns (ind and group) listing as rows each unique Individual ID and the corresponding group membership, as generated by \code{\link{get_grouping}}. Default is NA for no groups.
#' @param verbose (bool) If TRUE output the progress of fitting. Default is TRUE.
#'
#' @return A \emph{SANTAObj} containing all the spline models with individual and group time evolutions, for further analysis.
#'
#' \subsection{Details:}{
#'   The returned \emph{SANTAObj} is structured as follow:
#'   \tabular{ll}{
#'     SANTAObj \tab santaR object for futher analysis\cr
#'      \tab \cr
#'     SANTAObj$properties$df \tab input degree of freedom\cr
#'     SANTAObj$properties$CBand$status \tab Confidence Bands for group mean curve calculated \emph{(TRUE or FALSE)}\cr
#'     SANTAObj$properties$CBand$nBoot \tab parameter, number or bootstrap rounds for calculation of the group mean curve confidence bands\cr
#'     SANTAObj$properties$CBand$alpha \tab parameter, confidence of the group mean curve band\cr
#'     SANTAObj$properties$pval.dist$status \tab \emph{p}-value distance calculated \emph{(TRUE or FALSE)}\cr
#'     SANTAObj$properties$pval.dist$nPerm \tab parameter, number of permutations for calculation of distance \emph{p}-value\cr
#'     SANTAObj$properties$pval.dist$alpha \tab parameter, confidence on the bootstrapped \emph{p}-value\cr
#'     SANTAObj$properties$pval.fit$status \tab \emph{p}-value fitting calculated \emph{(TRUE or FALSE)}\cr
#'     SANTAObj$properties$pval.fit$nPerm \tab parameter, number of permutations for calculation of fitting \emph{p}-value\cr
#'     SANTAObj$properties$pval.fit$alpha \tab parameter, confidence on the bootstrapped \emph{p}-value\cr
#'      \tab \cr
#'     SANTAObj$general$inputData \tab \emph{inputMatrix}\cr
#'     SANTAObj$general$cleanData.in \tab only kept individuals INPUT values \emph{(equivalent to inputMatrix - rejected)}\cr
#'     SANTAObj$general$cleanData.pred \tab only kept individuals PREDICTED values on Ind splines\cr
#'     SANTAObj$general$grouping \tab grouping vector given as input\cr
#'     SANTAObj$general$meanCurve \tab spline fit over all kept datapoint \emph{(cleanData.pred)} | \code{\link[stats]{smooth.spline}} object\cr
#'     SANTAObj$general$pval.curveCorr \tab Pearson correlation coefficient between the two group curves, to detect highly correlated group shapes if required.\cr
#'     SANTAObj$general$pval.dist \tab \emph{p}-value between groups based on distance between groupMeanCurves\cr
#'     SANTAObj$general$pval.dist.l \tab lower bound confidence interval on \emph{p}-value\cr
#'     SANTAObj$general$pval.dist.u \tab upper bound confidence interval on \emph{p}-value\cr
#'     SANTAObj$general$pval.fit \tab \emph{p}-value between groups based on groupMeanCurves fitting\cr
#'     SANTAObj$general$pval.fit.l \tab lower bound confidence interval on \emph{p}-value\cr
#'     SANTAObj$general$pval.fit.u \tab upper bound confidence interval on \emph{p}-value\cr
#'      \tab \cr
#'     SANTAObj$groups \tab list of group information\cr
#'     SANTAObj$groups$rejectedInd \tab list of rejected individual \emph{(#tp < 4 or df)} | data\cr
#'     SANTAObj$groups$curveInd \tab list of spline fit | \code{\link[stats]{smooth.spline}} object\cr
#'     SANTAObj$groups$groupMeanCurve \tab spline fit over groupData.pred | \code{\link[stats]{smooth.spline}} object\cr
#'     SANTAObj$groups$point.in \tab all group points INPUT values (x,y) [kept individuals]\cr
#'     SANTAObj$groups$point.pred \tab all group points PREDICTED values on Ind splines (x,y)\cr
#'     SANTAObj$groups$groupData.in \tab only individuals from this group INPUT value (IND x TIME)\cr
#'     SANTAObj$groups$groupData.pred \tab only individuals from this group PREDICTED values on Ind splines (x,y)
#'   }
#' }
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' Yi             <- acuteInflammation$data$var_1
#' ind            <- acuteInflammation$meta$ind
#' time           <- acuteInflammation$meta$time
#' group          <- acuteInflammation$meta$group
#' grouping       <- get_grouping(ind, group)
#' inputMatrix    <- get_ind_time_matrix(Yi, ind, time)
#' resultSANTAObj <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#'
#' @family Analysis
#'
#' @export
santaR_fit                <- function(inputMatrix,df,grouping=NA,verbose=TRUE) {
  
  ## keep non-empty rows/cols
  inputMatrix  <- inputMatrix[ apply(!is.na(inputMatrix),1,sum) > 0, apply(!is.na(inputMatrix),2,sum) > 0]
  
  ## Input check (Broad conditions, some individuals might still fail later)
	# DF must be numeric
	if( !is.numeric(df) ){
    message("Error: Check input, 'df' should be numeric")
    stop("Check input, 'df' should be numeric")
  }
	# Need a minimum of 4 tp
	if (ncol(inputMatrix) < 4){
		message("Error: Check input, a minimum of 4 unique time-points are required to fit a smooth.splines")
		stop("Check input, a minimum of 4 unique time-points are required to fit a smooth.splines")		
	}
	# Number of tp < df
	if (ncol(inputMatrix) < df){
		message("Error: Check input, the number of unique time-points (", ncol(inputMatrix), ") cannot be less than the requested df (", df, ")")
		stop("Check input, the number of unique time-points (", ncol(inputMatrix), ") cannot be less than the requested df (", df, ")")
	}
	
  ## initialisation
  time                                 <- as.numeric(colnames(inputMatrix))
  SANTAObj                             <- list() # will be a list of list
  SANTAObj$properties                  <- list()
  SANTAObj$properties$df               <- df
  SANTAObj$properties$CBand$status     <- FALSE
	SANTAObj$properties$CBand$nBoot      <- NA
  SANTAObj$properties$CBand$alpha	     <- NA
  SANTAObj$properties$pval.dist$status <- FALSE
  SANTAObj$properties$pval.dist$nPerm  <- NA
  SANTAObj$properties$pval.dist$alpha  <- NA
  SANTAObj$properties$pval.fit$status  <- FALSE
  SANTAObj$properties$pval.fit$nPerm   <- NA
  SANTAObj$properties$pval.fit$alpha   <- NA
  SANTAObj$groups                      <- list()
  SANTAObj$general                     <- list()
  SANTAObj$general$inputData           <- inputMatrix
  SANTAObj$general$grouping            <- grouping
  SANTAObj$general$cleanData.in        <- matrix(ncol=length(time), nrow=0)
  SANTAObj$general$cleanData.pred      <- matrix(ncol=length(time), nrow=0)
  SANTAObj$general$pval.curveCorr      <- NA
  SANTAObj$general$pval.dist           <- NA
  SANTAObj$general$pval.dist.l         <- NA
  SANTAObj$general$pval.dist.u         <- NA
  SANTAObj$general$pval.fit            <- NA
  SANTAObj$general$pval.fit.l          <- NA
  SANTAObj$general$pval.fit.u          <- NA
  
  ## separate in groups
  dataMatrix  <- list()
  if (any(!is.na(grouping)))  {
    SANTAObj$general$grouping <- grouping
    groups  <- levels(as.factor(grouping[,2]))
    for (i in 1 : length(groups) ) {
      dataMatrix[[i]]         <- inputMatrix[ rownames(inputMatrix) %in% grouping[ grouping[,2]==groups[i], 1], ] 
      names(dataMatrix)[[i]]  <- groups[i]
    }    
  } else {
    dataMatrix[[1]]   <- inputMatrix
  } 

  ## for each group
  for (j in 1:length(dataMatrix)) {
    
    ## reject ind (store it)
    ## calculate ind spline
    ## store kept ind points
    ## calculate groupMeanCurve
    tmpGrp              <- list()
    tmpGrp$rejectedInd  <- list()
    tmpGrp$curveInd     <- list()
    ptX.in              <- numeric()
    ptY.in              <- numeric()
    ptX.pred            <- numeric()
    ptY.pred            <- numeric()
    groupData.in        <- matrix(ncol=length(time), nrow=0)
    groupData.pred      <- matrix(ncol=length(time), nrow=0)

    ## for each ind
    # no ind
    if (dim(dataMatrix[[j]])[1] == 0) {
      # special case of no individual matching 
      if(verbose){ message("No individuals left in group #", j, " once data is imported (remove NA rows and columns)") }
      tmpGrp$curveInd                   <- NA             ## Signal that no IND exist in this subgroup
      groupData.in                      <- NA
      groupData.pred                    <- NA
      
    # iterate over individuals
    } else {
      for (ind in 1 : length(rownames(dataMatrix[[j]])) ) {
        notNA             <- !is.na(dataMatrix[[j]][ind,])
        xInd              <- time[ notNA ]
        yInd              <- as.numeric(dataMatrix[[j]][ ind, notNA ])
        
        if (length( yInd ) < 4) {                       # no spline under 4 tp - n = #{unique x} = 4
          if(verbose){ message(rownames(dataMatrix[[j]][ind,])," - #tp < 4") }
          tmpGrp$rejectedInd[[ind]]         <- dataMatrix[[j]][ind,]
          names(tmpGrp$rejectedInd)[[ind]]  <- rownames(dataMatrix[[j]][ind,])
        } else if (length( yInd ) < df) {               # no spline if less tp than df - 1 < df <= n
          if(verbose){ message(rownames(dataMatrix[[j]][ind,]), " - #tp(", length( yInd ), ") < df(", df, ")") }
          tmpGrp$rejectedInd[[ind]]         <- dataMatrix[[j]][ind,]
          names(tmpGrp$rejectedInd)[[ind]]  <- rownames(dataMatrix[[j]][ind,])
        } else {
          tmpGrp$curveInd[[ind]]            <- stats::smooth.spline(xInd, yInd, df=df)
          names(tmpGrp$curveInd)[[ind]]     <- rownames(dataMatrix[[j]][ind,])
          # input values
          ptX.in                            <- c(ptX.in, xInd)
          ptY.in                            <- c(ptY.in, yInd)
          groupData.in                      <- rbind(groupData.in, dataMatrix[[j]][ind,])
          # predicted values from Ind spline
          ind.pred                          <- stats::predict( object=tmpGrp$curveInd[[ind]], x=time)
          ptX.pred                          <- c(ptX.pred, ind.pred$x)
          ptY.pred                          <- c(ptY.pred, ind.pred$y)
          groupData.pred                    <- rbind(groupData.pred, ind.pred$y)
        }
      }
    }
    
    if(is.logical(groupData.in)) {                                    # catch signal of no individual in the group signal (NA or empty initialised matrix)
      if(verbose){ message("No individuals left in group #",j) }
      tmpGrp$curveInd                   <- NA                         ## Signal that no IND exist in this subgroup
      groupData.in                      <- NA
      groupData.pred                    <- NA
    } else {
      if( is.data.frame(groupData.in) ) {                             # if all individuals failed (shouldn't happen as a df is only created if one ind is appended to the init matrix)
        if( dim(groupData.in)[1]==0 ) {                               # case of no individuals matching
          if(verbose){ message("No individuals left in group #",j) }
          tmpGrp$curveInd               <- NA                         ## Signal that no IND exist in this subgroup
          groupData.in                  <- NA
          groupData.pred                <- NA
        }
      }
    }
    ## groupMeanCurve
    if( is.data.frame(groupData.in) ) {                               ## is a data.frame only if not empty (checked before)
      meanVal                           <- colMeans(groupData.pred, na.rm=TRUE)
      notNA                             <- !is.na(meanVal)
      if( sum(notNA) >= df ) {
        tmpGrp$groupMeanCurve           <- stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) )
      } else {
        tmpGrp$groupMeanCurve           <- NA
      }      
      tmpGrp$point.in                   <- data.frame(x=ptX.in,   y=ptY.in)
      tmpGrp$point.pred                 <- data.frame(x=ptX.pred, y=ptY.pred)
      tmpGrp$groupData.in               <- data.frame(groupData.in)
      colnames(tmpGrp$groupData.in)     <- time
      rownames(tmpGrp$groupData.in)     <- rownames(groupData.in)
      tmpGrp$groupData.pred             <- data.frame(groupData.pred)
      colnames(tmpGrp$groupData.pred)   <- time
      rownames(tmpGrp$groupData.pred)   <- rownames(groupData.in)
      SANTAObj$general$cleanData.in     <- rbind(SANTAObj$general$cleanData.in,  tmpGrp$groupData.in)
      SANTAObj$general$cleanData.pred   <- rbind(SANTAObj$general$cleanData.pred,tmpGrp$groupData.pred)
    } else {
      tmpGrp$groupMeanCurve             <- NA
      tmpGrp$point.in                   <- NA
      tmpGrp$point.pred                 <- NA
      tmpGrp$groupData.in               <- NA
      tmpGrp$groupData.pred             <- NA
    }
    SANTAObj$groups[[j]]                <- tmpGrp
  }
  names(SANTAObj$groups) <- names(dataMatrix)
  
  ## general mean curve
  meanVal                 <- colMeans(SANTAObj$general$cleanData.pred, na.rm=TRUE)
  notNA                   <- !is.na(meanVal)
  if( sum(notNA) >= df ) {
    SANTAObj$general$meanCurve  <- stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) )
  } else {
    SANTAObj$general$meanCurve  <- NA
  }
  
  return(SANTAObj)
}
adwolfer/santaR documentation built on March 10, 2024, 5:28 p.m.