R/santaR_fit.R

Defines functions santaR_fit

Documented in santaR_fit

#' Generate a SANTAObj for a variable
#'
#' Generate a \emph{SANTAObj} containing all the splines model for individual and group time evolutions. Once all the splines representing individual and group evolutions are fitted, all time-points are back-projected (projected) and employed in subsequent analysis in place of the input measurements (functional approach). A grouping can be provided to separate individuals and compare trajectories: any number of groups can be provided, but comparision of group trajectories can only be executed between 2 groups.
#' \itemize{
#'   \item Individual trajectories with less than 4 time-points are rejected due to constraints on \code{\link[stats]{smooth.spline}} fitting \emph{(number of time-points < 4)}.
#'   \item Individual trajectories with less time-points than \emph{df} are rejected due to constraints on \code{\link[stats]{smooth.spline}} fitting \emph{(number of time-points < df)}.
#'   \item Rejected individual trajectories are not taken into account for mean curves calculations.
#' }
#'
#' @param inputMatrix \code{data.frame} of measurements for each IND x TIME as generated by \code{\link{get_ind_time_matrix}}. Rows are unique Individual IDs and columns unique measurement Time. Pairs of (IND,TIME) without a measurement are left as NA.
#' @param df (float) Degree of freedom to employ for fitting the \code{\link[stats]{smooth.spline}}
#' @param grouping NA or a \code{data.frame} with 2 columns (ind and group) listing as rows each unique Individual ID and the corresponding group membership, as generated by \code{\link{get_grouping}}. Default is NA for no groups.
#' @param verbose (bool) If TRUE output the progress of fitting. Default is TRUE.
#'
#' @return A \emph{SANTAObj} containing all the spline models with individual and group time evolutions, for further analysis.
#'
#' \subsection{Details:}{
#'   The returned \emph{SANTAObj} is structured as follow:
#'   \tabular{ll}{
#'     SANTAObj \tab santaR object for futher analysis\cr
#'      \tab \cr
#'     SANTAObj$properties$df \tab input degree of freedom\cr
#'     SANTAObj$properties$CBand$status \tab Confidence Bands for group mean curve calculated \emph{(TRUE or FALSE)}\cr
#'     SANTAObj$properties$CBand$nBoot \tab parameter, number or bootstrap rounds for calculation of the group mean curve confidence bands\cr
#'     SANTAObj$properties$CBand$alpha \tab parameter, confidence of the group mean curve band\cr
#'     SANTAObj$properties$pval.dist$status \tab \emph{p}-value distance calculated \emph{(TRUE or FALSE)}\cr
#'     SANTAObj$properties$pval.dist$nPerm \tab parameter, number of permutations for calculation of distance \emph{p}-value\cr
#'     SANTAObj$properties$pval.dist$alpha \tab parameter, confidence on the bootstrapped \emph{p}-value\cr
#'     SANTAObj$properties$pval.fit$status \tab \emph{p}-value fitting calculated \emph{(TRUE or FALSE)}\cr
#'     SANTAObj$properties$pval.fit$nPerm \tab parameter, number of permutations for calculation of fitting \emph{p}-value\cr
#'     SANTAObj$properties$pval.fit$alpha \tab parameter, confidence on the bootstrapped \emph{p}-value\cr
#'      \tab \cr
#'     SANTAObj$general$inputData \tab \emph{inputMatrix}\cr
#'     SANTAObj$general$cleanData.in \tab only kept individuals INPUT values \emph{(equivalent to inputMatrix - rejected)}\cr
#'     SANTAObj$general$cleanData.pred \tab only kept individuals PREDICTED values on Ind splines\cr
#'     SANTAObj$general$grouping \tab grouping vector given as input\cr
#'     SANTAObj$general$meanCurve \tab spline fit over all kept datapoint \emph{(cleanData.pred)} | \code{\link[stats]{smooth.spline}} object\cr
#'     SANTAObj$general$pval.curveCorr \tab Pearson correlation coefficient between the two group curves, to detect highly correlated group shapes if required.\cr
#'     SANTAObj$general$pval.dist \tab \emph{p}-value between groups based on distance between groupMeanCurves\cr
#'     SANTAObj$general$pval.dist.l \tab lower bound confidence interval on \emph{p}-value\cr
#'     SANTAObj$general$pval.dist.u \tab upper bound confidence interval on \emph{p}-value\cr
#'     SANTAObj$general$pval.fit \tab \emph{p}-value between groups based on groupMeanCurves fitting\cr
#'     SANTAObj$general$pval.fit.l \tab lower bound confidence interval on \emph{p}-value\cr
#'     SANTAObj$general$pval.fit.u \tab upper bound confidence interval on \emph{p}-value\cr
#'      \tab \cr
#'     SANTAObj$groups \tab list of group information\cr
#'     SANTAObj$groups$rejectedInd \tab list of rejected individual \emph{(#tp < 4 or df)} | data\cr
#'     SANTAObj$groups$curveInd \tab list of spline fit | \code{\link[stats]{smooth.spline}} object\cr
#'     SANTAObj$groups$groupMeanCurve \tab spline fit over groupData.pred | \code{\link[stats]{smooth.spline}} object\cr
#'     SANTAObj$groups$point.in \tab all group points INPUT values (x,y) [kept individuals]\cr
#'     SANTAObj$groups$point.pred \tab all group points PREDICTED values on Ind splines (x,y)\cr
#'     SANTAObj$groups$groupData.in \tab only individuals from this group INPUT value (IND x TIME)\cr
#'     SANTAObj$groups$groupData.pred \tab only individuals from this group PREDICTED values on Ind splines (x,y)
#'   }
#' }
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' Yi             <- acuteInflammation$data$var_1
#' ind            <- acuteInflammation$meta$ind
#' time           <- acuteInflammation$meta$time
#' group          <- acuteInflammation$meta$group
#' grouping       <- get_grouping(ind, group)
#' inputMatrix    <- get_ind_time_matrix(Yi, ind, time)
#' resultSANTAObj <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#'
#' @family Analysis
#'
#' @export
santaR_fit                <- function(inputMatrix,df,grouping=NA,verbose=TRUE) {
  
  ## keep non-empty rows/cols
  inputMatrix  <- inputMatrix[ apply(!is.na(inputMatrix),1,sum) > 0, apply(!is.na(inputMatrix),2,sum) > 0]
  
	## Input check (Broad conditions, some individuals might still fail later)
	# DF must be numeric
	if( !is.numeric(df) ){
    message("Error: Check input, 'df' should be numeric")
    stop("Check input, 'df' should be numeric")
  }
	# Need a minimum of 4 tp
	if (ncol(inputMatrix) < 4){
		message("Error: Check input, a minimum of 4 unique time-points are required to fit a smooth.splines")
		stop("Check input, a minimum of 4 unique time-points are required to fit a smooth.splines")		
	}
	# Number of tp < df
	if (ncol(inputMatrix) < df){
		message("Error: Check input, the number of unique time-points (", ncol(inputMatrix), ") cannot be less than the requested df (", df, ")")
		stop("Check input, the number of unique time-points (", ncol(inputMatrix), ") cannot be less than the requested df (", df, ")")
	}
	
  ## initialisation
  time                                 <- as.numeric(colnames(inputMatrix))
  SANTAObj                             <- list() # will be a list of list
  SANTAObj$properties                  <- list()
  SANTAObj$properties$df               <- df
  SANTAObj$properties$CBand$status     <- FALSE
	SANTAObj$properties$CBand$nBoot      <- NA
  SANTAObj$properties$CBand$alpha	     <- NA
  SANTAObj$properties$pval.dist$status <- FALSE
  SANTAObj$properties$pval.dist$nPerm  <- NA
  SANTAObj$properties$pval.dist$alpha  <- NA
  SANTAObj$properties$pval.fit$status  <- FALSE
  SANTAObj$properties$pval.fit$nPerm   <- NA
  SANTAObj$properties$pval.fit$alpha   <- NA
  SANTAObj$groups                      <- list()
  SANTAObj$general                     <- list()
  SANTAObj$general$inputData           <- inputMatrix
  SANTAObj$general$grouping            <- grouping
  SANTAObj$general$cleanData.in        <- matrix(ncol=length(time), nrow=0)
  SANTAObj$general$cleanData.pred      <- matrix(ncol=length(time), nrow=0)
  SANTAObj$general$pval.curveCorr      <- NA
  SANTAObj$general$pval.dist           <- NA
  SANTAObj$general$pval.dist.l         <- NA
  SANTAObj$general$pval.dist.u         <- NA
  SANTAObj$general$pval.fit            <- NA
  SANTAObj$general$pval.fit.l          <- NA
  SANTAObj$general$pval.fit.u          <- NA
  
  ## separate in groups
  dataMatrix  <- list()
  if (any(!is.na(grouping)))  {
    SANTAObj$general$grouping <- grouping
    groups  <- levels(as.factor(grouping[,2]))
    for (i in 1 : length(groups) ) {
      dataMatrix[[i]]         <- inputMatrix[ rownames(inputMatrix) %in% grouping[ grouping[,2]==groups[i], 1], ] 
      names(dataMatrix)[[i]]  <- groups[i]
    }    
  } else {
    dataMatrix[[1]]   <- inputMatrix
  } 
  
  ## for each group
  for (j in 1:length(dataMatrix)) {
    
    ## reject ind (store it)
    ## calculate ind spline
    ## store kept ind points
    ## calculate groupMeanCurve
    tmpGrp              <- list()
    tmpGrp$rejectedInd  <- list()
    tmpGrp$curveInd     <- list()
    ptX.in              <- numeric()
    ptY.in              <- numeric()
    ptX.pred            <- numeric()
    ptY.pred            <- numeric()
    groupData.in        <- matrix(ncol=length(time), nrow=0)
    groupData.pred      <- matrix(ncol=length(time), nrow=0)
    
    ## for each ind
    for (ind in 1 : length(rownames(dataMatrix[[j]])) ) {
      
      notNA             <- !is.na(dataMatrix[[j]][ind,])
      xInd              <- time[ notNA ]
      yInd              <- as.numeric(dataMatrix[[j]][ ind, notNA ])
      
      if        ( length( yInd ) ==0 ) {                                                          # special case of no individual matching 
        if(verbose){ message("No individual matching") }
        tmpGrp$curveInd                   <- NA                                                   ## Signal that no IND exist in this subgroup
        groupData.in                      <- NA
        groupData.pred                    <- NA
      } else if ( length( yInd ) < 4 ) {                                                          # no spline under 4 tp         -  n = #{unique x} = 4
        if(verbose){ message(rownames(dataMatrix[[j]][ind,])," - #tp < 4") }
        tmpGrp$rejectedInd[[ind]]         <- dataMatrix[[j]][ind,]
        names(tmpGrp$rejectedInd)[[ind]]  <- rownames(dataMatrix[[j]][ind,])
      } else if ( length( yInd ) < df ) {                                                         # no spline if less tp than df  -  1 < df <= n
        if(verbose){ message(rownames(dataMatrix[[j]][ind,]), " - #tp(", length( yInd ), ") < df(", df, ")") }
        tmpGrp$rejectedInd[[ind]]         <- dataMatrix[[j]][ind,]
        names(tmpGrp$rejectedInd)[[ind]]  <- rownames(dataMatrix[[j]][ind,])
      } else {
        tmpGrp$curveInd[[ind]]            <- stats::smooth.spline(xInd, yInd, df=df)
        names(tmpGrp$curveInd)[[ind]]     <- rownames(dataMatrix[[j]][ind,])
        # input values
        ptX.in                            <- c(ptX.in, xInd)
        ptY.in                            <- c(ptY.in, yInd)
        groupData.in                      <- rbind(groupData.in, dataMatrix[[j]][ind,])
        # predicted values from Ind spline
        ind.pred                          <- stats::predict( object=tmpGrp$curveInd[[ind]], x=time)
        ptX.pred                          <- c(ptX.pred, ind.pred$x)
        ptY.pred                          <- c(ptY.pred, ind.pred$y)
        groupData.pred                    <- rbind(groupData.pred, ind.pred$y)
      }
    }
    if(is.logical(groupData.in)) {                                                                  # if the group is empty
      if(verbose){ message("No individual left matching 1") }
      tmpGrp$curveInd                     <- NA                                                     ## Signal that no IND exist in this subgroup
      groupData.in                        <- NA
      groupData.pred                      <- NA
    } else {
      if( is.data.frame(groupData.in) ) {                                                           # if it wasn't already detected/taken care of
        if( dim(groupData.in)[1]==0 ) {                                                             # case of no individuals matching
          if(verbose){ message("No individual left matching 2") }
          tmpGrp$curveInd                 <- NA                                                     ## Signal that no IND exist in this subgroup
          groupData.in                    <- NA
          groupData.pred                  <- NA
        }
      }
    }
    ## groupMeanCurve
    if( is.data.frame(groupData.in) ) {                                                           ## is a data.frame only if not empty (checked before)
      meanVal                           <- colMeans(groupData.pred, na.rm=TRUE)
      notNA                             <- !is.na(meanVal)
      if( sum(notNA) >= df ) {
        tmpGrp$groupMeanCurve           <- stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) )
      } else {
        tmpGrp$groupMeanCurve           <- NA
      }      
      tmpGrp$point.in                   <- data.frame(x=ptX.in,   y=ptY.in)
      tmpGrp$point.pred                 <- data.frame(x=ptX.pred, y=ptY.pred)
      tmpGrp$groupData.in               <- data.frame(groupData.in)
      colnames(tmpGrp$groupData.in)     <- time
      rownames(tmpGrp$groupData.in)     <- rownames(groupData.in)
      tmpGrp$groupData.pred             <- data.frame(groupData.pred)
      colnames(tmpGrp$groupData.pred)   <- time
      rownames(tmpGrp$groupData.pred)   <- rownames(groupData.in)
      SANTAObj$general$cleanData.in    <- rbind(SANTAObj$general$cleanData.in,  tmpGrp$groupData.in)
      SANTAObj$general$cleanData.pred  <- rbind(SANTAObj$general$cleanData.pred,tmpGrp$groupData.pred)
    } else {
      tmpGrp$groupMeanCurve             <- NA
      tmpGrp$point.in                   <- NA
      tmpGrp$point.pred                 <- NA
      tmpGrp$groupData.in               <- NA
      tmpGrp$groupData.pred             <- NA
    }
    SANTAObj$groups[[j]]               <- tmpGrp
  }
  names(SANTAObj$groups) <- names(dataMatrix)
  
  ## general mean curve
  meanVal                 <- colMeans(SANTAObj$general$cleanData.pred, na.rm=TRUE)
  notNA                   <- !is.na(meanVal)
  if( sum(notNA) >= df ) {
    SANTAObj$general$meanCurve  <- stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) )
  } else {
    SANTAObj$general$meanCurve  <- NA
  }
  
  return(SANTAObj)
}

Try the santaR package in your browser

Any scripts or data that you put into this service are public.

santaR documentation built on May 2, 2019, 3:26 a.m.