R/TcGSA.LR.R
In TcGSA: Time-Course Gene Set Analysis

Documented in print.TcGSA TcGSA.LR

#'Computing the Likelihood Ratios for the Gene Sets under Scrutiny
#'
#'This function computes the Likelihood Ratios for the gene sets under
#'scrutiny, as well as estimations of genes dynamics inside those gene sets
#'through mixed models.
#'
#'This Time-course Gene Set Analysis aims at identifying gene sets that are not
#'stable over time, either homogeneously or heterogeneously (see \emph{Hejblum
#'et al, 2012})in terms of their probes.  And when the argument
#'\code{separateSubjects} is \code{TRUE}, instead of identifying gene sets that
#'have a significant trend over time, \emph{TcGSA} identifies gene sets that
#'have significantly different trends over time depending on \code{Subjects}.
#'
#'@aliases TcGSA.LR print.TcGSA.LR
#'
#'@param expr 
#'a matrix or dataframe of gene expression.  Its dimension are
#'\eqn{n}x\eqn{p}, with the \eqn{p} samples in column and the \eqn{n} genes in
#'row.
#'
#'@param gmt 
#'a \bold{gmt} object containing the gene sets definition.  See
#'\code{\link[GSA:GSA.read.gmt]{GSA.read.gmt}} and definition on 
#'\href{https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}{www.software.broadinstitute.org}.
#'
#'@param design
#'a matrix or dataframe containing the experimental variables that used in the model,
#'namely \code{subject_name}, \code{time_name}, and \code{covariates_fixed} 
#'and \code{time_covariates} if applicable.  Its dimension are \eqn{p}x\eqn{m} 
#'and its row are is in the same order as the columns of \code{expr}.
#'
#'@param subject_name
#'the name of the factor variable from \code{design} that contains the information on 
#'the repetition units used in the mixed model, such as the patient identifiers for instance.  
#'Default is \code{'Patient_ID'}.  See Details.
#'
#'@param time_name
#'the name of a numeric variable from \code{design} that contains 
#'the information on the time replicates (the time points at which gene 
#'expression was measured).  Default is \code{'TimePoint'}.  See Details.
#'
#'@param crossedRandom
#'logical flag indicating whether the random effects of the subjects and of the time points
#'should be modeled as one crossed random effect or as two separated random effects.  
#'Default is \code{FALSE}. See details.
#'
#'@param covariates_fixed
#'a character vector with the names of numeric or factor variables from the \code{design} 
#'matrix that should appear as fixed effects in the model.  See details.
#'Default is \code{""}, which corresponds to no covariates in the model.
#'
#'@param time_covariates
#'a character vector with the names of numeric or factor variables from the \code{design} 
#'matrix that should appear as fixed effects interaction with the \code{time_name} 
#'variable in the model.  See details.  Default is \code{""}, which corresponds 
#'to no covariates in the model.
#'
#'
#'@param time_func 
#'the form of the time trend. Can be either one of \code{"linear"},
#'\code{"cubic"}, \code{"splines"} or specified by the user, or the column name of 
#'a factor variable from \code{design}. If specified by the user, 
#'it must be as an expression using only names of variables from the \code{design} matrix 
#'with only the three following operators: \code{+}, \code{*}, \code{/} . 
#'The \code{"splines"} form corresponds to the natural cubic B-splines 
#'(see also \code{\link[splines:ns]{ns}}).  If there are only a few timepoints, 
#'a \code{"linear"} form should be sufficient. Otherwise, the \code{"cubic"} form is 
#'more parsimonious than the \code{"splines"} form, and should be sufficiently flexible.
#'If the column name of a factor variable from \code{design} is supplied, 
#'then time is considered as discrete in the analysis.
#'If the user specify a formula using column names from design, both factor and numeric
#'variables can be used.
#'
#'@param minGSsize 
#'the minimum number of genes in a gene set.  If there are
#'less genes than this number in one of the gene sets under scrutiny, the
#'Likelihood Ratio of this gene set is not computed (the mixed model are not
#'fitted). Default is \code{10} genes as the minimum.
#'
#'@param maxGSsize 
#'the maximum number of genes in a gene set.  If there are
#'more genes than this number in one of the gene sets under scrutiny, the
#'Likelihood Ratio of this gene set is not computed (the mixed model are not
#'fitted).  This is to avoid very long computation times.  Default is
#'\code{500} genes as the maximum.
#'
#'@param group_name 
#'in the case of several treatment groups, the name of a factor variable 
#'from the \code{design} matrix.  It indicates to which treatment group each sample
#' belongs to.  Default is \code{""}, which means that there is only one 
#' treatment group.  See Details.
#'
#'@param separateSubjects
#'logical flag indicating that the analysis identifies
#'gene sets that discriminates patients rather than gene sets than have a
#'significant trend over time.  Default is \code{FALSE}.  See Details.
#'
#'@return \code{TcGSA.LR} returns a \code{tcgsa} object, which is a list with
#'the 5 following elements:
#'\itemize{
	#'\item fit a data frame that contains the 3 following variables:
	#'\itemize{ 
	#'\item \code{LR}: the likelihood ratio between the model under the
	#'null hypothesis and the model under the alternative hypothesis.  
	#'\item
	#'\code{CVG_H0}: convergence status of the model under the null hypothesis.
	#'\item \code{CVG_H1}: convergence status of the model under the alternative
	#'hypothesis.
#'}
#'\item \code{time_func}: a character string passing along the value of the
#'\code{time_func} argument used in the call.
#'\item \code{GeneSets_gmt}: a \code{gmt} object passing along the value of the
#'\code{gmt} argument used in the call.
#'\item \code{group.var}: a factor passing along the \code{group_name} variable
#'from the \code{design} matrix.
#'\item \code{separateSubjects}: a logical flag passing along the value of the
#'\code{separateSubjects} argument used in the call.
#'\item \code{Estimations}: a list of 3 dimensions arrays.  Each element of the
#'list (i.e. each array) corresponds to the estimations of gene expression
#'dynamics for each of the gene sets under scrutiny (obtained from mixed
#'models).  The first dimension of those arrays is the genes included in the
#'concerned gene set, the second dimension is the \code{Patient_ID}, and the
#'third dimension is the \code{TimePoint}.  The values inside those arrays are
#'estimated gene expressions.
#'\item \code{time_DF}: the degree of freedom of the natural splines functions
#'}
#'
#'@author Boris P. Hejblum
#'
#'@seealso \code{\link{summary.TcGSA}}, \code{\link{plot.TcGSA}}, 
#'and \code{\link{TcGSA.LR.parallel}} for an implementation using 
#'parallel computing
#'
#'@references Hejblum BP, Skinner J, Thiebaut R, (2015) 
#'Time-Course Gene Set Analysis for Longitudinal Gene Expression Data. 
#'\emph{PLOS Comput. Biol.} 11(6):e1004310.
#'doi: 10.1371/journal.pcbi.1004310
#'
#'@importFrom GSA GSA.read.gmt
#'
#'@importFrom lme4 lmer
#'
#'@importFrom stats as.formula deviance fitted
#'
#'@export TcGSA.LR
#'
#'@examples
#'
#'if(interactive()){
#'data(data_simu_TcGSA)
#'
#'tcgsa_sim_1grp <- TcGSA.LR(expr=expr_1grp, gmt=gmt_sim, design=design, 
#'                           subject_name="Patient_ID", time_name="TimePoint",
#'                           time_func="linear", crossedRandom=FALSE)
#'tcgsa_sim_1grp
#'summary(tcgsa_sim_1grp)
#'
#'plot(x=tcgsa_sim_1grp, expr=expr_1grp, 
#'     Subject_ID=design$Patient_ID, TimePoint=design$TimePoint,
#'     baseline=1, 
#'     B=100,
#'     time_unit="H"
#'     )
#'        
#'tcgsa_sim_2grp <- TcGSA.LR(expr=expr_2grp, gmt=gmt_sim, design=design,
#'                           subject_name="Patient_ID", time_name="TimePoint",
#'                           time_func="linear", crossedRandom=FALSE, 
#'                           group_name="group.var")
#'tcgsa_sim_2grp
#'
#'}

TcGSA.LR <-
function(expr, gmt, design, subject_name="Patient_ID", time_name="TimePoint", crossedRandom=FALSE,
				 covariates_fixed="", time_covariates="",
				 time_func = "linear", group_name="", separateSubjects=FALSE,
				 minGSsize=10, maxGSsize=500){
  
  #DEBUG: data(data_simu_TcGSA);
  #expr=expr_1grp; Patient_ID=design$Patient_ID; TimePoint=design$TimePoint; gmt=gmt_sim; time_func="linear"; gs=2; maxGSsize=500; group_name="";
  #for (i in which(tcGSA_cubic_modules$CVG_H1>6)){
  #  print(paste(i, ":", length(intersect(gmt$genesets[[i]], rownames(expr)))))
  #}
	
  if(group_name!="" && separateSubjects){
    stop("'separateSubjects' is TRUE while 'group_name' is not \"\".\n This is an attempt to separate subjects in a multiple group setting.\n This is not handled by the TcGSA.LR function.\n\n")
  }
	
	if(is.matrix(expr)){
		if(mode(expr) !="numeric"){
		stop("'expr' is not numeric. Don't know how to deal with non-numerical expressions.")
		}
	}else if(is.data.frame(expr)){
		if(any(lapply(expr, mode) !="numeric")){
			stop("'expr' is not numeric. Don't know how to deal with non-numerical expressions.")
		}
	}else{
		stop("'expr' is neither a matrix nor a data.frame.")
	}

   
  LR <- numeric(length(gmt$genesets))
  CVG_H0 <- numeric(length(gmt$genesets))
  CVG_H1 <- numeric(length(gmt$genesets))
  estim_expr <- list()
  
  my_formul <- TcGSA.formula(design=design, subject_name=subject_name, time_name=time_name,  
  													 covariates_fixed=covariates_fixed, time_covariates=time_covariates, group_name=group_name,
  													 separateSubjects=separateSubjects, crossedRandom=crossedRandom,
  													 time_func=time_func)
  time_DF <- my_formul[["time_DF"]]
  for (gs in 1:length(gmt$geneset.names)){
    probes <- intersect(gmt$genesets[[gs]], rownames(expr))
    if(length(probes)>0 && length(probes)<=maxGSsize && length(probes)>=minGSsize){                                                       
    	expr_temp <- t(expr[probes, ])
    	rownames(expr_temp) <- NULL
    	data_lme  <- TcGSA.dataLME(expr=expr_temp, design=design, subject_name=subject_name, time_name=time_name, 
    														 covariates_fixed=covariates_fixed, time_covariates=time_covariates,
    														 group_name=group_name, time_func=time_func)
      if(length(levels(data_lme$probe))>1){
          lmm_H0 <- tryCatch(lmer(formula =my_formul[["H0"]]["reg"], REML=FALSE, data=data_lme),
                   error=function(e){NULL})
          lmm_H1 <- tryCatch(lmer(formula =my_formul[["H1"]]["reg"], REML=FALSE, data=data_lme),
                   error=function(e){NULL})
      }else{
    		lmm_H0 <- tryCatch(lmer(formula =my_formul[["H0"]]["1probe"], REML=FALSE, data=data_lme),
    											 error=function(e){NULL})
    		lmm_H1 <- tryCatch(lmer(formula =my_formul[["H1"]]["1probe"], REML=FALSE, data=data_lme),
    											 error=function(e){NULL})
      }

      if (!is.null(lmm_H0) & !is.null(lmm_H1)) {
        LR[gs] <- stats::deviance(lmm_H0, REML=FALSE) - stats::deviance(lmm_H1, REML=FALSE)
  	    CVG_H0[gs] <- lmm_H0@optinfo[["conv"]]$opt
  	    CVG_H1[gs] <- lmm_H1@optinfo[["conv"]]$opt
  	    #handling NAs in prediction
  	    samples_containingNA <- which(is.na(data_lme), arr.ind = TRUE)[, "row"]
  	    if(length(samples_containingNA)>0){
  	    	fit_vec <-  stats::fitted(lmm_H1)
  	    	fit_temp <- data.frame("id" = names(fit_vec), "fitted" = fit_vec)
  	    	estims <- merge(cbind.data.frame("id" = rownames(data_lme), data_lme), 
  	    					fit_temp,
  	    					by="id", all.x=TRUE)
  	    }else{
  	    	estims <- cbind.data.frame(data_lme, "fitted"=stats::fitted(lmm_H1))
  	    }
        estims_tab <- acast(data=estims, formula = stats::as.formula(paste("probe", subject_name, "t1", sep="~")), value.var="fitted")
        # drop = FALSE by default, which means that missing combination will be kept in the estims_tab and filled with NA
        dimnames(estims_tab)[[3]] <- as.numeric(dimnames(estims_tab)[[3]])*10
        estim_expr[[gs]] <- estims_tab
      }else {
        LR[gs] <- NA
        CVG_H0[gs] <- NA
        CVG_H1[gs] <- NA
        
        
        # CONVERGENCE DIAGNOSTICS IN lme4 v1.1-8 (from Nelder Mead optimizer)
        # -3: "nm_forced"
        # -2: "cannot generate a feasible simplex"
        # -1: "initial x is not feasible"
        #  0: "converged"
        
        estims <- cbind.data.frame(data_lme, "fitted"=NA)
        estims_tab <- acast(data=estims, formula = stats::as.formula(paste("probe", subject_name, "t1", sep="~")), value.var="fitted")
        if(is.numeric(design[, time_name])){
        	dimnames(estims_tab)[[3]] <- as.numeric(dimnames(estims_tab)[[3]])*10
        }
        estim_expr[[gs]] <- estims_tab
        warning("Unable to fit the mixed models for this gene set\n")
      }
    	
    }else{
	    LR[gs] <- NA
	    CVG_H0[gs] <- NA
	    CVG_H1[gs] <- NA
	    
	    estim_expr[[gs]] <- NA
	    warning("The size of the gene set ",  gmt$geneset.names[[gs]], "is problematic (too many or too few genes)\n")
	}
    message(paste(gs,"/", length(gmt$geneset.names)," gene sets analyzed\n", sep=""))
  }
  
  if(group_name==""){
  	gv <- NULL
  }else{
  	gv <- design[,group_name]
  }
  
  tcgsa <- list("fit"=as.data.frame(cbind(LR, CVG_H0, CVG_H1)), "time_func"=time_func, "GeneSets_gmt"=gmt, 
  							"group.var"=gv, "separateSubjects"=separateSubjects, "Estimations"=estim_expr, 
  							"time_DF"=time_DF
  							)
  class(tcgsa) <- "TcGSA"
  return(tcgsa)
}


#'@rdname TcGSA.LR
#'
#'@param x an object of class '\code{TcGSA}'.
#'@param ...  further arguments passed to or from other methods.
#'
#'@method print TcGSA
#'
#'@export

print.TcGSA <- function(x, ...){
	cat("\t\tA TcGSA object")
	cat("\n")
	cat("Form of the time trend:")
	cat("\n\t")
	cat(x[["time_func"]])
	cat("\n")
	cat("Number of treatment groups:")
	cat("\n\t")
	cat(ifelse(is.null(x[["group.var"]]),1,length(levels(x[["group.var"]]))))
	cat("\n")
	if(x[["separateSubjects"]]){
		cat("Number of gene sets tested for discriminating time trends among patients:") 
	}else{
		cat("Number of gene sets tested for significant time trend:")
	}
	cat("\n\t")
	cat(length(x[["GeneSets_gmt"]]$geneset.name))
}