R/Impulse_DE_fin.R

Defines functions impulse_DE DE_analysis generate_background plot_impulse impulse_fit cluster_genes_for_impulse two_impulses calc_impulse annotation_preparation

Documented in calc_impulse impulse_DE plot_impulse

################################################################################
########################     Impulse DE package     ############################
################################################################################

### Version:  0.99.0
### Date:     2016-06-02
### Author:   Jil Sander

################################################################################


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#                             Functions to call                                #
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#



################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Annotation preparation    +++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

#Prepares the annotation table for internal use

annotation_preparation <- function(data_annotation, data_table = NULL,
  colname_time = NULL, colname_condition = NULL, control_timecourse = FALSE,
  control_name = NULL, case_name = NULL){

   if (is.null(data_table)) {
      stop("ERROR: table of data values must be specified")
   }
   if ((nrow(data_annotation) != ncol(data_table)) ||
      (FALSE %in% (sort(rownames(data_annotation)) ==
            sort(colnames(data_table))))) {
                 stop("ERROR: column names of data table must be the same as
                    row names of annotation table!")
   }
   if (is.null(colname_time)) {
      stop("ERROR: name of the column containing the timepoints must be
           specified as character!")
   }
   if (is.null(colname_condition)) {
      stop("ERROR: name of the column containing the condition(s) must be
           specified as character!")
   }
   if (is.numeric(data_annotation[,colname_time]) == FALSE) {
      stop("ERROR: time variable must be numeric and is not allowed to
           contain characters!")
   }
   if (control_timecourse == FALSE &
      length(summary(as.factor(data_annotation[,colname_condition]))) >= 2) {
        stop("ERROR: if you have only one time course do not provide more than
             one condition!")
   }
   if (control_timecourse == TRUE & (length(summary(as.factor(data_annotation[,
        colname_condition]))) > 2) & is.null(case_name)) {
            stop("ERROR: please specify case and control names of interest
                 since you provide more than three conditions!")
   }
   annot <- data_annotation[,c(colname_time,colname_condition)]
   annot <- annot[colnames(data_table),]
   colnames(annot) <- c("Time","Condition")
   annot$Condition <- as.character(annot$Condition)

   if (length(summary(as.factor(data_annotation[,colname_condition]))) > 2) {
      print(paste("Case condition: ", case_name, sep  = ""))
   } else {
      print(paste("Case condition: ",  data_annotation[!(data_annotation[,
        colname_condition] %in% control_name),colname_condition][1], sep = ""))
   }
   if (control_timecourse == TRUE) {
      print(paste("Control condition: ", control_name, sep  = ""))
   }
   if (control_timecourse == TRUE & (length(summary(as.factor(data_annotation[,
        colname_condition]))) > 2)) {
             annot <- annot[annot$Condition %in% c(control_name, case_name),]
   }
   return(annot)
}

################################################################################

#' Impulse model value prediction
#'
#' Calculates impulse model values for given timepoints and predicted
#' impulse parameters.
#' @aliases calc_impulse
#' @param theta numerical vector of impulse parameters with the order
#' beta, h0, h1, h2, t1, t2.
#' @param timepoints numercial vector of time point(s).
#' @return The predicted impulse model values for the given time point(s).
#' @seealso \code{\link{impulse_DE}}, \code{\link{plot_impulse}}.
#' @author Jil Sander
#' @references Chechik, G. and Koller, D. (2009) Timing of Gene Expression
#' Responses to Envi-ronmental Changes. J. Comput. Biol., 16, 279-290.
#' @examples
#' #' theta vector in the order beta, h0, h1, h2, t1, t2
#' theta <- c(9.9, 14.7, 17.0, 16.9, -0.1, 37.0)
#' #' time points
#' timepoints <- c(0, 2, 4, 6, 8, 18, 24, 32, 48, 72)
#' #' calculate impulse values
#' impulse_values <- calc_impulse(theta, timepoints)
#' @export
calc_impulse <- function(theta,timepoints){
  beta1 = theta[1]
  h0 = theta[2]
  h1 = theta[3]
  h2 = theta[4]
  t1 = theta[5]
  t2 = theta[6]
  res = NULL
   for (i in 1:length(timepoints)) {
    res[i] = (1/h1) * (h0 + (h1 - h0) * (1/(1 + exp(-beta1*(timepoints[i] - t1))))) *
        (h2 + (h1 - h2) * (1/(1 + exp(beta1*(timepoints[i] - t2)))))
  }
  res = unlist(res)
  return(res)
}

################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++     Calculation of input for Optimization    ++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### specifies the function of squared errors which is minimized to fit
### parameters
two_impulses <- function(theta,x_vec,y_vec){
  beta1 = theta[1]
  h0 = theta[2]
  h1 = theta[3]
  h2 = theta[4]
  t1 = theta[5]
  t2 = theta[6]
   f = sum((unlist(lapply(x_vec, function(x) {(1/h1) * (h0 + (h1 - h0) *
        (1/(1 + exp(-beta1*(x - t1))))) * (h2 + (h1 - h2) *
        (1/(1 + exp(beta1*(x - t2)))))})) - y_vec)^2)
   return(f)
}

################################################################################

## compile simple functions to make them quicker
calc_impulse_comp <- cmpfun(calc_impulse)
two_impulses_comp <- cmpfun(two_impulses)


################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++     Clustering    +++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### performs 2-step clustering on expression data or background data
### 1. Step: clustering genes based on correlation to compare the shape
### 2. Step: split the clusters from step 1 into finer clusters based on the
###          eucledian distance to compare the magnitude
cluster_genes_for_impulse <- function(data_table, data_annotation,
    control_timecourse = FALSE, control_name = NULL, plot_clusters = FALSE,
    no_of_clusters = NULL, n_genes = NULL, n_device = TRUE){

  #' @import amap

  corr_thres = 0.8
  eucl_thres = 1.5

  ### assume only one timecourse as default and perform only 1 run
  results = list()
  runs = 1
  label = "case"

  ### perform 3 runs if control timecourse data is present
  if (control_timecourse == TRUE & is.null(control_name) == FALSE) {
     runs = 3
     label = c("combined","case","control")
  }
  ### clustering for different runs
  for (c_runs in 1:runs) {
    if (c_runs == 1) {                # combined or only case if no control
      print(paste("Clustering of ",label[c_runs]," data set", sep = ""))
      dat_pre = data_table
      dat_pre = dat_pre[which(apply(dat_pre,1,function(x){100*
              sd(x)/(mean(x))}) >= 2.5),]
      dat = t(apply(dat_pre,1,function(y){unlist(lapply(as.list(unique(
          as.numeric(as.character(data_annotation$Time)))),function(x){
          mean(y[data_annotation$Time == x])}))}))
      colnames(dat) <- unique(as.numeric(as.character(data_annotation$Time)))
    } else if(c_runs == 2) {         # case
      print(paste("Clustering of ",label[c_runs]," data set", sep = ""))
      dat_pre = data_table[,!(data_annotation$Condition %in% control_name)]
      dat_pre = dat_pre[which(apply(dat_pre,1,function(x){100*
              sd(x)/(mean(x))}) >= 2.5),]
      dat = t(apply(dat_pre,1,function(y){unlist(lapply(as.list(unique(
          as.numeric(as.character(data_annotation[!(data_annotation$Condition
          %in% control_name),"Time"])))),function(x){mean(
          y[data_annotation[!(data_annotation$Condition %in% control_name),
          "Time"] == x])}))}))
      colnames(dat) <- unique(as.numeric(as.character(data_annotation[
          !(data_annotation$Condition %in% control_name),"Time"])))
    } else {                        # control
      print(paste("Clustering of ",label[c_runs]," data set", sep = ""))
      dat_pre = data_table[,data_annotation$Condition %in% control_name]
      dat_pre = dat_pre[which(apply(dat_pre,1,function(x){100*
              sd(x)/(mean(x))}) >= 2.5),]
      dat = t(apply(dat_pre,1,function(y){unlist(lapply(as.list(unique(
          as.numeric(as.character(data_annotation[data_annotation$Condition
          %in% control_name,"Time"])))),function(x){mean(
          y[data_annotation[data_annotation$Condition %in% control_name,
         "Time"] == x])}))}))
      colnames(dat) <- unique(as.numeric(as.character(data_annotation[
          (data_annotation$Condition %in% control_name),"Time"])))
    }
    print(paste("- ", (nrow(data_table) - nrow(dat_pre)),
            " genes were excluded due to very low variation", sep = ""))

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~   Step 1: clustering based on correlation   ~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    max_n_pre_clus = 25   # restrict maximum number of pre-clusters to 25
    ### z-transformation of data for clustering
    dat_ztrans <- t(scale(t(dat)))

    ### ---> clustering of expression data
    if (is.null(no_of_clusters)) {
      file_part = "genes"
      n_clus = 0
      for (i in 1:max_n_pre_clus) {

        kmeans_clus <- Kmeans(dat_ztrans,i,200,method = "correlation")

    ### accept clustering if mean correlation between genes of the cluster and
    ### centroid of the cluster is higher than 'corr_thres' (default: 0.75)
        for (j in 1:i) {
          tmp <- dat_ztrans[kmeans_clus$cluster %in% j,]
          indt1 <- NULL
          if (i > 1 & sort(summary(as.factor(kmeans_clus$cluster)))[1]  < 10) {
               indt1 <- "I"
               n_clus = i - 1
               kmeans_clus <- kmeans_clus_pre
               break
          }
          if (is.null(nrow(tmp))) {    # if cluster contains only 1 gene
             average_pearson <- cor(tmp, kmeans_clus$centers[j,])
          } else {     # if cluster contains > 1 genes
             average_pearson <- mean(apply(tmp,1,function(x){cor(x,
                    kmeans_clus$centers[j,])}))
          }
        ### if one cluster does not fufill the correlation condition leave loop,
        ### increase numbers of potential clusters and try again
          if (average_pearson < corr_thres) {
            break
          }
          ### accept clustering and leave loop if all clusters fufill condition
          if (j == i) {
            n_clus = i
            break
          }
        }
        if (is.null(indt1) == FALSE) {
           break
        }
        if (n_clus != 0) {
          break
        ### if maximum number of allowed clusters is reached accept this number
        } else if ((n_clus == 0) & (i == max_n_pre_clus)) {
          n_clus = i
          break
        }
        kmeans_clus_pre = kmeans_clus
      }
      print(paste("--- Number of correlation-based pre-clusters: ",
            n_clus,sep = ""))

    ### clustering of background data based on clusters from expression data
    } else {
      file_part = "bg"
      n_clus <- max(round(no_of_clusters[[c_runs*2 - 1]]*nrow(data_table)/
            n_genes),1)
      kmeans_clus <- Kmeans(dat_ztrans,n_clus,200,method = "correlation")
      n_fine_clust_rand <- round(sample( no_of_clusters[[c_runs*2]] ,n_clus,
            replace = TRUE)*nrow(data_table)/n_genes)
      n_fine_clust_rand[n_fine_clust_rand < 1] = 1
    }
    kmeans_pre_clus_corr <- kmeans_clus
    n_pre_clus <- n_clus
    kmeans_clus_final <- kmeans_pre_clus_corr$cluster
    print(paste("------ Number of genes in pre-clusters: ",
      paste(paste("C",names(summary(as.factor(kmeans_clus_final))),": ",
      summary(as.factor(kmeans_clus_final)), sep = ""),collapse = ", "),sep = ""))


   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~   Step 2: clustering based on eucledian distance   ~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


    n_fine_clusters <- NULL
    cluster_means <-  NULL
    # restrict maximum number of fine clusters to 5 per pre-cluster
    max_n_fine_clus <- 10
    kmeans_clus <- NULL

     tempo2 <- do.call(c,as.data.frame(dat))
     tempo3 <- (tempo2 - median(tempo2))/mad(tempo2)
     dat_median_ztrans <- matrix(tempo3, nrow(dat), ncol(dat))
     rownames(dat_median_ztrans) = rownames(dat)
     colnames(dat_median_ztrans) = colnames(dat)

    ### cluster each pre-cluster into fine clusters
    for (cl in 1:n_pre_clus) {


     ### ---> clustering of expression data
       if (is.null(no_of_clusters)) {
        if (length(kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster %in%
                cl]) >  max_n_fine_clus) {
            n_clus = 0

            for (i in 1:max_n_fine_clus) {

              kmeans_clus <- kmeans(dat_median_ztrans[names(
                  kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                  %in% cl]),],i,200)
            ### accept clustering if mean eucledian distance between genes of
            ### the cluster and centroid of the cluster is lower than
            ### 'eucl_thres' (default: 5)
              for (j in 1:i) {
              indt2 <- NULL
              tmp <- dat_median_ztrans[names(kmeans_pre_clus_corr$cluster[
                  kmeans_pre_clus_corr$cluster %in% cl]),][kmeans_clus$cluster
                  %in% j,]
               # if smallest cluster would have less than 5 genes take previous
               # number of clusters
              if (i > 1 & sort(summary(as.factor(kmeans_clus$cluster)))[1] < 5) {
                  indt2 = "I"
                  n_clus = i - 1
                  kmeans_clus <- kmeans_clus_pre
                  break
              }

                if (is.null(nrow(tmp))) {    # if cluster contains only 1 gene
                   average_eucledian <- 0
                } else {  # if cluster contains > 1 genes
                   average_eucledian <- sd(apply(tmp,1,function(x){dist(rbind(x,
                        kmeans_clus$centers[j,]))}))
                }

                ### if one cluster does not fufill the eucledian condition leave
                ### loop,increase numbers of potential clusters and try again
                if (average_eucledian > eucl_thres) {
                  break
                }
                ### accept clustering and leave loop if all clusters fulfill
                ### condition
                if (j == i) {
                  n_clus = i
                  break
                }
              }
              if (is.null(indt2) == FALSE) {
                 break
              }
              if (n_clus != 0) {
                break
              ### if maximum number of allowed clusters is reached accept this
              ### number
              } else if ((n_clus == 0) & (i == max_n_fine_clus)) {
                n_clus = i
                break
              }
              kmeans_clus_pre = kmeans_clus
            }
            n_fine_clusters <- c(n_fine_clusters,n_clus)
          } else {
            n_clus = 1
            n_fine_clusters <- c(n_fine_clusters,n_clus)
            kmeans_clus$cluster <- rep(1,length(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl]))
            names(kmeans_clus$cluster) <- names(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl])
          }
        ### clustering of background data based on clusters from expression data
        } else {
          if (floor(length(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl]) / n_fine_clust_rand[cl])
                >= 2) {
            n_clus <- n_fine_clust_rand[cl]
            kmeans_clus <- kmeans(dat_median_ztrans[names(
                  kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                  %in% cl]),], n_clus,200)
          } else if (n_fine_clust_rand[cl] > 1 & floor(length(
                kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster %in%
                cl]) / (n_fine_clust_rand[cl] - 1)) >= 2) {
            n_clus <- n_fine_clust_rand[cl] - 1
            kmeans_clus <- kmeans(dat_median_ztrans[names(
                kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                %in% cl]),], n_clus,200)
          } else {
            n_clus = 1
            kmeans_clus$cluster <- rep(1,length(
                kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                %in% cl]))
            names(kmeans_clus$cluster) <- names(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl])
          }
          n_fine_clusters <- c(n_fine_clusters,n_clus)
        }

#      ### calculate mean values for each final cluster
      for (i in 1:n_clus) {
         tmp <- dat_pre[names(kmeans_clus$cluster)[kmeans_clus$cluster %in% i],]
      if (is.null(dim(tmp))) {       # if cluster contains only 1 gene
          cluster_means <- rbind(cluster_means,tmp)
        } else { # if cluster contains > 1 genes
          cluster_means <- rbind(cluster_means,colMeans(tmp))
        }
      }
      ### overwrite pre-clusters by fine clusters to have cluster information
      ### for each gene
      if (cl == 1) {
        kmeans_clus_final[names(kmeans_pre_clus_corr$cluster[
            kmeans_pre_clus_corr$cluster %in% cl])] <-  kmeans_clus$cluster
      } else {
        kmeans_clus_final[names(kmeans_pre_clus_corr$cluster[
            kmeans_pre_clus_corr$cluster %in% cl])] <-  kmeans_clus$cluster +
            sum(n_fine_clusters[1:(cl - 1)])
      }
    }
    rownames(cluster_means)  <- 1:sum(n_fine_clusters)
    n_clus = sum(n_fine_clusters)

    print(paste("--- Final number of clusters: ",n_clus,sep = ""))
    print(paste("------ Number of genes in final clusters: ",
      paste(paste("C",names(summary(as.factor(kmeans_clus_final))),": ",
      summary(as.factor(kmeans_clus_final)), sep = ""),collapse = ", "),sep = ""))


   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~   Plot clusters if plot_clusters == TRUE ~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    if (plot_clusters == TRUE & file_part != "bg") {
      for (i in 1:n_clus) {

        if (i == 1){
            if(n_device == TRUE){
                dev.new()
            }
            if (n_clus == 1) {
                par(mfrow = c(1,1))
            }
            else if (n_clus == 2) {
                par(mfrow = c(1,2))
            } else if (n_clus <= 4) {
                par(mfrow = c(2,2))
            } else if (n_clus <= 12) {
                par(mfrow = c(3,4))
            } else if (n_clus > 12) {
                par(mfrow = c(4,5))
            }
        } else if ((i - 1) %% 20 == 0) {
            if (n_device == TRUE) {
                dev.new()
            }
            par(mfrow = c(4,5))
        }
        ### expression values of genes in cluster
        tmp <- dat_ztrans[kmeans_clus_final %in% i,]
        # if cluster contains only 1 gene
        if (length(kmeans_clus_final[kmeans_clus_final %in% i]) == 1) {
          n_g = 1
        } else {   # if cluster contains > 1 genes
          n_g = nrow(tmp)
        }

        ### order gene expression data according to timepoints
        for (j in 1:n_g) {
          # if cluster contains only 1 gene
          if (length(kmeans_clus_final[kmeans_clus_final %in% i]) == 1) {
            tmp_plot = tmp
          } else {       # if cluster contains > 1 genes
            tmp_plot = tmp[j,]
          }
          ### plot clusters
          if (j == 1) {                     # if cluster contains only 1 gene
            plot(tmp_plot,type = "l",xlab = "Time points", ylab = "Z-score",
                    main = paste("Cluster ",i,", ",label[c_runs],sep = ""),
                    ylim = c(min(tmp),max(tmp)),xaxt = "n")
          axis(1, at = 1:length(colnames(tmp)), labels = colnames(tmp), las = 3)
          } else {   # if cluster contains > 1 genes
            points(colnames(tmp),tmp_plot,type = "l")
          }
        }
      }
    }
    results[[c_runs*4 - 3]] <- kmeans_clus_final
    results[[c_runs*4 - 2]] <- cluster_means
    results[[c_runs*4 - 1]] <- n_pre_clus
    results[[c_runs*4]] <- n_fine_clusters
    names(results)[c((c_runs*4 - 3):(c_runs*4))] <- paste(c("kmeans_clus",
         "cluster_means", "pre_clus","fine_clus"),label[c_runs],sep = "_")
  }

  return(results)
}


################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Impulse model fit    ++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### fits impulse model to a timecourse dataset
### genes are suppposed to be in rows, samples in columns
### timepoints need to be specified as numbers (integers or float)
### function is splitted into 3 parts
###### 1. fitting impulse model to a single gene
###### 2. fitting impulse model to matrix of genes, which calls 1.
###### 3. prepare data and fit model by calling 2.
### This separation was chosen to be able to use 'apply' to replace for-loops

impulse_fit <- function(data_input, data_annotation, n_iter = 100,
    control_timecourse = FALSE, control_name = NULL, cluster_results = NULL,
    start_values = NULL, fit_backg = FALSE, n_proc = 4, ...){

  ### check dataset for consistency
  if (ncol(data_annotation) != 2 || FALSE %in% (colnames(data_annotation) ==
      c("Time","Condition"))) {
            stop("Please use function 'annotation_preparation' to prepare
                 the annotation table")
  }
  if (control_timecourse == TRUE & is.null(control_name)) {
     stop("Please specify the name of the control in the column 'Condition'")
  }
  if (control_timecourse == TRUE & (is.null(start_values) == FALSE)) {
    start_values <- start_values[c(1,3,5)]
  }
  if (control_timecourse == FALSE & (is.null(control_name) == FALSE)) {
     start_values <- start_values[c(1,3)]
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~  Fit impulse model to gene ~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  ### fits an impulse model to expression values of a single gene

  impulse_fit_gene_wise <- function(vec, timepoints, NPARAM, n_iters = 100,
      fit_to_clusters = FALSE, start_val = NULL, fit_bg = FALSE, ...){

      mat1 = c(0,abs(vec[2:length(vec)] - vec[1:(length(vec) - 1)]) *
            vec[2:length(vec)])
      mn_beta = 0
      mx_beta = 10
      middle_ind = which(timepoints > min(timepoints) & timepoints <
            max(timepoints))
      min_ind = which(timepoints == min(timepoints))[1]
      max_ind = which(timepoints == max(timepoints))[1]
      mx_tm = max(timepoints)
      mn_tm = min(timepoints)
      tmp_ind = which(mat1 == max(mat1))[1]
      peaktime = timepoints[tmp_ind]
      peak = min(which(timepoints == peaktime))
      beta1 = abs(log(abs((vec[peak] - vec[min_ind])/(peaktime -
            timepoints[min_ind]))));

      ### set beta to 1 if calculcated value is infinite
      if (is.finite(beta1) == FALSE || is.na(beta1)) { beta1 = 1 }

      ### define start value theta based on the gene's data
      orig_theta = c(beta1,
         vec[min_ind],                    # h0
         vec[peak],                       # h1
         vec[max_ind],                    # h2
         (peaktime - timepoints[min_ind])/2,            # t1
         peaktime + (timepoints[max_ind] - peaktime)/2) # t2
      names(orig_theta) = c("beta","h0","h1","h2","t1","t2")
      theta = orig_theta

      ### replace theta estimates of 0 by small value, because optimization
      ### function has problems with 0 as input
      rand_num <- runif(length(which(theta == 0)),0,0.0001)
      theta[names(which(theta == 0))] <- rand_num
      names(theta) = c("beta","h0","h1","h2","t1","t2")

      ### ---> fit model to clusters
      if (fit_to_clusters == TRUE & is.null(start_val)) {

          if (fit_bg == FALSE) {
            thetas <- cbind(as.numeric(theta),apply(matrix(rep(runif((n_iters -
                    1)*NPARAM)),NPARAM,(n_iters - 1)),2,
              function(x){c(mn_beta + (mx_beta - mn_beta)*x[1],
                            min(vec) + (max(vec) - min(vec))*x[2:4],
                            mn_tm + (mx_tm - mn_tm)*x[5],
                            (mn_tm + (mx_tm - mn_tm)*x[5]) + (mx_tm - (mn_tm +
                            (mx_tm - mn_tm)*x[5]))*x[6])}))
           } else {
              thetas <- apply(matrix(rep(runif((n_iters)*NPARAM)),NPARAM,
                    (n_iters)),2,
              function(x){c(mn_beta + (mx_beta - mn_beta)*x[1],
                            min(vec) + (max(vec) - min(vec))*x[2:4],
                            mn_tm + (mx_tm - mn_tm)*x[5],
                            (mn_tm + (mx_tm - mn_tm)*x[5]) + (mx_tm - (mn_tm +
                            (mx_tm - mn_tm)*x[5]))*x[6])})
           }
        tmm1 <- system.time({
        fmin_outs <- cbind(
        # fitting via quasi-Newton method (optim, "BFGS")
           apply(thetas[,1:(floor(n_iters/2))],2,function(x){unlist(optim(x,
             two_impulses, x_vec = timepoints,
             y_vec = vec,method = "BFGS")[c("par","value")])}),
        # fitting via PORT routines (nlminb)
           apply(thetas[, c(1,(floor(n_iters/2) + 1):n_iters)],2,function(x){
             unlist(nlminb(x, two_impulses, x_vec = timepoints,
             y_vec = vec)[c("par","objective")])})
           )
        })

        ### return results of 3 best fits, which will serve as start values for
        ### the fits to the genes
        pvec_and_SSE = as.vector(fmin_outs[,order(fmin_outs["value",])][,1:3])

      ### ---> fit model to genes
      } else if (fit_to_clusters == FALSE & is.null(start_val) == FALSE) {
        tmm2 <- system.time({

      ### use first 3 best hits from fit to clusters plus initial guess 'theta'
      ### as start values
        if (fit_bg == TRUE) {
          toll <- matrix(start_val,7,3)[1:NPARAM,]
        } else {
          toll <- cbind(theta,matrix(start_val,7,3)[1:NPARAM,][,1:3])
        }

        fmin_outs <- cbind(
        # fitting via quasi-Newton method (optim, "BFGS")
        apply(toll,2,
          function(x){unlist(optim(x, two_impulses, x_vec = timepoints,
          y_vec = as.numeric(vec),method = "BFGS" )[c("par","value")])}),
        # again fitting via PORT routines (nlminb)
        apply(toll,2,
          function(x){unlist(nlminb(x, two_impulses, x_vec = timepoints,
          y_vec = as.numeric(vec) )[c("par","objective")])})
        )
        })
        if (is.null(dim(fmin_outs[ ,fmin_outs["value",] == min(fmin_outs["value",
            ])])) == TRUE) {
          pvec_and_SSE = fmin_outs[ ,fmin_outs["value",] ==
                min(fmin_outs["value",])]
       } else {
          ### if two or more randomization have the same minimum Sum of
          ### Squared Errors (SSE), choose the first one
          pvec_and_SSE = fmin_outs[ ,fmin_outs["value",] ==
                min(fmin_outs["value",])][,1]
       }
      }
     return(pvec_and_SSE)
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~  Fit impulse model to matrix ~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  ### fits an impulse model to all genes of a dataset
  impulse_fit_matrix <- function(data_tab, timepoints, n_it = 100,
        ctrl_tc = FALSE, ctrl = NULL, fit_to_clus = FALSE, start_val = NULL,
        fit_bg = FALSE, n_process = 4, ...){

    NPARAM = 6
    mat1 = NULL
    res <- list()
    for (j in 2:ncol(data_tab)) {
        mat1 = cbind(mat1, abs(data_tab[,j] - data_tab[,j - 1]) * data_tab[,j])
    }

        mc <- min(detectCores() - 1, n_process)
     if (fit_bg == FALSE && nrow(data_tab) > max(2*mc,10)) {

    ## do not split if less than 10 genes
        ind_list = list()
        bord = floor(nrow(data_tab)/mc)
        for (i in 1:mc) {
           ind_list[[i]] <- ((i - 1)*bord + 1):(i*bord)
        }
        if (mc*bord < nrow(data_tab)) {
             ind_list[[mc]] <-  c(ind_list[[mc]],(mc*bord + 1):nrow(data_tab))
        }

        cl <- makeCluster(mc)
        my.env <- new.env()
        assign("data_tab", data_tab, envir = my.env)
        assign("calc_impulse_comp", calc_impulse_comp, envir = my.env)
        assign("n_it", n_it, envir = my.env)
        assign("cluster_genes_for_impulse", cluster_genes_for_impulse,
             envir = my.env)
        assign("impulse_fit", impulse_fit, envir = my.env)
        assign("two_impulses", two_impulses, envir = my.env)
        assign("impulse_fit_gene_wise",impulse_fit_gene_wise, envir = my.env)
        assign("impulse_fit_matrix",impulse_fit_matrix, envir = my.env)
        assign("timepoints", timepoints, envir = my.env)
        assign("fit_to_clus", fit_to_clus, envir = my.env)
        assign("start_val", start_val, envir = my.env)
        assign("fit_bg", fit_bg, envir = my.env)
        assign("NPARAM", NPARAM, envir = my.env)

        clusterExport(cl = cl, varlist = c("data_tab",
         "calc_impulse_comp","n_it","cluster_genes_for_impulse","impulse_fit",
         "two_impulses", "impulse_fit_gene_wise",
         "impulse_fit_matrix", "timepoints","fit_to_clus","start_val",
         "fit_bg","NPARAM"), envir = my.env)

        ### fit impulse model to each gene of matrix and get impulse parameters
        resi <- clusterApply(cl, ind_list, function(z){t(apply(data_tab[z,],1,
            function(x){impulse_fit_gene_wise(x,timepoints,
            NPARAM,n_it,fit_to_clus, start_val, fit_bg)}))})
        resmat = do.call(rbind,resi)
        resmat <- resmat[rownames(data_tab),]
        stopCluster(cl)
      } else {
      ### fit impulse model to each gene of matrix and get impulse parameters
        resmat <- t(apply(data_tab,1,function(x){impulse_fit_gene_wise(x,
            timepoints, NPARAM,n_it,fit_to_clus, start_val, fit_bg)}))
      }


    ### use obtained impulse parameters to calculate impulse fit values
    ### ---> if fit to genes
    if (fit_to_clus == FALSE) {

     colnames(resmat) <- c("beta","h0","h1","h2","t1","t2","SSE")
     if (nrow(resmat) == 1) {      # if matrix contains only one gene
       resmat2 <- as.data.frame(t(calc_impulse_comp(resmat[,1:NPARAM],
          unique(sort(timepoints)))),
          row.names = rownames(resmat))
       colnames(resmat2) = unique(sort(timepoints))
     } else {   # if matrix contains > 1 genes
        resmat2 <- t(apply(resmat[,1:NPARAM],1,function(x){calc_impulse_comp(x,
                unique(sort(timepoints)))}))
        colnames(resmat2) = unique(sort(timepoints))
     }

    ### ---> if fit to clusters
    ### more complex because results from 3 best fits need to be saved and later
    ### used for the fit to the genes
    } else {
     colnames(resmat) <- c(paste(c("beta","h0","h1","h2","t1","t2","SSE"),1,
          sep = "_"), paste(c("beta","h0","h1","h2","t1","t2","SSE"),2,sep = "_"),
          paste(c("beta","h0","h1","h2","t1","t2","SSE"),3,sep = "_"))
     resmat2 <- t(apply(resmat,1,function(x){apply(matrix(x,7,3)[1:NPARAM,],2,
          function(y){calc_impulse_comp(y,unique(sort(timepoints)))})}))
     colnames(resmat2) <- c(paste(unique(sort(timepoints)),1, sep = "_"),
          paste(unique(sort(timepoints)),2, sep = "_"),
          paste(unique(sort(timepoints)),3, sep = "_"))
    }
    res[[1]] <- resmat
    res[[2]] <- resmat2
    names(res) <- c("impulse_parameters","impulse_fits")
    return(res)
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~  Prepare data for impulse model fit ~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  ### assume only one timecourse as default and perform only 1 run
  g_names = rownames(data_input)
  runs = 1
  label = "case"
  results = list()
  if (control_timecourse == TRUE) {
    runs = 3
    label = c("combined","case","control")
  }

  ### ---> fit to genes
  if (is.null(cluster_results) == FALSE & is.null(start_values) == FALSE) {
    file_ext = "genes"
    c1 = "genes"

    ### perform 3 runs if control timecourse data is present
    if (control_timecourse == TRUE) {
      ### generate a data list containing the combined, case and control data if
      ### control timecourse is present
      dat1 = as.data.frame(data_input)[names(cluster_results[[1]]),]
      dat1_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat1)),]
      dat1_n = dat1_n[,colnames(dat1)]
      dat2 = as.data.frame(data_input)[,!(data_annotation$Condition %in%
            control_name)]
      dat2 = dat2[names(cluster_results[[5]]),]
      dat2_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat2)),]
      dat2_n = dat2_n[,colnames(dat2)]
      dat3 = as.data.frame(data_input)[,(data_annotation$Condition %in%
            control_name)]
      dat3 = dat3[names(cluster_results[[9]]),]
      dat3_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat3)),]
      dat3_n = dat3_n[,colnames(dat3)]
      dat_ind <- c("dat1_n","dat2_n", "dat3_n")
      data_input <- list(dat1, dat2, dat3)

    } else if (control_timecourse == FALSE) {
      dat1 = as.data.frame(data_input)[names(cluster_results[[1]]),]
      dat1_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat1)),]
      dat_ind <- c("dat1_n")
      data_input <- list(dat1)
    }


  ### ---> fit to clusters
  }  else if (is.null(cluster_results) & is.null(start_values)) {
    file_ext = "clusters"
    c1 = "cluster"
    ### input are the means of the gene expression values of the clusters
    ### 2 is combined, 6 is case and 10 is control
    ### if runs == 1 or runs == 2, then 6 and 10 or just 10 are empty
    data_input <- data_input[c(2,6,10)[1:runs]]
  }

  ### fitting for different runs
  for (c_runs in 1:runs) {
    imp_res <- NULL

    ### ---> fit to genes
    if (is.null(cluster_results) == FALSE & is.null(start_values) == FALSE) {

      ### split genes into the clusters and fit impulse model to the
      ### genes of a cluster
      cluster_res_list <- split(data_input[[c_runs]],
            cluster_results[[(c_runs - 1)*4 + 1]])
      ind <- split(1:max(cluster_results[[(c_runs - 1)*4 + 1]]),
            1:max(cluster_results[[(c_runs - 1)*4 + 1]]))

      imp_res_list <- lapply(ind,
          function(x){impulse_fit_matrix(cluster_res_list[[x]],
          as.numeric(as.character(data_annotation[colnames(data_input[[c_runs]])
            ,"Time"])), n_it = n_iter, ctrl_tc = control_timecourse,
            ctrl = control_name, fit_to_clus = FALSE,
            start_val = start_values[[c_runs]][x,],
            fit_bg = fit_backg, n_process = n_proc)})

       if (nrow(get(dat_ind[c_runs])) != 0) {
        tump1  <- do.call(rbind,lapply(1:(length(imp_res_list)),
            function(x) imp_res_list[[x]][[1]]))
        tump2  <- do.call(rbind,lapply(1:(length(imp_res_list)),
            function(x) imp_res_list[[x]][[2]]))

        tmpp1 <- t(apply(get(dat_ind[c_runs]),1,function(x){rep(mean(x),
            ncol(tump2))}))
        colnames(tmpp1) <- colnames(tump2)
        tump2 <- rbind(tump2, tmpp1)
        tmpp2 <-  cbind( matrix(NA,nrow(get(dat_ind[c_runs])),ncol(tump1) - 1),
            apply(cbind(get(dat_ind[c_runs]), tmpp1[,1]),1,
            function(x){sum((x[1:(length(x) - 1)] - x[length(x)])^2)}))
        rownames(tmpp2) <- rownames(get(dat_ind[c_runs]))
        tump1 <- rbind(tump1, tmpp2)

        tempi1 <- tump1
        tempi2 <- tump2
      } else {
         tempi1  <- do.call(rbind,lapply(1:(length(imp_res_list)),
             function(x) imp_res_list[[x]][[1]]))
         tempi2  <- do.call(rbind,lapply(1:(length(imp_res_list)),
             function(x) imp_res_list[[x]][[2]]))
      }
      imp_res$impulse_parameters <- tempi1[g_names,]
      imp_res$impulse_fits <- tempi2[g_names,]

    ### ---> fit to clusters
    } else if (is.null(cluster_results) & is.null(start_values)) {
      ### fit impulse model to the mean of the cluster
      imp_res <-  impulse_fit_matrix(data_input[[c_runs]],
        as.numeric(as.character(data_annotation[colnames(data_input[[c_runs]]),
        "Time"])),
          n_it = n_iter, ctrl_tc = control_timecourse, ctrl = control_name,
                fit_to_clus = TRUE, fit_bg = fit_backg, n_process = n_proc)
    }
    names(imp_res) <- paste(c("impulse_parameters","impulse_fits"),
        label[c_runs], sep = "_")
    results[[names(imp_res)[1]]] <- imp_res[[1]]
    results[[names(imp_res)[2]]] <- imp_res[[2]]
  }

  ### for naming of the output files
  if (fit_backg == TRUE) { c2 = "bg" } else {c2 = "data"}
  return(results)
}

################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Plot impulse fits    ++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### plots impulse fits to timecourse data and also the control data if present

#' Plot impulse model fits
#'
#' Plots impulse model fits for the specified gene IDs. In the case of two
#' time courses, the fits for the combined, case and control data are plotted.
#' @aliases plot_impulse
#' @param gene_IDs character vector of gene names to be plotted; must be part
#' of the \code{rownames} of \code{data_table}
#' @param data_table numeric matrix of expression values; genes should be in
#' rows, samples in columns. Data should be properly normalized and
#' log2-transformed as well as filtered for present or variable genes.
#' @param data_annotation table providing co-variables for the samples including
#' condition and time points. Time points must be numeric numbers.
#' @param imp_fit_genes list of  fitted impulse model values and parameters as
#' produced by \code{impulse_DE} (list element \code{impulse_fit_results}).
#' @param control_timecourse logical indicating whether a control time
#' timecourse is part of the data set (\code{TRUE}) or not (\code{FALSE}).
#' Default is \code{FALSE}.
#' @param control_name character string specifying the name of the control
#' condition in \code{annotation_table}.
#' @param case_name character string specifying the name of the case
#' condition in \code{annotation_table}. Should be set if more than two
#' conditions are present in \code{data_annotation}.
#' @param file_name_part character string to be used as file extention.
#' @param title_line character string to be used as title for each plot.
#' @param sub_line character string to be used as subtitle for each plot.
#' @param new_device logical indicating whether each plot should be plotted
#' into a new device (\code{TRUE}) or not (\code{FALSE}). Default is \code{TRUE}.
#' @return Plots of the impulse model fits for the specified gene IDs.
#' @seealso \code{\link{impulse_DE}}, \code{\link{calc_impulse}}.
#' @author Jil Sander
#' @references Chechik, G. and Koller, D. (2009) Timing of Gene Expression
#' Responses to Envi-ronmental Changes. J. Comput. Biol., 16, 279-290.
#' @examples
#' #' Install package longitudinal and load it
#' library(longitudinal)
#' #' Attach datasets
#' data(tcell)
#' #' check dimension of data matrix of interest
#' dim(tcell.10)
#' #' generate a proper annotation table
#' annot <- as.data.frame(cbind("Time" =
#'      sort(rep(get.time.repeats(tcell.10)$time,10)),
#'      "Condition" = "activated"), stringsAsFactors = FALSE)
#' #' Time columns must be numeric
#' annot$Time <- as.numeric(annot$Time)
#' #' rownames of annotation table must appear in data table
#' rownames(annot) = rownames(tcell.10)
#' #' since genes must be in rows, transpose data matrix using t()
#' #' consider 6 genes for now only
#' genes <- c("SIVA","CD69","ZNFN1A1","IL4R","MAP2K4","JUND")
#' tcell.10.filtered <- t(tcell.10[,genes])
#' #' generate a list object having the form of the output of impulse_DE
#' #' first the parameter fits and SSEs
#' impulse_parameters_case <- matrix(
#'     c(0.6,	18.6,	17.2,	17.4,	5.1,	40.2,	3.5,
#'       0.3,	-464.9,	18.3,	17.3,	-17.2,	35.3,	17.5,
#'       23.2,	18,	18.8,	18.5,	3,	37,	13.2,
#'       NA,	NA,	NA,	NA,	NA,	NA,	3.1,
#'       NA,	NA,	NA,	NA,	NA,	NA,	9.6,
#'       9.5,	17.5,	18.7,	17.5,	8,	48,	46.7),length(genes),7, byrow = TRUE)
#' rownames(impulse_parameters_case) <- genes
#' colnames(impulse_parameters_case) <- c("beta", "h0", "h1", "h2", "t1", "t2", "SSE")
#' #' then the fitted values for the time points
#' impulse_fits_case <- matrix(c(
#'     18.55,	18.43,	18.15,	17.73,	17.43,	17.24,	17.24,	17.24,	17.38,	17.38,
#'     16.22,	17.18,	17.7,	17.97,	18.12,	18.27,	18.26,	18.03,	17.3,	17.28,
#'     18,	18,	18.82,	18.82,	18.82,	18.82,	18.82,	18.82,	18.48,	18.48,
#'     15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,
#'     17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,
#'     17.5,	17.5,	17.5,	17.5,	18.18,	18.67,	18.67,	18.67,	17.98,	17.53)
#'     ,length(genes),length(unique(annot$Time)), byrow = TRUE)
#' rownames(impulse_fits_case) <- genes
#' colnames(impulse_fits_case) <- unique(annot$Time)
#' #' finalize list object
#' impulse_fit_genes <- list("impulse_parameters_case" = impulse_parameters_case,
#'                           "impulse_fits_case" = impulse_fits_case)
#' #' Plot expression values
#' plot_impulse(genes, tcell.10.filtered, annot, impulse_fit_genes)
#' @export
plot_impulse <- function(gene_IDs, data_table, data_annotation,imp_fit_genes,
    control_timecourse = FALSE, control_name = NULL, case_name = NULL,
    file_name_part = "", title_line = "", sub_line = "", new_device = TRUE){

  print("---Plotting genes")

  if (length(grep("[a-zA-Z]",rownames(data_table))) == 0) {
       rownames(data_table) <- paste(rownames(data_table),"G", sep = "_")
       gene_IDs <- paste(gene_IDs, "G", sep = "_")
  }
  print(gene_IDs)

  ### if control timecourse is present split data into case and control data
  if (control_timecourse == TRUE) {
      fmat_case <- data_table[,!(data_annotation$Condition %in% control_name)]
      fmat_ctrl <- data_table[,data_annotation$Condition %in% control_name]
      timep_case <- as.numeric(as.character(data_annotation[colnames(fmat_case),
            "Time"]))
      timep_ctrl <- as.numeric(as.character(data_annotation[colnames(fmat_ctrl),
            "Time"]))
  }
  timep <- as.numeric(as.character(data_annotation[colnames(data_table),"Time"]))
  x_vec <- seq(0,max(timep),0.1)

   for (i in 1:length(gene_IDs)) {
       if (i == 1) {
           if(new_device == TRUE){
               dev.new()
           }
           if (length(gene_IDs) == 1) {
               par(mfrow = c(1,1))
           } else if (length(gene_IDs) == 2) {
               par(mfrow = c(1,2))
           } else if (length(gene_IDs) <= 4) {
               par(mfrow = c(2,2))
           } else if (length(gene_IDs) <= 6) {
               par(mfrow = c(2,3))
           } else if (length(gene_IDs) > 6) {
               par(mfrow = c(3,3))
           }
        } else if ((i - 1) %% 9 == 0) {
            if(new_device == TRUE){
                dev.new()
            }
            par(mfrow = c(3,3))
        }

     ### if there is no control data plot only case time course data and fit
     if (control_timecourse == FALSE) {
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_case[gene_IDs[i],])) {
             calc_case = imp_fit_genes$impulse_fits_case[gene_IDs[i],1]
         } else {
             calc_case = calc_impulse_comp(imp_fit_genes$impulse_parameters_case[gene_IDs[i],1:6],x_vec)
         }
         plot(timep,data_table[gene_IDs[i],],col = "blue",pch = 3,xlim = c(0,max(timep)),
            ylim = c(min(c(as.numeric(data_table[gene_IDs[i],]),as.numeric(calc_case))) - 0.5,
               max(c(as.numeric(data_table[gene_IDs[i],]),as.numeric(calc_case))) + 0.5),
            xlab = "Time", ylab = "Impulse fit or log2 expression value",
            main = paste(gene_IDs[i]," ",title_line, sep = ""),sub = sub_line)
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_case[gene_IDs[i],])) {
            abline(h = calc_case , col = "blue")
         } else {
           points(x_vec, calc_case, col = "blue", type = "l")
         }
         legend(x = "bottomright",as.character(data_annotation[1,"Condition"]),
                fill = c("blue"), cex = 0.6)

     ### if there is control data
     } else if (control_timecourse == TRUE) {

         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_case[gene_IDs[i],])) {
            calc_case = imp_fit_genes$impulse_fits_case[gene_IDs[i],1]
            status_case = FALSE
         } else {
            calc_case = calc_impulse_comp(imp_fit_genes$impulse_parameters_case[gene_IDs[i],1:6],x_vec)
            status_case = TRUE
         }
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_control[gene_IDs[i],])) {
            calc_ctrl = imp_fit_genes$impulse_fits_control[gene_IDs[i],1]
            status_ctrl = FALSE
         } else {
            calc_ctrl = calc_impulse_comp(imp_fit_genes$impulse_parameters_control[gene_IDs[i],1:6],x_vec)
            status_ctrl = TRUE
         }
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_combined[gene_IDs[i],])) {
            calc_comb = imp_fit_genes$impulse_fits_combined[gene_IDs[i],1]
            status_comb = FALSE
         } else {
            calc_comb = calc_impulse_comp(imp_fit_genes$impulse_parameters_combined[gene_IDs[i],1:6],x_vec)
            status_comb = TRUE
         }

         plot(timep_case,fmat_case[gene_IDs[i],],col = "blue",pch = 3,
              xlim = c(0,max(timep)),
              ylim = c(min(c(as.numeric(data_table[gene_IDs[i],]),
               as.numeric(calc_case), as.numeric(calc_ctrl),
               as.numeric(calc_comb))) - 0.5,
               max(c(as.numeric(data_table[gene_IDs[i],]),as.numeric(calc_case),
               as.numeric(calc_ctrl), as.numeric(calc_comb))) + 0.5),
            xlab = "Time", ylab = "Impulse fit or log2 expression value",
            main = paste(gene_IDs[i]," ",title_line,sep = ""),sub = sub_line)

         points(timep_ctrl,fmat_ctrl[gene_IDs[i],],col = "red",pch = 4)

         if (status_case == FALSE) {
           abline(h = calc_case , col = "blue")
         } else {
           points(x_vec,calc_case, col = "blue", type = "l")
         }
         if (status_ctrl == FALSE) {
            abline(h = calc_ctrl , col = "red")
         } else {
           points(x_vec,calc_ctrl, col = "red", type = "l")
         }
         if (status_comb == FALSE) {
            abline(h = calc_comb , col = "grey")
         } else {
           points(x_vec,calc_comb, col = "grey", type = "l")
         }
         legend(x = "bottomright",
            c(as.character(data_annotation$Condition[data_annotation$Condition !=
            control_name][1]),control_name,"combined"),
            fill = c("blue","red","grey"), cex = 0.6)
     }
   }
}


#################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Background generation   +++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### generates the background for the DE analysis
### can be parallelized onto 'n_proc' nodes

generate_background <- function(data_table, data_annotation, n_iter = 100,
    imp_fit_genes, control_timecourse = FALSE, control_name = NULL,
    no_of_clus = NULL, n_rands = 50000, n_proc = 4){

  ### get number of pre- and fine clusters as input from the genes
  ### also exclude genes for which the model was not fitted due to almost
  ### no variation
  if (control_timecourse == TRUE) {
    no_of_clus = no_of_clus[c(3,4,7,8,11,12)]
    data_table <- data_table[!(is.na(imp_fit_genes[[1]][,"beta"]) |
        is.na(imp_fit_genes[[3]][,"beta"]) |
        is.na(imp_fit_genes[[5]][,"beta"])),]
  } else if (control_timecourse == FALSE & is.null(control_name)) {
    no_of_clus = no_of_clus[c(3,4)]
    data_table <- data_table[!(is.na(imp_fit_genes[[1]][,"beta"])),]
  }
    imp_fit_genes <- lapply(imp_fit_genes,
        function(x){x <- x[rownames(data_table),]})

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~  Fit background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  impulse_bg <- function(data_tab, data_annot,  n_it = 100, imp_fit,
        ctrl_tk = FALSE, ctrl_name = NULL,
        n_clust = NULL, n_randoms_tot = 50000, no_proc = 4){

    ### divide randomizations by no. of nodes
    n_rand <- round(n_randoms_tot/no_proc)

    background = rep(0,n_rand)
    timep <- as.numeric(as.character(data_annot[colnames(data_tab),"Time"]))
    NPARAM = 6

    ### take 'n_rand' random entries from data table
    i_s <- sample(1:nrow(data_tab),n_rand, replace = TRUE)
    data_tab_rand <- data_tab[i_s,]
    rownames(data_tab_rand) <- paste("bg_",c(1:length(i_s)))

    ### ---> if control data is present
    if (ctrl_tk == TRUE) {
      data_list <- list(data_tab_rand, data_tab_rand[,
          !(data_annot$Condition %in% ctrl_name)],
          data_tab_rand[,data_annot$Condition %in% ctrl_name])

      ### null model: fits to the combined dataset
      y_i_null_s = (imp_fit[[2]][rownames(data_tab),])[i_s,
            as.character(data_annot[colnames(data_list[[1]]),"Time"])]
      calc_impulse_comp_data_tab_rand_case <- (imp_fit[[4]][rownames(data_tab),
            ])[i_s,as.character(data_annot[colnames(data_list[[2]]),"Time"])]

      ### if real control timecourse is present
        calc_impulse_comp_data_tab_rand_ctrl <- (imp_fit[[6]][rownames(data_tab),
            ])[i_s, as.character(data_annot[colnames(data_list[[3]]),"Time"])]
        RESD_1_s = cbind((data_list[[2]] - calc_impulse_comp_data_tab_rand_case),
          (data_list[[3]] - calc_impulse_comp_data_tab_rand_ctrl))
        RESD_1_s = RESD_1_s[,colnames(data_list[[1]])]

      ### draw random resdiuals and add them to the null model
      Y_star_s = y_i_null_s + t(apply(RESD_1_s,1,
          function(x){x[sample(1:ncol(RESD_1_s),
          ncol(data_tab_rand),replace = TRUE)]}))
      colnames(Y_star_s) <- colnames(data_tab_rand)
      rownames(Y_star_s) <- paste("bg_",c(1:length(i_s)))

      ### cluster Y_star_s data by using the clusters from the genes
      tm_c <- system.time({
        clustering_results_background <- cluster_genes_for_impulse(Y_star_s,
            data_annot, ctrl_tk, ctrl_name, plot_clusters = TRUE,
            no_of_clusters = n_clust, n_genes = nrow(data_tab))
      })
      print(paste("Consumed time bg clustering: ",
            round(tm_c["elapsed"]/60,2)," min", sep = ""))

      ### fit impulse model to the clusters of Y_star_s
      tm_f <- system.time({
        impulse_fit_clusters_bg <- impulse_fit(clustering_results_background,
            data_annot, n_it, control_timecourse = ctrl_tk,
            ctrl_name, fit_backg = TRUE)
      })
      print(paste("Consumed time bg clus fit: ",
            round(tm_f["elapsed"]/60,2)," min", sep = ""))

      ### fit impulse model to each Y_star
      tm_fg <- system.time({
        impulse_fit_genes_bg <- impulse_fit(Y_star_s, data_annot, 1, ctrl_tk,
            ctrl_name, clustering_results_background, impulse_fit_clusters_bg,
            fit_backg = TRUE )
      })

      print(paste("Consumed time bg gene fit: ",
            round(tm_fg["elapsed"]/60,2)," min", sep = ""))

      ### calculate background
        RESD_1_s_b = cbind((Y_star_s[rownames(data_list[[1]]),
            colnames(data_list[[2]])] -
            impulse_fit_genes_bg[[4]][rownames(data_list[[1]]),
            as.character(data_annot[colnames(data_list[[2]]),"Time"])]),
            (Y_star_s[rownames(data_list[[1]]),colnames(data_list[[3]])] -
            impulse_fit_genes_bg[[6]][rownames(data_list[[1]]),
            as.character(data_annot[colnames(data_list[[3]]),"Time"])]))

        colnames(RESD_1_s_b) <- c(colnames(data_list[[2]]),colnames(data_list[[3]]))
        RESD_1_s_b = RESD_1_s_b[,colnames(data_list[[1]])]
        RESD_0_s_b = Y_star_s[rownames(data_list[[1]]),
            colnames(data_list[[1]])] -
            impulse_fit_genes_bg[[2]][rownames(data_list[[1]]),
            as.character(data_annot[colnames(data_list[[1]]),"Time"])]

       SS_1_s_b = apply(RESD_1_s_b,1,function(x){sum(x^2)})
       SS_0_s_b = apply(RESD_0_s_b,1,function(x){sum(x^2)})

    ### ---> if no control data is present
    } else {
      y_i_null_s = rowMeans(data_tab_rand)
      calc_impulse_comp_data_tab_rand <- (imp_fit[[2]][rownames(data_tab),])[i_s,
            as.character(data_annot[,"Time"])]
      RESD_1_s = data_tab_rand - calc_impulse_comp_data_tab_rand

      ### draw random resdiuals and add them to the null model
      Y_star_s = t(y_i_null_s + apply(RESD_1_s,1,function(x){x[sample(1:ncol(RESD_1_s),
          ncol(data_tab_rand),replace = TRUE)]}))
      colnames(Y_star_s) <- colnames(data_tab_rand)

      ### cluster Y_star_s data by using the clusters from the genes
      tm_c <- system.time({
         clustering_results_background <- cluster_genes_for_impulse(Y_star_s,
            data_annot, ctrl_tk, ctrl_name, plot_clusters = TRUE,
            no_of_clusters = n_clust, n_genes = nrow(data_tab))
      })
      print(paste("Consumed time bg clustering: ",
                  round(tm_c["elapsed"]/60,2)," min",sep = ""))

      ### fit impulse model to the clusters of Y_star_s
      tm_f <- system.time({
       impulse_fit_clusters_bg <- impulse_fit(clustering_results_background,
            data_annot, n_it, control_timecourse = ctrl_tk, ctrl_name,
            fit_backg = TRUE)
      })
      print(paste("Consumed time bg clust fit: ",
                  round(tm_f["elapsed"]/60,2)," min",sep = ""))

      ### fit impulse model to each Y_star
      tm_fg <- system.time({
        impulse_fit_genes_bg <- impulse_fit(Y_star_s, data_annot, 1, ctrl_tk, ctrl_name,
            clustering_results_background, impulse_fit_clusters_bg, fit_backg = TRUE )
      })
      print(paste("Consumed time bg gene fit: ",
        round(tm_fg["elapsed"]/60,2)," min", sep = ""))

      ### calculate background
       RESD_1_s_b = Y_star_s - impulse_fit_genes_bg[[2]][rownames(Y_star_s),
            as.character(data_annot[colnames(Y_star_s),"Time"])]
       SS_1_s_b = apply(RESD_1_s_b,1,function(x){sum(x^2)})
       RESD_0_s_b = Y_star_s - rowMeans(Y_star_s)
       SS_0_s_b = apply(RESD_0_s_b,1,function(x){sum(x^2)})
    }
    background = (SS_0_s_b - SS_1_s_b) / SS_1_s_b
    return(background)
  }

#   define a new environment, which is used for parallelization
  my.env <- new.env()
  assign("n_rands", n_rands, envir = my.env)
  assign("n_proc", n_proc, envir = my.env)
  assign("data_table", data_table, envir = my.env)
  assign("data_annotation", data_annotation, envir = my.env)
  assign("n_iter",n_iter,envir = my.env)
  assign("control_timecourse", control_timecourse, envir = my.env)
  assign("control_name", control_name, envir = my.env)
  assign("imp_fit_genes", imp_fit_genes, envir = my.env)
  assign("no_of_clus", no_of_clus, envir = my.env)
  assign("calc_impulse_comp", calc_impulse_comp, envir = my.env)
  assign("cluster_genes_for_impulse", cluster_genes_for_impulse, envir = my.env)
  assign("impulse_fit", impulse_fit, envir = my.env)
  assign("two_impulses", two_impulses, envir = my.env)
  environment(impulse_bg) <- my.env

#' @import parallel
#' @import boot

#  ### set number of nodes to maximum - 1
  mc <- min(detectCores() - 1, n_proc)
  assign("mc", mc, envir = my.env)
  cl <- makeCluster(mc)
  clusterExport(cl = cl, varlist = c("impulse_bg","n_rands","n_proc","data_table",
    "data_annotation","n_iter","control_timecourse","control_name",
    "imp_fit_genes","mc", "no_of_clus","calc_impulse_comp",
    "cluster_genes_for_impulse","impulse_fit", "two_impulses"), envir = my.env)
  junk <- clusterEvalQ(cl,library(boot))
  clusterSetRNGStream(cl,123)
  res <- clusterEvalQ(cl,impulse_bg(data_table, data_annotation, n_iter,
     imp_fit_genes, control_timecourse, control_name, no_of_clus, n_rands, mc))
  environment(res) <- my.env
  res2 <- do.call(c,res)
  stopCluster(cl)
  return(res2)
}

################################################################################


DE_analysis <- function(data_table,data_annotation,impulse_fit_results,
    background, control_timecourse = FALSE, control_name = NULL,
    e_type = "Array", Q = 0.01){

  ### if control timecourse is present split data into case and control data
  if (control_timecourse == TRUE) {
      fmat_case <- as.matrix(data_table[,!(data_annotation$Condition %in%
            control_name)])
      fmat_ctrl <- as.matrix(data_table[,data_annotation$Condition %in%
            control_name])
     timep_case <-  as.numeric(as.character(data_annotation[colnames(fmat_case),
            "Time"]))
      timep_ctrl <- as.numeric(as.character(data_annotation[colnames(fmat_ctrl),
            "Time"]))
  }
  timep <- as.numeric(as.character(data_annotation[colnames(data_table),
        "Time"]))

  ### correct background values
  background[background < 0] = 0
  background = background[background < 100]

  if (control_timecourse == TRUE) {
    y_i_null = impulse_fit_results[[2]][rownames(fmat_case),as.character(timep)]
     RESD_1_s = cbind(fmat_case - impulse_fit_results[[4]][rownames(fmat_case),
        as.character(timep_case)], (fmat_ctrl[rownames(fmat_case),] -
        impulse_fit_results[[6]][rownames(fmat_case),as.character(timep_ctrl)]))
     RESD_1_s = RESD_1_s[,colnames(data_table)]

    ### if there is no control data
  } else if (control_timecourse == FALSE) {
      y_i_null = rowMeans(data_table)
      RESD_1_s = data_table - impulse_fit_results[[2]][rownames(data_table),
            as.character(timep)]
  }
  RESD_0_s = data_table - y_i_null
  SS_0_s = apply(RESD_0_s, 1, function(x){sum(x^2)})
  SS_1_s = apply(RESD_1_s, 1, function(x){sum(x^2)})

  F_stats =  (SS_0_s - SS_1_s) / SS_1_s
  ### avoid cases where SS_1 = 0  & SS_0 = 0--> Quotient will be NaN
  F_stats[ which(SS_0_s == 0 & SS_1_s == 0)] = 0

  ### bootstrap the p-values
  p  =  unlist(lapply(F_stats,function(x){length(which(background >=
        x))/length(background)}))
  p_scaled = p.adjust(p, method = "BH")
  p_scaled_orig = p_scaled


  # calculate FCs

  # FCs timepoint vs earlier
  if (control_timecourse == FALSE) {
       means <- t(apply(data_table, 1,function(x){
       tmp = NULL
       for (i in 1:length(unique(timep))) {
       tmp = c(tmp,mean(2^x[which(data_annotation[colnames(data_table),
            "Time"] == sort(unique(timep))[i])]))
       }
       return(tmp)
       }))
       colnames(means) <- sort(unique(timep))
       Ratios_TPs <- matrix(rep(0,nrow(data_table)*(length(unique(timep)) - 1)),
            nrow(data_table),(length(unique(timep)) - 1))
       for (tt in 1:(length(unique(timep)) - 1)) {
          Ratios_TPs[,tt] <- means[,tt + 1]/means[,tt]
       }

       FC_TP_DEindex <- apply(Ratios_TPs,1,function(x){
          if (length(which(x > 2 || x < 0.5)) >= 2) { "DE" } else {"not_DE"}
       })
  }


  if (control_timecourse == TRUE) {

      # FCs case vs control
      means_case <- t(apply(fmat_case, 1,function(x){
      tmp = NULL
      for (i in 1:length(unique(timep_case))) {
      tmp = c(tmp, mean(2^x[which(data_annotation[colnames(fmat_case),
        "Time"] == sort(unique(timep_case))[i])]))
      }
      return(tmp)
      }))
      colnames(means_case) <- sort(unique(timep_case))

      means_control <- t(apply(fmat_ctrl, 1,function(x){
      tmp = NULL
      for (i in 1:length(unique(timep_ctrl))) {
      tmp = c(tmp, mean(2^x[which(data_annotation[colnames(fmat_ctrl),
            "Time"] == sort(unique(timep_ctrl))[i])]))
      }
      return(tmp)
      }))
      colnames(means_control) <- sort(unique(timep_ctrl))

      common_timepoints = sort(unique(timep_case))[sort(unique(timep_case)) %in%
            unique(timep_ctrl)]
      if (length(common_timepoints) == 0) {
         FC_DEindex = rep("not_detectable",nrow(means_case))
      } else {
       meanRatios <- t(apply(cbind(means_case[,as.character(common_timepoints)],
          means_control[,as.character(common_timepoints)]),1,function(x){
          tmp <- x[1:length(unique(common_timepoints))]/
              (x[(length(unique(common_timepoints)) + 1):(2*
              length(unique(common_timepoints)))])
          return(tmp)
        }))
        FCs <- t(apply(meanRatios,1,function(x){
          y = x
          y[x < 1] = -1/x[x < 1]
          return(y)
        }))
        FC_DEindex <- apply(FCs,1,function(x){
          if (length(which(abs(x) > 2)) >= 2) { "DE" } else {"not_DE"}
        })
      }

      # ANOVA
      anovas <- t(apply(data_table,1,function(x){summary(aov(as.numeric(x) ~
            data_annotation$Time + data_annotation$Condition +
            data_annotation$Time * data_annotation$Condition))[[1]][2:3,
            "Pr(>F)"]}))
      anovas_FDR =  cbind(p.adjust(anovas[,1], method = "BH"),
            p.adjust(anovas[,2], method = "BH"))
      anovas_pre = anovas_FDR[,1] < Q | anovas_FDR[,2] < Q
      anovas_index <- unlist(lapply(anovas_pre, function(x){if (x == TRUE) {"DE"
          } else {"not_DE"}}))
  }

  # SS1 Flag
  SS1_flag = SS_1_s < 20
  error_index = unlist(lapply(SS1_flag,function(x){if (x == FALSE) {"fit might be
      unstable due to large variation"} else {""}}))

  ### exit the function without error if no DE genes are detected
  if (!(TRUE %in% (p_scaled <= Q))) {
    warning("No DE genes were detected. Maybe amount of background
            randomizations is too low.")
    return(NULL)

  ### if DE genes are detected, finish FDR correction by using the cutoff
  } else {

    ### if control data is present but not as a timecourse, add t-test
    ### p-values to the results
    result =   as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = as.numeric(p_scaled_orig), stringsAsFactors = FALSE))
    result$adj.p <- as.numeric(as.character(result$adj.p))
    result = result[order(result$adj.p),]
    print(paste("Found ",nrow(result[result$adj.p <= Q,])," DE genes",sep = ""))

    pvals_and_flags <- NULL
    if (control_timecourse == TRUE) {
       pvals_and_flags <- as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = p_scaled_orig, "at least 2 TPs for case vs. control" =
            FC_DEindex, "ANOVA condition or time*condition" = anovas_index,
            "prediction error" = error_index))
    } else {
       if (e_type == "Array") {
           pvals_and_flags <- as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = p_scaled_orig, "at least 2 consecutive TP Ratios" =
            FC_TP_DEindex, "prediction error" = error_index))
       } else {
           pvals_and_flags <- as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = p_scaled_orig,"prediction error" = error_index))
       }
    }
    return(list("DE_genes" = result[as.numeric(result$adj.p) <= Q,],
            "pvals_and_flags" = pvals_and_flags))
    }
}


################################################################################
################################################################################
################################################################################


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#                     Final function calling all others
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

#' Differential expression analysis using impulse models
#'
#' Fits an impulse model to time course data and uses this model as a basis
#' to detect differentially expressed (DE) genes. If a single time course
#' data set is given, DE genes are detected over time, whereas if an
#' additional control time course data set is present, DE genes are
#' detected between both datasets.
#' @aliases impulse_DE impulseDE
#' @param expression_table numeric matrix of expression values; genes should
#' be in rows, samples in columns. Data should be properly normalized and
#' log2-transformed as well as filtered for present or variable genes.
#' @param annotation_table table providing co-variables for the samples
#' including condition and time points. Time points must be numeric numbers.
#' @param colname_time character string specifying the column name of the
#' co-variable "\code{Time}" in \code{annotation_table}
#' @param colname_condition character string specifying the column name of
#' the co-variable "\code{Condition}" in \code{annotation_table}
#' @param control_timecourse logical indicating whether a control time
#' timecourse is part of the data set (\code{TRUE}) or not (\code{FALSE}).
#' Default is \code{FALSE}.
#' @param control_name character string specifying the name of the control
#' condition in \code{annotation_table}.
#' @param case_name character string specifying the name of the case
#' condition in \code{annotation_table}. Should be set if more than two
#' conditions are present in \code{annotation_table}.
#' @param expr_type character string with allowed values "\code{Array}" or
#' "\code{Seq}". Default is "\code{Array}".
#' @param plot_clusters logical indicating whether to plot the clusters
#' (\code{TRUE}) or not (\code{FALSE}). Default is \code{TRUE}.
#' @param n_iter numeric value specifying the number of iterations, which are
#' performed to fit the impulse model to the clusters. Default is \code{100}.
#' @param n_randoms numeric value specifying the number of generated randomized
#' background iterations, which are used for differential expression analysis.
#' Default is \code{50000} and this value should not be decreased.
#' @param n_process numeric value indicating the number of processes, which can
#' be used on the machine to run calculations in parallel. Default
#' is \code{4}. The specified value is internally changed to
#' \code{min(detectCores() - 1, n_process)} using the \code{detectCores}
#' function from the package \code{parallel} to avoid overload.
#' @param Q_value numeric value specifying the cutoff to call genes
#' significantly differentially expressed after FDR correction (adjusted
#' p-value). Default is \code{0.01}.
#' @param new_device logical indicating whether each plot should be plotted
#' into a new device (\code{TRUE}) or not (\code{FALSE}). Default is \code{TRUE}.
#' @return List containing the following elements:
#' \itemize{
#' \item \code{impulse_fit_results} List containing fitted values and model
#' parameters:
#' \itemize{
#' \item \code{impulse_parameters_case} Matrix of fitted impulse model
#' parameters and sum of squared fitting errors for the case dataset. If a
#' control time course is present, corresponding list entries will exist for the
#' control and the combined dataset as well (named
#' \code{impulse_parameters_control} and \code{impulse_parameters_combined},
#' respectively).
#' \item \code{impulse_fits_case} Matrix of impulse values calculated based
#' on the analyzed time points and the fitted model parameters for the combined
#' dataset. If a control time course is present, corresponding list entries will
#' exist for the control and the combined dataset as well (named
#' \code{impulse_fits_control} and \code{impulse_fits_combined},
#' respectively).
#' }
#' \item \code{DE_results} List containg the results from the differential
#' expression analysis:
#' \itemize{
#' \item \code{DE_genes} data.frame containing the names of genes being called
#' as differentially expressed according to the specified cutoff \code{Q_value}
#' together with the adjusted p-values.
#' \item \code{pvals_and_flags} data.frame containing all gene names
#' together with the adjusted p-values and flags for differential expression
#' according to additional tests.
#' }
#' \item \code{clustering_results} List containing the clustering results:
#' \itemize{
#' \item \code{kmeans_clus_case} Numeric vector of clusters IDs, to which the
#' genes were finally assigned.
#' \item \code{cluster_means_case} Matrix containing the mean expression values
#' for each cluster (taken over all genes assigned to a cluster).
#' \item \code{pre_clus_case} Numeric number of clusters determined after the
#' first (preliminary) clustering step.
#' \item \code{fine_clus_case} Numeric number of final clusters determined after
#'  the second clustering step.
#' }
#' If a control time course is present, those four list entries will
#' exist correspondingly for the control and the combined dataset as well
#' (ending with \code{_control} and \code{_combined} instead of \code{_case},
#' respectively).
#' }
#' @details \code{ImpulseDE} is based on the impulse model proposed by
#' Chechik and Koller, which reflects a two-step behavior of genes within a cell
#' responding to environmental changes (Chechik and Koller, 2009). To detect
#' differentially expressed genes, a five-step workflow is followed:
#' \enumerate{
#' \item The genes are clustered into a limited number of groups using k-means
#' clustering. If \code{plot_clusters} = \code{TRUE}, the clusters are plotted.
#' \item The impulse model is fitted to the mean expression profiles of the
#' clusters. The best parameter sets are then used for the next step.
#' \item The impulse model is fitted to each gene separately using the parameter
#' sets from step 2 as optimal start point guesses.
#' \item The impulse model is fitted to a randomized dataset (bootstrap), which
#' is essential to detect significantly differentially expressed genes
#' (Storey et al., 2005).
#' \item Detection of differentially expressed genes utilizing the fits to the
#' real and randomized data sets. FDR-correction is performed to obtain adjusted
#' p-values (Benjamini and Hochberg, 1995).
#' }
#' @examples
#' #' Install package longitudinal and load it
#' library(longitudinal)
#' #' Attach datasets
#' data(tcell)
#' #' check dimension of data matrix of interest
#' dim(tcell.10)
#' #' generate a proper annotation table
#' annot <- as.data.frame(cbind("Time" =
#'    sort(rep(get.time.repeats(tcell.10)$time,10)),
#'    "Condition" = "activated"), stringsAsFactors = FALSE)
#' #' Time columns must be numeric
#' annot$Time <- as.numeric(annot$Time)
#' #' rownames of annotation table must appear in data table
#' rownames(annot) = rownames(tcell.10)
#' #' apply ImpulseDE in single time course mode
#' #' since genes must be in rows, transpose data matrix using t()
#' #' For the example, reduce iterations to 10, randomizations to 50, number of
#' #' genes to 20 and number of used processors to 1:
#' impulse_results <- impulse_DE(t(tcell.10)[1:20,], annot, "Time", "Condition",
#'    n_iter = 10, n_randoms = 50, n_process = 1)
#' @seealso \code{\link{plot_impulse}}, \code{\link{calc_impulse}}.
#' @author Jil Sander
#' @references Benjamini, Y. and Hochberg, Y. (1995) Controlling the false
#' discovery rate: a practical and powerful approach to multiple testing.
#' J. R. Stat. Soc. Series B Stat. Methodol., 57, 289-300.
#' @references Storey, J.D. et al. (2005) Significance analysis of time course
#' microarray experiments. Proc. Natl. Acad. Sci. USA, 102, 12837-12841.
#' @references Rangel, C., Angus, J., Ghahramani, Z., Lioumi, M., Sotheran, E.,
#' Gaiba, A., Wild, D.L., Falciani, F. (2004) Modeling T-cell activation using
#' gene expression profiling and state-space models. Bioinformatics, 20(9),
#' 1361-72.
#' @references Chechik, G. and Koller, D. (2009) Timing of Gene Expression
#' Responses to Envi-ronmental Changes. J. Comput. Biol., 16, 279-290.
#' @references Yosef, N. et al. (2013) Dynamic regulatory network controlling
#' TH17 cell differentiation. Nature, 496, 461-468.
#' @export
impulse_DE <- function(expression_table = NULL, annotation_table = NULL,
    colname_time = NULL, colname_condition = NULL, control_timecourse = FALSE,
    control_name = NULL, case_name = NULL, expr_type = "Array",
    plot_clusters = TRUE, n_iter = 100, n_randoms = 50000, n_process = 4,
    Q_value = 0.01, new_device = TRUE){

  tm_tot <- system.time({

    #' @import compiler
    #' @importFrom grDevices dev.new
    #' @importFrom graphics abline axis legend par plot points frame
    #' @importFrom stats aov cor dist kmeans mad median nlminb optim p.adjust
    #' @importFrom stats runif sd
    #' @importFrom utils head write.table

    ## prepare annotation table for internal usage
    print("START: Prepare annotation table for internal usage")
    print("-------------------------------------------------------------------")

    prepared_annotation <- annotation_preparation(annotation_table,
          expression_table,
          colname_time ,colname_condition, control_timecourse, control_name,
          case_name)
    prepared_annotation <- prepared_annotation[order(
            prepared_annotation$Condition),]
    prepared_annotation <- prepared_annotation[order(prepared_annotation$Time),]
    print("DONE")
    print("###################################################################")

    expression_table <- as.matrix(expression_table)    # to have numeric values
    expression_table <- expression_table[,rownames(prepared_annotation)]

    # exclude genes with missing values(NAs)
    indx <- apply(expression_table,1,function(x){TRUE %in% is.na(x)})
    expression_table <- expression_table[!(indx),]

    # if rownames are just 1,2,3 or if there are no rownames
    if (is.null(rownames(expression_table))) {
       rownames(expression_table) <- paste("G", 1:nrow(expression_table),
            sep = "_")
    } else if (length(grep("[a-zA-Z]",rownames(expression_table))) == 0) {
       rownames(expression_table) <- paste(rownames(expression_table),"G",
            sep = "_")
    }

    ### cluster genes to reduce efforts for fitting the impulse model
    print("START: Clustering genes for Impulse model fit")
    print("-------------------------------------------------------------------")
    tm_clust <- system.time({
      clustering_results <- cluster_genes_for_impulse(expression_table,
          prepared_annotation, control_timecourse, control_name, plot_clusters,
          n_device = new_device)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_clust["elapsed"]/60,2)," min",sep = ""))
    print("##################################################################")

    # fit Impulse model to the clusters
    print("START: Fitting Impulse model to the clusters")
    print("-------------------------------------------------------------------")
    tm_imp_fit_clus <- system.time({
      impulse_fit_clusters <- impulse_fit(clustering_results,prepared_annotation,
          n_iter, control_timecourse, control_name, n_proc = n_process)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_imp_fit_clus["elapsed"]/60,2)," min",
        sep = ""))
    print("###################################################################")

    ###  fit Impule model to each gene by using the cluster fits as start values
    print("START: Fitting Impulse model to the genes")
    print("-------------------------------------------------------------------")
    tm_imp_fit_gen <- system.time({
      impulse_fit_genes <- impulse_fit(expression_table, prepared_annotation,
        n_iter, control_timecourse, control_name, clustering_results,
        impulse_fit_clusters, n_proc = n_process)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_imp_fit_gen["elapsed"]/60,2),
        " min",sep = ""))
    print("###################################################################")

    ### generate background for the DE analysis
    print("START: Generate background")
    print("-------------------------------------------------------------------")
    tm_bg <- system.time({
      background_results <-  generate_background(expression_table,
        prepared_annotation, n_iter, impulse_fit_genes, control_timecourse,
        control_name, clustering_results, n_randoms, n_process)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_bg["elapsed"]/60,2)," min",sep = ""))
    print("###################################################################")

    ## detect differentially expressed genes
    print("START: DE analysis")
    print("-------------------------------------------------------------------")
    tm_DE <- system.time({
      impulse_DE_genes <- DE_analysis(expression_table,prepared_annotation,
            impulse_fit_genes, background_results, control_timecourse,
            control_name, expr_type, Q_value)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_DE["elapsed"]/60,2)," min",sep = ""))
    print("##################################################################")
  })
  print(paste("TOTAL consumed time: ",round(tm_tot["elapsed"]/60,2),
        " min",sep = ""))
  return(list("impulse_fit_results" = impulse_fit_genes,
        "DE_results" = impulse_DE_genes, "clustering_results" = clustering_results))
}

Try the ImpulseDE package in your browser

Any scripts or data that you put into this service are public.

ImpulseDE documentation built on April 28, 2020, 9:05 p.m.