R/Metalonda.R
In MetaLonDA: Metagenomics Longitudinal Differential Abundance Method

Documented in metalonda metalondaAll

#' Metagenomic Longitudinal Differential Abundance Analysis for one feature
#'
#' Find significant time intervals of the one feature
#' 
#' @param Count matrix has the number of reads that mapped to each feature in each sample.
#' @param Time vector of the time label of each sample.
#' @param Group vector of the group label of each sample.
#' @param ID vector of the subject ID label of each sample.
#' @param n.perm number of permutations.
#' @param fit.method fitting method (nbinomial, lowess).
#' @param points points at which the prediction should happen.
#' @param text Feature's name.
#' @param parall boolean to indicate whether to use multicore.
#' @param pvalue.threshold p-value threshold cutoff for identifing significant time intervals.
#' @param adjust.method multiple testing correction method.
#' @param time.unit time unit used in the Time vector (hours, days, weeks, months, etc.)
#' @param col two color to be used for the two groups (eg., c("red", "blue")).
#' @param ylabel text to be shown on the y-axis of all generated figures (default: "Normalized Count")
#' @param prefix prefix to be used to create directory for the analysis results
#' @return returns a list of the significant time intervals for the tested feature.
#' @import parallel
#' @import doParallel
#' @import stats
#' @import zoo 
#' @references
#' Ahmed Metwally (ametwall@stanford.edu)
#' @examples 
#' data(metalonda_test_data)
#' n.sample = 5
#' n.timepoints = 10
#' n.group = 2
#' Group = factor(c(rep(0, n.sample*n.timepoints), rep(1,n.sample*n.timepoints)))
#' Time = rep(rep(1:n.timepoints, times = n.sample), 2)
#' ID = factor(rep(1:(2*n.sample), each = n.timepoints))
#' points = seq(1, 10, length.out = 10)
#' \dontrun{
#' output.nbinomial = metalonda(Count = metalonda_test_data[1,], Time = Time, Group = Group,
#' ID = ID, fit.method =  "nbinomial", n.perm = 10, points = points,
#' text = rownames(metalonda_test_data)[1], parall = FALSE, pvalue.threshold = 0.05, 
#' adjust.method = "BH", time.unit = "hours", ylabel = "Normalized Count", col = c("black", "green"))
#' }
#' @export
metalonda = function(Count, Time, Group, ID, n.perm = 500, fit.method = "nbinomial", 
                      points, text = 0, parall = FALSE, pvalue.threshold = 0.05, 
                      adjust.method = "BH", time.unit = "days", ylabel = "Normalized Count", col = c("blue", "firebrick"),
                     prefix = "Test")
{
  cat("Start MetaLonDA \n")

  if (!dir.exists(prefix)){
    dir.create(file.path(prefix))
  }
  
  # Extract groups
  Group = as.character(Group)
  group.levels = sort(unique(Group))

  
  
  if(length(group.levels) > 2){
    stop("You have more than two phenotypes.")
  } else if(length(group.levels) < 2){
    stop("You have less than two phenotypes.")
  }

  
  gr.1 = as.character(group.levels[1])
  gr.2 = as.character(group.levels[2])

  Group[which(Group == gr.1)] = 0
  Group[which(Group == gr.2)] = 1

  
  
  ## Preprocessing: add pseudo counts for zero abudnance features
  Count = Count + 1e-8

  
  ## Form MetaLonDA dataframe
  aggregate.df = data.frame(Count = Count, Time = Time, Group = Group, ID = ID)


  ## Visualize feature's abundance accross different time points  
  visualizeFeature(aggregate.df, text, group.levels, unit = time.unit, ylabel = ylabel, col = col, prefix = prefix)

  
  group.0 = aggregate.df[aggregate.df$Group == 0, ]
  group.1 = aggregate.df[aggregate.df$Group == 1, ]
  points.min = max(sort(group.0$Time)[1], sort(group.1$Time)[1])
  points.max = min(sort(group.0$Time)[length(group.0$Time)], sort(group.1$Time)[length(group.1$Time)])
  points = points[which(points >= points.min & points <= points.max)]
  
  cat("Start Curve Fitting \n") 
  if (fit.method == "lowess")
  {
    cat("Fitting: LOWESS \n")
    model = curveFitting(df = aggregate.df, method= "lowess", points)
  } else if (fit.method == "nbinomial")
  {
    cat("Fitting: NB SS \n")
    model = tryCatch({
      curveFitting(df = aggregate.df, method= "nbinomial", points)
      },  error = function(err) {
        print(paste("ERROR in gss = ", err, sep="")); 
        return("ERROR")
        })
  } else {
    cat("You have entered unsupported fitting method\n")
    quit()
    
  }
  
  ## Visualize feature's trajectories spline
  visualizeFeatureSpline(aggregate.df, model, fit.method, text, group.levels, unit = time.unit, ylabel = ylabel, 
                         col = col, prefix = prefix)
 
   
  ## Calculate area under the fitted curve for each time interval
  cat("Calculate Area Under the Fitted Curves \n")
  area = intervalArea(model)
  
  
  ## TODO: remove this warning
  # 1: <anonymous>: ... may be used in an incorrect context: ‘.fun(piece, ...)’
  
  ## Permutation 
  perm  = permutation(aggregate.df, n.perm, fit.method, points, lev = group.levels, parall = parall)
  
  
  ## Area p-value per unit interval
  area.perm = areaPermutation(perm)
   
  
  a1 = do.call(rbind, area.perm)
  a2 = do.call(rbind, a1[,2])

  ##  Visualize AR empirical distribution
  #visualizeARHistogram(a2, text, fit.method, prefix = prefix)
  
  ## Calculate AR p-value 
  pvalue.area = sapply(1:(length(points)-1), function(i){
    sum(a2[,i] >= area$ar.abs[i])/length(a2[,i])
  } )



  ## Identify significant time inetrval based on the adjusted p-value 
  cat("p-value Adjustment Method = ", adjust.method, "\n")
  adjusted.pvalue = p.adjust(pvalue.area, method = adjust.method)
  interval = findSigInterval(adjusted.pvalue, threshold = pvalue.threshold, sign = area$ar.sign)
  st = points[interval$start]
  en = points[interval$end + 1]
  
  
  if(length(st) > 0)
  {
    ## Visualize sigificant area
    visualizeArea(aggregate.df, model, fit.method, st, en, text, group.levels, unit = time.unit, ylabel = ylabel,
                  col = col, prefix = prefix)
  }
   
  
  ## Calculate start, end, dominant for each interval
  interval.start = points[-length(points)]
  interval.end = points[-1]
  dominant = area$ar.sign
  dominant[which(dominant == 1)] = gr.1
  dominant[which(dominant == -1)] = gr.2
  
  
  ## Prepare Log2FoldChange
  avg.mod0.count = rollapply(model$dd.0$Count, 2, mean)
  avg.mod1.count = rollapply(model$dd.1$Count, 2, mean)
  foldChange = avg.mod0.count/avg.mod1.count
  log2FoldChange = log2(foldChange)

  
  output.details = list(feature = rep(text, length(interval.start)), 
                        interval.start = interval.start, interval.end = interval.end,
                        avg.mod0.count = avg.mod0.count, avg.mod1.count = avg.mod1.count, 
                        foldChange = foldChange, log2FoldChange = log2FoldChange, 
                        areaRatio = area$ar, areaRatio.abs = area$ar.abs, areaRatio.sign = area$ar.sign, dominant = dominant,
                        intervals.pvalue = pvalue.area, adjusted.pvalue = adjusted.pvalue, points = points)

  output.summary = data.frame(feature = rep(text, length(interval$start)), start = st, end = en,
                        dominant = interval$dominant, pvalue = interval$pvalue)

  
  ## Output table summarizing time intervals statistics
  output.details$points = output.details$points[-length(output.details$points)]
  feature.summary = as.data.frame(do.call(cbind, output.details), stringsAsFactors = FALSE)
  write.csv(feature.summary, file = sprintf("%s/Feature_%s_Summary.csv", prefix, text), row.names = FALSE)
  
  
  aggregateData = list(feature.detail = output.details, feature.summary = feature.summary, feature.model = model)
  save(aggregateData, 
       file = sprintf("%s/Feature_%s_Summary_%s.RData",
                      prefix, text, fit.method))
  cat("\n\n")
  
  
  return(list(detailed = output.details, summary = output.summary, model = model))
}


#' Metagenomic Longitudinal Differential Abundance Analysis for all Features
#'
#' Identify significant features and their significant time interval
#' 
#' @param Count Count matrix of all features
#' @param Time Time label of all samples
#' @param Group Group label of all samples
#' @param ID individual ID label for samples
#' @param n.perm number of permutations
#' @param fit.method The fitting method (nbinomial, lowess)
#' @param num.intervals The number of time intervals at which metalonda test differential abundance 
#' @param parall logic to indicate whether to use multicore
#' @param pvalue.threshold p-value threshold cutoff
#' @param adjust.method Multiple testing correction methods
#' @param time.unit time unit used in the Time vector (hours, days, weeks, months, etc.)
#' @param norm.method normalization method to be used to normalize count matrix (css, tmm, ra, log10, median_ratio) 
#' @param prefix prefix for the output figure
#' @param col two color to be used for the two groups (eg., c("red", "blue")).
#' @param ylabel text to be shown on the y-axis of all generated figures (default: "Normalized Count")
#' @return Returns a list of the significant features a long with their significant time intervals
#' @references
#' Ahmed Metwally (ametwall@stanford.edu)
#' @examples 
#' \dontrun{
#' data(metalonda_test_data)
#' n.sample = 5
#' n.timepoints = 10
#' n.group = 2
#' Group = factor(c(rep(0, n.sample*n.timepoints), rep(1,n.sample*n.timepoints)))
#' Time = rep(rep(1:n.timepoints, times = n.sample), 2)
#' ID = factor(rep(1:(2*n.sample), each = n.timepoints))
#' points = seq(1, 10, length.out = 10)
#' output.nbinomial = metalondaAll(Count = metalonda_test_data, Time = Time, Group = Group,
#' ID = ID, n.perm = 10, fit.method =  "nbinomial", num.intervals = 100, 
#' parall = FALSE, pvalue.threshold = 0.05, adjust.method = "BH", 
#' time.unit = "hours", norm.method = "none", prefix = "Test",  time.unit = "hours", 
#' ylabel = "Normalized Count", col = c("black", "green"))
#' }
#' @export
metalondaAll = function(Count, Time, Group, ID, n.perm = 500,
                         fit.method = "nbinomial", num.intervals = 100, parall = FALSE, 
                         pvalue.threshold = 0.05, adjust.method = "BH", time.unit = "days", 
                         norm.method = "none", prefix = "Output", ylabel = "Normalized Count", col = c("blue", "firebrick"))
{
  ## Check the dimentions of the annotation vectors and count matrix
  if(length(Time) == length(ID))
  {
    if(ncol(Count) == length(Group))
    {
      if(length(Time) == length(Group))
      {
        cat("Dimensionality check passed\n")
      } else
      {
        stop("The length of the annotation vectors don't match or not consistent 
             with the number of columns of the count matrix.")
      }
    } else
    {
      stop("The length of the annotation vectors don't match or not consistent 
           with the number of columns of the count matrix.")
    }
  } else
  {
    stop("The length of the annotation vectors don't match or not consistent 
         with the number of columns of the count matrix.")
  }
  
  
  #### Create Prefix folder
  dir.create(file.path(prefix), showWarnings = FALSE)
  
  
  ## Normalization
  if(norm.method != "none")
  {
    if(norm.method == "css" | norm.method == "tmm" | norm.method == "ra" | norm.method == "log10" | norm.method == "median_ratio")
    {
      cat("Normalizaton method = ", norm.method, "\n")
      Count = normalize(Count, method = norm.method)
    } else{
      stop("You have entered a wrong normalization method")
    }
  }
  
  ## Specify the test/prediction timepoints for metalonda
  if(num.intervals == "none")
    points = seq(min(Time), max(Time))
  else
    points = seq(min(Time), max(Time), length.out = num.intervals + 1)
  
  cat("Prediction Points = ")
  print(points)
  cat("\n")
  
  ## Filter out the taxa that always have zero of one/both group
  Group = as.character(Group)
  group.levels = sort(unique(Group))
  if(length(group.levels) > 2){
    stop("You have more than two phenotypes.")
  }
  gr.1 = group.levels[1]
  gr.2 = group.levels[2]
  
  data.count.filt = as.matrix(Count)
  
  ## Apply metalonda for each feature
  n.features = nrow(data.count.filt)
  detailed = list()
  summary = list()
  model = list()
  for (i in 1:n.features)
  {
    cat ("Feature  = ", rownames(data.count.filt)[i], "\n")
    out = metalonda(Count = data.count.filt[i,], Time = Time, Group = Group, ID = ID,
                    fit.method = fit.method, n.perm = n.perm, points = points, 
                    text=rownames(data.count.filt)[i], parall = parall, pvalue.threshold, adjust.method, time.unit, 
                    ylabel = ylabel, col = col, prefix = prefix)

    
    detailed[[i]] = out$detailed
    summary[[i]] = out$summary
    model[[i]] = out$model
  }
  
  summary.tmp = do.call(rbind, summary)
  summary.tmp$dominant[which(summary.tmp$dominant == 1)] = gr.1
  summary.tmp$dominant[which(summary.tmp$dominant == -1)] = gr.2
  
  ## Output table and figure that summarize the significant time intervals
  write.csv(summary.tmp, file = sprintf("%s/%s_MetaLonDA_TimeIntervals.csv", prefix, prefix), row.names = FALSE)
  visualizeTimeIntervals(interval.details = summary.tmp, prefix, unit = time.unit, col = col)
  
  aggregateData = list(output.detail = detailed, output.summary = summary.tmp, output.model = model)
  save(aggregateData, file = sprintf("%s/%s_MetaLonDA.RData", prefix, prefix))
  return(aggregateData)
}