R/pmf-anova.R

Defines functions pmfanova

Documented in pmfanova

#' pmfanova
#'
#' perform principle compponent analysis on ramclustR object dataset, export plots
#' @details This function performs standard or mixed model anova using a ramclusR object as input 
#' 
#' @param ramclustObj ramclustR object to perform ANOVA on
#' @param anova.name character; optionally assign a name to the anova. valuable if you wish to perform more than one ANOVA on a sample set. If NULL, name will be autogenerated from the anova.call and subset information. 
#' @param which.data character; which dataset (SpecAbund or SpecAbundAve) to perform PCA on. 
#' @param subset character or integer vector; if character, must be even length.  If you wish to perform ANOVA only when your factor called 'treatment' is a 'trt' sample and when 'time' is '3', then you would use  i.e. c("treatment", "trt", "time", "3"). vector length must always be even, and with 'factor' followed by 'level'.  If an integer vector is provided, only row numbers matching those integers are retained. 
#' @param anova.call character;  the model you wish to run.  i.e. "Trt * Time" or "Trt * Time + (1|Block)" 
#' @param posthoc character; if you wish to perform post hoc testing, define the posthoc test to use.  i.e. if anova.call is "Trt * Time" you can use "Trt|Time" to specific 'treatment within time' comparisons.
#' @param effectsplots logical; if TRUE, plot effect plots in pdf format.  If plotting fails, can set to 'FALSE' to get tabular results only.
#' @param label.by  how should metabolites columns be labelled? one of 'ann' or 'cmpd', typically. 
#' @param delim character; what is the character delimiting factors in the sample names.  generally '-'
#' @param plots logical; if plotting is any issue set this to FALSE. 
#' @param pcut numeric; what is the pvalue cutoff for determining significance?  Used only for plotting. 
#' @param p.adj character; what p.adjust method should be used.  generally 'fdr' or 'bh' (equivalent). see ?p.adjust
#' @param which.quan; chracter vector.  which factors should be interpreted as quantitative?  i.e. c("time", "dose")
#' @param filter logical, TRUE by default. when $cmpd.use slot is present (from rc.cmpd.filter.cv function), only cmpds that passed cv filtering are used. If you wish to change that behavior, rerun the rc.cmpd.filter.cv function with a really high CV threshold. 
#' @param summary.statistics logical, TRUE by default.  If true, calculate mean and standard deviation for each level as defined by the anova.call model.
#' @param output.summary logical, TRUE by default.  If true, saves full model results for all ANOVA as .Rdata file, and exports all summaries to a .txt file.  
#' @return returns a ramclustR object.  new R object in $pca slot. Optionally, new R object in $AuerGervini slot if npc = "auto".
#' @concept RAMClustR
#' @author Corey Broeckling

#' @export 


pmfanova<-function(ramclustObj=RC,
                   anova.name = NULL,
                   which.data="SpecAbund",
                   label.by = "cmpd", 
                   subset = c(""),
                   anova.call=NULL,
                   posthoc=NULL,
                   effectsplots=TRUE,
                   delim="-",
                   plots=TRUE,
                   pcut=0.05,
                   p.adj="BH",
                   which.quan=NULL,
                   filter = TRUE,
                   summary.statistics = TRUE, 
                   output.summary = TRUE,
                   remove.na.factors = FALSE
) {
  
  # # consider moving away from lsmeans and effects, keeping only emmeans dependency.
  # # lme4, lmerTest, pbkrtest needed? 
  # # replace effects plots with emmeans plots
  # d <- RAMClustR::getData(ramclustObj = ramclustObj)
  # dat <- cbind(d[[1]], d[[2]])
  # dat <- dat[subset,]
  # # m <- lm(as.formula(paste("C0001", "~", "treatment+dose+time+treatment:dose+treatment:time+dose:time")), data = dat)
  # # e <- emmeans(m, specs = pairwise ~ treatment:dose:time)
  # m <- lm(as.formula(paste("C0002", "~", "treatment*dose")), data = dat)
  # a <- aov(m)
  # e <- emmeans(m, specs = pairwise ~ dose|treatment)
  # e.grid <- ref_grid(m)
  # plot(e.grid, by = c('time', 'treatment'))
  # emmip(m, C0001 ~ dose)
  # with(pigs, interaction.plot(percent, source, conc))
  # dat$dose <- as.numeric(as.character(dat$dose))
  # with(dat, interaction.plot(dose, paste(treatment, time), C0001))
  # # emmip(noise.lm, type ~ size | side)
  # emmip(m, label ~ time, data = dat)
  
  require(effects)
  require(lme4)
  require(lmerTest)
  require(pbkrtest)
  require(emmeans)
  
  if(!dir.exists("stats")) {
    dir.create('stats')
  }
  if(!dir.exists("stats/anova")) {
    dir.create('stats/anova')
  }
  
  
  d <- RAMClustR::getData(
    ramclustObj = ramclustObj,
    which.data = which.data,
    filter = filter,
    cmpdlabel = label.by
  )
  
  cat(head(dimnames(d[[2]][[2]])))
  
  if(filter){
    if(!is.null(ramclustObj$cmpd.use)) {
      if(length(ramclustObj$cmpd.use == ncol(d[[2]]))) {
        cmpd.use <- which(ramclustObj$cmpd.use)
      } else {
        cmpd.use <- 1:ncol(d[[2]])
      }
    } else {
      cmpd.use <- 1:ncol(d[[2]])
    }
  } else {
    cmpd.use <- 1:ncol(d[[2]])
  }
  
  
  ## remove samples that have NA values in any factor from anova.call
  if(remove.na.factors) {
    keep <- rep(TRUE, nrow(d[[1]]))
    
    for(i in 1:ncol(d[[1]])) {
      if(!grepl(names(d[[1]])[i], anova.call)) next
      na.vals <- which(is.na(d[[1]][,i]))
      na.vals <- c(na.vals, which(d[[1]][,i] == "NA"))
      if(length(na.vals)==0) next
      keep[na.vals] <- FALSE
    }
    
    if(any(!keep)) {
      cat("excluding", length(which(!keep)), "samples with 'NA' values from ANOVA.", '\n')
      d[[1]] <- d[[1]][keep,]
      d[[2]] <- d[[2]][keep,]
      d[[3]] <- d[[3]][keep,]
    }
  }
  
  
  if(!is.null(which.quan)) {
    for(i in 1:length(which.quan)) {
      d[[1]][,i] <- as.numeric(d[[1]][,i])
    }
  }
  
  for(i in 1:ncol(d[[1]])) {
    d[[1]][,i] <- as.factor(d[[1]][,i])
  }
  
  
  
  ramclustObj$history$anova <- paste(
    "Analysis of variance was performed in R.", 
    paste0("The ",  which.data, " dataset was used as input."), 
    if(!is.null(which.quan)) {paste0("The factor(s) [", paste(which.quan, collapse = " "), "] are treated as numeric." )}
  )
  
  
  if(is.null(anova.name)) {
    if(length(subset)>1) {
      anova.name <- paste(paste(subset, collapse = "."), paste(strsplit(gsub("[^[:alnum:] ]", "", anova.call), " +")[[1]], collapse = "."), sep = "_")
    } else {
      anova.name <- paste(strsplit(gsub("[^[:alnum:] ]", "", anova.call), " +")[[1]], collapse = ".")
    }
  }
  
  out.dir <- paste0("stats/anova/", anova.name)
  while(dir.exists(out.dir)) {
    out.dir <- paste0(out.dir, "_", format(Sys.time(), "%H%M%S"))
  }
  
  dir.create(out.dir)
  anova.name <- basename(out.dir)
  
  ramclustObj$history$anova[[2]] <- paste(
    paste0("For the analysis of variance titled '", anova.name, "',"),
    "the", which.data, "dataset was used as described below."
  )
  
  cat("subset:", subset, '\n')
  # cat(dim(d[[2]]), '\n')
  
  if(length(subset) > 1) {
    if(is.integer(subset))  {
      d[[1]] <- d[[1]][subset,]
      d[[2]] <- d[[2]][subset,]
      d[[3]] <- d[[3]][subset,]
    } else {
      if(length(subset)/2 != round(length(subset)/2)) stop("'subset' length must be even", '\n')
      subset <- matrix(subset, nrow = 2)
      keep <- 1:nrow(d[[1]])
      for(i in 1:ncol(subset)) {
        if(!any(names(d[[1]]) == subset[1,i])) {
          stop(paste("factor", subset[1,i], "was not found", '\n'))
        }
        f <- as.character(d[[1]][,subset[1,i]])
        k <- which(f == subset[2,i])
        if(length(k) == 0) {
          stop(paste("level", subset[2,], "in factor", subset[1,i], "was not found", '\n'))
        }
        keep <- sort(intersect(keep, k))
      }
      d[[1]] <- d[[1]][keep,]
      d[[2]] <- d[[2]][keep,]
      d[[3]] <- d[[3]][keep,]
      # ramclustObj$history$anova3 <- paste(
      #   paste0("The dataset was subsetted to include only samples for which [") 
      # )
      # for(i in 1:ncol(subset)) {
      #   ramclustObj$history <- paste(ramclustObj$history, paste0(subset[1,i], "=", subset[2,i]))
      # }
      # ramclustObj$history <- paste0(ramclustObj$history, ".")
    }
    
  }
  
  dat<-d[[3]]
  cmpd <- dimnames(d[[2]])[[2]]
  
  if(grepl("1|", anova.call, fixed = TRUE)) {use<-2} else {use<-1}
  
  
  if(use == 1) {
    cat("using fixed linear model analysis", '\n')
    test <- lm(as.formula(paste(cmpd[1], "~", anova.call)), data = dat)
    res<-lapply(1:length(cmpd), FUN = function(x){
      lm(as.formula(paste(cmpd[x], "~", anova.call)), data = dat)
    })
    pnames<-attr(attr(res[[1]]$model, which="terms"), which="term.labels")
    mp<-data.frame(lapply(1:length(res), FUN=function(x) {anova(res[[x]])$"Pr(>F)"[1:length(pnames)]}))
    dimnames(mp)[[1]] <- pnames
    dimnames(mp)[[2]] <- cmpd
    
    if(effectsplots) {
      testplot<-try(
        effects::allEffects(test), silent = TRUE
      )
      if(class(testplot) != 'try-error') {
        pdf(file=paste0(out.dir, "/effectsplots.pdf"), width=16, height=8)
        for(i in 1:length(res)) {
          plot(effects::allEffects(res[[i]]), main=ramclustObj[[label.by]][i], ylab="effect size (signal intensity)")
        }
        dev.off()
      } else {
        cat("effects plots failed: setting effectsplots = FALSE", '\n')
        cat(testplot)
        stop("effects plots failed")
        effectsplots = FALSE
      }
    }
    
    ramclustObj$history$anova4 <- paste(
      "Fixed-factor linear model ANOVA was performed using the lm function.", 
      paste0("The model used was '", anova.call, ".'")
    )
    
  } 
  
  if(use == 2) {
    cat("using mixed linear model analysis",'\n')
    assign(".TeMpVaR", dat, envir=globalenv())
    test <- lmerTest::lmer(as.formula(paste(cmpd[1], "~", anova.call)), data = .TeMpVaR)
    if(effectsplots) {
      testplot<-try(
        effects::allEffects(test), silent = TRUE
      )
      if(class(testplot) != 'try-error') {
        pdf(file=paste0(out.dir, "/effectsplots.pdf"), width=16, height=8)
      } else {
        cat("effects plots failed: setting effectsplots = FALSE", '\n')
        cat(testplot)
        stop("effects plots failed")
        effectsplots = FALSE
      }
    }
    
    res <- list(rep(NA, length(cmpd)))
    for(x in 1:length(cmpd)) {
      res.x <- lmerTest::lmer(as.formula(paste(cmpd[x], "~", anova.call)), data = .TeMpVaR)
      if(effectsplots) {
        plot(effects::allEffects(res.x), main=ramclustObj[[label.by]][x], ylab="effect size (signal intensity)")
      }
      res[[x]] <- res.x
    }
    
    if(effectsplots) { dev.off() }
    
    pnames<-dimnames((anova(test, ddf="Kenward-Roger"))[1])[[1]]
    mp<-data.frame(lapply(1:length(res), FUN=function(x) {anova(res[[x]], ddf="Kenward-Roger")$"Pr(>F)"}))
    if(length(pnames) == dim(mp)[[2]]) {mp <- as.data.frame(t(mp))}
    dimnames(mp)[[1]] <- pnames
    dimnames(mp)[[2]] <- cmpd
    ramclustObj$history$anova5 <- paste(
      "Mixed model ANOVA was performed using the lmer and lmerTest functions.", 
      paste0("The model used was '", anova.call, ".'"),
      "P-values were assigned using the 'anova' function with ddf set to 'Kenward-Roger.'"
    )
    on.exit(rm(.TeMpVaR, envir=globalenv()))
  }
  
  ## pvalue correction: 
  ## apply pvalue correction
  for(i in 1:nrow(mp)) {
    mp[i,] <- p.adjust(mp[i,], method=p.adj, n = (dim(mp)[1]*dim(mp)[2]))
    ramclustObj$history$anova7 <- paste(
      "P-value correction was performed using the p.adjust function with method set to", 
      paste0(p.adj, ".")
    )
  }
  
  
  cat(out.dir, '\n')
  
  if(output.summary) {
    out <- c("ANOVA MODEL DETAILS:  ", '\n')
    out <- c(out, capture.output(sessionInfo()), '\n', '\n', '\n')
    for(i in 1:length(res)) {
      out <- c(out, capture.output(res[[i]]),  '\n')
      out <- c(out, capture.output(anova(res[[i]])),  '\n', '\n', '\n')
    }
    sink(paste0('stats/anova/', anova.name, "/model_details.txt"))
    cat(out)
    sink()
    
    
    save(res, file = paste0(out.dir, "/models.Rdata"))
  }
  
  
  
  ##optionally return posthoc results using tukey HSD
  if(!is.null(posthoc)) {
    ## pvalue corrections happens within for all constrasts, no need to correct again.
    
    for(j in 1:length(posthoc)) {
      # test <- lsmeans(res[[1]], as.formula(paste("pairwise ~", posthoc[j])), data=dat)
      test <- emmeans::emmeans(res[[1]], as.formula(paste("pairwise ~", posthoc[j])), data=dat)
      phres<-lapply(1:length(res), 
                    FUN=function (x) {
                      # lsmeans(res[[x]], as.formula(paste("pairwise ~", posthoc[j])), data=dat, method = "tukey")
                      suppressWarnings(emmeans::emmeans(res[[x]], as.formula(paste("pairwise ~", posthoc[j])), data=dat))
                    })
      
      # test$contrasts@grid
      tmp <- as.matrix(test$contrasts@grid, stringsAsFactors = FALSE)
      pn <- vector(length = 0)
      for(i in 1:nrow(tmp)) {
        pn <- c(pn, paste(as.vector(tmp[i,]), collapse = " _ "))
      }
      pnames <- pn
      pdata <- data.frame(lapply(1:length(phres), FUN = function(x) {summary(phres[[x]]$contrasts)$p.value }))
      dimnames(pdata)[[1]] <- pnames
      dimnames(pdata)[[2]] <- dimnames(mp)[[2]]
      mp<-rbind(mp, pdata)
    }
    ramclustObj$history$anova6 <- paste(
      paste0("Post-hoc testing was performed for [", paste(posthoc, sep = " "), "] using the 'Tukey' method in the emmeans package.")
    )
    
  }
  
  
  # out.dir
  ramclustObj[[paste0("anova.pval_", anova.name)]] <- mp
  
  if(!is.null(ramclustObj$clrt)){
    
    pldata <- t(rbind("rt" = round(ramclustObj$clrt, digits = 1), mp))
    for(j in ncol(pldata):2) {
      if(all(is.na(pldata[,j]))) {
        pldata <- pldata[,-j]
      }
    }
    plcols <- pldata
    plcols[,2:ncol(plcols)] <- "gray"
    plcols[which(pldata <= 0.05, arr.ind = TRUE)] <- rgb(red=0, green=80, blue=0, max=255)
    if(ncol(pldata) < 2) {break} 
    
    # pwr <- 1
    # cexs <- ramclustObj$msint^(1/pwr)/median(ramclustObj$msint^(1/pwr))
    # while(max(cexs) > 6) {
    #   pwr <- pwr + 1
    #   cexs <- ramclustObj$msint^(1/pwr)/median(ramclustObj$msint^(1/pwr))
    # }
    # cex.rg <- seq(min(cexs), max(cexs), length.out = 5)
    # int.rg <- seq(from = min(ramclustObj$msint), to = max(ramclustObj$msint), by = )
    
    if(is.null(ramclustObj$msint)){    
      msint<-rep(0, length(ramclustObj$cmpd))
      for(i in 1:length(msint)){
        msint[i]<-weighted.mean(ramclustObj$SpecAbund[,i], ramclustObj$SpecAbund[,i])
      }
      ramclustObj$msint<-msint
    }
    
    ints <- log10(ramclustObj$msint)^2
    cexs <- 3*(ints)/max(ints)
    ## backcalculation: range(10^(range(ints)^0.5))
    cex.rg <- seq(min(cexs), max(cexs), length.out = 5)
    int.rg <- seq(from = min(ints), to = max(ints), length.out = 5)
    int.rg <- 10^(int.rg^0.5)
    
    if(plots) {
      pdf(file=paste0(out.dir, "/anova_summary.pdf"), width=10, height=8)
      for(i in 2:ncol(pldata)) {
        par(xpd=FALSE)
        if(any(pldata[,i] == 0)) {
          fix <- as.vector(which(pldata[,i] == 0))
          pldata[fix,i] <- min(pldata[-fix,i])
        }
        plot(pldata[,1], -log10(pldata[,i]), 
             xlab="retention time (seconds)", 
             ylab="-log(pvalue)", ylim = c(0, -log10(min(0.04, min(pldata[,i], na.rm = TRUE)))),
             cex.axis=1, cex.lab=1, main=paste("ANOVA p-values:", dimnames(pldata)[[2]][i]), bty='L',
             pch=21, cex=cexs, bg=plcols[,i], 
             sub = "circle size reflects median feature intensity", cex.sub = 0.5)
        abline(h=-log10(pcut), col=gray(0.4), lty=2)
        par(xpd = TRUE)
        legend(legend=formatC(round(int.rg)), x.intersp = 2, y.intersp = max(cex.rg/1.5),
               x= "topright", inset=c(-0.05,0),
               pt.cex=cex.rg, bty="n", pch=21, bg=rgb(red=0, green=80, blue=0, max=255) )
        par(xpd=FALSE)
      }
      dev.off()
    }
  }
  
  if(!is.null(ramclustObj$clrt) & !is.null(ramclustObj$annconf)) {
    write.csv(
      file=paste0(out.dir, "/anova_pvalues.csv"), 
      data.frame(
        "cmpd"=ramclustObj$cmpd[cmpd.use], "annotation"=ramclustObj$ann[cmpd.use], 
        "annconf"=ramclustObj$annconf[cmpd.use], "rt"=ramclustObj$clrt[cmpd.use],
        t(mp), check.names=FALSE), row.names=FALSE)
  } else {
    write.csv(file="stats/anova/anova_pvalues.csv", mp, row.names=TRUE)
  }
  
  if(summary.statistics) {
    anova.call <- gsub("1|", "", anova.call, fixed = TRUE)
    f <- paste(ramclustObj$cmpd[1], "~", anova.call)
    # if(grepl("|", f, fixed = TRUE)) {
    #   f <- unlist(strsplit(f, "+", fixed = TRUE))
    #   for(i in length(f):1) {
    #     if(grepl("|", f[i], fixed = TRUE)) {
    #       f <- f[-i]
    #     }
    #   }
    # }
    test <- aggregate(as.formula(f), data = dat, FUN = "mean")
    fact.names <- names(test)[1:(ncol(test)-1)]
    n <- c("cmpd", fact.names, "statistic", "value")
    out <- data.frame(matrix(nrow = 0, ncol = length(n))); names(out) <- n
    template <- out
    for(i in 1:length(ramclustObj$cmpd)) {
      f <- paste(ramclustObj$cmpd[i], "~", anova.call)
      if(grepl("|", f, fixed = TRUE)) {
        f <- unlist(strsplit(f, "+", fixed = TRUE))
        for(i in length(f):1) {
          if(grepl("|", f[i], fixed = TRUE)) {
            f <- f[-i]
          }
        }
      }
      tmp <- template
      m <- aggregate(as.formula(f), data = dat, FUN = "mean")
      md <- aggregate(as.formula(f), data = dat, FUN = "median")
      s <- aggregate(as.formula(f), data = dat, FUN = "sd")
      se <- aggregate(as.formula(f), data = dat, FUN = function(x) sd(x)/sqrt(length(x)))
      for(j in fact.names) {
        tmp[1:nrow(m),j] <- m[1:nrow(m),j]
      }
      tmp[,"cmpd"] <- ramclustObj$cmpd[i]
      tmpmd <- tmp
      tmps <- tmp
      tmpse <- tmp
      tmp[, "statistic"] <- "mean"
      tmpmd[, "statistic"] <- "median"
      tmps[,"statistic"] <- "sd"
      tmpse[,"statistic"] <- "se"
      tmp[,"value"] <- m[,ncol(m)]
      tmpmd[,"value"] <- md[,ncol(md)]
      tmps[,"value"] <- s[,ncol(s)]
      tmpse[,"value"] <- se[,ncol(se)]
      tmp <- rbind(tmp, tmpmd, tmps, tmpse)
      out <- rbind(out, tmp)
    }
    write.csv(out, file = paste0("stats/anova/", anova.name, "/summary.statistics.csv"), row.names = FALSE)
  }
  
  return(ramclustObj)
}
cbroeckl/csu.pmf.tools documentation built on Jan. 26, 2024, 6:27 p.m.