R/ms_IDIT_QC.R

Defines functions ms_IDIT_QC

Documented in ms_IDIT_QC

#' ms_IDIT_QC
#'
#' Function to generate a QC report for the ITDR or ITTR dataset
#'
#' @param data ITDR or ITTR dataset to perform QC analysis
#' @param foldername name for the QC result folder
#' @param reportname name for the QC report
#' @param nread number of reading channels or sample treatements, default value 10
#' @param simpleAUC whether to perform a simple calculation of AUC, default set to TRUE;
#' note it is a bit "tricky" to interpret the AUC value in logarithmic space for
#' most of ITDR data, suggest to just use simpleAUC by default
#' @param isdatafitted whether the provided data is associated with
#' fitting parameters, i.e., after ms_ITDR_fitting or ms_ITTR_fitting
#' @param PSMcutoff whether to apply PSM threshold cutoff on hit selection,
#' default set to TRUE
#' @param PSMthreshold the threshold of averaged PSM numbers to consider protein
#' quantification as reliable, default value is 3
#'
#' @import Nozzle.R1
#' @import VennDiagram
#' @importFrom grid grid.draw
#' @import tidyr
#' @importFrom plyr . dlply ddply
#' @import dplyr
#' @import RColorBrewer
#' @import ggplot2
#'
#' @export
#'
#' @return NULL
#' @examples \dontrun{
#' ms_IDIT_QC(ITDRdata_fitted)
#' ms_IDIT_QC(ITDRdata_scaled, isdatafitted=FALSE, foldername="QC1")
#' }
#'
#'


ms_IDIT_QC <- function(data, foldername=NULL, reportname=NULL, nread=10, simpleAUC=TRUE,
                       PSMcutoff=TRUE, PSMthreshold=3, isdatafitted=TRUE) {
  # provide the fitted data
  cat("Make sure you provide the scaled data with fitting parameters as input ")
  cat("for maximal utilization of this QC function. \n")

  dataname <- deparse(substitute(data))
  outdir <- ms_directory(data, dataname)$outdir
  data <- ms_directory(data, dataname)$data

  if (!length(foldername)) {
    foldername <- paste0(dataname, "_QCreports")
  }
  if (!length(reportname)) {
    reportname <- foldername
  }
  dir.create( paste0(outdir,"/",foldername), showWarnings=FALSE )

  #to separate condition into condition and replicates
  nset <- length(unique(data$condition))
  data <- tidyr::separate(data, condition, into=c("condition", "replicate"), sep="\\.")
  # list <- strsplit(as.character(data$condition), "\\.")
  # df <- ldply(list)
  # colnames(df) <- c("condition", "replicate")
  # data <- cbind(data[,c(1,2)], df, data[,-c(1:3)])
  ncondition <- length(unique(data$condition))
  nreplicate <- length(unique(data$replicate))
  #nset <- ncondition * nreplicate
  if (nset > 5) {cat("To plot venn diagram, maximum five unique data sets.\n")}

  data_N1 <- plyr::ddply(data, .(condition, replicate), summarise, Number_run=length(unique(id)))
  data_N2 <- plyr::ddply(data, .(condition), summarise, Number_sample=length(unique(id)))
  data_N <- merge(data_N1, data_N2)

  drawvennplot <- function (data, numberofgroup, foldername, plotname) {
    vennplot <- venn.diagram(
      x = data,
      euler.d = TRUE,
      scaled = TRUE,
      height = 8,
      width = 8,
      units = "in",
      resolution = 500,
      filename = NULL,
      lwd = 1,
      cex = 0.8,
      cat.cex = 1,
      cat.col = brewer.pal(8,"Dark2")[c(1:numberofgroup)],
      fill = brewer.pal(8,"Set2")[c(1:numberofgroup)],
      margin = 0.1
    )
    grDevices::png(paste0(outdir,"/",foldername,"/",plotname), width=8, height=8, units="in", res=200)
    grid::grid.draw(vennplot)
    dev.off()
  }

  if (nset>1 & nset<=5) {
    data_id1 <- plyr::dlply(data, .(condition, replicate), function(x) n=unique(x$id))
    drawvennplot(data_id1, nset, foldername, "Quantified_IDs_run.png")
  }
  if (ncondition>1 & ncondition<=5) {
    data_id2 <- plyr::dlply(data, .(condition), function(x) condition=unique(x$id))
    drawvennplot(data_id2, ncondition, foldername, "Quantified_IDs_sample.png")
  }

  d1 <- tidyr::gather(data[,c(1,3,5:(4+nread))], treatment, reading, -id, -condition)
  d1$treatment <- factor(d1$treatment, levels=sort(as.numeric(unique(d1$treatment)), decreasing=FALSE))
  q <- ggplot(d1, aes(x = treatment, y = reading)) +
    coord_cartesian(ylim = c(0,2)) +
    scale_y_continuous(breaks=c(0,0.8,1,1.25,2))
  q <- q + labs(x="Treatment", y="Normalized Ratio") +
    geom_boxplot(aes(fill=condition)) +
    geom_hline(yintercept=c(0.8,1.25), linetype = "longdash") +
    theme_bw() + theme(text = element_text(size=12),
                       axis.text.x = element_text(angle = 45,hjust = 1),
                       aspect.ratio=1)
  ggsave(filename = paste0(outdir,"/",foldername,"/","wholeset_trend.png"), q, height=8, width=8)


  data$STD <- apply(data[ ,c(5:(4+nread))], 1, sd)
  data <- mutate(data, CV=STD/rowMeans(data[,c(5:(4+nread))]))
  limit <- quantile(data$CV, 0.999)
  m <- ggplot(data, aes(x=CV, fill=condition)) + coord_cartesian(xlim = c(0,limit))
  m <- m + geom_histogram(binwidth = limit/100, alpha=0.5, position="identity") +
    labs(x="Reading coefficient of variation (CV)") + theme_bw() +
    theme(text = element_text(size=12), aspect.ratio=1)
  ggsave(filename = paste0(outdir,"/",foldername,"/","CV_distribution.png"), m, height=8, width=8)

  numdosevector <- as.numeric(names(data)[c(5:(nread+4))])

  if (simpleAUC==TRUE) {
    data$AUC <- rowMeans(data[ ,c(5:(nread+4))],na.rm=T)
    # data <- dplyr::mutate(data, AUC=rowSums(data[, c(5:(nread+4))],na.rm=T))
    # data <- dplyr::mutate(data, AUC=AUC/nread)
  } else {
    data$AUC <- apply(data, 1, function(x) auc(numdosevector, x[c(5:(nread+4))], type="linear"))
  }
  limit <- quantile(data$AUC, 0.999)
  m <- ggplot(data, aes(x=AUC, fill=condition)) + coord_cartesian(xlim = c(0,limit))
  m <- m + geom_histogram(binwidth = limit/100, alpha=0.5, position="identity") +
    labs(x="Area under the curve (AUC)") + theme_bw() +
    theme(text = element_text(size=12), aspect.ratio=1)
  ggsave(filename = paste0(outdir,"/",foldername,"/","AUC_distribution.png"), m, height=8, width=8)

  limity <- quantile(data$CV, 0.999)
  limitx <- quantile(data$AUC, 0.999)
  n1 <- ggplot(data, aes(x=AUC, y=CV)) + geom_point(aes(colour=condition),size=0.5,alpha=0.3) +
    coord_cartesian(xlim = c(0,limitx), ylim = c(0,limity))
  n1 <- n1 + facet_grid(.~condition+replicate) +
    labs(x="Reading Area Under the Curve",y="Reading coefficient of variation") +
    theme_bw() + theme(
      text = element_text(size=12),
      axis.text=element_text(size=6),
      legend.position="bottom", aspect.ratio=1)
  ggsave(filename = paste0(outdir,"/",foldername,"/","AUC_CV_distribution1.png"), n1, height=4, width=8)

  if (isdatafitted) {
    n2 <- ggplot(data, aes(x=R2, y=CV)) + geom_point(aes(colour=condition),size=0.5,alpha=0.3) +
      geom_vline(xintercept=0.8, colour="blue", linetype = "longdash") +
      coord_cartesian(ylim = c(0,limity))
    n2 <- n2 + labs(x="Reading R-squared",y="Reading coefficient of variation") +
      facet_grid( .~condition+replicate) + theme_bw() +
      theme(text = element_text(size=12), axis.text=element_text(size=6),
            legend.position="bottom", aspect.ratio=1)
    ggsave(filename = paste0(outdir,"/",foldername,"/","R2_CV_distribution1.png"), n2, height=4, width=8)

    n3 <- ggplot(data, aes(x=R2, y=AUC)) + geom_point(aes(colour=condition),size=0.5,alpha=0.3) +
      geom_vline(xintercept=0.8, colour="blue", linetype = "longdash") +
      coord_cartesian(ylim = c(0,20))
    n3 <- n3 + labs(x="Reading R-squared",y="Reading Area Under the Curve") +
      facet_grid( .~condition+replicate) + theme_bw() +
      theme(text = element_text(size=12), axis.text=element_text(size=6),
            legend.position="bottom", aspect.ratio=1)
    ggsave(filename = paste0(outdir,"/",foldername,"/","R2_AUC_distribution1.png"), n3, height=4, width=8)

    a<-data.frame(t(as.matrix(summary(data$CV))), stringsAsFactors=F, check.names=F)
    a$parameter <- "CV"
    b<-data.frame(t(as.matrix(summary(data$AUC))), stringsAsFactors=F, check.names=F)
    b$parameter <- "AUC"
    c<-data.frame(t(as.matrix(summary(data$R2))), stringsAsFactors=F, check.names=F)
    c$parameter <- "R2"
    R2CV<-plyr::rbind.fill(a,b)
    R2CV<-plyr::rbind.fill(R2CV,c)
    R2CV<-R2CV[ ,c((ncol(R2CV)-1),1:(ncol(R2CV)-2),ncol(R2CV))]
  } else {
    a<-data.frame(t(as.matrix(summary(data$CV))), stringsAsFactors=F, check.names=F)
    a$parameter <- "CV"
    b<-data.frame(t(as.matrix(summary(data$AUC))), stringsAsFactors=F, check.names=F)
    b$parameter <- "AUC"
    R2CV<-plyr::rbind.fill(a,b)
    R2CV<-R2CV[ ,c((ncol(R2CV)-1),1:(ncol(R2CV)-2),ncol(R2CV))]
  }
  #print(R2CV)

  if (PSMcutoff) {# To select the proteins with more than 3 PSM (average)
    PSMcol <- grep("PSM", names(data), value=FALSE)
    PSMname <- names(data)[PSMcol]
    names(data)[PSMcol] <- "PSM"
    PSMtable <- plyr::ddply(data, .(id), summarize, PSMmean=mean(PSM))

    PSMplot <- ggplot(PSMtable, aes(x = PSMmean)) + geom_histogram(fill="darkgray") + geom_vline(xintercept=3, colour="blue", linetype = "longdash")
    PSMplot <- PSMplot + labs(x="Average PSM number for each protein") + scale_x_log10() +
      theme_bw() + theme(text = element_text(size=12), axis.text.x = element_text(angle = 45,hjust = 1), aspect.ratio=1);
    ggsave(filename = paste0(outdir,"/",foldername,"/","PSM_distribution.png"), PSMplot, height=8, width=8);

    PSMtable <- subset(PSMtable, PSMmean>=PSMthreshold)
    nkeep <- PSMtable$id
    fkeep <- which(data$id %in% nkeep)
    names(data)[PSMcol] <- PSMname
    data_PSMsmall <- data[-fkeep, ]
    data_PSMbig <- data[fkeep, ]
  }

  data_N2 <- plyr::ddply(data_PSMbig, .(condition), summarise, Number_sample=length(unique(id)))
  data_N_PSMbig <- merge(data_N1, data_N2)
  limity <- quantile(data_PSMbig$CV, 0.999)
  limitx <- quantile(data_PSMbig$AUC, 0.999)
  n1 <- ggplot(data_PSMbig, aes(x=AUC, y=CV)) +
    geom_point(aes(colour=condition),size=0.5,alpha=0.3) +
    coord_cartesian(xlim = c(0,limitx), ylim = c(0,limity))
  n1 <- n1 + labs(x="Reading Area Under the Curve",y="Reading coefficient of variation") +
    facet_grid( .~condition+replicate) + theme_bw() +
    theme(text = element_text(size=12), axis.text=element_text(size=6),
          legend.position="bottom", aspect.ratio=1)
  ggsave(filename = paste0(outdir,"/",foldername,"/","AUC_CV_distribution2.png"), n1, height=4, width=8)

  if (isdatafitted) {
    n2 <- ggplot(data_PSMbig, aes(x=R2, y=CV)) +
      geom_point(aes(colour=condition),size=0.5,alpha=0.3) +
      geom_vline(xintercept=0.8, colour="blue", linetype = "longdash") +
      coord_cartesian(ylim = c(0,limity))
    n2 <- n2 + labs(x="Reading R-squared",y="Reading coefficient of variation") +
      facet_grid(.~condition+replicate) + theme_bw() +
      theme(text = element_text(size=12), axis.text=element_text(size=6),
            legend.position="bottom", aspect.ratio=1)
    ggsave(filename = paste0(outdir,"/",foldername,"/","R2_CV_distribution2.png"), n2, height=4, width=8)

    n3 <- ggplot(data_PSMbig, aes(x=R2, y=AUC)) + geom_point(aes(colour=condition),size=0.5,alpha=0.3) +
      geom_vline(xintercept=0.8, colour="blue", linetype = "longdash") +
      coord_cartesian(ylim = c(0,20))
    n3 <- n3 + labs(x="Reading R-squared",y="Reading Area Under the Curve") +
      facet_grid( .~condition+replicate) + theme_bw() +
      theme(text = element_text(size=12), axis.text=element_text(size=6),
            legend.position="bottom", aspect.ratio=1)
    ggsave(filename = paste0(outdir,"/",foldername,"/","R2_AUC_distribution2.png"), n3, height=4, width=8)

    a<-data.frame(t(as.matrix(summary(data_PSMbig$CV))), stringsAsFactors=F, check.names=F)
    a$parameter <- "CV"
    b<-data.frame(t(as.matrix(summary(data_PSMbig$AUC))), stringsAsFactors=F, check.names=F)
    b$parameter <- "AUC"
    c<-data.frame(t(as.matrix(summary(data_PSMbig$R2))), stringsAsFactors=F, check.names=F)
    c$parameter <- "R2"
    R2CV_PSMbig<-plyr::rbind.fill(a,b)
    R2CV_PSMbig<-plyr::rbind.fill(R2CV_PSMbig,c)
    R2CV_PSMbig<-R2CV_PSMbig[ ,c((ncol(R2CV_PSMbig)-1),1:(ncol(R2CV_PSMbig)-2),ncol(R2CV_PSMbig))]
  } else {
    a<-data.frame(t(as.matrix(summary(data_PSMbig$CV))), stringsAsFactors=F, check.names=F)
    a$parameter <- "CV"
    b<-data.frame(t(as.matrix(summary(data_PSMbig$AUC))), stringsAsFactors=F, check.names=F)
    b$parameter <- "AUC"
    R2CV_PSMbig<-plyr::rbind.fill(a,b)
    R2CV_PSMbig<-R2CV_PSMbig[ ,c((ncol(R2CV_PSMbig)-1),1:(ncol(R2CV_PSMbig)-2),ncol(R2CV_PSMbig))]
  }
  #print(R2CV_PSMbig)

  r <- newCustomReport( reportname )
  ss1 <- newSection( "Quantified Protein Numbers" )
  ss2 <- newSection( "Quantified Protein Overlapping" )
  ss3 <- newSection( "The distributions of readings (grouped by 10plex-curve)" )
  ss4 <- newSection( "The distributions of readings (grouped by 10plex-curve) in PSM>3 group" )

  if (nset>1 & nset<=5) {
    fig1 <- newFigure( "Quantified_IDs_run.png", "The overlapping of proteins from each run.")
  } else {
    fig1 <- NULL
  }
  if (ncondition>1 & ncondition<=5) {
    fig2 <- newFigure( "Quantified_IDs_sample.png", "The overlapping of proteins from each sample.")
  } else {
    fig2 <- NULL
  }
  fig3 <- newFigure( "wholeset_trend.png", "The distribution of readings in each channel.")
  fig4 <- newFigure( "AUC_distribution.png", "The distribution of AUCs of readings per curve in each condition.")
  fig5 <- newFigure( "CV_distribution.png", "The distribution of CVs of readings per curve in each condition.")

  fig6 <- newFigure( "AUC_CV_distribution1.png", "The distribution of AUC vs CV of readings per curve in each condition.")
  fig7 <- newFigure( "AUC_CV_distribution2.png", "The distribution of AUC vs CV of readings per curve in each condition from PSM>3 group.")
  if (isdatafitted) {
    fig8 <- newFigure( "R2_CV_distribution1.png", "The distribution of R2 vs CV of readings per curve in each condition.")
    fig9 <- newFigure( "R2_CV_distribution2.png", "The distribution of R2 vs CV of readings per curve in each condition from PSM>3 group.")
    fig10 <- newFigure( "R2_AUC_distribution1.png", "The distribution of R2 vs AUC of readings per curve in each condition.")
    fig11 <- newFigure( "R2_AUC_distribution2.png", "The distribution of R2 vs AUC of readings per curve in each condition from PSM>3 group.")
  } else {
    fig8 <- NULL
    fig9 <- NULL
    fig10 <- NULL
    fig11 <- NULL
  }
  fig12 <- newFigure( "PSM_distribution.png", "The distribution of PSMs for each quantified protein")

  t1 <- newTable( data_N, paste0("The number of quantified proteins for in total ", length(unique(data$id))), " unique proteins." )
  t2 <- newTable( R2CV, "The distribution of CV and R2 of readings (grouped by 10plex-curve)." )
  t3 <- newTable( data_N_PSMbig, paste0("The number of quantified proteins in PSM>3 group for in total ", length(unique(data_PSMbig$id))), " unique proteins." )
  t4 <- newTable( R2CV_PSMbig, "The distribution of CV and R2 of readings (grouped by 10plex-curve) in PSM>3 group." )
  p1 <- newParagraph( "The coefficient of variation (CV) is defined as the ratio of standard deviation to mean of ten readings in one TMT10 set defined curve.")
  p2 <- newParagraph( "Note that to prevent the extreme values compressing the plot, only up to 99.9% percentile of CV distribution in the data was plotted." )
  p3 <- newParagraph( "PSM>3 group is the subset of quantified proteins with on average more than 3 Peptide Spectrum Match (PSM) support." );

  ss1 <- addTo( ss1, t1)
  if (ncondition>1) {
    ss2 <- addTo( ss2, fig1, fig2 ) # parent, child_1, ..., child_n
  } else {
    ss2 <- addTo( ss2, fig1)
  }
  if (isdatafitted) {
    ss3 <- addTo( ss3, t2, p1, fig3, fig4, fig5, fig6, fig8, fig10, p2 )
    ss4 <- addTo( ss4, p3, fig12, t3, t4, fig7, fig9, fig11, p2 )
  } else {
    ss3 <- addTo( ss3, t2, p1, fig3, fig4, fig5, fig6, p2 )
    ss4 <- addTo( ss4, p3, fig12, t3, t4, fig7, p2 )
  }
  r <- addTo( r, ss1, ss2, ss3, ss4 )
  writeReport( r, filename=paste0(outdir,"/",foldername, "/", reportname )) # w/o extension
  rm(p1, p2, p3, r, ss1, ss2, ss3, ss4, t1, t2, t3, t4, fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8, fig9, fig10, fig11, fig12)

}
nkdailingyun/mineCETSA documentation built on Feb. 27, 2021, 8:26 p.m.