In ningzhibin/rmdocpu: Metalab report upon provided templates

  htmltools::img(src = "https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/rmd/iMetaReport.png", 
               alt = 'logo', 
               style = 'position:absolute; top:0; right:0; padding:10px;width:100px;height:100px;')

# enviroment setup
knitr::opts_chunk$set(echo = FALSE,warning = FALSE, message = FALSE, cache = FALSE)

library(tidyverse)
library(ggplot2)
library(plotly)

# other reqruied package:
# DT # requred, but not need to library


# Version update!
# 20200206 add fully support for pdf and word format, see how to use down below.
#           fix the problem for displaying ggplotly on html in a row by adding a break line. Otherwise it will only show the last one
#           This is because of the tag thing in html
#           add a dynamic control of the figure height, when samples too many, increase the figure height, otherwise it is not recoganizable on static report, like pdf and  word
#           add a parameterized control of rendering to html or static format, for html one, use ggplotly, for static one, use ggplot2
#           fix some bugs, imcompatible with ggplotly
#           fix some bugs, to make it the run more efficiently
#           add a function to order the rawfiles according to the user input order, if grouping information provided, order according to the groups first
#           adda function to report the group information, provided or not, grouping information if provided, error message if not provided but not matched with the raw files, or with missing values, which can help users to double check the meta table setting
#           learned A lot of new stuff for rmarkdown. like how to use r variables/values in r section settings(like the figure_height in this document), or directly used out of r blocks, like the meta inform in this documents. 
#

# local usage test and debug simple file
df_summary.txt  <- read.delim("summary.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
rmarkdown::render("MQ_report_summary_indev.Rmd", params = list(summary_file_tbl =  df_summary.txt))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "pdf_document", params = list(summary_file_tbl =  df_summary.txt,if_html =FALSE))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "word_document", params = list(summary_file_tbl =  df_summary.txt,if_html =FALSE))

# local usage test and debug: big file
df_summary.txt  <- read.delim("22122017_Colon_Aging_summary.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
rmarkdown::render("MQ_report_summary_indev.Rmd", params = list(summary_file_tbl =  df_summary.txt))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "pdf_document", params = list(summary_file_tbl =  df_summary.txt,if_html =FALSE))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "word_document", params = list(summary_file_tbl =  df_summary.txt,if_html =FALSE))


# local usage test and debug, big file without meta
df_summary.txt  <- read.delim("22122017_Colon_Aging_summary.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
rmarkdown::render("MQ_report_summary_indev.Rmd", params = list(summary_file_tbl =  df_summary.txt))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "pdf_document", params = list(summary_file_tbl =  df_summary.txt,if_html =FALSE))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "word_document", params = list(summary_file_tbl =  df_summary.txt,if_html =FALSE))

# local usage test and debug, big file with meta
df_summary.txt  <- read.delim("22122017_Colon_Aging_summary.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
meta_table <- read.delim("meta.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)

rmarkdown::render("MQ_report_summary_indev.Rmd", params = list(summary_file_tbl =  df_summary.txt, meta_table = meta_table))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "pdf_document", params = list(summary_file_tbl =  df_summary.txt, meta_table = meta_table,if_html =FALSE))
rmarkdown::render("MQ_report_summary_indev.Rmd",output_format = "word_document", params = list(summary_file_tbl =  df_summary.txt, meta_table = meta_table,if_html =FALSE))

tidy_MQ_summary <- function(df_summary.txt){
  # the format of summary.txt mainly fall into three categories

  # df_summary.txt has to be a tidyverse tbl format, from read_tsv
  # take out the last line to organize into a data.frame

  last_line <- t(df_summary.txt[nrow(df_summary.txt),])
  last_line[last_line[,1] == ""] <- NA
  last_line[last_line[,1] == "0"] <- NA
  last_line <- last_line[-1,, drop = FALSE]
  last_line <- last_line[which(!is.na(last_line[,1])),, drop =  FALSE]
  colnames(last_line) <- ("values")

  summary <- df_summary.txt[-nrow(df_summary.txt),] # remove the last line

  # take out rows about raw files summary 

  # if there are experiment design column, 
  if(length(grep("Experiment", colnames(summary)))>0){

    # and if there are separate rows for experimental desgin,
    # otherwise, expereiment desgin is set with one raw file one experiment 
    if(length(which(nchar(summary$Experiment) ==0)) >0){ # this 
     df_rawfiles <-summary[which(nchar(summary$Experiment) >0),]

      # take out rows about experiment summary
      df_experiment <- summary[which(nchar(summary$Experiment) ==0),]

      return(list("summary_all" = last_line,
                  "summary_rawfiles" = df_rawfiles,
                  "summary_experiment" = df_experiment,
                  "set_experiment" = TRUE

      ))
    }else{

     df_rawfiles <-  summary
      return(list("summary_all" = last_line,
                  "summary_rawfiles" = df_rawfiles,
                  "set_experiment" = FALSE # even with experiment setup, still no need to do

      ))

    }

    # otherwise no need to do separate experiment display

  }else{
      df_rawfiles <-  summary
      return(list("summary_all" = last_line,
                  "summary_rawfiles" = df_rawfiles,
                  "set_experiment" = FALSE

      ))

  }

}



MQ_QC_plot<- function(data.frame, 
                      plot_type = c("scatter", "bar", "density", "histogram", "freqpoly", "box", "violin") ,
                      group = NULL, # needs to be column name
                      cutoff = 20, 
                      maintitle = "", 
                      xlabel = "",
                      vertical =  FALSE,
                      ...
                ){

  # in case some column names are not valid names (containing special symbol, like space, % etc)

  names(data.frame)[1] <- "names"
  names(data.frame)[2] <- "value"

  if(length(plot_type) == 0){ # if no plot type defined, do not plot and exit
    stop
  }else{
    plot_out <- list()
  }


  if(is.null(group)){ # for none grouped 
    # this will enrsure the default plottinng order is followin the input order of the summary.txt,
    data.frame$names <- factor(data.frame$names, levels = data.frame$names)

    #for violinplot and boxplot, a fake group is needed
    group <- "All"
    data.frame$All = "All"

    #plotting

    if("scatter" %in% plot_type){
      scatter_plot <- ggplot(data.frame) + 
        geom_point(aes_string(x = "names", y = "value")) +
        geom_hline(yintercept = cutoff, linetype="dashed", color = "blue", size=1) +
        labs(title = maintitle, x = "", y = xlabel) + 
        theme_bw() +
        theme(plot.title = element_text(hjust = 0.5),
              axis.text.x = element_text(angle = 90, hjust = 1),
              panel.grid = element_blank()) + 
        coord_flip()



      plot_out <- c(plot_out, list("scatter_plot" = scatter_plot))
    }



  if("bar" %in% plot_type){
    bar_plot <- ggplot(data.frame) +
      geom_hline(yintercept = cutoff, linetype="dashed", color = "blue", size=1) +
      geom_col(aes_string(x = "names", y = "value"))+

      labs(title = maintitle, x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())+ 
      coord_flip()

    plot_out <- c(plot_out, list("bar_plot" = bar_plot))

  }


    if("freqpoly" %in% plot_type){
        # distritibution
        freqpoly_plot <- ggplot(data.frame) +
          annotate("rect", xmin=-Inf, xmax= cutoff, ymin=0, ymax=Inf, alpha=0.1, fill="red") +
          annotate("rect", xmin=cutoff, xmax=Inf, ymin=0, ymax=Inf, alpha=0.5, fill="lightblue") +
          geom_freqpoly(aes_string("value") )+
          geom_vline(xintercept = cutoff, linetype="dashed", color = "blue", size=1)+
          labs(title = maintitle,  x = "",y = xlabel) + 
          theme_bw() +
          theme(plot.title = element_text(hjust = 0.5),
                axis.text.x = element_text(angle = 90, hjust = 1),
                panel.grid = element_blank())
        if(vertical){
          freqpoly_plot <- freqpoly_plot + coord_flip()
        }

        plot_out <- c(plot_out, list("freqpoly_plot" = freqpoly_plot))
      }

      if("histogram" %in% plot_type){
        histogram_plot <- ggplot(data.frame) +
          annotate("rect", xmin=-Inf, xmax= cutoff, ymin=0, ymax=Inf, alpha=0.1, fill="red") +
          annotate("rect", xmin=cutoff, xmax=Inf, ymin=0, ymax=Inf, alpha=0.5, fill="lightblue") +
          geom_histogram(aes_string("value"),position = "identity",alpha = 0.5)+
          geom_vline(xintercept = cutoff, linetype="dashed", color = "blue", size=1)+
          labs(title = maintitle,  x = "",y = xlabel) + 
          theme_bw() +
          theme(plot.title = element_text(hjust = 0.5),
                axis.text.x = element_text(angle = 90, hjust = 1),
                panel.grid = element_blank())
        if(vertical){
          histogram_plot <- histogram_plot + coord_flip()
        }
        plot_out <- c(plot_out, list("histogram_plot" = histogram_plot))
      }



      if("density" %in% plot_type){
        density_plot <-ggplot(data.frame) +
          annotate("rect", xmin=-Inf, xmax= cutoff, ymin=0, ymax=Inf, alpha=0.1, fill="red") +
          annotate("rect", xmin=cutoff, xmax=Inf, ymin=0, ymax=Inf, alpha=0.5, fill="lightblue") +
          geom_density(aes_string("value"),position = "identity",alpha = 0.5) +
          geom_vline(xintercept = cutoff, linetype="dashed", color = "blue", size=1)+
          labs(title = maintitle,  x = "",y = xlabel) + 
          theme_bw() +
          theme(plot.title = element_text(hjust = 0.5),
                axis.text.x = element_text(angle = 90, hjust = 1),
                panel.grid = element_blank())
        if(vertical){
          density_plot <- density_plot + coord_flip()
        }
        plot_out <- c(plot_out, list("density_plot" = density_plot))
      }


  }else{
    # if grouping provided, order by group
    data.frame$names <- factor(data.frame$names, levels = data.frame$names[order(data.frame[group])])

    if("scatter" %in% plot_type){
    scatter_plot <- ggplot(data.frame) + 
      geom_point(aes_string(x = "names", y = "value", colour = group)) +
      geom_hline(yintercept = cutoff, linetype="dashed", color = "blue", size=1) +
      labs(title = maintitle, x = "", y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())+ 
      coord_flip()



    plot_out <- c(plot_out, list("scatter_plot" = scatter_plot))
  }
  if("bar" %in% plot_type){
    bar_plot <- ggplot(data.frame) +
      geom_hline(yintercept = cutoff, linetype="dashed", color = "blue", size=1) +
      geom_col(aes_string(x = "names", y = "value", fill = group))+
      labs(title = maintitle, x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())+ 
      coord_flip()


    plot_out <- c(plot_out, list("bar_plot" = bar_plot))

  }
  if("freqpoly" %in% plot_type){
    # distritibution
    freqpoly_plot <- ggplot(data.frame) +
      annotate("rect", xmin=-Inf, xmax= cutoff, ymin=0, ymax=Inf, alpha=0.1, fill="red") +
      annotate("rect", xmin=cutoff, xmax=Inf, ymin=0, ymax=Inf, alpha=0.5, fill="lightblue") +
      geom_freqpoly(aes_string("value",colour = group) )+
      geom_vline(xintercept = cutoff, linetype="dashed", color = "blue", size=1)+
      labs(title = maintitle,  x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())
    if(vertical){
      freqpoly_plot <- freqpoly_plot + coord_flip()
    }

    plot_out <- c(plot_out, list("freqpoly_plot" = freqpoly_plot))
  }

  if("histogram" %in% plot_type){
    histogram_plot <- ggplot(data.frame) +
      annotate("rect", xmin=-Inf, xmax= cutoff, ymin=0, ymax=Inf, alpha=0.1, fill="red") +
      annotate("rect", xmin=cutoff, xmax=Inf, ymin=0, ymax=Inf, alpha=0.5, fill="lightblue") +
      geom_histogram(aes_string("value", colour = group, fill = group),position = "identity",alpha = 0.5)+
      geom_vline(xintercept = cutoff, linetype="dashed", color = "blue", size=1)+
      labs(title = maintitle,  x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())
    if(vertical){
      histogram_plot <- histogram_plot + coord_flip()
    }
    plot_out <- c(plot_out, list("histogram_plot" = histogram_plot))
  }



  if("density" %in% plot_type){
    density_plot <-ggplot(data.frame) +
      annotate("rect", xmin=-Inf, xmax= cutoff, ymin=0, ymax=Inf, alpha=0.1, fill="red") +
      annotate("rect", xmin=cutoff, xmax=Inf, ymin=0, ymax=Inf, alpha=0.5, fill="lightblue") +
      geom_density(aes_string("value", colour = group, fill = group),position = "identity",alpha = 0.5) +
      geom_vline(xintercept = cutoff, linetype="dashed", color = "blue", size=1)+
      labs(title = maintitle,  x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())
    if(vertical){
      density_plot <- density_plot + coord_flip()
    }
    plot_out <- c(plot_out, list("density_plot" = density_plot))
  }

}

  # the following 2 plots have to use grouping information for plotting, even with only 1 group   
  if("violin" %in% plot_type){
    violin_plot <- ggplot(data.frame) +
      geom_violin(aes_string(x =group,  y = "value", colour = group, fill = group))+
      geom_jitter(aes_string(x =group,  y = "value",colour = group, fill = group),shape=21)  +
      geom_hline(yintercept = cutoff, linetype="dashed", color = "blue", size=1) +
      labs(title = maintitle,  x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())
    if(vertical){
      violin_plot <- violin_plot + coord_flip()
    }
    plot_out <- c(plot_out, list("violin_plot" = violin_plot))

  }


  if("box" %in% plot_type){
    box_plot <- ggplot(data.frame) +
      geom_boxplot(aes_string(x =group,  y = "value", colour = group, fill = group))+
      geom_jitter(aes_string(x =group,  y = "value",colour = group, fill = group),shape=21)  +
      geom_hline(yintercept = cutoff, linetype="dashed", color = "blue", size=1) +
      labs(title = maintitle,  x = "",y = xlabel) + 
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5),
            axis.text.x = element_text(angle = 90, hjust = 1),
            panel.grid = element_blank())
    if(vertical){
      box_plot <- box_plot + coord_flip()
    }
    plot_out <- c(plot_out, list("box_plot" = box_plot))

  }




  return(plot_out) 

  # Example
  # my_test <- data.frame("samplename" = paste0("sample_", 1:20),
  #                       "msms_id" = c(abs(rnorm(10))*10+20, abs(rnorm(10))*10+30),
  #                       "treat_group" = c(paste0("group_", rep("A",10)), paste0("group_", rep("B",10)))
  # )
  # 
  # tt <-  MQ_QC_plot(my_test, plot_type = c("scatter","bar","density", "histogram", "freqpoly", "box", "violin"), cutoff = 35, group = "treat_group", maintitle = "MSMS ID Rate", xlabel = "MS/MS ID %")
  # now tt has all the required plot


}

# this function is borrowed from: http://michaeljw.com/blog/post/subchunkify/
# subchunkify <- function(g, fig_height=7, fig_width=5) { # g is a ggplot object
#   g_deparsed <- paste0(deparse(
#     function() {g}
#   ), collapse = '')
# 
#   sub_chunk <- paste0("
#   `","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=",
#    fig_height, ", fig.width=", fig_width, ", echo=FALSE}",
#   "\n(", 
#     g_deparsed
#     , ")()",
#   "\n`","``
#   ")
# 
#   cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE))
# }

Intro

This report provides some overall description of the database search. Users can use this to quickly check the overal quality of the experiment

# input
if(is.null(params$summary_file_tbl)){
  # test with local files
  df_summary.txt  <- read.delim("summary.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)

}else{
  # opencpu render from data table by parametrized input
  df_summary.txt <- params$summary_file_tbl
}

# Note: some following analysis with meta infor assumes that
# 1st columns as sample name, 2nd column as experiment name, 3rd column and after as grouping

meta_table <- params$meta_table

# process the file

summary_file_summary <- tidy_MQ_summary(df_summary.txt)

# get the raw stat line for output
df_all <- as.data.frame(t(summary_file_summary$summary_all))

# get the raw file information dataframe
raw <-summary_file_summary$summary_rawfiles

data.frame <- data.frame(rawfile= raw[["Raw file"]], msms_id_rate = raw[["MS/MS Identified [%]"]])



if( is.null(meta_table)){ 

  meta_info <- "* **No meta information provided**"

}else if (any(is.na(meta_table))){ 

  meta_info <-  "* **The meta information provided has missing values, please check again**"

}else if(!(all(as.vector(data.frame$rawfile) %in% meta_table[,1]) && all( meta_table[,1] %in% as.vector(data.frame$rawfile)))){
  meta_info <-  "* **The raw files in meta information provided do not match the raw file names in the summary.txt, please check again**"

}else{
 meta_info <-   c("* **Groups: **",unique(meta_table[,3]))  
}

Take-home figures

Peptide Sequences Identified in total: r df_all["Peptide Sequences Identified"]
Avearge ms/ms identification rate(%): r df_all["MS/MS Identified [%]"]

r meta_info

MSMS id rate

Why you should pay attention to MSMS Id rate?

MS ID rate is a good repretation of the MS run quality. Raw files from Q-Exactive series should have roughly around 50% ms ID rate (Percentage of MSMS spectra identified as peptided, at a 1% FDR) for humane cell culture digest, and at least 20% for metaproteomics samples according to experience.
MS ID rate should also be well-reproduced across samples and groupings.
Check the raw files if they have obnormally low ID rate, usually with abnormal LC/basepeak profile or low MS intensity.
A decreasing MS ID treand along sample running order indicates a performance drop of the MS: your MS might need to be cleaned. If the performance drops a lot, more than 20% within running time for the whole project, without scramble of the sample run-order, the data might not be usable, unless very careflly caliburated.

# determin the figure height. if some files have too many samples, it would be a good idea to increase the height to make it easier to 

figurue_height <- 0.1*nrow(data.frame)+2


if( is.null(meta_table)){ 
  # plot without meta
    plots_msms <-  MQ_QC_plot(data.frame, plot_type = c("scatter","density", "box"), cutoff = 20,maintitle = "MSMS ID Rate", xlabel = "MS/MS ID %")

}else{

  # put the grouping information into the data.frame
    data.frame_merged_msms <- merge(data.frame, meta_table, by.x= colnames(data.frame)[1], by.y = colnames(meta_table)[1])

  # the plot, with meta, by the first column of grouping information, which is column 4 (column 3 as experiment)
  # here, can add suport for more grouping 

    plots_msms <-  MQ_QC_plot(data.frame_merged_msms, plot_type = c("scatter","density","box"), group = colnames(data.frame_merged_msms)[4],cutoff = 20,maintitle = "MSMS ID Rate", xlabel = "MS/MS ID %")


}

if(params$if_html){
 plotly::ggplotly(plots_msms$scatter_plot)
}else{
 plots_msms$scatter_plot
}

if(params$if_html){
  plotly::ggplotly(plots_msms$density_plot)
}else{
  plots_msms$density_plot
}

if(params$if_html){
  plotly::ggplotly(plots_msms$box_plot)
}else{
  plots_msms$box_plot
}

Peptide Sequence

data.frame_peptide <- data.frame(rawfile= raw[["Raw file"]], Peptide_sequence_identified = raw[["Peptide Sequences Identified"]])

if(is.null(meta_table)){ 
 # plot without meta
    plots_peptide <-  MQ_QC_plot(data.frame_peptide, plot_type = c("scatter","density", "box"), cutoff = 3000,maintitle = "Peptide Sequences Identified", xlabel = "Peptide Sequences Identified")

}else{
# put the grouping information into the data.frame
    data.frame_merged_peptide <- merge(data.frame_peptide, meta_table, by.x= colnames(data.frame_peptide)[1], by.y = colnames(meta_table)[1])

    # the plot, with meta, by the first column of grouping information, which is column 4 (column 3 as experiment)
    # here, can add suport for 

    plots_peptide <-  MQ_QC_plot(data.frame_merged_peptide, plot_type = c("scatter","density", "box"), group = colnames(data.frame_merged_peptide)[4],cutoff = 0,maintitle = "Peptide Sequences Identified", xlabel = "Peptide Sequences Identified")

}

if(params$if_html){
  plotly::ggplotly(plots_peptide$scatter_plot)
}else{
  plots_peptide$scatter_plot
}

if(params$if_html){
  plotly::ggplotly(plots_peptide$density_plot)
}else{
  plots_peptide$density_plot
}

if(params$if_html){

  plotly::ggplotly(plots_peptide$box_plot)
}else{
  plots_peptide$box_plot
}

Overall Performance

check the overall performance for all raw files:

if(params$if_html){
  nrows <- nrow(summary_file_summary$summary_all)
  DT::datatable(summary_file_summary$summary_all,options = list(pageLength = nrows)) # datatable is only good for html 
}else{
  knitr::kable(summary_file_summary$summary_all)
}