R/qc.R

Defines functions massPattern

#!/usr/bin/env Rscript

###############################################################################
# massPattern

# Copyright (c) 2018 Broad Institute

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
###############################################################################

# load in the required libraries, report an error if they are not installed
for( lib in c('tibble','logging','data.table','dplyr','ggrepel', 'ggplot2', 'scales', 'gridExtra', 'future', 'plyr', 'cowplot')) {
  if(! suppressPackageStartupMessages(require(lib, character.only=TRUE)) ) stop(paste("Please install the R package: ",lib))
}
###############################################################
# If running on the command line, load other qc modules #
###############################################################

# this evaluates to true if script is being called directly as an executable
if(identical(environment(), globalenv()) &&
   !length( grep( "^source\\(", sys.calls()))) {

  # source all R in qc package, relative to this folder
  script_options <-commandArgs(trailingOnly = FALSE)
  script_path <- sub("--file=","",script_options[grep("--file=", script_options)])
  script_dir <- dirname(script_path)
  script_name <- basename(script_path)

  for(R_file in dir(script_dir, pattern = "*.R"))
  {
    if(! ( R_file == script_name )) source( file.path(script_dir, R_file) )
  }
}



massPattern <- function(input, r2_threshold = 0.90, q_threshold = 0.0001, coef_threshold = 0.1,
                        min_prevalence = 2, output_path = './', outputname = 'results.txt'){
  dir.create(file.path(output_path), showWarnings = FALSE)
  #print (c(r2_threshold , q_threshold, coef_threshold, min_prevalence))
  loaded_data <- load_data(input=input, type='all', name = 'Compound')
  data <- loaded_data$data
  data <- numeric_dataframe(data)
  sample_metadata <- loaded_data$sample_metadata
  feature_metadata <- loaded_data$feature_metadata


  data<- as.data.frame(t(data))
  metadata<- as.data.frame(t(sample_metadata))
  metadata <- as.data.frame(metadata["injection order"], drop = F)
  colnames(metadata) <- "injection_order"

  metadata <- numeric_dataframe(metadata)
  #metadata$injection_order <- as.numeric(metadata$injection_order) * 1.0

  data <- filter_by_prevelance(data, prev= min_prevalence, axis = 2)

  # fit linear model
  fit_data <- fit.data(features = data2, metadata = metadata, model = 'SLM')
  r2_distribution <- as.data.frame(fit_data$results$r2, drop = F)
  r2_distribution$rank <- NA
  colnames(r2_distribution)[1]<- 'R2'
  r2_distribution<- r2_distribution[order(r2_distribution$R2),]
  r2_distribution$rank <- seq.int(nrow(r2_distribution))
  colnames(r2_distribution) <- c("y", "x")
  pdf(paste(output_path,'/r2_distribution.pdf', sep=''), width = 2.75, height = 2.25, onefile=TRUE)
  r2_distribution_plot <- ggplot2::ggplot(data=r2_distribution,
                  ggplot2::aes(x=as.numeric(as.character(x)), y=as.numeric(as.character(y)))) +
    ggplot2::geom_point( color = 'black', alpha = .1, shape = 21, size = 2, stroke = 0.1) +
    ggplot2::scale_x_continuous(limits=c(min(r2_distribution['x']), max(r2_distribution['x'])))+
    ggplot2::scale_y_continuous(limits=c(min(r2_distribution['y']), max(r2_distribution['y'])))+
    ggplot2::guides(alpha='none')+ggplot2::labs("")+
    ggplot2::labs(title='Distribution of coefficient of determination', x='rank of R2', y='R2' )+ theme_nature()

  stdout <- capture.output(print(r2_distribution_plot),type="message")
  if (length(stdout)>0) logging::logdebug(stdout)
  dev.off()
  print("r2_distribution_plot done!")
  # filter for high correlated features with injection order
  to_be_removed <- fit_data$results[which(fit_data$results$r2 >= r2_threshold &
                                          fit_data$results$qval <= q_threshold &
                                          abs(fit_data$results$coef) >= coef_threshold)  , 'feature']

  filtered_data <- data[!rownames(data) %in% to_be_removed,]
  removed_data <- data[rownames(data) %in% to_be_removed,]

  df <- readxl::read_excel(input, sheet = 1, col_names = FALSE)
  df2 <- tibble::add_column(as.data.frame(df), to_be_removed = NA, .after = 1)

  df2$to_be_removed <- ifelse(df2[,1] %in% to_be_removed, "Yes", "No")
  write.table( df2, paste(output_path,'/',outputname, sep=''),
               sep = "\t", eol = "\n", quote = F, col.names = F, row.names = F)
  write.table( fit_data$results, paste(output_path,'/stats.txt', sep=''),
               sep = "\t", eol = "\n", quote = F, col.names = NA, row.names = T)
  # plot one example
  pool <- ifelse(startsWith(rownames(data2), "PREF"), 2, 1)#'Refrence pool', 'Sample')
  if (length(to_be_removed) > 0){
    pdf(paste(output_path,'/features_associated_w_injection_order_r2.pdf', sep=''), width = 2.75, height = 2.25, onefile=TRUE)
    for (i in 1:length(to_be_removed)){
      input_df<- as.data.frame(cbind(metadata$injection_order, data2[to_be_removed[i]], pool))
      x_label<- "Injection order"
      y_label <- "Intensity"
      title <- to_be_removed[i]
      colnames(input_df)<- c('x', 'y', 'pool')
      temp_plot <- ggplot2::ggplot(data=input_df,
                      ggplot2::aes(as.numeric(as.character(x)), as.numeric(as.character(y)))) +
        ggplot2::geom_point( fill = pool, color = 'black', alpha = .75, shape = 21, size = 2, stroke = 0.1) +
        ggplot2::scale_x_continuous(limits=c(min(input_df['x']), max(input_df['x'])))+
        ggplot2::scale_fill_manual(values=c("darkolivegreen4", "orange"))+
        ggplot2::scale_y_continuous(limits=c(min(input_df['y']), max(input_df['y'])))+
        ggplot2::stat_smooth(method = "glm", color ='blue', na.rm = T)+
        ggplot2::guides(alpha='none')+ggplot2::labs("")+
        ggplot2::labs(title=title, x=x_label, y=y_label)+ theme_nature()
      stdout <- capture.output(print(temp_plot),type="message")
      if (length(stdout)>0) logging::logdebug(stdout)
    }
    dev.off()
    print("diagnostics plots done!")
  }
}
broadinstitute/massPattern documentation built on Oct. 9, 2019, 10:56 p.m.