#!/usr/bin/env Rscript
###############################################################################
# massPattern
# Copyright (c) 2018 Broad Institute
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
###############################################################################
# load in the required libraries, report an error if they are not installed
for( lib in c('tibble','logging','data.table','dplyr','ggrepel', 'ggplot2', 'scales', 'gridExtra', 'future', 'plyr', 'cowplot')) {
if(! suppressPackageStartupMessages(require(lib, character.only=TRUE)) ) stop(paste("Please install the R package: ",lib))
}
###############################################################
# If running on the command line, load other qc modules #
###############################################################
# this evaluates to true if script is being called directly as an executable
if(identical(environment(), globalenv()) &&
!length( grep( "^source\\(", sys.calls()))) {
# source all R in qc package, relative to this folder
script_options <-commandArgs(trailingOnly = FALSE)
script_path <- sub("--file=","",script_options[grep("--file=", script_options)])
script_dir <- dirname(script_path)
script_name <- basename(script_path)
for(R_file in dir(script_dir, pattern = "*.R"))
{
if(! ( R_file == script_name )) source( file.path(script_dir, R_file) )
}
}
massPattern <- function(input, r2_threshold = 0.90, q_threshold = 0.0001, coef_threshold = 0.1,
min_prevalence = 2, output_path = './', outputname = 'results.txt'){
dir.create(file.path(output_path), showWarnings = FALSE)
#print (c(r2_threshold , q_threshold, coef_threshold, min_prevalence))
loaded_data <- load_data(input=input, type='all', name = 'Compound')
data <- loaded_data$data
data <- numeric_dataframe(data)
sample_metadata <- loaded_data$sample_metadata
feature_metadata <- loaded_data$feature_metadata
data<- as.data.frame(t(data))
metadata<- as.data.frame(t(sample_metadata))
metadata <- as.data.frame(metadata["injection order"], drop = F)
colnames(metadata) <- "injection_order"
metadata <- numeric_dataframe(metadata)
#metadata$injection_order <- as.numeric(metadata$injection_order) * 1.0
data <- filter_by_prevelance(data, prev= min_prevalence, axis = 2)
# fit linear model
fit_data <- fit.data(features = data2, metadata = metadata, model = 'SLM')
r2_distribution <- as.data.frame(fit_data$results$r2, drop = F)
r2_distribution$rank <- NA
colnames(r2_distribution)[1]<- 'R2'
r2_distribution<- r2_distribution[order(r2_distribution$R2),]
r2_distribution$rank <- seq.int(nrow(r2_distribution))
colnames(r2_distribution) <- c("y", "x")
pdf(paste(output_path,'/r2_distribution.pdf', sep=''), width = 2.75, height = 2.25, onefile=TRUE)
r2_distribution_plot <- ggplot2::ggplot(data=r2_distribution,
ggplot2::aes(x=as.numeric(as.character(x)), y=as.numeric(as.character(y)))) +
ggplot2::geom_point( color = 'black', alpha = .1, shape = 21, size = 2, stroke = 0.1) +
ggplot2::scale_x_continuous(limits=c(min(r2_distribution['x']), max(r2_distribution['x'])))+
ggplot2::scale_y_continuous(limits=c(min(r2_distribution['y']), max(r2_distribution['y'])))+
ggplot2::guides(alpha='none')+ggplot2::labs("")+
ggplot2::labs(title='Distribution of coefficient of determination', x='rank of R2', y='R2' )+ theme_nature()
stdout <- capture.output(print(r2_distribution_plot),type="message")
if (length(stdout)>0) logging::logdebug(stdout)
dev.off()
print("r2_distribution_plot done!")
# filter for high correlated features with injection order
to_be_removed <- fit_data$results[which(fit_data$results$r2 >= r2_threshold &
fit_data$results$qval <= q_threshold &
abs(fit_data$results$coef) >= coef_threshold) , 'feature']
filtered_data <- data[!rownames(data) %in% to_be_removed,]
removed_data <- data[rownames(data) %in% to_be_removed,]
df <- readxl::read_excel(input, sheet = 1, col_names = FALSE)
df2 <- tibble::add_column(as.data.frame(df), to_be_removed = NA, .after = 1)
df2$to_be_removed <- ifelse(df2[,1] %in% to_be_removed, "Yes", "No")
write.table( df2, paste(output_path,'/',outputname, sep=''),
sep = "\t", eol = "\n", quote = F, col.names = F, row.names = F)
write.table( fit_data$results, paste(output_path,'/stats.txt', sep=''),
sep = "\t", eol = "\n", quote = F, col.names = NA, row.names = T)
# plot one example
pool <- ifelse(startsWith(rownames(data2), "PREF"), 2, 1)#'Refrence pool', 'Sample')
if (length(to_be_removed) > 0){
pdf(paste(output_path,'/features_associated_w_injection_order_r2.pdf', sep=''), width = 2.75, height = 2.25, onefile=TRUE)
for (i in 1:length(to_be_removed)){
input_df<- as.data.frame(cbind(metadata$injection_order, data2[to_be_removed[i]], pool))
x_label<- "Injection order"
y_label <- "Intensity"
title <- to_be_removed[i]
colnames(input_df)<- c('x', 'y', 'pool')
temp_plot <- ggplot2::ggplot(data=input_df,
ggplot2::aes(as.numeric(as.character(x)), as.numeric(as.character(y)))) +
ggplot2::geom_point( fill = pool, color = 'black', alpha = .75, shape = 21, size = 2, stroke = 0.1) +
ggplot2::scale_x_continuous(limits=c(min(input_df['x']), max(input_df['x'])))+
ggplot2::scale_fill_manual(values=c("darkolivegreen4", "orange"))+
ggplot2::scale_y_continuous(limits=c(min(input_df['y']), max(input_df['y'])))+
ggplot2::stat_smooth(method = "glm", color ='blue', na.rm = T)+
ggplot2::guides(alpha='none')+ggplot2::labs("")+
ggplot2::labs(title=title, x=x_label, y=y_label)+ theme_nature()
stdout <- capture.output(print(temp_plot),type="message")
if (length(stdout)>0) logging::logdebug(stdout)
}
dev.off()
print("diagnostics plots done!")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.