Nothing
## ----required packages, echo = FALSE, warning=FALSE, results="hide"-----------
suppressPackageStartupMessages({
library("BiocStyle")
library("DEP")
library("dplyr")
library("tidyr")
library("purrr")
library("ggplot2")
library("SummarizedExperiment")
})
## ----variables----------------------------------------------------------------
# Variables
replicates = 3
bg_proteins = 3000
DE_proteins = 300
log2_mean_bg = 27
log2_sd_bg = 2
log2_mean_DE_control = 25
log2_mean_DE_treatment = 30
log2_sd_DE = 2
## ----simulate_data------------------------------------------------------------
# Loading DEP and packages required for data handling
library("DEP")
library("dplyr")
library("tidyr")
library("purrr")
library("ggplot2")
library("SummarizedExperiment")
# Background data (sampled from null distribution)
sim_null <- data_frame(
name = paste0("bg_", rep(1:bg_proteins, rep((2*replicates), bg_proteins))),
ID = rep(1:bg_proteins, rep((2*replicates), bg_proteins)),
var = rep(c("control_1", "control_2", "control_3",
"treatment_1", "treatment_2", "treatment_3"), bg_proteins),
val = 2^rnorm(2*replicates*bg_proteins, mean = log2_mean_bg, sd = log2_sd_bg))
# Data for DE proteins (sampled from alternative distribution)
sim_diff <- rbind(
data_frame(
name = paste0("DE_", rep(1:DE_proteins, rep(replicates, DE_proteins))),
ID = rep((bg_proteins+1):(bg_proteins+DE_proteins),
rep(replicates, DE_proteins)),
var = rep(c("control_1", "control_2", "control_3"), DE_proteins),
val = 2^rnorm(replicates*DE_proteins,
mean = log2_mean_DE_control, sd = log2_sd_DE)),
data_frame(
name = paste0("DE_", rep(1:DE_proteins, rep(replicates, DE_proteins))),
ID = rep((bg_proteins+1):(bg_proteins+DE_proteins),
rep(replicates, DE_proteins)),
var = rep(c("treatment_1", "treatment_2", "treatment_3"), DE_proteins),
val = 2^rnorm(replicates*DE_proteins,
mean = log2_mean_DE_treatment, sd = log2_sd_DE)))
# Combine null and DE data
sim <- rbind(sim_null, sim_diff) %>%
spread(var, val) %>%
arrange(ID)
# Generate experimental design
experimental_design <- data_frame(
label = colnames(sim)[!colnames(sim) %in% c("name", "ID")],
condition = c(rep("control", replicates),
rep("treatment", replicates)),
replicate = rep(1:replicates, 2))
## ----missing_values_variables-------------------------------------------------
# Variables
MAR_fraction = 0.05
MNAR_proteins = 100
## ----add_missing_values-------------------------------------------------------
# Generate a MAR matrix
MAR_matrix <- matrix(
data = sample(c(TRUE, FALSE),
size = 2*replicates*(bg_proteins+DE_proteins),
replace = TRUE,
prob = c(MAR_fraction, 1-MAR_fraction)),
nrow = bg_proteins+DE_proteins,
ncol = 2*replicates)
# Introduce missing values at random (MAR)
controls <- grep("control", colnames(sim))
treatments <- grep("treatment", colnames(sim))
sim[, c(controls, treatments)][MAR_matrix] <- 0
sim$MAR <- apply(MAR_matrix, 1, any)
# Introduce missing values not at random (MNAR)
DE_protein_IDs <- grep("DE", sim$name)
sim[DE_protein_IDs[1:MNAR_proteins], controls] <- 0
sim$MNAR <- FALSE
sim$MNAR[DE_protein_IDs[1:MNAR_proteins]] <- TRUE
## ----generate_SE--------------------------------------------------------------
# Generate a SummarizedExperiment object
sim_unique_names <- make_unique(sim, "name", "ID", delim = ";")
se <- make_se(sim_unique_names, c(controls, treatments), experimental_design)
## ----plot_data_noFilt, fig.width = 4, fig.height = 4--------------------------
# Plot a barplot of the protein quantification overlap between samples
plot_frequency(se)
## ----filter_proteins----------------------------------------------------------
# No filtering
no_filter <- se
# Filter for proteins that are quantified in all replicates of at least one condition
condition_filter <- filter_proteins(se, "condition", thr = 0)
# Filter for proteins that have no missing values
complete_cases <- filter_proteins(se, "complete")
# Filter for proteins that are quantified in at least 2/3 of the samples.
frac_filtered <- filter_proteins(se, "fraction", min = 0.66)
## ----filter_consequences------------------------------------------------------
# Function to extract number of proteins
number_prots <- function(se) {
names <- rownames(get(se))
data_frame(Dataset = se,
bg_proteins = sum(grepl("bg", names)),
DE_proteins = sum(grepl("DE", names)))
}
# Number of bg and DE proteins still included
objects <- c("no_filter",
"condition_filter",
"complete_cases",
"frac_filtered")
map_df(objects, number_prots)
## ----scale_vst, message = FALSE-----------------------------------------------
# Scale and variance stabilize
no_filter <- normalize_vsn(se)
condition_filter <- normalize_vsn(condition_filter)
complete_cases <- normalize_vsn(complete_cases)
frac_filtered <- normalize_vsn(frac_filtered)
## ----plot_norm, fig.width = 4, fig.height = 5---------------------------------
# Mean versus Sd plot
meanSdPlot(no_filter)
## ----plot_missval, fig.height = 4, fig.width = 3------------------------------
# Plot a heatmap of proteins with missing values
plot_missval(no_filter)
## ----plot_detect, fig.height = 4, fig.width = 4-------------------------------
# Plot intensity distributions and cumulative fraction of proteins
# with and without missing values
plot_detect(no_filter)
## ----imputation_methods, error = TRUE-----------------------------------------
# All possible imputation methods are printed in an error, if an invalid function name is given.
impute(no_filter, fun = "")
## ----impute, results = "hide", message = FALSE, warning = FALSE---------------
# No imputation
no_imputation <- no_filter
# Impute missing data using random draws from a
# Gaussian distribution centered around a minimal value (for MNAR)
MinProb_imputation <- impute(no_filter, fun = "MinProb", q = 0.01)
# Impute missing data using random draws from a
# manually defined left-shifted Gaussian distribution (for MNAR)
manual_imputation <- impute(no_filter, fun = "man", shift = 1.8, scale = 0.3)
# Impute missing data using the k-nearest neighbour approach (for MAR)
knn_imputation <- impute(no_filter, fun = "knn", rowmax = 0.9)
## ----plot_imp, fig.width = 5, fig.height = 7----------------------------------
# Plot intensity distributions before and after imputation
plot_imputation(no_filter, MinProb_imputation,
manual_imputation, knn_imputation)
## ----mixed_impuation, results = "hide", message = FALSE, warning = FALSE------
# Extract protein names with missing values
# in all replicates of at least one condition
proteins_MNAR <- get_df_long(no_filter) %>%
group_by(name, condition) %>%
summarize(NAs = all(is.na(intensity))) %>%
filter(NAs) %>%
pull(name) %>%
unique()
# Get a logical vector
MNAR <- names(no_filter) %in% proteins_MNAR
# Perform a mixed imputation
mixed_imputation <- impute(
no_filter,
fun = "mixed",
randna = !MNAR, # we have to define MAR which is the opposite of MNAR
mar = "knn", # imputation function for MAR
mnar = "zero") # imputation function for MNAR
## ----sample_specific_impuation, results = "hide", message = FALSE, warning = FALSE----
# SummarizedExperiment to MSnSet object conversion
sample_specific_imputation <- no_filter
MSnSet <- as(sample_specific_imputation, "MSnSet")
# Impute differently for two sets of samples
MSnSet_imputed1 <- MSnbase::impute(MSnSet[, 1:3], method = "MinProb")
MSnSet_imputed2 <- MSnbase::impute(MSnSet[, 4:6], method = "knn")
# Combine into the SummarizedExperiment object
assay(sample_specific_imputation) <- cbind(
MSnbase::exprs(MSnSet_imputed1),
MSnbase::exprs(MSnSet_imputed2))
## ----plot_imp2, fig.width = 5, fig.height = 5---------------------------------
# Plot intensity distributions before and after imputation
plot_imputation(no_filter, mixed_imputation, sample_specific_imputation)
## ----DE_analysis, message = FALSE, warning = FALSE----------------------------
# Function that wraps around test_diff, add_rejections and get_results functions
DE_analysis <- function(se) {
se %>%
test_diff(., type = "control", control = "control") %>%
add_rejections(., alpha = 0.1, lfc = 0) %>%
get_results()
}
# DE analysis on no, knn, MinProb and mixed imputation
no_imputation_results <- DE_analysis(no_imputation)
knn_imputation_results <- DE_analysis(knn_imputation)
MinProb_imputation_results <- DE_analysis(MinProb_imputation)
mixed_imputation_results <- DE_analysis(mixed_imputation)
## ----rejections---------------------------------------------------------------
# Function to extract number of DE proteins
DE_prots <- function(results) {
data_frame(Dataset = gsub("_results", "", results),
significant_proteins = get(results) %>%
filter(significant) %>%
nrow())
}
# Number of significant proteins
objects <- c("no_imputation_results",
"knn_imputation_results",
"MinProb_imputation_results",
"mixed_imputation_results")
map_df(objects, DE_prots)
## ----plot_performance---------------------------------------------------------
# Function to obtain ROC data
get_ROC_df <- function(results) {
get(results) %>%
select(name, treatment_vs_control_p.val, significant) %>%
mutate(
DE = grepl("DE", name),
BG = grepl("bg", name)) %>%
arrange(treatment_vs_control_p.val) %>%
mutate(
TPR = cumsum(as.numeric(DE)) / 300,
FPR = cumsum(as.numeric(BG)) / 3000,
method = results)
}
# Get ROC data for no, knn, MinProb and mixed imputation
ROC_df <- map_df(objects, get_ROC_df)
# Plot ROC curves
ggplot(ROC_df, aes(FPR, TPR, col = method)) +
geom_line() +
theme_DEP1() +
ggtitle("ROC-curve")
# Plot ROC curves zoom
ggplot(ROC_df, aes(FPR, TPR, col = method)) +
geom_line() +
theme_DEP1() +
xlim(0, 0.1) +
ggtitle("ROC-curve zoom")
## ----differences_results------------------------------------------------------
# Function to obtain summary data
get_rejected_proteins <- function(results) {
get(results) %>%
filter(significant) %>%
left_join(., select(sim, name, MAR, MNAR), by = "name") %>%
mutate(
DE = grepl("DE", name),
BG = grepl("bg", name),
method = results)
}
# Get summary data for no, knn, MinProb and mixed imputation
objects <- c("no_imputation_results",
"knn_imputation_results",
"MinProb_imputation_results",
"mixed_imputation_results")
summary_df <- map_df(objects, get_rejected_proteins)
# Plot number of DE proteins (True and False)
summary_df %>%
group_by(method) %>%
summarize(TP = sum(DE), FP = sum(BG)) %>%
gather(category, number, -method) %>%
mutate(method = gsub("_results", "", method)) %>%
ggplot(aes(method, number, fill = category)) +
geom_col(position = position_dodge()) +
theme_DEP2() +
labs(title = "True and False Hits",
x = "",
y = "Number of DE proteins",
fill = "False or True")
## ----plot_missval_histogram---------------------------------------------------
# Plot number of DE proteins with missing values
summary_df %>%
group_by(method) %>%
summarize(MNAR = sum(MNAR), MAR = sum(MAR)) %>%
gather(category, number, -method) %>%
mutate(method = gsub("_results", "", method)) %>%
ggplot(aes(method, number, fill = category)) +
geom_col(position = position_dodge()) +
theme_DEP2() +
labs(title = "Category of Missing Values",
x = "",
y = "Number of DE proteins",
fill = "")
## ----session_info, echo = FALSE-----------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.