# artMS PLOT FUNCTIONS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Individual Normalized abundance dot plots for every protein
#'
#' @description Protein abundance dot plots for each unique uniprot id. It can
#' take a long time
#' @param input_file (char) File path and name to the `-normalized.txt` output
#' file from MSstats
#' @param output_file (char) Output file (path) name (add the `.pdf` extension)
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @return (pdf) file with each individual protein abundance plot for each
#' conditions
#' @keywords abundance, dotplots, plot
#' @examples \dontrun{
#' artmsDataPlots(input_file = "results/ab-results-mss-normalized.txt",
#' output_file = "results/ab-results-mss-normalized.pdf")
#' }
#' @export
artmsDataPlots <- function(input_file,
output_file,
verbose = TRUE) {
if(any(missing(input_file) |
missing(output_file)))
stop("Missed (one or many) required argument(s)
Please, check the help of this function to find out more")
data_mss = fread(input_file, integer64 = 'double')
unique_subjects <- unique(data_mss$PROTEIN)
condition_length <- length(unique(data_mss$GROUP))
min_abu <- min(data_mss$ABUNDANCE, na.rm = TRUE)
max_abu <- max(data_mss$ABUNDANCE, na.rm = TRUE)
pdf(output_file, width = condition_length * 1.5, height = 3)
if(verbose) message('>> PRINTING CONDITION PLOTS for every protein ')
for (subject in unique_subjects) {
subject_data <- data_mss[PROTEIN == subject, ]
if(verbose) message(sprintf('%s ', subject))
p <-
ggplot(data = subject_data,
aes(x = SUBJECT, y = ABUNDANCE, colour = FEATURE))
p <- p + geom_point(size = 2, na.rm = TRUE) +
facet_wrap(
facets = ~ GROUP,
drop = TRUE,
scales = 'free_x',
ncol = condition_length
) +
ylim(min_abu, max_abu) +
theme(axis.text.x = element_text(angle = -90, hjust = 1)) +
guides(colour = FALSE) +
xlab(NULL) +
ggtitle(subject)
print(p)
}
if(verbose) message("--- Done! ")
garbarge <- dev.off()
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Heatmap of significant values
#
# @description heatmap plot to represent proteins with significant changes
# @param mss_F (data.frame) with the significant values (log2fc, pvalues)
# @param out_file (char) Name for the output
# @param labelOrder (vector) Vector with the particular order for the IDs
# (default, `NULL` no order)
# @param names (char) Type of ID used. Default is `Protein` (uniprot entry id).
# Soon will be possible to use 'Gene' name ids.
# @param cluster_cols (logical) Select whether to cluster the columns.
# Options: `T` or `F`. Default `T`.
# @param display (char) Value used to genarate the heatmaps. Options:
# - `log2FC` (default)
# - `adj.pvalue`
# - `pvalue`
# @param verbose (logical) `TRUE` (default) shows function messages
# @return A heatmap of significant values
# @keywords significant, heatmap
.artms_plotHeat <- function(mss_F,
out_file,
labelOrder = NULL,
names = 'Protein',
cluster_cols = FALSE,
display = c('log2FC', 'adj.pvalue', 'pvalue'),
verbose = TRUE) {
heat_data <- data.frame(mss_F, names = names)
## create matrix from log2FC or p-value as user defined
display <- match.arg(display)
if (display == 'log2FC') {
# Issues with extreme_val later if we have Inf/-Inf values.
if (sum(is.infinite(heat_data$log2FC)) > 0) {
idx <- is.infinite(heat_data$log2FC)
heat_data$log2FC[idx] <- NA
}
##LEGACY
# heat_data_w = data.table::dcast(names ~ Label, data = heat_data, value.var = 'log2FC')
heat_data_w <- heat_data %>%
tidyr::pivot_wider(id_cols = names,
names_from = Label,
values_from = log2FC,
values_fn = list(log2FC = length))
} else if (display == 'adj.pvalue') {
heat_data$adj.pvalue = -log10(heat_data$adj.pvalue + 10 ^ -16)
##LEGACY
# heat_data_w = data.table::dcast(names ~ Label,
# data = heat_data, value.var = 'adj.pvalue')
heat_data_w <- heat_data %>%
tidyr::pivot_wider(id_cols = names,
names_from = Label,
values_from = adj.pvalue,
values_fn = list(adj.pvalue = length))
} else if (display == 'pvalue') {
heat_data$pvalue = -log10(heat_data$pvalue + 10 ^ -16)
##LEGACY
# heat_data_w = data.table::dcast(names ~ Label, data = heat_data, value.var = 'pvalue')
heat_data_w <- heat_data %>%
tidyr::pivot_wider(id_cols = names,
names_from = Label,
values_from = pvalue,
values_fn = list(pvalue = length))
}
heat_data_w <- as.data.frame(heat_data_w)
## try
#gene_names = uniprot_to_gene_replace(uniprot_ac=heat_data_w$Protein)
rownames(heat_data_w) = heat_data_w$names
heat_data_w = heat_data_w[, -1]
heat_data_w[is.na(heat_data_w)] = 0
max_val = ceiling(max(heat_data_w))
min_val = floor(min(heat_data_w))
extreme_val = max(max_val, abs(min_val))
if (extreme_val %% 2 != 0)
extreme_val = extreme_val + 1
bin_size = 2
signed_bins = (extreme_val / bin_size)
colors_neg = rev(colorRampPalette(brewer.pal("Blues", n = extreme_val /
bin_size))(signed_bins))
colors_pos = colorRampPalette(
brewer.pal("Reds", n = extreme_val / bin_size))(signed_bins)
colors_tot = c(colors_neg, colors_pos)
if (is.null(labelOrder)) {
pheatmap(
heat_data_w,
scale = "none",
cellheight = 10,
cellwidth = 10,
filename = out_file,
color = colors_tot,
breaks = seq(
from = -extreme_val,
to = extreme_val,
by = bin_size
),
cluster_cols = cluster_cols,
fontfamily = "mono"
)
} else{
heat_data_w <- heat_data_w[, labelOrder]
pheatmap(
heat_data_w,
scale = "none",
cellheight = 10,
cellwidth = 10,
filename = out_file,
color = colors_tot,
breaks = seq(
from = -extreme_val,
to = extreme_val,
by = bin_size
),
cluster_cols = cluster_cols,
fontfamily = "mono"
)
if(verbose) message("--- Heatmap is out ")
}
return(heat_data_w)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Outputs a heatmap of the MSStats results created using the log2fold
#' changes
#'
#' @description Heatmap of the Relative Quantifications (MSStats results)
#' @param input_file (char) MSstats `results.txt` file and location (or
#' data.frame of resuts)
#' @param output_file (char) Output file name (pdf format) and location.
#' Default:"quantifications_heatmap.pdf"
#' @param species (char). Specie name to be able to add the Gene name. To find
#' out more about the supported species check `?artmsMapUniprot2Entrez`
#' @param labels (vector) of uniprot ids if only specific labes would like to
#' be plotted. Default: all labels
#' @param cluster_cols (boolean) `True` or `False` to cluster columns.
#' Default: FALSE
#' @param lfc_lower (int) Lower limit for the log2fc. Default: -2
#' @param lfc_upper (int) Upper limit for the log2fc. Default: +2
#' @param whatPvalue (char) `pvalue` or `adj.pvalue` (default)
#' @param FDR (int) Upper limit false discovery rate (or pvalue). Default: 0.05
#' @param display Metric to be displayed. Options:
#' - `log2fc` (default)
#' - `adj.pvalue`
#' - `pvalue`
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @return (pdf or ggplot2 object) heatmap of the MSStats results using the
#' selected metric
#' @keywords heatmap, log2fc
#' @examples
#' # Unfortunately, the example does not contain any significant hits
#' # Use for illustration purposes
#' artmsPlotHeatmapQuant(input_file = artms_data_ph_msstats_results,
#' species = "human",
#' output_file = NULL,
#' whatPvalue = "pvalue",
#' lfc_lower = -1,
#' lfc_upper = 1)
#' @export
artmsPlotHeatmapQuant <- function(input_file,
output_file = "quantifications_heatmap.pdf",
species,
labels = '*',
cluster_cols = FALSE,
display = 'log2FC',
lfc_lower = -2,
lfc_upper = 2,
whatPvalue = "adj.pvalue",
FDR = 0.05,
verbose = TRUE) {
if(any(missing(input_file) |
missing(species)))
stop("Missed (one or many) required argument(s)
Please, check the help of this function to find out more")
input <- .artms_checkIfFile(input_file)
## select data points by LFC & FDR criterium in single condition and
## adding corresponding data points from the other conditions
sign_hits <- .artms_significantHits(
input,
labels = labels,
LFC = c(lfc_lower, lfc_upper),
whatPvalue = whatPvalue,
FDR = FDR
)
sign_hits <- sign_hits[complete.cases(sign_hits$log2FC), ]
sign_hits <- sign_hits[is.finite(sign_hits$log2FC), ]
if (dim(sign_hits)[1] == 0) {
return(message("--- not enough significant hits!"))
}
sign_labels <- unique(sign_hits$Label)
if(verbose) message(
sprintf(
">> TOTAL NUMBER OF SELECTED HITS FOR PLOTS WITH LFC BETWEEN %s AND %s AT %s FDR:%s\n",
lfc_lower,
lfc_upper,
FDR,
nrow(sign_hits) / length(sign_labels)
)
)
suppressMessages(
sign_hits <- artmsAnnotationUniprot(
x = sign_hits,
columnid = "Protein",
species = species
)
)
## REPRESENTING RESULTS AS HEATMAP
## plot heat map for all contrasts
if (any(grepl('Gene', colnames(sign_hits)))) {
heat_labels <- paste(sign_hits$Protein, sign_hits$Gene, sep = '_')
} else{
heat_labels <- sign_hits$Protein
}
heat_labels <- gsub('\\sNA$', '', heat_labels)
# Old PlotHeat function:
heat_data <- data.frame(sign_hits, heat_labels = heat_labels)
## create matrix from log2FC or p-value as user defined
if (display == 'log2FC') {
# Issues with extreme_val later if we have Inf/-Inf values.
if (sum(is.infinite(heat_data$log2FC)) > 0) {
idx <- is.infinite(heat_data$log2FC)
heat_data$log2FC[idx] <- NA
}
##LEGACY
# heat_data_w <- data.table::dcast(heat_labels ~ Label,
# data = heat_data,
# value.var = 'log2FC')
heat_data_w <- heat_data %>%
tidyr::pivot_wider(id_cols = heat_labels,
names_from = Label,
values_from = log2FC)
} else if (display == 'adj.pvalue') {
heat_data$adj.pvalue <- -log10(heat_data$adj.pvalue + 10 ^ -16)
##LEGACY
# heat_data_w <- data.table::dcast(heat_labels ~ Label,
# data = heat_data,
# value.var = 'adj.pvalue')
heat_data_w <- heat_data %>%
tidyr::pivot_wider(id_cols = heat_labels,
names_from = Label,
values_from = adj.pvalue)
} else if (display == 'pvalue') {
heat_data$pvalue <- -log10(heat_data$pvalue + 10 ^ -16)
##LEGACY
# heat_data_w <- data.table::dcast(heat_labels ~ Label,
# data = heat_data,
# value.var = 'pvalue')
heat_data_w <- heat_data %>%
tidyr::pivot_wider(id_cols = heat_labels,
names_from = Label,
values_from = pvalue)
}
#gene_names <- uniprot_to_gene_replace(uniprot_ac=heat_data_w$Protein)
heat_data_w <- as.data.frame(heat_data_w)
rownames(heat_data_w) <- heat_data_w$heat_labels
heat_data_w <- heat_data_w[, -1]
heat_data_w[is.na(heat_data_w)] = 0
max_val <- ceiling(max(heat_data_w))
min_val <- floor(min(heat_data_w))
extreme_val <- max(max_val, abs(min_val))
if (extreme_val %% 2 != 0)
extreme_val = extreme_val + 1
bin_size = 2
signed_bins <- (extreme_val / bin_size)
colors_neg <- rev(colorRampPalette(RColorBrewer::brewer.pal("Blues", n = extreme_val /
bin_size))(signed_bins))
colors_pos <- colorRampPalette(RColorBrewer::brewer.pal("Reds",
n = extreme_val / bin_size))(signed_bins)
colors_tot <- c(colors_neg, colors_pos)
if (!is.null(output_file)) {
pheatmap(
heat_data_w,
scale = "none",
cellheight = 10,
cellwidth = 10,
filename = output_file,
color = colors_tot,
breaks = seq(
from = -extreme_val,
to = extreme_val,
by = bin_size
),
cluster_cols = cluster_cols,
fontfamily = "mono"
)
if(verbose) message("--- Heatmap done ")
} else{
pheatmap(
heat_data_w,
scale = "none",
cellheight = 10,
cellwidth = 10,
color = colors_tot,
breaks = seq(
from = -extreme_val,
to = extreme_val,
by = bin_size
),
cluster_cols = cluster_cols,
fontfamily = "mono"
)
}
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Generate reproducibility plots based on raw intentities
# (from the evidence file)
#
# @description Generate reproducibility plots based on raw intentities
# (log tranformed) from the evidence file
# @param data (data.frame) clean processed evidence
# @param verbose (logical) `TRUE` (default) shows function messages
# @return (pdf) A reproducibility plot based on evidence values
# @keywords internal, plot, qc, quality, control
.artms_plotReproducibilityEvidence <- function(data,
verbose = TRUE) {
Feature = NULL
data <- data[c('Feature',
'Proteins',
'Intensity',
'Condition',
'BioReplicate',
'Run')]
condi <- unique(data$Condition)
# Progress bar
if(verbose) pb <- txtProgressBar(min = 0,
max = length(condi),
style = 3)
for ( i in seq_len(length(condi)) ) {
eCondition <- condi[i]
# Progress bar
if(verbose) setTxtProgressBar(pb, i)
conditionOne <- data[which(data$Condition == eCondition), ]
# FIRST CHECK FOR TECHNICAL REPLICAS
bioreplicasAll <- unique(conditionOne$BioReplicate)
for (eBioreplica in bioreplicasAll) {
# message('\tChecking for technical replicas in ',eBioreplica, " ")
biorepli <- conditionOne[conditionOne$BioReplicate == eBioreplica, ]
here <- unique(biorepli$Run)
if (length(here) > 1) {
#Limit to 2 technical replicas: (length(here) > 1 & length(here) == 2)
# We are expecting no more than 2 technical replicas.
# If there is more... it is worthy to double check
# message('\t\t>>Reproducibility for technical replicas of',eBioreplica,":")
#Need to change the RUN number to letters (TR: TECHNICAL REPLICA)
biorepli$TR <- biorepli$Run
biorepli$TR[biorepli$TR == here[1]] <- 'tr1'
biorepli$TR[biorepli$TR == here[2]] <- 'tr2'
# Let's select unique features per TECHNICAL REPLICAS,
# and sum them up (the same feature might have been many differnt times)
biorepliaggregated <- aggregate(Intensity ~ Feature+Proteins+Condition+BioReplicate+Run+TR,
data = biorepli,
FUN = sum)
biorepliaggregated$Intensity <- log2(biorepliaggregated$Intensity)
##LEGACY
# bdc <- data.table::dcast(data = biorepliaggregated,
# Feature + Proteins ~ TR,
# value.var = 'Intensity')
bdc <- biorepliaggregated %>%
tidyr::pivot_wider(id_cols = c(Feature, Proteins),
names_from = TR,
values_from = Intensity)
bdc <- as.data.frame(bdc)
# Get the number of proteins
np <- dim(bdc)[1]
corr_coef <- round(cor(bdc$tr1, bdc$tr2, use = "pairwise.complete.obs"), digits = 2)
# message("r:\t",corr_coef," ")
p1 <- ggplot(bdc, aes(x = tr1, y = tr2))
p1 <- p1 + geom_point(na.rm = TRUE) + geom_rug() +
geom_density_2d(colour = 'lightgreen')
p1 <- p1 + geom_smooth(colour = "green",
fill = "lightblue",
method = 'lm',
formula = y ~ x,
na.rm = TRUE)
p1 <- p1 + theme_light()
p1 <- p1 + labs(title = paste(
"Reproducibility between Technical Replicas\nBioReplica:\n",
eBioreplica,
"\n(n = ",
np,
", r = ",
corr_coef,
")"
)
)
print(p1)
}
# else if (length(here) == 1) {
# # message("\t\tOnly one technical replica ")
# } else{
# stop("More than 2 technical replicates found. This is very unusual.
# Please, revise it and if correct contact artms developers")
# }
} # Checking the reproducibility between Technical Replicas
# NOW BETWEEN BIOREPLICAS
# Before comparing the different biological replicas,
# aggregate the technical replicas
# First choose the maximum for the technical replicas as before,
# but first check whether there are more than one
if (length(here) > 1) {
conditionOne <- aggregate(Intensity ~ Feature + Proteins + Condition + BioReplicate + Run,
data = conditionOne,
FUN = sum)
b <- aggregate(Intensity ~ Feature + Proteins + Condition + BioReplicate,
data = conditionOne,
FUN = mean)
} else{
b <- aggregate(Intensity ~ Feature + Proteins + Condition + BioReplicate,
data = conditionOne,
FUN = sum)
}
b$Intensity <- log2(b$Intensity)
blist <- unique(b$BioReplicate)
if (length(blist) > 1) {
# We need at least TWO BIOLOGICAL REPLICAS
to <- length(blist) - 1
for (i in seq_len(to)) {
j <- i + 1
for ( k in j:length(blist) ) {
br1 <- blist[i]
br2 <- blist[k]
# message("\tChecking reproducibility between ",br1, "and ",br2 ,"\t")
##LEGACY
# bcfinal <- data.table::dcast(data = b,
# Feature + Proteins ~ BioReplicate,
# value.var = 'Intensity')
bcfinal <- b %>% tidyr::pivot_wider(id_cols = c(Feature, Proteins),
names_from = BioReplicate,
values_from = Intensity)
bcfinal <- as.data.frame(bcfinal)
bcfinal <- bcfinal[complete.cases(bcfinal),]
# Let's check the total number of peptides here...
checkTotalNumber <- subset(bcfinal, select = c(br1, br2))
npt <- dim(checkTotalNumber)[1]
corr_coef <- round(cor(bcfinal[[br1]], bcfinal[[br2]], use = "pairwise.complete.obs"), digits = 2)
# message("r:\t",corr_coef," ")
colnames(bcfinal) <- c("Feature", "Proteins", "br1", "br2")
p2 <- ggplot(bcfinal, aes(x = br1, y = br2 ))
p2 <- p2 + geom_point(na.rm = TRUE) + geom_rug() + geom_density_2d(colour = 'red')
p2 <- p2 + geom_smooth(colour = "red",
fill = "lightgreen",
method = 'lm',
formula = y ~ x,
na.rm = TRUE)
p2 <- p2 + theme_light()
p2 <- p2 + labs(title = paste("Peptide Reproducibility between Bioreplicas\n (condition:",
eCondition,
")",
br1,
"vs",
br2,
"\n(n =",
npt,
" r = ",
corr_coef,
")"))
p2 <- p2 + labs(x = "br1")
p2 <- p2 + labs(y = "br2")
print(p2)
} #end for
} #end for
} else{
message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
}
} # all the conditions
# Close Progress bar
if(verbose) close(pb)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Plot abundance boxplots
#
# @description Plot abundance boxplots
# @param data (data.frame) processed modelqc
# @return Abundacen boxplot
# @keywords internal, plot, abundance
.artms_plotAbundanceBoxplots <- function(df) {
p1 <- ggplot2::ggplot(df,
aes(x = SUBJECT,
y = ABUNDANCE,
fill = ABUNDANCE))
p1 <- p1 + geom_boxplot(aes(fill = SUBJECT),
na.rm = TRUE)
p1 <- p1 + theme_linedraw()
p1 <-
p1 + theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5
),
legend.position = "none"
)
p1 <- p1 + labs(x = "BIOREPLICATES")
p1 <- p1 + ggtitle("Relative Abundance BioReplicates")
print(p1)
p2 <-
ggplot2::ggplot(df, aes(
x = as.factor(GROUP),
y = ABUNDANCE,
fill = ABUNDANCE
))
p2 <- p2 + geom_boxplot(aes(fill = GROUP),
na.rm = TRUE)
p2 <- p2 + theme_linedraw()
p2 <-
p2 + theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5
),
legend.position = "none"
)
p2 <- p2 + labs(x = "CONDITIONS")
p2 <- p2 + ggtitle("Relative Abundance Conditions")
print(p2)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Total Number of unique proteins based on abundance data
#
# @description Total Number of unique proteins per biological replicate and
# conditions
# @param df (data.frame) modelqc
# @return (pdf) Barplots with the number of proteins per br / condition
# @keywords internal, plots, abundance, counts
.artms_plotNumberProteinsAbundance <- function(df) {
x <- df[,c('PROTEIN', 'SUBJECT')]
y <- unique(x)
names(y)[grep('SUBJECT', names(y))] <- 'BioReplicate'
z <-
ggplot2::ggplot(y, aes(x = BioReplicate, fill = BioReplicate))
z <- z + geom_bar(stat = "count", na.rm = TRUE)
z <- z + theme_linedraw()
z <- z + theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5))
z <- z + geom_text(stat = 'count',
aes(label = ..count..),
vjust = -0.5,
size = 2.7)
z <- z + ggtitle("Unique Proteins in BioReplicates")
print(z)
a <- df[,c('PROTEIN', 'GROUP')]
b <- unique(a)
names(b)[grep('GROUP', names(b))] <- 'Condition'
c <- ggplot2::ggplot(b, aes(x = Condition, fill = Condition))
c <- c + geom_bar(stat = "count", na.rm = TRUE)
c <- c + theme_linedraw()
c <- c + theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5))
c <- c + geom_text(stat = 'count',
aes(label = ..count..),
vjust = -0.5,
size = 2.7)
c <- c + ggtitle("Unique Proteins in Conditions")
print(c)
}
#
# @title Plot the total number of quantified proteins in each condition
#
# @description
# @keys internal, plot, counts
# @param x (data.frame) Data frame of imputed log2fc
.artms_plotNumberProteinsImputedLog2fc <- function(x) {
x <- x[c('Protein', 'Comparison')]
y <- unique(x)
z <- ggplot(y, aes(x = Comparison, fill = Comparison))
z <- z + geom_bar(stat = "count",
na.rm = TRUE)
z <- z + theme_linedraw()
z <- z + theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5))
z <- z + geom_text(stat = 'count',
aes(label = ..count..),
vjust = -0.5,
size = 2.7)
z <- z + ggtitle("Unique Proteins in Comparisons")
print(z)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Generate reproducibility plots based on abundance data
# (normalized intensities from MSstats modelqc)
#
# @description Generate reproducibility plots based on abundance data
# (normalized intensities from MSstats modelqc)
# @param data (data.frame) Protein abundance (modelqc)
# @return Reproducibility plots based on abundance data
# (normalized intensities)
# @keywords plot, reproducibility, abundance
.artms_plotReproducibilityAbundance <- function(x,
verbose = verbose) {
condi <- unique(x$GROUP)
# Progress bar
if(verbose) pb <- txtProgressBar(min = 0,
max = length(condi),
style = 3)
for ( i in seq_len(length(condi)) ) {
eCondition <- condi[i]
# Progress bar
if(verbose) setTxtProgressBar(pb, i)
# message(" ################## CONDITION: ",eCondition," ############### ")
# message("TECHNICAL REPLICAS --------------------------- ")
# message("- ", eCondition," ")
conditionOne <- x[which(x$GROUP == eCondition), ]
# FIRST CHECK FOR TECHNICAL REPLICAS
bioreplicasAll <- unique(conditionOne$SUBJECT)
# plot_tr = list()
for (eBioreplica in bioreplicasAll) {
# message('\tChecking for technical replicas in ',eBioreplica, " ")
biorepli <- conditionOne[conditionOne$SUBJECT == eBioreplica, ]
here <- unique(biorepli$RUN)
if (length(here) > 1) {
# Check whether we have more than 2 technical replicas and let
# the user know:
if (length(here) > 2) {
message(
"(-)----- WARNING: More than 2 technical replicas!
make sure that this is right "
)
}
# We are expecting no more than 2 technical replicas.
# If there is more... it is worthy to double check
# message('\t\t>>Plotting Reproducibility between technical replicas ',
# eBioreplica," ")
#Need to change the RUN number to letters (TR: TECHNICAL REPLICA)
biorepli$TR <- biorepli$RUN
biorepli$TR[biorepli$TR == here[1]] <- 'tr1'
biorepli$TR[biorepli$TR == here[2]] <- 'tr2'
##LEGACY
# bdc <- data.table::dcast(data = biorepli, PROTEIN ~ TR,
# value.var = 'ABUNDANCE')
bdc <- biorepli %>%
tidyr::pivot_wider(id_cols = PROTEIN,
names_from = TR,
values_from = ABUNDANCE)
bdc <- as.data.frame(bdc)
bdc <- bdc[complete.cases(bdc), ]
# Get the number of proteins
np <- length(unique(bdc$PROTEIN))
corr_coef <- round(cor(bdc$tr1, bdc$tr2), digits = 2)
p1 <- ggplot2::ggplot(bdc, aes(x = tr1, y = tr2))
p1 <- p1 + geom_point(na.rm = TRUE)
p1 <- p1 + geom_smooth(colour = "green",
fill = "lightblue",
method = 'lm',
formula = y ~ x,
na.rm = TRUE)
p1 <- p1 + theme_light()
p1 <- p1 + labs(
title = paste(
"Reproducibility between Technical Replicas\nBioReplica:\n",
eBioreplica,
"\n(n = ",
np,
", r = ",
corr_coef,
")"
)
)
print(p1)
# plot_tr[[eBioreplica]] <- p1
} else if (length(here) < 1) {
stop("More than 2 technical replicates found. This is very unusual.
Please, revise it and contact artMS developers if right")
}
} # Checking the reproducibility between Technical Replicas
# NOW BETWEEN BIOREPLICAS
# Before comparing the different biological replicas,
# aggregate on the technical replicas
# message(" BIOLOGICAL REPLICAS --------------------------- ")
b <- aggregate(
ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
data = conditionOne,
FUN = mean
)
# Remove the dash
b$SUBJECT <- gsub("-", "_", b$SUBJECT)
blist <- unique(b$SUBJECT)
if ( length(blist) > 1 ) {
# We need at least TWO BIOLOGICAL REPLICAS
to <- length(blist) - 1
for (i in seq_len(to) ) {
j <- i + 1
for ( k in j:length(blist) ) {
br1 <- blist[i]
br2 <- blist[k]
# message("\tChecking reproducibility between ",br1, "and ",br2 ," ")
##LEGACY
# bc <- data.table::dcast(data = b,
# PROTEIN ~ SUBJECT,
# value.var = 'ABUNDANCE')
bc <- b %>%
tidyr::pivot_wider(id_cols = PROTEIN,
names_from = SUBJECT,
values_from = ABUNDANCE)
bc <- bc[complete.cases(bc), ]
npt <- length(unique(bc$PROTEIN))
corr_coef <- round(cor(bc[[br1]], bc[[br2]]), digits = 2)
p2 <- ggplot2::ggplot(bc, aes_string(x = paste(br1) , y = paste(br2) ))
p2 <- p2 + geom_point(na.rm = TRUE)
p2 <- p2 + geom_smooth(colour = "red",
fill = "lightgreen",
method = 'lm',
formula = y ~ x,
na.rm = TRUE)
p2 <- p2 + theme_light()
p2 <- p2 + labs(
title = paste(
"Reproducibility between Bioreplicas\n(condition:",
eCondition, ") ", br1, "vs", br2, "\n(n =", npt, " r = ", corr_coef, ")"
)
)
p2 <- p2 + labs(x = br1)
p2 <- p2 + labs(y = br2)
print(p2)
}
}
if(verbose) message(" ")
} else{
if(verbose) message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
}
} # all the conditions
if(verbose) close(pb)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Plot correlation between conditions
#
# @description Plot correlation between conditions
# @param x (data.frame) of Protein Abundance (MSstats modelqc)
# @param numberBiologicalReplicas (int) Number of biological replicates
# @return (ggplot.object) A correlation plot between conditions
# @keywords internal, plot, correlation
.artms_plotCorrelationConditions <- function(x, numberBiologicalReplicas) {
# Before jumping to merging biological replicas:
# Technical replicas: aggregate on the technical replicas
b <- aggregate(ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
data = x,
FUN = mean)
# Aggregate now the CONDITIONS on the biological replicas:
# One way to do this would be to be very stringent,
# requiring to find data in all biological replicas:
# allBiologicalReplicas <- function(x){
# ifelse(sum(!is.na(x)) == numberBiologicalReplicas,
# mean(x, na.rm = TRUE), NA)}
# datadc <- data.table::dcast(data=b,
# PROTEIN~GROUP, value.var = 'ABUNDANCE',
# fun.aggregate = allBiologicalReplicas, fill = 0)
# Or a most relaxed way:
##LEGACY
# datadc <- data.table::dcast(data = b,
# PROTEIN ~ GROUP,
# value.var = 'ABUNDANCE',
# fun.aggregate = mean)
datadc <- b %>%
tidyr::pivot_wider(id_cols = PROTEIN,
names_from = GROUP,
values_from = ABUNDANCE,
values_fn = list(ABUNDANCE = mean))
# before <- dim(datadc)[1]
# l <- dim(datadc)[2]
# datadc <- datadc[apply(datadc[c(2:l)],1,function(z) !any(z==0)),]
# evenafter <- dim(datadc)[1]
# datadc <- datadc[complete.cases(datadc),]
# after <- dim(datadc)[1]
# message("Total proteins before: ",
# before, " Removing the 0s: ",evenafter,
# " Total proteins (only complete cases): ", after, " ")
blist <- unique(b$GROUP)
if (length(blist) > 1) {
# We need at least TWO CONDITIONS
to <- length(blist) - 1
for (i in seq_len(to) ) {
j <- i + 1
for (k in j:length(blist)) {
br1 <- blist[i]
br2 <- blist[k]
# message("\t",br1,"-",br2,":")
npt <- length(unique(datadc$PROTEIN))
corr_coef <- round(cor(datadc[[br1]],
datadc[[br2]],
use = "complete.obs"), digits = 2)
# cat ("r:",corr_coef,"\n")
p2 <- ggplot2::ggplot(datadc, aes_string(x = paste(br1), y = paste(br2)))
p2 <- p2 + geom_point(na.rm = TRUE)
p2 <- p2 + geom_smooth(colour = "blue",
fill = "lightblue",
method = 'lm',
formula = y ~ x,
na.rm = TRUE)
p2 <- p2 + theme_light()
p2 <- p2 + labs(title = paste("CORRELATION between CONDITIONS:\n",
br1,
"and",
br2,
"\n(n = ",
npt,
" r = ",
corr_coef,
")"))
p2 <- p2 + labs(x = br1)
p2 <- p2 + labs(y = br2)
print(p2)
} # FOR loop
} # For loop
# message(" ")
} else{
message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
}
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Plot correlation between quantifications different quantified
# comparisons
#
# @description Plot correlation between all quantifications, i.e., different
# quantified comparisons
# @param datai (data.frame) Processed MSstats results
# @param verbose (logical) `TRUE` (default) shows function messages
# @return (ggplot.object) Plot correlation between quantifications and r values
# @keywords internal, plot, correlation, log2fc
.artms_plotRatioLog2fc <- function(datai,
verbose = TRUE) {
##LEGACY
# datadc <- data.table::dcast(data = datai, Protein ~ Label, value.var = 'log2FC')
datadc <- datai %>%
tidyr::pivot_wider(id_cols = Protein,
names_from = Label,
values_from = log2FC)
datadc <- as.data.frame(datadc)
before <- dim(datadc)[1]
l <- dim(datadc)[2]
datadc <- do.call(data.frame, lapply(datadc, function(x) replace(x, is.infinite(x), NA)))
datadc <- datadc[complete.cases(datadc), ]
after <- dim(datadc)[1]
if(verbose) message("---Total unique identifiers before: ",
before,
"\n---Total unique identifiers (only complete cases): ",
after)
blist <- unique(datai$Label)
blist <- gsub("-", ".", blist)
if (length(blist) > 1) {
# We need at least TWO CONDITIONS
to <- length(blist) - 1
# Progress bar
if(verbose) pb <- txtProgressBar(min = 0,
max = length(blist) - 1,
style = 3)
for (i in seq_len(to)) {
if(verbose) setTxtProgressBar(pb, i)
j <- i + 1
for (k in j:length(blist)) {
br1 <- blist[i]
br2 <- blist[k]
# message("\tChecking relation between conditions ",br1, "and ",br2 ,":")
npt <- length(unique(datadc$Protein))
corr_coef <- round(cor(datadc[[br1]], datadc[[br2]]), digits = 2)
# cat ("r: ",corr_coef," ")
p3 <- ggplot2::ggplot(datadc, aes_string(x = paste(br1), y = paste(br2)))
p3 <- p3 + geom_point(na.rm = TRUE) + geom_rug() + geom_density_2d()
p3 <- p3 + geom_smooth(colour = "red",
fill = "lightblue",
method = 'lm',
formula = y ~ x,
na.rm = TRUE)
p3 <- p3 + theme_light()
p3 <- p3 + labs(title = paste0("log2fc(",
br1,
") vs log2fc(",
br2,
") \n(n = ",
npt,
" r = ",
corr_coef,
")"))
p3 <- p3 + labs(x = br1)
p3 <- p3 + labs(y = br2)
print(p3)
} # FOR loop
} # For loop
if(verbose) close(pb)
} else{
message("\tONLY ONE BIOLOGICAL REPLICA AVAILABLE (plots are not possible) ")
}
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Pretty Labels for Heatmaps
#
# @description Generates pretty labels for the heatmaps.
# @param uniprot_acs (char) Uniprot accession id
# @param uniprot_ids (char) Uniprot entry id
# @param gene_names (car) Gene symbol
# @return Pretty labels for a heatmap
# @keywords internal, plots, pretty
.artms_prettyPrintHeatmapLabels <- function(uniprot_acs, uniprot_ids, gene_names) {
result = paste(uniprot_acs, uniprot_ids, gene_names, sep = ' ')
return(result)
}
# @title Correlation heatmaps of all the individual features
# @description Correlation heatmap using intensity values across all the
# conditions
# @param data_w (data.frame) resulting from the `.artms_castMaxQToWidePTM`
# function
# @param keys (data.frame) of the keys
# @param config (yaml.object) Configuration object (yaml loaded)
# @return (pdf) A correlation heatmap (suffix `-heatmap.pdf`)
# @keywords internal, heatmap, intensity, comparisons
.artms_sampleCorrelationHeatmap <- function (data_w, keys, config) {
# data_w <- data.table::as.data.table(data_w2)
# mat = log2(data_w[, 4:ncol(data_w), with = TRUE])
# mat[is.na(mat)] = 0
# mat_cor = cor(mat, method = 'pearson', use = 'everything')
data_w <- log2(data_w[,4:ncol(data_w)])
data_w[is.na(data_w)] <- 0
mat_cor <- cor(data_w, method = 'pearson', use = 'everything')
## we want to make informarive row names so order by
## RawFile because that's how data_w is ordered
keys <- as.data.frame(keys)
ordered_keys <- keys[with(keys, order(RawFile)), ]
mat_names <- paste(ordered_keys$Condition,
ordered_keys$BioReplicate,
ordered_keys$Run)
colnames(mat_cor) <- mat_names
rownames(mat_cor) <- mat_names
colors_pos <- colorRampPalette(RColorBrewer::brewer.pal("Blues", n = 5))(10)
colors_neg <- rev(colorRampPalette(RColorBrewer::brewer.pal("Reds", n = 5))(10))
colors_tot <- c(colors_neg, colors_pos)
pheatmap(
mat = mat_cor,
cellwidth = 10,
cellheight = 10,
scale = 'none',
filename = gsub('.txt', '-heatmap.pdf', config$files$output),
color = colors_tot,
breaks = seq(from = -1, to = 1, by = .1),
fontfamily = "mono"
)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Barplot of peptide counts per biological replicate
#
# @description Total number of unique peptide identified per biological
# replicate
# @param data_f (char) Evidence file (same structure as the original)
# @param config (yaml.object) Configuration object
# @return (pdf) Barplot of peptide counts
# @keywords barplot, counts, peptides
.artms_samplePeptideBarplot <- function(data_f, config) {
# set up data into ggplot compatible format
data_f <- data.table(data_f,
labels = paste(data_f$RawFile,
data_f$Condition,
data_f$BioReplicate))
data_f <- data_f[with(data_f, order(labels, decreasing = TRUE)), ]
# plot the peptide counts for all the samples TOGETHER
p <- ggplot(data = data_f, aes(x = labels))
p <- p + geom_bar(na.rm = TRUE) + theme(axis.text.x = element_text(angle = 90,
hjust = 1,
family = 'mono'
)) + ggtitle('Unique peptides per run\n after filtering') + coord_flip()
ggsave(filename = gsub('.txt', '-peptidecounts.pdf', config$files$output),
plot = p,
width = 8,
height = 10)
w <- 10
h <- ceiling((7 / 5 + 2) * ceiling(length(unique(data_f$Condition)) / 5))
# plot the peptide counts for all the samples PER BAIT
p <- ggplot(data = data_f, aes(x = as.factor(BioReplicate)))
p <- p + geom_bar(na.rm = TRUE) + theme(axis.text.x = element_text(angle = 90,
hjust = 1,
family = 'mono' )) +
ggtitle('Unique peptides per run\n after filtering') +
facet_wrap( ~ Condition, scales = 'free', ncol = 5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(filename = gsub('.txt', '-peptidecounts-perBait.pdf', config$files$output),
plot = p,
width = w,
height = h)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Select significant hits
#
# @description Filtered data.frame with significant values (log2fc > 2 |
# log2fc < -2; adj.pvalue < 0.05) from the MSstats results
# @param mss_results (data.frame) of MSstats results
# @param labels (vector) of selected labels. Default: all (`*`)
# @param LFC (vector, int) with the negative and positive threshold. Default:
# c(-2, 2)
# @param whatPvalue (char) `pvalue` or `adj.pvalue` (default)?
# @param FDR (int) false discovery rate (adj.pvalue) threshold. Default: 0.05
# @return (data.frame) only with significant hits
# @keywords internal, significant, selections
.artms_significantHits <- function(mss_results,
labels = '*',
LFC = c(-2, 2),
whatPvalue = 'adj.pvalue',
FDR = 0.05) {
selected_results <- mss_results[grep(labels, mss_results$Label),]
if (whatPvalue == "adj.pvalue") {
significant_proteins <-
selected_results[(
!is.na(selected_results$log2FC) &
selected_results$adj.pvalue <= FDR &
(
selected_results$log2FC >= LFC[2] |
selected_results$log2FC <= LFC[1]
)
), 'Protein']
} else if (whatPvalue == "pvalue") {
significant_proteins <-
selected_results[(
!is.na(selected_results$log2FC) &
selected_results$pvalue <= FDR &
(
selected_results$log2FC >= LFC[2] |
selected_results$log2FC <= LFC[1]
)
), 'Protein']
} else{
stop("The whatPvalue argument is wrong. Valid options: pvalue or adj.pvalue")
}
significant_results <-
selected_results[selected_results$Protein %in% significant_proteins,]
return(significant_results)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Summary stats for plots
#
# @description Gives count, mean, standard deviation, standard error of the mean,
# and confidence interval (default 95%).
# Borrowed from Cookbook for R
# (http://www.cookbook-r.com/Manipulating_data/Summarizing_data/)
#
# @param data: a data frame.
# @param measurevar: the name of a column that contains the variable to be summariezed
# @param groupvars: a vector containing names of columns that contain grouping variables
# @param na.rm: a boolean that indicates whether to ignore NA's
# @param conf.interval: the percent range of the confidence interval (default is 95%)
# @return data.frame with summary statistics
# @keywords internal, plot, stats
.artms_summarySE <- function(data=NULL,
measurevar,
groupvars=NULL,
na.rm=FALSE,
conf.interval=.95,
.drop=TRUE) {
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- plyr::ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @title Generate PCA plots based on abundance data
#
# @description Generate PCA plots based on abundance data
# @param x Data.frame output from `artms_loadModelQCstrict`
# @param filename Prefix to generate output names (WITH NO EXTENSION)
# @param allConditions Conditions selected to generate the plots
# @return PCA plots based on abundance data (pdf format)
# @keywords internal, plot, pca
.artms_getPCAplots <- function(x, filename, allConditions) {
# PRINCIPAL COMPONENT ANALYSIS
# Using the following packages:
# FactoMineR, factoextra, corrplot, PerformanceAnalytics
# Remove NA
x <- as.data.frame(x)
nogenename2 <- x[duplicated(x$Gene), ]
if (dim(nogenename2)[1] > 0) {
# message("\t---Removing proteins without a gene name: ")
x <- x[complete.cases(x), ]
}
x <- x[!duplicated(x[, c("Gene")]),]
rownames(x) <- x$Gene
df <- x[, allConditions]
# Correlation matrix
df.cor.matrix <- round(cor(df, use = "complete.obs"), 2)
# Correlation plots:
out.correlation <- paste0(filename, "-correlations.pdf")
pdf(out.correlation)
corrplot::corrplot(
df.cor.matrix,
type = "upper",
tl.col = "black",
tl.srt = 45,
diag = FALSE,
addCoef.col = TRUE
)
PerformanceAnalytics::chart.Correlation(df,
histogram = TRUE,
pch = 19,
main = "Correlation between Conditions")
garbage <- dev.off()
# PCA 1----
res.pca <- FactoMineR::PCA(df,
scale.unit = TRUE,
ncp = 4,
graph = FALSE)
eigenvalues <- res.pca$eig
out.pca01 <- paste0(filename, "-pca01.pdf")
pdf(out.pca01)
# par(mfrow = c(1, 1))
here1 <- plot(res.pca, choix = "var", new.plot = FALSE)
print(here1)
# This is equivalent to
# PCA(df, scale.unit = TRUE, ncp = 4, graph = TRUE)
garbage <- dev.off()
# PCA 2----
out.pca02 <- paste0(filename, "-pca02.pdf")
pdf(out.pca02)
barplot(
eigenvalues[, 2],
names.arg = seq_len(nrow(eigenvalues)),
main = "Variances",
xlab = "Principal Components",
ylab = "Percentage of variances",
col = "steelblue"
)
lines(
x = seq_len(nrow(eigenvalues)),
eigenvalues[, 2],
type = "b",
pch = 19,
col = "red"
)
garbage <- dev.off()
# PCA 3----
h <- factoextra::fviz_pca_var(res.pca,
col.var = "contrib") + theme_minimal()
i <- factoextra::fviz_pca_biplot(res.pca,
labelsize = 3,
pointsize = 0.8) + theme_minimal()
j <- factoextra::fviz_contrib(res.pca, choice = "var", axes = 1)
l <- factoextra::fviz_contrib(res.pca, choice = "var", axes = 2)
out.pca03 <- paste0(filename, "-pca03.pdf")
pdf(out.pca03)
print(h)
print(i)
print(j)
print(l)
garbage <- dev.off()
# PCA 4
df <- df[complete.cases(df),]
prot.pca <- prcomp(t(df))
prot.pcax <- as.data.frame(prot.pca$x)
prot.pcax$Condition <- row.names(prot.pcax)
p.pca.group <- ggplot(prot.pcax,
aes(x = PC1,
y = PC2,
color = Condition)) + #, color=condition, shape=condition
geom_point(alpha = .8, size = 3) +
theme_light() +
labs(title = "PCA")
out.pca04 <- paste0(filename, "-pca04.pdf")
pdf(out.pca04, width = 9, height = 7)
plot(p.pca.group)
garbage <- dev.off()
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Volcano plot (log2fc / pvalues)
#'
#' @description It generates a scatter-plot used to quickly identify changes
#' @param mss_results (data.frame or file) Selected MSstats results
#' @param lfc_upper (numeric) log2fc upper threshold (positive value)
#' @param lfc_lower (numeric) log2fc lower threshold (negative value)
#' @param whatPvalue (char) `pvalue` or `adj.pvalue` (default)
#' @param FDR (numeric) False Discovery Rate threshold
#' @param output_name (char) Name for the output file (don't forget the `.pdf`
#' extension)
#' @param PDF (logical) Option to generate pdf format. Default: `T`
#' @param decimal_threshold (numeric) Decimal threshold for the pvalue.
#' Default: 16 (10^-16)
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @keywords plot, volcano
#' @return (pdf) of a volcano plot
#' @examples
#' artmsVolcanoPlot(mss_results = artms_data_ph_msstats_results,
#' whatPvalue = "pvalue",
#' PDF = FALSE)
#' @export
artmsVolcanoPlot <- function(mss_results,
output_name = "volcano_plot.pdf",
lfc_upper = 1,
lfc_lower = -1,
whatPvalue = "adj.pvalue",
FDR = 0.05,
PDF = TRUE,
decimal_threshold = 16,
verbose = TRUE) {
if(any(missing(mss_results)))
stop("One (or many) of the required arguments missed.
Please, check the help for this function to find out more")
if (PDF) {
if (!grepl("\\.pdf", output_name)) {
stop("File extension '.pdf' is missed for < output_name >")
}
}
mss_results <- .artms_checkIfFile(mss_results)
# handle cases where log2FC is Inf. There are no pvalues or other
# information for these cases :(
# Issues with extreme_val later if we have Inf/-Inf values.
if (sum(is.infinite(mss_results$log2FC)) > 0) {
idx <- is.infinite(mss_results$log2FC)
mss_results$log2FC[idx] <- NA
}
min_x <- -ceiling(max(abs(mss_results$log2FC), na.rm = TRUE))
max_x <- ceiling(max(abs(mss_results$log2FC), na.rm = TRUE))
# Deal with special cases in the data where we have pvalues = Inf,NA,0
if (whatPvalue == "adj.pvalue") {
if (sum(is.na(mss_results$adj.pvalue)) > 0)
mss_results <- mss_results[!is.na(mss_results$adj.pvalue), ]
if (nrow(mss_results[mss_results$adj.pvalue == 0 |
mss_results$adj.pvalue == -Inf, ]) > 0)
mss_results[!is.na(mss_results$adj.pvalue) &
(mss_results$adj.pvalue == 0 |
mss_results$adj.pvalue == -Inf), ]$adj.pvalue = 10 ^ -decimal_threshold
max_y = ceiling(-log10(min(mss_results[mss_results$adj.pvalue > 0, ]$adj.pvalue, na.rm = TRUE))) + 1
} else if (whatPvalue == "pvalue") {
if (sum(is.na(mss_results$pvalue)) > 0)
mss_results <- mss_results[!is.na(mss_results$pvalue), ]
if (nrow(mss_results[mss_results$pvalue == 0 |
mss_results$pvalue == -Inf, ]) > 0)
mss_results[!is.na(mss_results$pvalue) &
(mss_results$pvalue == 0 |
mss_results$pvalue == -Inf), ]$pvalue = 10 ^ -decimal_threshold
max_y = ceiling(-log10(min(mss_results[mss_results$pvalue > 0, ]$pvalue, na.rm = TRUE))) + 1
} else{
stop("The whatPvalue argument is wrong. Valid options: < pvalue > or < adj.pvalue >")
}
l <- length(unique(mss_results$Label))
w_base <- 7
h_base <- 7
if (l <= 2) {
w <- w_base * l
} else{
w <- w_base * 2
}
h <- h_base * ceiling(l / 2)
if (whatPvalue == "adj.pvalue") {
p <- ggplot(mss_results, aes(x = log2FC, y = -log10(adj.pvalue)))
p <- p + geom_point(colour = 'grey', na.rm = TRUE) +
geom_point(
data = mss_results[mss_results$adj.pvalue <= FDR &
mss_results$log2FC >= lfc_upper, ],
aes(x = log2FC, y = -log10(adj.pvalue)),
colour = 'red',
size = 1,
na.rm = TRUE
) +
geom_point(
data = mss_results[mss_results$adj.pvalue <= FDR &
mss_results$log2FC <= lfc_lower, ],
aes(x = log2FC, y = -log10(adj.pvalue)),
colour = 'blue',
size = 1,
na.rm = TRUE
) +
geom_vline(xintercept = c(lfc_lower, lfc_upper),
lty = 'dashed') +
geom_hline(yintercept = -log10(FDR), lty = 'dashed') +
xlim(min_x, max_x) +
ylim(0, max_y) +
facet_wrap(facets = ~ Label,
ncol = 2,
scales = 'fixed')
} else{
p <- ggplot(mss_results, aes(x = log2FC, y = -log10(pvalue)))
p <- p + geom_point(colour = 'grey', na.rm = TRUE) +
geom_point(
data = mss_results[mss_results$pvalue <= FDR &
mss_results$log2FC >= lfc_upper, ],
aes(x = log2FC, y = -log10(pvalue)),
colour = 'red',
size = 1,
na.rm = TRUE
) +
geom_point(
data = mss_results[mss_results$pvalue <= FDR &
mss_results$log2FC <= lfc_lower, ],
aes(x = log2FC, y = -log10(pvalue)),
colour = 'blue',
size = 1,
na.rm = TRUE
) +
geom_vline(xintercept = c(lfc_lower, lfc_upper),
lty = 'dashed') +
geom_hline(yintercept = -log10(FDR), lty = 'dashed') +
xlim(min_x, max_x) +
ylim(0, max_y) +
facet_wrap(facets = ~ Label,
ncol = 2,
scales = 'fixed')
}
if (PDF) {
pdf(output_name, width = w, height = h)
print(p)
garbage <- dev.off()
if(verbose) message(output_name, " is ready")
} else{
print(p)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.