#' Visualization for explanatory data analysis - TMT experiment
#'
#' To illustrate the quantitative data and quality control of MS runs,
#' dataProcessPlotsTMT takes the quantitative data from MSstatsTMT converter
#' functions as input
#' and generate two types of figures in pdf files as output :
#' (1) profile plot (specify "ProfilePlot" in option type), to identify the
#' potential sources of variation for each protein;
#' (2) quality control plot (specify "QCPlot" in option type), to evaluate the
#' systematic bias between MS runs.
#'
#' @export
#' @import ggplot2
#' @importFrom graphics axis image legend mtext par plot.new title plot
#' @importFrom grDevices dev.off hcl pdf
#' @importFrom dplyr mutate n
#' @importFrom reshape2 dcast
#' @importFrom MSstatsTMT proteinSummarization
#' @importFrom gridExtra grid.arrange
#' @param data.ptm name of the data with PTM sites in protein name, which can be
#' the output of MSstatsTMT converter functions.
#' @param data.protein name of the data with peptide level, which can be the
#' output of MSstatsTMT converter functions.
#' @param data.ptm.summarization name of the data with ptm sites in
#' protein-level name, which can be the output of
#' the MSstatsTMT \code{\link[MSstatsTMT]{proteinSummarization}} function.
#' @param data.protein.summarization name of the data with protein-level, which
#' can be the output of the
#' MSstatsTMT \code{\link[MSstatsTMT]{proteinSummarization}} function.
#' @param type choice of visualization. "ProfilePlot" represents profile plot of
#' log intensities across MS runs.
#' "QCPlot" represents box plots of log intensities across channels and MS runs.
#' @param ylimUp upper limit for y-axis in the log scale.
#' FALSE(Default) for Profile Plot and QC Plot uses the upper limit as rounded
#' off maximum of log2(intensities) after normalization + 3..
#' @param ylimDown lower limit for y-axis in the log scale. FALSE(Default) for
#' Profile Plot and QC Plot uses 0..
#' @param x.axis.size size of x-axis labeling for "Run" and "channel in Profile
#' Plot and QC Plot.
#' @param y.axis.size size of y-axis labels. Default is 10.
#' @param text.size size of labels represented each condition at the top of
#' Profile plot and QC plot. Default is 4.
#' @param text.angle angle of labels represented each condition at the top of
#' Profile plot and QC plot. Default is 0.
#' @param legend.size size of legend above Profile plot. Default is 7.
#' @param dot.size.profile size of dots in Profile plot. Default is 2.
#' @param ncol.guide number of columns for legends at the top of plot. Default
#' is 5.
#' @param width width of the saved pdf file. Default is 10.
#' @param height height of the saved pdf file. Default is 10.
#' @param which.Protein Protein list to draw plots. List can be names of
#' Proteins or order numbers of Proteins.
#' Default is "all", which generates all plots for each protein. For QC plot,
#' "allonly" will generate one QC plot with all proteins.
#' @param originalPlot TRUE(default) draws original profile plots, without
#' normalization.
#' @param summaryPlot TRUE(default) draws profile plots with protein
#' summarization for each channel and MS run.
#' @param address the name of folder that will store the results. Default folder
#' is the current working directory.
#' The other assigned folder has to be existed under the current working
#' directory.
#' An output pdf file is automatically created with the default name of
#' "ProfilePlot.pdf" or "QCplot.pdf".
#' The command address can help to specify where to store the file as well as
#' how to modify the beginning of the file name.
#' If address=FALSE, plot will be not saved as pdf file but showed in window.
#' @return plot or pdf
#' @examples
#' data(raw.ptm)
#' data(raw.protein)
#' data(quant.msstats.ptm)
#' data(quant.msstats.protein)
#'
#' ## Profile plot
#' dataProcessPlotsTMTPTM(data.ptm=raw.ptm,
#' data.protein=raw.protein,
#' data.ptm.summarization=quant.msstats.ptm,
#' data.protein.summarization=quant.msstats.protein,
#' which.Protein = 1,
#' type='ProfilePlot',
#' address=FALSE)
dataProcessPlotsTMTPTM <- function(data.ptm,
data.protein,
data.ptm.summarization,
data.protein.summarization,
type,
ylimUp = FALSE,
ylimDown = FALSE,
x.axis.size = 10,
y.axis.size = 10,
text.size = 4,
text.angle = 90,
legend.size = 7,
dot.size.profile = 2,
ncol.guide = 5,
width = 10,
height = 12,
which.Protein = "all",
originalPlot = TRUE,
summaryPlot = TRUE,
address = "") {
## save process output in each step
allfiles <- list.files()
filenaming <- "mstatstmtptm"
if (length(grep(filenaming,allfiles)) == 0) {
finalfile <- "mstatstmtptm.log"
processout <- NULL
} else {
num <- 0
finalfile <- "mstatstmtptm.log"
while (is.element(finalfile, allfiles)) {
num <- num + 1
lastfilename <- finalfile ## in order to rea
finalfile <- paste0(paste(filenaming, num, sep="-"), ".log")
}
finalfile <- lastfilename
processout <- as.matrix(read.table(finalfile, header=TRUE, sep="\t"))
}
processout <- rbind(processout,
as.matrix(c(" ", " ", "MSstatsTMTPTM -
dataProcessPlotsTMTPTM function", " "),
ncol=1))
## Checking for input variables
type <- toupper(type)
if (length(setdiff(type, c("PROFILEPLOT", "QCPLOT"))) != 0) {
processout <- rbind(processout,
c(paste0("Input for type=", type,
". However,'type' should be one of ProfilePlot,
QCPlot.")))
write.table(processout, file=finalfile, row.names=FALSE)
stop("Input for type=", type,
". However,'type' should be one of ProfilePlot, QCPlot.")
}
raw.required.columns <- c('ProteinName', 'PeptideSequence', 'Charge', 'PSM',
'Mixture', 'TechRepMixture', 'Run', 'Channel',
'Condition', 'BioReplicate', 'Intensity')
if (!all(raw.required.columns %in% names(data.ptm))) {
stop("Please include in the raw PTM data all the following elements: ",
paste0(sQuote(raw.required.columns), collapse = ", "))
}
if (!all(raw.required.columns %in% names(data.protein))) {
stop("Please include in the raw Protein data all the following elements: ",
paste0(sQuote(raw.required.columns), collapse = ", "))
}
summarized.required.columns <- c('Run', 'Protein', 'Abundance', 'Channel',
'BioReplicate', 'Condition', 'TechRepMixture',
'Mixture')
if (!all(summarized.required.columns %in% names(data.ptm.summarization))) {
stop(
"Please include in the summarized PTM data all the following elements: ",
paste0(sQuote(summarized.required.columns), collapse = ", "))
}
if (!all(summarized.required.columns %in% names(data.protein.summarization))
) {
stop(
"Please include in the summarized
Protein data all the following elements: ",
paste0(sQuote(summarized.required.columns), collapse = ", "))
}
Condition <- Run <- xorder <- Channel <- NULL
PeptideSequence <- PSM <- ProteinName <- NULL
GlobalProtein <- Protein <- NULL
groupAxis <- cumGroupAxis <- abundance <- analysis <- NULL
datafeature.protein <- data.protein
datafeature.ptm <- data.ptm
datarun.protein <- data.protein.summarization
datarun.ptm <- data.ptm.summarization
# conditions in feature data
fea.conds.protein <- as.character(unique(datafeature.protein$Condition))
fea.conds.ptm <- as.character(unique(datafeature.ptm$Condition))
# conditions in protein data
run.conds.protein <- as.character(unique(datarun.protein$Condition))
run.conds.ptm <- as.character(unique(datarun.ptm$Condition))
# only keep the overlapped conditions between feature data and protein data
shared.conds <- Reduce(intersect, list(fea.conds.protein, fea.conds.ptm,
run.conds.protein, run.conds.ptm))
datafeature.protein <- datafeature.protein[datafeature.protein$Condition
%in% shared.conds,]
datafeature.ptm <- datafeature.ptm[datafeature.ptm$Condition %in% shared.conds
,]
datarun.protein <- datarun.protein[datarun.protein$Condition %in% shared.conds
,]
datarun.ptm <- datarun.ptm[datarun.ptm$Condition %in% shared.conds,]
# make sure condition is factor
datafeature.protein$Condition <- factor(datafeature.protein$Condition)
datafeature.ptm$Condition <- factor(datafeature.ptm$Condition)
datarun.protein$Condition <- factor(datarun.protein$Condition)
datarun.ptm$Condition <- factor(datarun.ptm$Condition)
## Remove Site from protein name
regex_protein <- '([^-]+)(?:_[^-]+){1}$'
datafeature.ptm <- datafeature.ptm %>% mutate(GlobalProtein = str_match(
ProteinName, regex_protein)[,2])
colnames(datafeature.protein)[colnames(datafeature.protein) == 'ProteinName'
] <- 'Protein'
colnames(datafeature.ptm)[colnames(datafeature.ptm) == 'ProteinName'
] <- 'Protein'
datafeature.protein$Protein <- factor(datafeature.protein$Protein)
datafeature.ptm$Protein <- factor(datafeature.ptm$Protein)
datafeature.ptm$GlobalProtein <- factor(datafeature.ptm$GlobalProtein)
datarun.protein$Protein <- factor(datarun.protein$Protein)
datarun.ptm$Protein <- factor(datarun.ptm$Protein)
## feature level data : log2 transform
datafeature.protein$abundance <- log2(datafeature.protein$Intensity)
datafeature.ptm$abundance <- log2(datafeature.ptm$Intensity)
datafeature.protein[!is.na(datafeature.protein$Intensity) &
datafeature.protein$Intensity < 1, 'abundance'] <- 0
datarun.ptm[!is.na(datarun.ptm$Intensity) &
datarun.ptm$Intensity < 1, 'abundance'] <- 0
if (length(setdiff(toupper(type), c(toupper("ProfilePlot"), toupper("QCPlot")
))) != 0) {
stop("Input for type=", type,
". However,'type' should be one of \"ProfilePlot\", \"QCPlot\"."
)
}
if (address == FALSE){
## here I used == FALSE, instead of !address. Because address can be logical
## or characters.
if (which.Protein == 'all') {
stop('** Cannnot generate all plots in a screen. Please set one protein
at a time.')
} else if (length(which.Protein) > 1) {
stop('** Cannnot generate multiple plots in a screen. Please set one
protein at a time.')
}
}
## Profile plot ##
## ---------------
if (toupper(type) == "PROFILEPLOT") {
processout <- rbind(processout,
c("ProfilePlot plotting started."))
## choose Proteins or not
if (which.Protein != "all") {
## check which.Protein is name of Protein
if (is.character(which.Protein)) {
temp.name <- which.Protein
## message if name of Protein is wrong.
if (length(setdiff(temp.name,unique(datafeature.ptm$Protein))) > 0) {
stop("Please check protein name. Data set does not
have this protein. - ",
toString(temp.name))
}
}
## check which.Protein is order number of Protein
if (is.numeric(which.Protein)) {
temp.name <- levels(datafeature.ptm$Protein)[which.Protein]
## message if name of Protein is wrong.
if (length(levels(datafeature.ptm$Protein)) < max(which.Protein)) {
stop("Please check your ion of proteins. There are ",
length(levels(datafeature.ptm$Protein))," proteins in this
dataset.")
}
}
## use only assigned proteins
datafeature.ptm <- datafeature.ptm[which(datafeature.ptm$Protein %in%
temp.name), ]
temp_proteins <- as.character((datafeature.ptm %>% distinct(GlobalProtein)
)[[1]])
datafeature.ptm$Protein <- factor(datafeature.ptm$Protein)
datafeature.protein <- datafeature.protein[which(
datafeature.protein$Protein %in% temp_proteins), ]
datafeature.protein$Protein <- factor(datafeature.protein$Protein)
datarun.protein <- datarun.protein[which(datarun.protein$Protein %in%
temp_proteins), ]
datarun.ptm <- datarun.ptm[which(datarun.ptm$Protein %in% temp.name), ]
datarun.protein$Protein <- factor(datarun.protein$Protein)
datarun.ptm$Protein <- factor(datarun.ptm$Protein)
}
## assign upper or lower limit
y.limup <- ceiling(max(datafeature.protein$abundance,
datafeature.ptm$abundance, na.rm = TRUE) + 5)
if (is.numeric(ylimUp)) {
y.limup <- ylimUp
}
y.limdown <- 0
if (is.numeric(ylimDown)) {
y.limdown <- ylimDown
}
datafeature.protein <- datafeature.protein[with(datafeature.protein, order(
Run, Condition, Channel)), ]
datafeature.ptm <- datafeature.ptm[with(datafeature.ptm, order(
Run, Condition, Channel)), ]
datafeature.protein$Run <- factor(datafeature.protein$Run)
datafeature.ptm$Run <- factor(datafeature.ptm$Run)
datarun.protein$Run <- factor(datarun.protein$Run)
datarun.ptm$Run <- factor(datarun.ptm$Run)
## !! important: order of x-axis
## can be reorder by group and then channel, WITHIN Run
## first make new column for x-axis
datafeature.protein$group.channel <- paste(datafeature.protein$Condition,
datafeature.protein$Channel,
sep = "_")
datafeature.ptm$group.channel <- paste(datafeature.ptm$Condition,
datafeature.ptm$Channel, sep = "_")
## not sure better way for coding
## potentially change it.
datafeature.protein$xorder <- NA
datafeature.ptm$xorder <- NA
for (k in seq_along(unique(datafeature.protein$Run))) {
runid <- unique(datafeature.protein$Run)[k]
datafeature.protein[datafeature.protein$Run == runid, ]$xorder <- factor(
datafeature.protein[datafeature.protein$Run == runid, ]$group.channel,
levels <- unique(datafeature.protein[datafeature.protein$Run == runid,
]$group.channel),
labels <- seq(1, length(unique(datafeature.protein[
datafeature.protein$Run == runid, ]$group.channel))))
}
for (k in seq_along(unique(datafeature.ptm$Run))) {
runid <- unique(datafeature.ptm$Run)[k]
datafeature.ptm[datafeature.ptm$Run == runid, ]$xorder <- factor(
datafeature.ptm[datafeature.ptm$Run == runid, ]$group.channel,
levels <- unique(datafeature.ptm[datafeature.ptm$Run == runid,
]$group.channel),
labels <- seq(1, length(unique(datafeature.ptm[
datafeature.ptm$Run == runid, ]$group.channel))))
}
## check
## unique(datafeature[datafeature$Run == '5', c('Channel', 'Condition',
## 'Run', 'xorder','group.channel')])
## need to make data.frame with same variables for condition name
datafeature.protein$xorder <- as.numeric(datafeature.protein$xorder)
datafeature.ptm$xorder <- as.numeric(datafeature.ptm$xorder)
## keep unique information for x-axis labeling. will be used in plotting
tempGroupName.protein <- unique(datafeature.protein[, c("Condition",
"xorder", "Run",
"Channel")])
tempGroupName.ptm <- unique(datafeature.ptm[, c("Condition", "xorder",
"Run", "Channel")])
groupline.protein <- tempGroupName.protein %>% dplyr::group_by(
Condition, Run) %>% dplyr::mutate(groupAxis = n())
groupline.protein <- groupline.protein %>% dplyr::select(-xorder, -Channel)
groupline.protein <- groupline.protein[!duplicated(groupline.protein), ]
groupline.ptm <- tempGroupName.ptm %>% dplyr::group_by(
Condition, Run) %>% dplyr::mutate(groupAxis = n())
groupline.ptm <- groupline.ptm %>% dplyr::select(-xorder, -Channel)
groupline.ptm <- groupline.ptm[!duplicated(groupline.ptm), ]
groupline.protein <- groupline.protein %>% dplyr::group_by(
Run) %>% dplyr::mutate(cumGroupAxis = cumsum(groupAxis))
groupline.ptm <- groupline.ptm %>% dplyr::group_by(
Run) %>% dplyr::mutate(cumGroupAxis = cumsum(groupAxis))
groupline.protein$cumGroupAxis <- groupline.protein$cumGroupAxis + 0.5
groupline.ptm$cumGroupAxis <- groupline.ptm$cumGroupAxis + 0.5
## add coordinate for group id
groupline.protein$xorder <- groupline.protein$cumGroupAxis -
groupline.protein$groupAxis / 2
groupline.protein$abundance <- y.limup - 0.5
groupline.ptm$xorder <- groupline.ptm$cumGroupAxis -
groupline.ptm$groupAxis / 2
groupline.ptm$abundance <- y.limup - 0.5
groupline.all.protein <- groupline.protein
groupline.all.ptm <- groupline.ptm
## remove last condition for vertical line between groups
groupline.protein <- groupline.protein[-which(
groupline.protein$Condition %in% levels(
groupline.protein$Condition)[nlevels(groupline.protein$Condition)]), ]
groupline.ptm <- groupline.ptm[-which(
groupline.ptm$Condition %in% levels(groupline.ptm$Condition)[
nlevels(groupline.ptm$Condition)]), ]
## need to fill in incomplete rows for Runlevel data
haverun <- FALSE
if (sum(is.element(colnames(datarun.protein), "Run")) != 0) {
datamat <- reshape2::dcast(Protein + Channel ~ Run,
data = datarun.protein,
value.var = 'Abundance', keep = TRUE)
datarun.protein <- reshape2::melt(datamat,
id.vars=c('Protein', 'Channel'))
colnames(datarun.protein)[colnames(
datarun.protein) %in% c("variable", "value")] <- c('Run', 'Abundance')
## match x axis order
datarun.protein <- merge(datarun.protein,
tempGroupName.protein, by = c('Run', 'Channel'))
haverun <- TRUE
}
if (sum(is.element(colnames(datarun.ptm), "Run")) != 0) {
datamat <- reshape2::dcast(Protein + Channel ~ Run, data = datarun.ptm,
value.var = 'Abundance', keep = TRUE)
datarun.ptm <- reshape2::melt(datamat, id.vars=c('Protein', 'Channel'))
colnames(datarun.ptm)[colnames(datarun.ptm) %in% c("variable", "value")
] <- c('Run', 'Abundance')
## match x axis order
datarun.ptm <- merge(datarun.ptm, tempGroupName.ptm,
by = c('Run', 'Channel'))
haverun <- TRUE
}
## save the plots as pdf or not
## If there are the file with the same name,
## add next numbering at the end of file name
## y-axis labeling
yaxis.name <- 'Log2-intensities'
## Only plot proteins that occur in both datasets
global_proteins <- (datafeature.protein %>% distinct(Protein))[[1]]
ptm_proteins <- (datafeature.ptm %>% distinct(GlobalProtein))[[1]]
plot_proteins <- intersect(ptm_proteins, global_proteins)
datafeature.ptm <- datafeature.ptm %>% filter(
GlobalProtein %in% plot_proteins)
plot_proteins <- (datafeature.ptm %>% distinct(Protein))[[1]]
if (originalPlot) {
if (address != FALSE) {
allfiles <- list.files()
num <- 0
filenaming <- paste0(address, "ProfilePlot")
finalfile <- paste0(address, "ProfilePlot.pdf")
while (is.element(finalfile, allfiles)) {
num <- num + 1
finalfile <- paste0(paste(filenaming, num, sep = "-"), ".pdf")
}
pdf(finalfile, width = width, height = height)
}
## factoring for run, channel, condition should be done before loop
for (i in seq_len(length(plot_proteins))) {
sub.ptm <- datafeature.ptm[
datafeature.ptm$Protein == as.character(plot_proteins[i]), ]
sub.protein <- datafeature.protein[
datafeature.protein$Protein == as.character(
(sub.ptm %>% distinct(GlobalProtein))[[1]]), ]
sub.protein$PeptideSequence <- factor(as.character(
sub.protein$PeptideSequence))
sub.protein$Charge <- factor(as.character(sub.protein$Charge))
sub.protein$PSM <- factor(as.character(sub.protein$PSM))
sub.ptm$PeptideSequence <- factor(as.character(sub.ptm$PeptideSequence))
sub.ptm$Charge <- factor(as.character(sub.ptm$Charge))
sub.ptm$PSM <- factor(as.character(sub.ptm$PSM))
# if all measurements are NA,
if (nrow(sub.protein) == sum(is.na(sub.protein$abundance))|
nrow(sub.protein) == sum(!is.na(
sub.protein$abundance) & sub.protein$abundance == 0)|
nrow(sub.ptm) == sum(is.na(sub.ptm$abundance))|
nrow(sub.ptm) == sum(
!is.na(sub.ptm$abundance) & sub.ptm$abundance == 0)) {
message(paste0("Can't the Profile plot for ", unique(
sub.protein$Protein), "(", i, " of ", length(plot_proteins),
") because all measurements are NAs or zero."))
next()
}
## seq for peptide and charge
## for seting up color and linetype
b.protein <- unique(sub.protein[, c("PeptideSequence", "PSM")])
## add because if there are missing value, orders are different.
b.protein <- b.protein[with(b.protein, order(PeptideSequence, PSM)), ]
b.ptm <- unique(sub.ptm[, c("PeptideSequence", "PSM")])
## add because if there are missing value, orders are different.
b.ptm <- b.ptm[with(b.ptm, order(PeptideSequence, PSM)), ]
temp1.protein <- xtabs(~PeptideSequence, b.protein)
temp1.ptm <- xtabs(~PeptideSequence, b.ptm)
## unique charge id within peptide sequence, for line type
ss.protein <- NULL
ss.ptm <- NULL
## unique peptide sequence id, for color
s.protein <- NULL
s.ptm <- NULL
for (j in seq_along(temp1.protein)) {
temp3 <- rep(j, temp1.protein[j])
s.protein <- c(s.protein, temp3)
temp2 <- seq(1, temp1.protein[j])
ss.protein <- c(ss.protein, temp2)
}
for (j in seq_along(temp1.ptm)) {
temp3 <- rep(j, temp1.ptm[j])
s.ptm <- c(s.ptm, temp3)
temp2 <- seq(1, temp1.ptm[j])
ss.ptm <- c(ss.ptm, temp2)
}
## for annotation of condition
groupline.tmp.protein <- data.frame(groupline.protein,
"PSM" = unique(sub.protein$PSM)[1],
"PeptideSequence" = unique(
sub.protein$PeptideSequence)[1])
groupline.all.tmp.protein <- data.frame(groupline.all.protein,
"PSM" = unique(
sub.protein$PSM)[1],
"PeptideSequence" = unique(
sub.protein$PeptideSequence
)[1])
groupline.tmp.ptm <- data.frame(groupline.ptm,
"PSM" = unique(sub.ptm$PSM)[1],
"PeptideSequence" = unique(
sub.ptm$PeptideSequence)[1])
groupline.all.tmp.ptm <- data.frame(groupline.all.ptm,
"PSM" = unique(sub.ptm$PSM)[1],
"PeptideSequence" = unique(
sub.ptm$PeptideSequence)[1])
## 2019. 12. 17, MC : for profile plot, define color for dot
cbp <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
"#D55E00", "#CC79A7")
check.length.protein <- length(unique(s.protein)) %/% length(cbp)
if ( check.length.protein > 0 ){
cbp.protein <- rep(cbp, times=check.length.protein + 1)
} else {
cbp.protein <- cbp
}
check.length.ptm <- length(unique(s.ptm)) %/% length(cbp)
if ( check.length.ptm > 0 ){
cbp.ptm <- rep(cbp, times=check.length.ptm + 1)
} else {
cbp.ptm <- cbp
}
##
## 1st plot for Protein plot
protein_temp <- ggplot(aes_string(x = 'xorder', y = 'abundance',
color = 'PSM', linetype = 'PSM'),
data = sub.protein) +
facet_grid(~Run) +
geom_point(size=dot.size.profile) +
geom_line(size = 0.5) +
scale_colour_manual(values=cbp[s.protein]) +
scale_linetype_manual(values = ss.protein) +
scale_shape_manual(values = c(16)) +
labs(title = paste0('Protein - ', unique(sub.protein$Protein)),
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
scale_x_continuous('MS runs') +
geom_vline(data = groupline.tmp.protein,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.protein,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "top",
legend.text = element_text(size = legend.size)) +
guides(color = guide_legend(title = paste("# peptide:", nlevels(
sub.protein$PeptideSequence)),
title.theme = element_text(size = 13, angle = 0),
keywidth = 0.4,
keyheight = 0.1,
default.unit = 'inch',
ncol = ncol.guide),
linetype = guide_legend(
title = paste("# peptide:",
nlevels(sub.protein$PeptideSequence)),
title.theme = element_text(size = 13, angle = 0),
keywidth = 0.4,
keyheight = 0.1,
default.unit = 'inch',
ncol = ncol.guide))
## 1st plot for PTM plot
ptm_temp <- ggplot(aes_string(x = 'xorder', y = 'abundance',
color = 'PSM', linetype = 'PSM'),
data = sub.ptm) +
facet_grid(~Run) +
geom_point(size=dot.size.profile) +
geom_line(size = 0.5) +
scale_colour_manual(values=cbp[s.ptm]) +
scale_linetype_manual(values = ss.ptm) +
scale_shape_manual(values = c(16)) +
labs(title = paste0('PTM - ', unique(sub.ptm$Protein)),
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
scale_x_continuous('MS runs') +
geom_vline(data = groupline.tmp.ptm,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.ptm,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "top",
legend.text = element_text(size = legend.size)) +
guides(color = guide_legend(title = paste(
"# peptide:", nlevels(sub.ptm$PeptideSequence)),
title.theme = element_text(size = 13, angle = 0),
keywidth = 0.4,
keyheight = 0.1,
default.unit = 'inch',
ncol = ncol.guide),
linetype = guide_legend(
title = paste("# peptide:", nlevels(sub.ptm$PeptideSequence)),
title.theme = element_text(size = 13, angle = 0),
keywidth = 0.4,
keyheight = 0.1,
default.unit = 'inch',
ncol = ncol.guide))
gridExtra::grid.arrange(ptm_temp, protein_temp, ncol=1)
message(paste("Drew the Profile plot for ", unique(sub.ptm$Protein),
"(", i, " of ", length(plot_proteins), ")"))
}
# end-loop for each protein
if (address != FALSE) {
dev.off()
}
} # end original plot
############################################
## 2st plot for original plot : summary
############################################
if (summaryPlot) {
if (address != FALSE) {
allfiles <- list.files()
num <- 0
filenaming <- paste0(address, "ProfilePlot_wSummarization")
finalfile <- paste0(address, "ProfilePlot_wSummarization.pdf")
while (is.element(finalfile, allfiles)) {
num <- num + 1
finalfile <- paste0(paste(filenaming, num, sep = "-"), ".pdf")
}
pdf(finalfile, width = width, height = height)
}
for (i in seq_len(length(plot_proteins))) {
sub.ptm <- datafeature.ptm[datafeature.ptm$Protein == as.character(
plot_proteins[i]), ]
sub.protein <- datafeature.protein[
datafeature.protein$Protein == as.character(
(sub.ptm %>% distinct(GlobalProtein))[[1]]), ]
sub.protein$PeptideSequence <- factor(as.character(
sub.protein$PeptideSequence))
sub.protein$Charge <- factor(as.character(sub.protein$Charge))
sub.protein$PSM <- factor(as.character(sub.protein$PSM))
sub.ptm$PeptideSequence <- factor(as.character(sub.ptm$PeptideSequence))
sub.ptm$Charge <- factor(as.character(sub.ptm$Charge))
sub.ptm$PSM <- factor(as.character(sub.ptm$PSM))
# if all measurements are NA,
if (nrow(sub.protein) == sum(is.na(sub.protein$abundance))|
nrow(sub.protein) == sum(
!is.na(sub.protein$abundance) & sub.protein$abundance == 0)|
nrow(sub.ptm) == sum(is.na(sub.ptm$abundance))|
nrow(sub.ptm) == sum(
!is.na(sub.ptm$abundance) & sub.ptm$abundance == 0)) {
message(paste0("Can't the Profile plot for ", unique(
sub.protein$Protein),
"(", i, " of ", length(plot_proteins),
") because all measurements are NAs or zero."))
next()
}
## for annotation of condition
groupline.tmp.protein <- data.frame(groupline.protein,
"PSM" = unique(sub.protein$PSM)[1],
"PeptideSequence" = unique(
sub.protein$PeptideSequence)[1])
groupline.all.tmp.protein <- data.frame(groupline.all.protein,
"PSM" = unique(
sub.protein$PSM)[1],
"PeptideSequence" = unique(
sub.protein$PeptideSequence
)[1])
groupline.tmp.ptm <- data.frame(groupline.ptm,
"PSM" = unique(sub.ptm$PSM)[1],
"PeptideSequence" = unique(
sub.ptm$PeptideSequence)[1])
groupline.all.tmp.ptm <- data.frame(groupline.all.ptm,
"PSM" = unique(sub.ptm$PSM)[1],
"PeptideSequence" = unique(
sub.ptm$PeptideSequence)[1])
if (haverun) {
subrun.ptm <- datarun.ptm[
datarun.ptm$Protein == as.character(plot_proteins[i]), ]
subrun.protein <- datarun.protein[
datarun.protein$Protein == as.character(
(sub.ptm %>% distinct(GlobalProtein))[[1]]), ]
if (nrow(subrun.protein) != 0) {
quantrun.ptm <- sub.ptm[1, ]
quantrun.ptm[, 2:ncol(quantrun.ptm)] <- NA
quantrun.ptm <- quantrun.ptm[rep(seq_len(nrow(subrun.ptm))), ]
quantrun.ptm$Protein <- subrun.ptm$Protein
quantrun.ptm$PeptideSequence <- "Run summary"
quantrun.ptm$Charge <- "Run summary"
quantrun.ptm$PSM <- "Run summary"
quantrun.ptm$Channel <- subrun.ptm$Channel
quantrun.ptm$Run <- subrun.ptm$Run
quantrun.ptm$abundance <- subrun.ptm$Abundance
quantrun.ptm$xorder <- subrun.ptm$xorder
} else {
## if there is only one Run measured across all runs
## no Run information for linear with censored
quantrun.ptm <- datafeature.ptm[1, ]
quantrun.ptm[, 2:ncol(quantrun.ptm)] <- NA
quantrun.ptm$Protein <- levels(datafeature.ptm$Protein)[i]
quantrun.ptm$PeptideSequence <- "Run summary"
quantrun.ptm$Charge <- "Run summary"
quantrun.ptm$PSM <- "Run summary"
quantrun.ptm$abundance <- NA
quantrun.ptm$Intensity <- NA
}
if (nrow(subrun.protein) != 0) {
quantrun.protein <- sub.protein[1, ]
quantrun.protein[, 2:ncol(quantrun.protein)] <- NA
quantrun.protein <- quantrun.protein[rep(seq_len(
nrow(subrun.protein))), ]
quantrun.protein$Protein <- subrun.protein$Protein
quantrun.protein$PeptideSequence <- "Run summary"
quantrun.protein$Charge <- "Run summary"
quantrun.protein$PSM <- "Run summary"
quantrun.protein$Channel <- subrun.protein$Channel
quantrun.protein$Run <- subrun.protein$Run
quantrun.protein$abundance <- subrun.protein$Abundance
quantrun.protein$xorder <- subrun.protein$xorder
} else {
## if there is only one Run measured across all runs
## no Run information for linear with censored
quantrun.protein <- datafeature.protein[1, ]
quantrun.protein[, 2:ncol(quantrun.protein)] <- NA
quantrun.protein$Protein <- levels(datafeature.protein$Protein)[i]
quantrun.protein$PeptideSequence <- "Run summary"
quantrun.protein$Charge <- "Run summary"
quantrun.protein$PSM <- "Run summary"
quantrun.protein$abundance <- NA
quantrun.protein$Intensity <- NA
}
quantrun.ptm$analysis <- "Run summary"
quantrun.protein$analysis <- "Run summary"
sub.ptm$analysis <- "Processed feature-level data"
sub.protein$analysis <- "Processed feature-level data"
final.ptm <- rbind(sub.ptm, quantrun.ptm)
final.ptm$analysis <- factor(final.ptm$analysis)
final.ptm$PSM <- factor(final.ptm$PSM)
final.protein <- rbind(sub.protein, quantrun.protein)
final.protein$analysis <- factor(final.protein$analysis)
final.protein$PSM <- factor(final.protein$PSM)
## Draw summarized ptm plot
ptempall.ptm <- ggplot(aes_string(x = 'xorder', y = 'abundance',
color = 'analysis',
linetype = 'PSM', size = 'analysis')
, data = final.ptm) +
facet_grid(~Run) +
geom_point(size = dot.size.profile) +
geom_line(size = 0.5) +
scale_colour_manual(values = c("lightgray", "darkred")) +
scale_shape_manual(values = c(16)) +
scale_size_manual(values = c(1.7, 2), guide = "none") +
scale_linetype_manual(values = c(rep(1, times = length(
unique(final.ptm$PSM))-1), 2), guide = "none") +
labs(title = paste0('PTM - ', unique(sub.ptm$Protein)),
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
geom_vline(data = groupline.tmp.ptm,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.ptm,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "top",
legend.text = element_text(size = legend.size),
legend.title = element_blank()) +
guides(color = guide_legend(order = 1,
title = NULL,
label.theme = element_text(
size = 10, angle = 0)))
## draw point again because some red summary dots could be hiden
ptempall.ptm <- ptempall.ptm + geom_point(data = final.ptm, aes(
x = xorder, y = abundance, size = analysis, color = analysis))
## Draw summarized protein plot
ptempall.protein <- ggplot(aes_string(x = 'xorder', y = 'abundance',
color = 'analysis',
linetype = 'PSM',
size = 'analysis'),
data = final.protein) +
facet_grid(~Run) +
geom_point(size = dot.size.profile) +
geom_line(size = 0.5) +
scale_colour_manual(values = c("lightgray", "darkred")) +
scale_shape_manual(values = c(16)) +
scale_size_manual(values = c(1.7, 2), guide = "none") +
scale_linetype_manual(values = c(rep(1, times = length(
unique(final.protein$PSM))-1), 2), guide = "none") +
labs(title = paste0('Protein - ', unique(sub.protein$Protein)),
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
geom_vline(data = groupline.tmp.protein,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.protein,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "top",
legend.text = element_text(size = legend.size),
legend.title = element_blank()) +
guides(color = guide_legend(order = 1,
title = NULL,
label.theme = element_text(size = 10,
angle = 0)))
## draw point again because some red summary dots could be hiden
ptempall.protein <- ptempall.protein + geom_point(
data = final.protein, aes(x = xorder,
y = abundance,
size = analysis, color = analysis))
gridExtra::grid.arrange(ptempall.ptm, ptempall.protein, ncol=1)
message(paste("Drew the Profile plot with summarization for ",
unique(sub.ptm$Protein),
"(", i, " of ", length(
unique(datafeature.ptm$Protein)), ")"))
}
} # end-loop for each protein
if (address!=FALSE) {
dev.off()
}
} # end summarization plot
} # end Profile plot
## QC plot (Quality control plot) ##
## ---------------------------------
if (toupper(type) == "QCPLOT") {
## y-axis labeling
yaxis.name <- 'Log2-intensities'
## save the plots as pdf or not
## If there are the file with the same name
## add next numbering at the end of file name
if (address != FALSE) {
allfiles <- list.files()
num <- 0
filenaming <- paste0(address,"QCPlot")
finalfile <- paste0(address,"QCPlot.pdf")
while (is.element(finalfile, allfiles)) {
num <- num + 1
finalfile <- paste0(paste(filenaming, num, sep = "-"), ".pdf")
}
pdf(finalfile, width = width, height = height)
}
## assign upper or lower limit
y.limup <- ceiling(max(datafeature.protein$abundance,
datafeature.ptm$abundance, na.rm = TRUE) + 3)
if (is.numeric(ylimUp)) {
y.limup <- ylimUp
}
y.limdown <- 0
if (is.numeric(ylimDown)) {
y.limdown <- ylimDown
}
datafeature.protein <- datafeature.protein[with(datafeature.protein,
order(Run, Condition,
Channel)), ]
datafeature.ptm <- datafeature.ptm[
with(datafeature.ptm, order(Run, Condition, Channel)), ]
datafeature.protein$Run <- factor(datafeature.protein$Run)
datafeature.ptm$Run <- factor(datafeature.ptm$Run)
datarun.protein$Run <- factor(datarun.protein$Run)
datarun.ptm$Run <- factor(datarun.ptm$Run)
## !! important: order of x-axis
## can be reorder by group and then channel, WITHIN Run
## first make new column for x-axis
datafeature.protein$group.channel <- paste(
datafeature.protein$Condition, datafeature.protein$Channel, sep = "_")
datafeature.ptm$group.channel <- paste(
datafeature.ptm$Condition, datafeature.ptm$Channel, sep = "_")
## not sure better way for coding
## potentially change it.
datafeature.protein$xorder <- NA
datafeature.ptm$xorder <- NA
for (k in seq_along(unique(datafeature.protein$Run))) {
runid <- unique(datafeature.protein$Run)[k]
datafeature.protein[datafeature.protein$Run == runid,
]$xorder <- factor(datafeature.protein[
datafeature.protein$Run == runid, ]$group.channel,
levels <- unique(
datafeature.protein[
datafeature.protein$Run == runid,
]$group.channel),
labels <- seq(1, length(
unique(datafeature.protein[
datafeature.protein$Run == runid,
]$group.channel))))
}
for (k in seq_along(unique(datafeature.ptm$Run))) {
runid <- unique(datafeature.ptm$Run)[k]
datafeature.ptm[datafeature.ptm$Run == runid,
]$xorder <- factor(datafeature.ptm[
datafeature.ptm$Run == runid, ]$group.channel,
levels <- unique(datafeature.ptm[
datafeature.ptm$Run == runid, ]$group.channel),
labels <- seq(1, length(unique(datafeature.ptm[
datafeature.ptm$Run == runid, ]$group.channel))))
}
## need to make data.frame with same variables for condition name
datafeature.protein$xorder <- as.numeric(datafeature.protein$xorder)
datafeature.ptm$xorder <- as.numeric(datafeature.ptm$xorder)
## keep unique information for x-axis labeling. will be used in plotting
tempGroupName.protein <- unique(datafeature.protein[, c("Condition",
"xorder",
"Run", "Channel")])
tempGroupName.ptm <- unique(datafeature.ptm[, c("Condition",
"xorder",
"Run", "Channel")])
## count # per condition per Run
#groupline <- unique(datafeature[, c('Condition', 'Run')])
#groupline$groupAxis <- as.numeric(xtabs(~Condition+Run, tempGroupName))
groupline.protein <- tempGroupName.protein %>% dplyr::group_by(
Condition, Run) %>% dplyr::mutate(groupAxis = n())
groupline.protein <- groupline.protein %>% dplyr::select(-xorder, -Channel)
groupline.protein <- groupline.protein[!duplicated(groupline.protein), ]
groupline.ptm <- tempGroupName.ptm %>% dplyr::group_by(
Condition, Run) %>% dplyr::mutate(groupAxis = n())
groupline.ptm <- groupline.ptm %>% dplyr::select(-xorder, -Channel)
groupline.ptm <- groupline.ptm[!duplicated(groupline.ptm), ]
## make accumurated # as condition increase
groupline.protein <- groupline.protein %>% dplyr::group_by(
Run) %>% dplyr::mutate(cumGroupAxis = cumsum(groupAxis))
groupline.ptm <- groupline.ptm %>% dplyr::group_by(
Run) %>% dplyr::mutate(cumGroupAxis = cumsum(groupAxis))
groupline.protein$cumGroupAxis <- groupline.protein$cumGroupAxis + 0.5
groupline.ptm$cumGroupAxis <- groupline.ptm$cumGroupAxis + 0.5
## add coordinate for group id
groupline.protein$xorder <- groupline.protein$cumGroupAxis -
groupline.protein$groupAxis / 2
groupline.protein$abundance <- y.limup - 0.5
groupline.ptm$xorder <- groupline.ptm$cumGroupAxis -
groupline.ptm$groupAxis / 2
groupline.ptm$abundance <- y.limup - 0.5
## save all information, for labeling group in plot
groupline.all.protein <- groupline.protein
groupline.all.ptm <- groupline.ptm
## remove last condition for vertical line between groups
groupline.protein <- groupline.protein[-which(
groupline.protein$Condition %in% levels(
groupline.protein$Condition)[nlevels(groupline.protein$Condition)]), ]
groupline.ptm <- groupline.ptm[-which(
groupline.ptm$Condition %in% levels(
groupline.ptm$Condition)[nlevels(groupline.ptm$Condition)]), ]
## all protein
if (which.Protein == 'all' | which.Protein == 'allonly') {
## for annotation of condition
groupline.tmp.protein <- data.frame(groupline.protein,
"PSM" = unique(datafeature.protein$PSM)[1],
"PeptideSequence" = unique(
datafeature.protein$PeptideSequence)[1])
groupline.all.tmp.protein <- data.frame(groupline.all.protein,
"PSM" = unique(
datafeature.protein$PSM)[1],
"PeptideSequence" = unique(
datafeature.protein$PeptideSequence)[1])
## for annotation of condition
groupline.tmp.ptm <- data.frame(groupline.ptm,
"PSM" = unique(
datafeature.ptm$PSM)[1],
"PeptideSequence" = unique(
datafeature.ptm$PeptideSequence)[1])
groupline.all.tmp.ptm <- data.frame(groupline.all.ptm,
"PSM" = unique(
datafeature.ptm$PSM)[1],
"PeptideSequence" = unique(
datafeature.ptm$PeptideSequence
)[1])
## 1st plot for original plot
## for boxplot, x-axis, xorder should be factor
datafeature.protein$xorder <- factor(datafeature.protein$xorder)
datafeature.ptm$xorder <- factor(datafeature.ptm$xorder)
ptemp.ptm <- ggplot(aes_string(x = 'xorder', y = 'abundance'),
data = datafeature.ptm) +
facet_grid(~Run) +
geom_boxplot(aes_string(fill = 'Condition'), outlier.shape = 1,
outlier.size = 1.5) +
labs(title = 'All PTMs',
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
geom_vline(data = groupline.tmp.ptm,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.ptm,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "none")
ptemp.protein <- ggplot(aes_string(x = 'xorder', y = 'abundance'),
data = datafeature.protein) +
facet_grid(~Run) +
geom_boxplot(aes_string(fill = 'Condition'), outlier.shape = 1,
outlier.size = 1.5) +
labs(title = 'All proteins',
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
geom_vline(data = groupline.tmp.protein,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.protein,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "none")
gridExtra::grid.arrange(ptemp.ptm, ptemp.protein, ncol=1)
message("Drew the Quality Contol plot(boxplot) for all ptms/proteins.")
}
## each protein
## choose Proteins or not
if (which.Protein != 'allonly') {
if (which.Protein != "all") {
## check which.Protein is name of Protein
if (is.character(which.Protein)) {
temp.name <- which.Protein
## message if name of Protein is wrong.
if (length(setdiff(temp.name,unique(datafeature.ptm$Protein))) > 0) {
stop("Please check protein name.
Data set does not have this protein. - ",
toString(temp.name))
}
}
## check which.Protein is order number of Protein
if (is.numeric(which.Protein)) {
temp.name <- levels(datafeature.ptm$Protein)[which.Protein]
## message if name of Protein is wrong.
if (length(levels(datafeature.ptm$Protein)) < max(which.Protein)) {
stop("Please check your ion of proteins. There are ",
length(levels(datafeature.ptm$Protein)),
" proteins in this dataset.")
}
}
## use only assigned proteins
datafeature.ptm <- datafeature.ptm[which(
datafeature.ptm$Protein %in% temp.name), ]
temp_proteins <- as.character(
(datafeature.ptm %>% distinct(GlobalProtein))[[1]])
datafeature.ptm$Protein <- factor(datafeature.ptm$Protein)
datafeature.protein <- datafeature.protein[
which(datafeature.protein$Protein %in% temp_proteins), ]
datafeature.protein$Protein <- factor(datafeature.protein$Protein)
datarun.protein <- datarun.protein[
which(datarun.protein$Protein %in% temp_proteins), ]
datarun.ptm <- datarun.ptm[which(datarun.ptm$Protein %in% temp.name), ]
datarun.protein$Protein <- factor(datarun.protein$Protein)
datarun.ptm$Protein <- factor(datarun.ptm$Protein)
}
## Only plot proteins that occur in both datasets
global_proteins <- (datafeature.protein %>% distinct(Protein))[[1]]
ptm_proteins <- (datafeature.ptm %>% distinct(GlobalProtein))[[1]]
plot_proteins <- intersect(ptm_proteins, global_proteins)
datafeature.ptm <- datafeature.ptm %>% filter(
GlobalProtein %in% plot_proteins)
plot_proteins <- (datafeature.ptm %>% distinct(Protein))[[1]]
## factoring for run, channel, condition should be done before loop
for (i in seq_len(length(plot_proteins))) {
sub.ptm <- datafeature.ptm[datafeature.ptm$Protein == as.character(
plot_proteins[i]), ]
sub.protein <- datafeature.protein[
datafeature.protein$Protein == as.character(
(sub.ptm %>% distinct(GlobalProtein))[[1]]), ]
sub.ptm <- sub.ptm[!is.na(sub.ptm$abundance), ]
sub.protein <- sub.protein[!is.na(sub.protein$abundance), ]
## if all protein measurements are NA,
if (nrow(sub.ptm) == sub.ptm[!is.na(sub.ptm$abundance), ] |
nrow(sub.protein) == sub.protein[!is.na(sub.protein$abundance), ]) {
message(paste("Can't the Quality Control plot for ", unique(
sub.ptm$Protein),
"(", i, " of ", length(plot_proteins),
") because all measurements are NAs."))
next()
}
## for annotation of condition
groupline.tmp.ptm <- data.frame(groupline.ptm,
"PSM" = unique(sub.ptm$PSM)[1],
"PeptideSequence" = unique(
sub.ptm$PeptideSequence)[1])
groupline.all.tmp.ptm <- data.frame(groupline.all.ptm,
"PSM" = unique(sub.ptm$PSM)[1],
"PeptideSequence" = unique(
sub.ptm$PeptideSequence)[1])
groupline.tmp.protein <- data.frame(groupline.protein,
"PSM" = unique(sub.protein$PSM)[1],
"PeptideSequence" = unique(
sub.protein$PeptideSequence)[1])
groupline.all.tmp.protein <- data.frame(groupline.all.protein,
"PSM" = unique(sub.protein$PSM)[1],
"PeptideSequence" = unique(
sub.protein$PeptideSequence)[1])
## 1st plot for original plot
## for boxplot, x-axis, xorder should be factor
sub.ptm$xorder <- factor(sub.ptm$xorder)
sub.protein$xorder <- factor(sub.protein$xorder)
ptemp.ptm <- ggplot(aes_string(x = 'xorder', y = 'abundance'),
data = sub.ptm) +
facet_grid(~Run) +
geom_boxplot(aes_string(fill = 'Condition'), outlier.shape = 1,
outlier.size = 1.5) +
labs(title = paste0('PTM - ', unique(sub.ptm$Protein)),
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
geom_vline(data = groupline.tmp.ptm,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.ptm,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "none")
ptemp.protein <- ggplot(aes_string(x = 'xorder', y = 'abundance'),
data = sub.protein) +
facet_grid(~Run) +
geom_boxplot(aes_string(fill = 'Condition'), outlier.shape = 1,
outlier.size = 1.5) + labs(
title = paste0('Protein - ',
unique(sub.protein$Protein)),
x = 'MS runs') +
scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
geom_vline(data = groupline.tmp.protein,
aes(xintercept = cumGroupAxis),
colour = "grey", linetype = "longdash") +
geom_text(data = groupline.all.tmp.protein,
aes(x = xorder, y = abundance, label = Condition),
size = text.size,
angle = text.angle, hjust = .9,
color = "black") +
theme(
panel.background = element_rect(fill = 'white', colour = "black"),
legend.key = element_rect(fill = 'white', colour = 'white'),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray95'),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = y.axis.size, colour = "black"),
axis.ticks = element_line(colour = "black"),
axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
title = element_text(size = x.axis.size + 8, vjust = 1.5),
legend.position = "none")
gridExtra::grid.arrange(ptemp.ptm, ptemp.protein, ncol=1)
message(paste("Drew the Quality Contol plot(boxplot) for ",
unique(sub.ptm$Protein), "(", i, " of ",
length(unique(datafeature.ptm$Protein)), ")"))
} # end-loop
}
if (address != FALSE) {
dev.off()
}
} # end QC plot
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.