#' @import ggplot2
#' @import graphics
#' @import viridis
#' @import methods
#' @import stats
#' @import utils
#' @import parallel
# In this package we deliberately use aes_(x=~x, y=~y) format in ggplot. This
# gets rid of unwanted NOTEs "no visible binding for global variables" in R CMD
# check.
simple_theme <- ggplot2::theme_bw() +
ggplot2::theme(
panel.border = ggplot2::element_blank(),
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
axis.line = ggplot2::element_line(colour = "black")
)
simple_theme_grid <- ggplot2::theme_bw() +
ggplot2::theme(
panel.border = ggplot2::element_blank(),
panel.grid.major = ggplot2::element_line(colour = "grey90"),
panel.grid.minor = ggplot2::element_line(colour = "grey95"),
axis.line = ggplot2::element_line(colour = "black")
)
cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
#' Evidence columns
#'
#' \code{evidenceColumns} contains default columns to be read from the evidence
#' file. The names of list elements are used internally to reference evidence
#' data.
#'
#' @examples
#' str(evidenceColumns)
#'
#' @export
evidenceColumns <- list(
sequence = 'Sequence',
modified_sequence = 'Modified sequence',
modifications = 'Modifications',
protein_group = 'Proteins',
protein = 'Leading razor protein',
experiment = 'Experiment',
charge = 'Charge',
reverse = 'Reverse',
contaminant = 'Potential contaminant'
)
#' Measure columns
#'
#' \code{measureColumns} contains measurement columns from evidence file. In
#' case of label-free data, there is only one column: Intensity.
#'
#' @examples
#' str(measureColumns)
#'
#' @export
measureColumns <- list(
intensity = 'Intensity'
)
#' Protein columns
#'
#' To be used as the default \code{data.cols} parameter in
#' \code{\link{readProteinGroups}}.
#'
#' @examples
#' str(proteinColumns)
#'
#' @export
proteinColumns <- list(
protein = "Majority protein IDs",
reverse = 'Reverse',
contaminant = 'Potential contaminant'
)
#' \code{proteusData} constructor
#'
#' @param tab A matrix with data, rows are peptides/proteins, columns are
#' samples
#' @param metadata A data frame with metadata, needs at least two columns:
#' "sample" and "condition"
#' @param content Either "peptides" or "proteins"
#' @param pep2prot A data frame with two columns: "sequence" and "protein". Can
#' use "other" for non-proteus data.
#' @param peptides A character vector with peptide sequences
#' @param proteins A character vector with protein names
#' @param measures A character vector with names of intensity and/or ratio
#' columns
#' @param type Type of experiment: "label-free" or "SILAC"
#' @param npep A data frame with number of peptides per protein
#' @param sequence.col Name of the sequence used, either "sequence" or "modified_sequence"
#' @param protein.col Name of the protein column used, either "protein" or "protein_group"
#' @param peptide.aggregate.fun Function used to aggregate peptides
#' @param peptide.aggregate.parameters Additional parameters passed to peptide
#' aggregate function
#' @param protein.aggregate.fun Function used to aggregate proteins
#' @param protein.aggregate.parameters Additional parameters passed to protein
#' aggregate function
#' @param min.peptides Integer, minimum number of peptides to combine into a
#' protein
#' @param norm.fun Normalizing function
#'
#' @return A \code{proteusData} object.
proteusData <- function(tab, metadata, content, pep2prot, peptides, proteins, measures,
npep=NULL, type="label-free", sequence.col="sequence", protein.col="protein",
peptide.aggregate.fun=NULL, peptide.aggregate.parameters=NULL,
protein.aggregate.fun=NULL, protein.aggregate.parameters=NULL,
min.peptides=NULL, norm.fun=identity) {
stopifnot(
ncol(tab) == nrow(metadata),
is(tab, "matrix"),
content %in% c("peptide", "protein", "other"),
type %in% c("label-free", "SILAC", "TMT"),
sequence.col %in% c("sequence", "modified_sequence"),
protein.col %in% c("protein", "protein_group")
)
# make sure to keep correct order of samples
metadata$sample <- factor(metadata$sample, levels=metadata$sample)
colnames(tab) <- metadata$sample
if(content == "peptide") {
stopifnot(
nrow(tab) == length(peptides),
!is.null(peptide.aggregate.fun)
)
rownames(tab) <- peptides
} else if(content == "protein") {
stopifnot(
nrow(tab) == length(proteins),
is.numeric(min.peptides),
!is.null(protein.aggregate.fun)
)
rownames(tab) <- proteins
}
if(!is.null(npep)) {
stopifnot(nrow(npep) == nrow(tab))
}
# number of replicates in each condition
cnd <- as.character(metadata$condition)
cnd.fac <- factor(cnd)
nrep <- tapply(cnd, cnd.fac, length) # nice trick
metadata$condition <- cnd.fac
pdat <- list(
tab = tab,
metadata = metadata,
measures = measures,
content = content,
conditions = levels(cnd.fac),
nrep = nrep,
type = type,
sequence.col = sequence.col,
protein.col = protein.col,
peptide.aggregate.fun = peptide.aggregate.fun,
peptide.aggregate.parameters = peptide.aggregate.parameters,
protein.aggregate.fun = protein.aggregate.fun,
protein.aggregate.parameters = protein.aggregate.parameters,
min.peptides = min.peptides,
norm.fun = deparse(substitute(norm.fun)),
pep2prot = pep2prot,
peptides = peptides,
proteins = proteins,
npep = npep
)
class(pdat) <- append(class(pdat), "proteusData")
pdat$stats <- intensityStats(pdat)
pdat$detect <- goodData(pdat)
return(pdat)
}
#' Summary of \code{proteusData} object
#'
#' @param object \code{proteusData} object.
#' @param ... additional arguments affecting the summary produced.
#'
#' @return Text output with object summary
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' summary(prodat)
#'
#' @export
summary.proteusData <- function(object, ...) {
cat("\n*** Basic statistics ***\n\n")
cat(paste0(" content = ", object$content, "\n"))
cat(paste0(" experiment type = ", object$type, "\n"))
cat(paste0(" number of samples = ", nrow(object$metadata), "\n"))
cat(paste0(" number of conditions = ", length(object$conditions), "\n"))
cat(paste0(" number of ", object$content, "s = ", nrow(object$tab), "\n"))
cat(paste0(" samples = ", paste0(object$metadata$sample, collapse = ", "), "\n"))
cat(paste0(" conditions = ", paste0(object$conditions, collapse = ", "), "\n"))
cat("\n*** Data processing ***\n\n")
cat(paste0(" evidence columns used = ", paste0(object$measures, collapse = ", "), "\n"))
cat(paste0(" sequence = '", ifelse(object$sequence.col == 'sequence', "Sequence", "Modified sequence"), "'\n"))
cat(paste0(" protein = '", ifelse(object$protein.col == 'protein', "Leading razor protein", "Proteins"), "'\n"))
cat(paste0(" normalization = ", object$norm.fun, "\n"))
if(object$content == "protein") {
cat("\n*** Protein data ***\n\n")
cat(paste0(" quantification method = ", object$aggregate.fun, "\n"))
if(!is.null(object$aggregate.parameters)) {
cat(paste0(" aggregate parameters : ", object$aggregate.parameters, "\n"))
}
cat(paste0(" minimum number of peptides per protein = ", object$min.peptides, "\n"))
}
}
#' Read column names from a tab-delimited text file
#'
#' @param file File name.
#'
#' @return A vector with column names.
#' @export
#'
#' @examples
#' evidenceFile <- system.file("extdata", "evidence.txt.gz", package="proteusLabelFree")
#' evidence.columns <- readColumnNames(evidenceFile)
#' evidence.columns
readColumnNames <- function(file) {
cols <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, nrows=1)
names(cols)
}
#' Read MaxQuant's evidence file
#'
#' \code{readEvidenceFile} reads MaxQuant's evidence file. Contaminants and
#' reverse sequences are filtered out.
#'
#' @details
#'
#' There are two parameters controlling which columns are read from the evidence
#' file. Parameter \code{measure.cols} selects columns with measurements: these
#' are intensities (label-free, TMT) or ratios (SILAC). In the simplest case of
#' label-free data, there is only one measure column: "Intensity". Parameter
#' \code{data.columns} selects all other columns read from the evidence file.
#' There are two default lists, supplied with the package, appropriate for an
#' label-free experiment, \code{measureColumns} and \code{evidenceColumns}.
#'
#'
#' @param file File name.
#' @param measure.cols Named list with measure columns to read.
#' @param data.cols Named list with other columns to read (in addition to
#' measure columns).
#' @param zeroes.are.missing Logical. If TRUE zeroes are interpreted as missing
#' data and replaced with NAs.
#' @return Data frame with selected columns from the evidence file.
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' evidenceFile <- system.file("extdata", "evidence.txt.gz", package="proteusLabelFree")
#' evi <- readEvidenceFile(evidenceFile)
#' }
#'
#' @export
readEvidenceFile <- function(file, measure.cols=measureColumns, data.cols=evidenceColumns, zeroes.are.missing=TRUE) {
columns <- c(data.cols, measure.cols)
if(anyDuplicated(names(columns))) stop("Column names must be unique.")
# check if all required columns are in the evidence file
evi.cols <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE, nrows = 1)
missing <- NULL
for(col in columns) {
if(!(col %in% colnames(evi.cols))) missing <- c(missing, paste0("'", col, "'"))
}
if(!is.null(missing))
stop(paste0("Column(s) ", paste0(missing, collapse=", "), " not found in file ", file))
# read and process evidence file
evi <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE)
evi <- evi[, as.character(columns)]
names(evi) <- names(columns)
# sometimes there are only NAs and the condition doesn't work
evi$reverse[is.na(evi$reverse)] <- ''
evi$contaminant[is.na(evi$contaminant)] <- ''
evi <- evi[which(evi$contaminant != '+' & evi$reverse != '+'),]
# replace NaNs and infinites with NAs in measure columns
# the same with zeroes if flag is on
for(col in names(measure.cols)) {
x <- evi[, col]
x[is.nan(x) | is.infinite(x)] <- NA
if(zeroes.are.missing) x[x == 0] <- NA
evi[, col] <- x
}
# remove rows that have only NAs in measure columns
not.empty <- which(rowSums(!is.na(evi[,names(measure.cols), drop=FALSE])) > 0)
evi <- evi[not.empty,]
}
#' Read protein groups file from MaxQuant
#'
#'
#' \code{readProteinGroups} reads a MaxQuant's protein groups file (usually
#' named \code{proteinGroups.txt}), extracts intensity data and creates a
#' \code{proteusData} object.
#'
#' @details
#'
#' This function allows to read MaxQuant's proteinGroups file directly, without
#' reading and processing the evidence file. Apart from the proteinGroups file
#' name, a metadata data frame needs to be provided.
#'
#' @param file Input file
#' @param meta Metadata data frame. As a minimum, it should contain two columns:
#' \code{sample} with unique sample names and \code{condition} with condition
#' names.
#' @param measure.cols A named list with measure columns to read. These are
#' columns in the proteinGroup files containing intensity measurements per
#' sample. Usually they are called \code{Intensity <sample>}. If not provided,
#' column names will be created from metadata using a template
#' \code{paste("Intensity", meta$sample)}.
#' @param data.cols A named list with columns to read in addition to measure
#' columns. It should contain three columns named \code{protein}, which will
#' be used as protein ID and \code{reverse} and \code{contaminant}, which will
#' be used to filter data. The values of this named list are the actual column
#' names in the file (including spaces). The default structure
#' \code{proteinColumns} is provided with the package, though it might need
#' modifying in case these columns are named differently in the protein groups
#' file.
#' @return A \code{proteusData} object with protein intensity data.
#'
#' @examples
#' library(proteusLabelFree)
#' proteinGroupsFile <- system.file("extdata", "proteinGroups.txt.gz", package="proteusLabelFree")
#' prot.MQ <- readProteinGroups(proteinGroupsFile, meta)
#'
#' @export
readProteinGroups <- function(file, meta, measure.cols=NULL, data.cols=proteinColumns) {
# attempt creating default measure columns "Intensity sample"
if(is.null(measure.cols)) {
measure.cols <- paste("Intensity", meta$sample)
names(measure.cols) <- meta$sample
}
dat <- readEvidenceFile(file, measure.cols=measure.cols, data.cols=data.cols)
tab <- as.matrix(dat[names(measure.cols)])
tab[tab==0] <- NA
tab[is.nan(tab)] <- NA
colnames(tab) <- meta$sample
rownames(tab) <- dat$protein
# create pdat object
pdat <- proteusData(tab, meta, "protein", NULL, NULL, rownames(tab), measure.cols,
min.peptides=0, protein.aggregate.fun="MaxQuant")
return(pdat)
}
#' Helper function, not exported
#'
#' @param ... Any parameters
#' @return A string with list of parameters and their values
parameterString <- function(...) {
x <- sapply(list(...), deparse)
if(length(x) > 0) {
ap <- paste0(names(x), "=", x, collapse=";")
} else {
ap <- NULL
}
ap
}
#' Create peptide table from evidence data
#'
#' \code{makePeptideTable} computes a peptide table and related data. Peptide
#' table is a matrix with columns corresponding to conditions and rows
#' corresponding to peptide sequences.
#'
#' @details
#'
#' The evidence file contains a column called "Experiment" and one or more
#' columns with measure values. In case of label-free experiment there is only
#' one measure column: "Intensity". In case of TMT experiment there are several
#' measure columns, usually called "Reporter intensity 0", "Reporter intensity
#' 1", and so on. \code{makePeptideTable} will combine "Experiment" and measure
#' columns in a way defined by the metadata (parameter \code{meta}). The name of
#' the combined column will be named using "sample" column in metadata.
#'
#' The result is a \code{proteusData} object containing a table with rows
#' corresponding to peptides and columns corresponding to samples (as defined in
#' metadata). Each cell of the table is an aggregated measure values over all
#' evidence entries corresponding to the given sequence and experiment. How
#' these measure values are aggregated is controlled by the parameter
#' \code{aggregate.fun}.
#'
#' There are two aggregation functions provided in this package:
#' \code{\link{aggregateMedian}} and \code{\link{aggregateSum}}. Depending on
#' the needs, the user can provide any arbitrary function to perform
#' aggregation. We recommend using \code{aggregateSum} for label-free and TMT
#' experiments and \code{aggregateMedian} for SILAC experiments.
#'
#' The aggregate function should be in form: \code{function(wp, ...)}. \code{wp}
#' is a matrix containing measurements from for a given peptide. Rows are
#' evidence file entries, columns are samples. \code{...} are additional
#' parameters for the function that are passed from \code{makePeptideTable}. The
#' aggregate function should return a vector of values for each sample. That is
#' the length of the vector should be the same as the number of columns in
#' \code{wp}. For example, the default aggregate function \code{aggregateSum}
#' calculates sums in each column of \code{wp}.
#'
#' This function, apart from aggregating evidence data into peptides, builds a
#' peptide-to-protein reference and stores is in the returned object ($pep2prot
#' field). It can be used at peptide level, to identify which protein peptides
#' belong to. It is then passed to \code{\link{makeProteinTable}} function and
#' used to aggregate peptides to proteins. Here we can decide how peptides are
#' aggregated to proteins. This is controlled by the \code{protein.col}
#' parameter, which indicates which evidence column should be used to build
#' peptide-to-protein relation. If "protein" is used (default) the
#' peptite-to-protein relation is build on leading razor proteins. If
#' "protein_group" is used, then peptide-to-protein relation is based on protein
#' groups.
#'
#' Only samples from metadata are used, regardless of the content of the
#' evidence data. This makes selection of samples for downstream processing
#' easy: select only required rows in the metadata data frame.
#'
#' @param evi Evidence table created with \code{\link{readEvidenceFile}}.
#' @param meta Data frame with metadata. As a minimum, it should contain
#' "sample" and "condition" columns.
#' @param sequence.col A column name to identify peptides. Can be either
#' "sequence" or "modified_sequence".
#' @param protein.col A column name to use for peptide-to-protein reference. Can
#' be either "protein" or "protein_group".
#' @param measure.cols A named list of measure columns; should be the same as
#' used in \code{\link{readEvidenceFile}}
#' @param aggregate.fun A function to aggregate pepetides with the same
#' sequence/sample.
#' @param ... Additional parameters passed to the aggregate function
#' @param experiment.type Type of the experiment, "label-free", "TMT" or
#' "SILAC".
#' @param ncores Number of cores for parallel processing
#' @return A \code{proteusData} object, containing peptide intensities and
#' metadata.
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' pepdat <- makePeptideTable(evi, meta, ncores=2)
#' }
#'
#' @export
makePeptideTable <- function(evi, meta, sequence.col=c("sequence", "modified_sequence"),
protein.col=c("protein", "protein_group"), measure.cols=measureColumns,
aggregate.fun=aggregateSum, ..., experiment.type=c("label-free", "TMT", "SILAC"),
ncores = 4) {
sequence.col <- match.arg(sequence.col)
protein.col <- match.arg(protein.col)
experiment.type <- match.arg(experiment.type)
# mclapply doesn't work on Windows, so force 1 core
if(Sys.info()[['sysname']] == "Windows") {
ncores <- 1
warning("Multicore processing not available in Windows. Using ncores=1")
}
# check if measure.cols, sequence.col and protein.col are in the evidence file
measures <- names(measure.cols)
for(col in c(measures, sequence.col, protein.col)) {
if(!(col %in% names(evi))) stop(paste0("Column '", col, "' not found in evidence data."))
}
# dummy experiment if experiment column not present
if(!("experiment" %in% names(evi)) && !("experiment" %in% names(meta))) {
evi$experiment = "dummy"
meta$experiment = "dummy"
}
# test metadata integrity
for(col in c("experiment", "measure", "sample", "condition")) {
if(!(col %in% names(meta))) stop(paste0("Column '", col, "' not found in metadata."))
}
if(anyDuplicated(paste0(meta$experiment, meta$measure)) > 0) stop("Experiment/measure combination in metadata must be unique.")
if(anyDuplicated(meta$sample) > 0) stop("Sample in metadata must be unique.")
experiments <- unique(evi$experiment)
for(ex in meta$experiment) {
if(!(ex %in% experiments)) stop(paste0("Experiment '", ex, "' from metadata not found in evidence."))
}
for(me in meta$measure) {
if(!(me %in% measure.cols)) stop(paste0("Measure '", me, "' from metadata not found in measure.cols."))
}
# make sure to keep correct order of samples
meta$sample <- factor(meta$sample, levels=meta$sample)
# melt and recast evidence data
#
# NOTE: due to a potential conflict between reshape and reshape2 we do not use
# "variable.name" parameter and default to the column name "variable". This is
# changed manually to "measure", because we want it this way. See
# https://github.com/hadley/reshape/issues/63 for more details.
eviMelted <- reshape2::melt(evi, id.vars = c(sequence.col, "experiment"), measure.vars=measures)
names(eviMelted)[1] <- "sequence" # for simplicity, call it sequence
names(eviMelted)[3] <- "measure" # see note above
eviMelted$value <- as.numeric(eviMelted$value) # integers do not work well in cast + median
eviMelted$measure <- measure.cols[eviMelted$measure] # recover original measurement columns
# Add sample column to melted evidence
# we tried using merge:
# eviMelted <- merge(eviMelted, meta[, c("experiment", "measure", "sample")], by=c("experiment", "measure"))
# but it turns out it is quite slow; translation of concatenated column is significantly faster
m2s <- setNames(meta$sample, paste0(meta$experiment, ".", meta$measure)) # named vector for translation
eviMelted$expmes <- paste0(eviMelted$experiment, ".", eviMelted$measure)
eviMelted <- eviMelted[which(eviMelted$expmes %in% names(m2s)),] # select only experiment.measure present in metadata
eviMelted$sample <- m2s[eviMelted$expmes]
eviMelted <- eviMelted[, c("sequence", "sample", "value")]
# create unique sequence names
eviMelted$seqsam <- paste0(eviMelted$sequence, ".", eviMelted$sample)
eviMelted <- eviMelted[order(eviMelted$seqsam),]
s2s <- setNames(eviMelted$sequence, eviMelted$seqsam)
r <- rle(eviMelted$seqsam)
eviMelted$uniseq <- paste0(rep(s2s[r$values], times=r$lengths), ".", unlist(lapply(r$lengths, seq_len)))
# cast into table: sample vs unique sequence
# in this table there are multiple entries per peptide
tab <- reshape2::dcast(eviMelted, uniseq ~ sample, sum, value.var="value")
# original sequence
u2s <- setNames(eviMelted$sequence, eviMelted$uniseq)
sequences <- u2s[tab$uniseq]
# extract numeric data as matrix
tab <- as.matrix(tab[,2:ncol(tab)])
tab[tab == 0] <- NA
# aggregate peptides
ptab <- parallel::mclapply(unique(sequences), function(s) {
wp <- tab[sequences == s,, drop=FALSE]
x <- aggregate.fun(wp, ...)
row <- as.data.frame(t(as.vector(x)))
rownames(row) <- s
return(row)
}, mc.cores=ncores)
ptab <- as.matrix(do.call(rbind, ptab))
colnames(ptab) <- colnames(tab)
peptides <- row.names(ptab)
# peptide to protein conversion
pep2prot <- data.frame(sequence=evi[[sequence.col]], protein=evi[[protein.col]])
pep2prot <- unique(pep2prot)
if(anyDuplicated(pep2prot$sequence) > 0) stop("Non-unique peptide-to-protein association found. Proteus requires that peptide sequence is uniquely associated with a protein or protein group.")
rownames(pep2prot) <- pep2prot$sequence
pep2prot <- pep2prot[peptides,]
proteins <- levels(as.factor(pep2prot$protein))
# create pdat object
pdat <- proteusData(ptab, meta, "peptide", pep2prot, peptides, proteins,
as.character(measure.cols), type = experiment.type,
sequence.col = sequence.col, protein.col = protein.col,
peptide.aggregate.fun = deparse(substitute(aggregate.fun)),
peptide.aggregate.parameters = parameterString(...)
)
return(pdat)
}
#' Make protein table
#'
#' \code{makeProteinTable} creates a protein table from the peptide table.
#' Protein intensities or ratios are aggregated from peptide data. An external
#' function is used to aggregate each protein.
#'
#' @details
#'
#' There are three protein aggregation functions provided in this package:
#' \code{\link{aggregateHifly}}, \code{\link{aggregateMedian}} and
#' \code{\link{aggregateSum}}. Depending on the needs, the user can provide any
#' arbitrary function to perform aggregation.
#'
#' The aggregate function should be in form: \code{function(wp, ...)}. \code{wp}
#' is a matrix containing measurements from all peptides for the particular
#' protein. Rows are peptides, columns are samples. \code{...} are additional
#' parameters for the function that are passed from \code{makeProteinTable}. The
#' aggregate function should return a vector of values for each sample. That is
#' the length of the vector should be the same as the number of columns in
#' \code{wp}.
#'
#' @param pepdat A \code{proteusData} object containing peptide data, normally
#' created by \code{\link{makePeptideTable}}
#' @param aggregate.fun Function to aggregate peptides into a protein. See
#' "Details" for more details.
#' @param ... Additional parameters to pass to \code{aggregate.fun}
#' @param min.peptides Minimum number of peptides per protein
#' @param ncores Number of cores for parallel processing
#'
#' @return A \code{proteusData} object containing protein intensities and
#' metadata.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, ncores=2)
#'
#' @export
makeProteinTable <- function(pepdat, aggregate.fun=aggregateHifly, ...,
min.peptides=2, ncores=4) {
if(!is(pepdat, "proteusData")) stop ("Input data must be of class proteusData.")
# mclapply doesn't work on Windows, so force 1 core
if(Sys.info()[['sysname']] == "Windows") {
ncores <- 1
warning("Multicore processing not available in Windows. Using ncores=1")
}
meta <- pepdat$metadata
tab <- pepdat$tab
# make sure to keep correct order of samples
# this should be OK in peptide table, but just in case
meta$sample <- factor(meta$sample, levels=meta$sample)
protlist <- list()
for(cond in pepdat$conditions) {
w <- tab[,which(meta$condition == cond), drop=FALSE]
samples <- colnames(w)
protcond <- parallel::mclapply(pepdat$proteins, function(prot) {
sel <- which(pepdat$pep2prot$protein == prot)
npep <- length(sel)
if(npep >= min.peptides)
{
wp <- w[sel,, drop=FALSE]
x <- aggregate.fun(wp, ...)
row <- as.data.frame(t(as.vector(x)))
} else {
row <- as.data.frame(t(rep(NA, length(samples))))
}
names(row) <- samples
row <- data.frame(protein=prot, row, check.names=FALSE)
return(row)
}, mc.cores=ncores)
protint <- do.call(rbind, protcond)
protlist[[cond]] <- protint
}
# dplyr join all tables
protab <- Reduce(function(df1, df2) dplyr::full_join(df1,df2, by="protein"), protlist)
# remove empty rows (happens when min.peptides > 1)
protab <- protab[which(rowSums(!is.na(protab)) > 0), ]
proteins <- protab$protein
protab <- as.matrix(protab[,as.character(meta$sample)]) # just numbers
# count peptides
n <- as.integer(tapply(pepdat$pep2prot$protein, pepdat$pep2prot$protein, length))
npep <- data.frame(npep=n[proteins])
rownames(npep) <- proteins # a bit redundant, but might be useful
prodat <- proteusData(protab, meta, "protein", pepdat$pep2prot, pepdat$peptides,
proteins, pepdat$measures,
type = pepdat$type,
sequence.col = pepdat$sequence.col,
min.peptides = min.peptides,
peptide.aggregate.fun = pepdat$peptide.aggregate.fun,
peptide.aggregate.parameters = pepdat$peptide.aggregate.parameters,
protein.aggregate.fun = deparse(substitute(aggregate.fun)),
protein.aggregate.parameters = parameterString(...),
npep = npep)
return(prodat)
}
#' Protein aggregate function using hifly method
#'
#' Function to aggregate peptides into proteins using a hi-flyer method of Silva
#' et al. 2006 (http://www.mcponline.org/content/5/1/144.full.pdf).
#'
#' @details
#'
#' This function should be used from within \code{\link{makeProteinTable}}. The
#' "hifly" method is a follows. \enumerate{ \item For a given protein find all
#' corresponding peptides. \item In each replicate, order intensities from the
#' highest to the lowest. This is done separately for each replicate. \item
#' Select n top rows of the ordered table. \item In each replicate, find the
#' mean of these three rows. This is the estimated protein intensity. }
#'
#' @param wp Intensity table with selection of peptides for this protein
#' @param hifly Number of high-fliers
#'
#' @return A numeric vector with aggregated protein intensities
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, aggregate.fun=aggregateHifly, hifly=3)
#' }
#' @export
aggregateHifly <- function(wp, hifly=3) {
npep <- nrow(wp)
stopifnot(npep > 0)
if(npep > 1) {
sp <- as.matrix(apply(wp, 2, function(x) sort(x, na.last=TRUE, decreasing=TRUE)))
ntop <- min(npep, hifly)
row <- t(colMeans(sp[1:ntop,,drop=FALSE], na.rm=TRUE))
row[is.nan(row)] <- NA # colMeans puts NaNs where the column contains only NAs
} else {
row <- wp
}
return(as.vector(row))
}
#' Protein/peptide aggregate function using median
#'
#' This function should be used from within \code{\link{makePeptideTable}} or
#' \code{\link{makeProteinTable}}. It aggregates values by calculating the
#' median for each column of \code{wp}.
#'
#' @param wp Matrix with columns corresponding to samples and rows corresponding
#' to peptide entries.
#'
#' @return A numeric vector with aggregated values
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, aggregate.fun=aggregateMedian)
#' }
#' @export
aggregateMedian <- function(wp) {
row <- apply(wp, 2, function(x) median(x, na.rm=TRUE))
return(as.vector(row))
}
#' Peptide/protein aggregate function using sum
#'
#' This function should be used from within \code{\link{makePeptideTable}} or
#' \code{\link{makeProteinTable}}. It aggregates values by calculating the sum
#' for each column of \code{wp}.
#'
#' @param wp Matrix with columns corresponding to samples and rows corresponding
#' to peptide entries.
#'
#' @return A numeric vector with aggregated values
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, aggregate.fun=aggregateSum)
#' }
#' @export
aggregateSum <- function(wp) {
row <- colSums(wp, na.rm=TRUE)
row[row==0] <- NA # colSums puts zeroes where the column contains only NAs (!!!)
return(as.vector(row))
}
#' Annotate proteins
#'
#' \code{annotateProteins} adds protein annotations to a \code{proteusData}
#' object.
#'
#' @details
#'
#' The only information about proteins proteus package extracts from the
#' evidence file are protein identifiers. These can be in various forms,
#' depending on how MaxQuant was ran. In order to annotate them, two steps are
#' required.
#'
#' First, the user needs to create a data frame linking protein identifiers (as
#' in \code{pdat$proteins} vector) to some metadata. This data frame has to
#' contain a column called \code{protein} with the identifiers and any
#' additional columns with annotations, e.g., UniProt IDs, protein names, gene
#' names, domains, GO-terms and so on. These data can be obtained from UniProt.
#' Any annotation columns included will be used by interactive functions
#' \code{plotFID_live} and \code{plotVolcano_live}.
#'
#' See \code{\link{fetchFromUniProt}} for obtaining annotation data from
#' UniProt.
#'
#' Once the annotation table is created, it can be merged into the
#' \code{proteusData} object using \code{annotateProteins} function. The order
#' of identifiers in the annotation table is not important. Also, not all
#' proteins have to be present. The merge will add only the matching proteins.
#' As a result, this function returns a \code{proteusData} object with
#' \code{annotation} field added. It contains an annotation data frame with rows
#' corresponding to the internal list of proteins. This means the rows of this
#' data frame correspond one-to-one to the rows in \code{pdat$tab} and
#' \code{pdat$detect}, if pdat contains proteins (not peptides).
#'
#' @param pdat A \code{proteusData} object containing protein data
#' @param annotation A data frame with a column \code{protein} containing
#' protein identifiers, as in \code{pdat$proteins}
#'
#' @return A \code{proteusData} with annotation field added.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#'
#' # annotations.id is a data frame with annotations
#' prodat <- annotateProteins(prodat, annotations.id)
#'
#' @export
annotateProteins <- function(pdat, annotation) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
if(!is.data.frame(annotation)) stop ("Annotation needs to be a data frame.")
if(is.null(annotation$protein)) stop ("Annotation table has to have a 'protein' column.")
if(anyDuplicated(annotation$protein) > 0) stop("protein column in annotations must contain only unique identifiers.")
nover <- length(intersect(pdat$proteins, annotation$protein))
if(nover == 0) stop("No overlap between data and annotation. Nothing to annotate.")
mrg <- data.frame(protein=pdat$proteins)
mrg <- merge(mrg, annotation, by="protein", all.x=TRUE)
pdat$annotation <- mrg
cat(paste("Annotated", nover, "proteins.\n"))
return(pdat)
}
#' Normalize columns of a matrix to medians
#'
#' \code{normalizeMedian} normalizes the columns of a matrix to have the same
#' medians. It should be used with \code{\link{normalizeData}} function.
#'
#' @param tab Data frame with raw intensities. Normally, this is a \code{tab}
#' field in the \code{proteusData} object (see examples below).
#' @return Normalized matrix.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' normtab <- normalizeMedian(prodat$tab)
#'
#' @export
normalizeMedian <- function(tab) {
norm.fac <- apply(tab, 2, function(x) {median(x, na.rm=TRUE)})
norm.fac <- norm.fac / mean(norm.fac, na.rm=TRUE)
tab <- t(t(tab) / norm.fac)
return(tab)
}
#' Normalize proteus data
#'
#' \code{normalizeData} normalizes the intensity table in a \code{proteusData}
#' object, using the provided normalizing function.
#'
#' @param pdat A \code{proteusData} object with peptide or protein intensities.
#' @param norm.fun A normalizing function.
#' @return A \code{proteusData} object with normalized intensities.
#'
#' @details The normalizing function, specified by \code{norm.fun} needs to
#' normalize columns of a numerical matrix. The input is a matrix and the
#' output is a normalized matrix. The default value points to
#' \code{\link{normalizeMedian}}, Proteus's function normalizing each column
#' to its median. Other functions can be used, for example
#' \code{normalizeQuantiles} from the \code{limma} package.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat, norm.fun=normalizeMedian)
#'
#' @export
normalizeData <- function(pdat, norm.fun=normalizeMedian) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
if(class(norm.fun) != "function") stop ("'norm.fun' has to be a function.")
pdat$tab <- norm.fun(pdat$tab)
pdat$norm.fun <- deparse(substitute(norm.fun))
pdat$stats <- intensityStats(pdat) # need to recalculate stats!
return(pdat)
}
#' RAS procedure for CONSTANd normalization (not exported)
#'
#' @param K Input matrix
#' @param max.iter Maximum number of iterations
#' @param eps Convergence limit
#'
#' @return Matrix normalized so row and column means equal 1/n (n - number of
#' columns)
RAS <- function(K, max.iter=50, eps=1e-5) {
n <- ncol(K)
m <- nrow(K)
# ignore rows with only NAs
good.rows <- which(rowSums(!is.na(K)) > 0)
K <- K[good.rows, ]
cnt <- 1
repeat {
row.mult <- 1 / (n * rowMeans(K, na.rm=TRUE))
K <- K * row.mult
err1 <- 0.5 * sum(abs(colMeans(K, na.rm=TRUE) - 1/n))
col.mult <- 1 / (n * colMeans(K, na.rm=TRUE))
K <- t(t(K) * col.mult)
err2 <- 0.5 * sum(abs(rowMeans(K, na.rm=TRUE) - 1/n))
cnt <- cnt + 1
if(cnt > max.iter || (err1 < eps && err2 < eps)) break
}
# reconstruct full table
KF <- matrix(NA, nrow=m, ncol=n)
KF[good.rows, ] <- K
return(KF)
}
#' Normalize protein/peptide data using CONSTANd normalization
#'
#' @details
#'
#' \code{normalizeTMT} implements CONSTANd algorithm from Maes et al. (2016)
#' \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4974351/pdf/zjw2779.pdf}.
#' It normalizes TMT table, for each experiment separately. After normalization
#' each row shows the percentage of total row intensity and each column is
#' normalized to their mean value. Hence, both row and column means are equal
#' 1/n, where n is the number of columns (reporters).
#'
#' @param pdat A \code{proteusData} object with peptide or protein intensities.
#' @param max.iter Maximum number of iterations for the RAS procedure.
#' @param eps Convergence limit for the RAS procedure.
#'
#' @return A \code{proteusData} with normalized data.
#'
#' @examples
#' library(proteusTMT)
#' data(proteusTMT)
#' prodat.norm <- normalizeTMT(prodat)
#'
#' @export
normalizeTMT <- function(pdat, max.iter=50, eps=1e-5) {
for(ex in levels(as.factor(pdat$meta$experiment))) {
colsel <- which(pdat$metadata$experiment == ex)
pdat$tab[, colsel] <- RAS(pdat$tab[, colsel])
pdat$norm.fun <- "CONSTANd"
pdat$stats <- intensityStats(pdat) # need to recalculate stats!
}
return(pdat)
}
#' Plot distance matrix
#'
#' \code{plotDistanceMatrix} plots a distance matrix for peptide or protein
#' data.
#'
#' Computes a distance matrix and plots it as a shade-plot. The default distance
#' is Pearson's correlation coefficient. Other methods will be introduced later.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param distance A method to calculate distance.
#' @param text.size Text size on axes
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotDistanceMatrix(pepdat)
#'
#' @export
plotDistanceMatrix <- function(pdat, distance=c("correlation"), text.size=10) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
distance <- match.arg(distance)
corr.mat <- cor(pdat$tab, use="complete.obs")
m <- reshape2::melt(corr.mat, varnames=c("Sample1", "Sample2"))
m$Sample1 <- factor(m$Sample1, levels=pdat$metadata$sample)
m$Sample2 <- factor(m$Sample2, levels=pdat$metadata$sample)
ggplot(m, aes_(x=~Sample1, y=~Sample2)) +
geom_tile(aes_(fill=~value)) +
viridis::scale_fill_viridis() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5, size=text.size),
axis.text.y = element_text(size=text.size)
) +
labs(x='Sample', y='Sample', fill="Correlation")
}
#' Plot clustering dendrogram
#'
#' \code{plotClustering} plots a dendrogram of intensity data, using
#' hierarchical clustering.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param x.text.size Size of text on the x-axis (sample names).
#' @return Creates a plot
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotClustering(pepdat)
#'
#' @export
plotClustering <- function(pdat, x.text.size=10) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
corr.mat <- cor(pdat$tab, use="complete.obs")
dis <- as.dist(1 - corr.mat) # dissimilarity matrix
hc <- hclust(dis)
dendr <- ggdendro::dendro_data(hc)
dat <- ggdendro::segment(dendr)
theme.d <- ggplot2::theme(
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
panel.background = ggplot2::element_blank(),
axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust=0.5, size=x.text.size),
axis.line.x = ggplot2::element_blank(),
axis.ticks.x = ggplot2::element_blank()
)
ggplot() +
theme.d +
geom_segment(data=dat, aes_(x=~x, y=~y, xend=~xend, yend=~yend)) +
scale_x_continuous(breaks = seq_along(dendr$labels$label), labels = dendr$labels$label) +
scale_y_continuous(expand = c(0,0), limits = c(0, max(dat$y) * 1.03)) +
labs(x="Sample", y="Distance")
}
#' Plot PCA
#'
#' \code{plotPCA} creates a principal analysis plot using first two components.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param with.legend Logical, whether to plot a legend.
#' @param point.size Size of the point in the plot.
#' @param text.size Size of the axis text.
#' @param label.size Size of the text labels identifying sapmles.
#' @param legend.text.size Size of the legend text.
#' @param palette Palette of colours
#'
#' @return Creates a plot
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotPCA(pepdat)
#' @export
plotPCA <- function(pdat, with.legend=TRUE, point.size=1.2, text.size=10,
label.size=3, legend.text.size=7, palette=cbPalette) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
pca <- prcomp(t(na.omit(log10(pdat$tab))), scale.=TRUE, center=TRUE)
p <- data.frame(
x = pca$x[, 1],
y = pca$x[, 2],
condition = pdat$metadata$condition,
sample = pdat$metadata$sample
)
var.perc <- 100 * (pca$sdev)^2 / sum((pca$sdev)^2)
pca1 <- sprintf("PCA1 (%5.1f%%)", var.perc[1])
pca2 <- sprintf("PCA2 (%5.1f%%)", var.perc[2])
g <- ggplot(p, aes_(x=~x, y=~y, label=~sample)) +
simple_theme +
theme(
legend.title = element_text(size = legend.text.size),
legend.text = element_text(size = legend.text.size)
) +
coord_cartesian(expand=TRUE) +
geom_point(aes_(colour=~condition), size=point.size) +
ggrepel::geom_text_repel(size=label.size) +
scale_color_manual(values=palette) +
theme(text = element_text(size=text.size)) +
labs(x=pca1, y=pca2)
if(!with.legend) g <- g + theme(legend.position="none")
g
}
#' Plot peptide or protein count per sample
#'
#' \code{plotCount} makes a plot of peptide/protein count per sample.
#'
#' @param pdat A \code{proteusData} object.
#' @param x.text.size Size of text on the x-axis (sample names).
#' @param palette Palette of colours
#' @return A plot (\code{ggplot} object) of the number of peptides/proteins
#' detected in each sample.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotCount(pepdat)
#'
#' @export
plotCount <- function(pdat, x.text.size=10, palette=cbPalette){
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
meta <- pdat$metadata
entry.count <- apply(pdat$tab, 2, function(x) sum(!is.na(x)))
med.count <- median(entry.count)
df <- data.frame(x=meta$sample, y=entry.count, condition=meta$condition)
g <- ggplot(df, aes_(x=~x,y=~y,fill=~condition)) +
geom_col(colour='grey60') +
simple_theme +
scale_y_continuous(expand = c(0,0)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5, size=x.text.size)) +
labs(x="Sample", y="Count") +
labs(title = paste0("Median count = ", med.count)) +
theme(plot.title=element_text(hjust=0, size=12))
if(nlevels(as.factor(df$condition)) <= length(palette)) g <- g + scale_fill_manual(values=palette)
#g <- g + viridis::scale_fill_viridis(discrete=TRUE)
g
}
#' Jaccard similarity
#'
#' Computes Jaccard similarity between "detections" in two vectors of the same
#' length. A detection is a value, as opposed to NA.
#'
#' @param x A vector
#' @param y A vector
#'
#' @return Jaccard similarity between x and y
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' sim <- jaccardSimilarity(pepdat$tab[,1], pepdat$tab[,2])
#'
#' @export
#'
jaccardSimilarity <- function(x, y) {
stopifnot(length(x) == length(y))
intersection <- which(!is.na(x) & !is.na(y))
union <- which(!is.na(x) | !is.na(y))
jaccard <- ifelse(length(union) > 0, length(intersection) / length(union), 0)
return(jaccard)
}
#' Detection Jaccard similarity
#'
#' \code{plotDetectionSimilarit} plots a distribution of (peptide) detection
#' similarities between samples. Jaccard similarity between two sets is defined
#' as the size of the intersection divided by the size of the union. This plot
#' can be used to assess quality of data.
#'
#' @param pdat A \code{proteusData} object with peptides (or proteins).
#' @param text.size Text size.
#' @param plot.grid Logical to plot grid.
#' @param bin.size Bin size for the histogram.
#' @param hist.colour Colour of the histogram.
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotDetectionSimilarity(pepdat)
#'
#'
#' @export
#'
plotDetectionSimilarity <- function(pdat, bin.size=0.01, text.size=12, plot.grid=TRUE, hist.colour='grey30') {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
n <- ncol(pdat$tab)
# indices of upper triangular: all pair-wise combinations of columns in tab
pairs <- which(upper.tri(matrix(1, n, n)) == TRUE, arr.ind=TRUE)
# Jaccard simliarity for each pair of columns in tab
sim <- apply(pairs, 1, function(i) jaccardSimilarity(pdat$tab[,i[1]], pdat$tab[,i[2]]))
ggplot(data.frame(sim), aes_(~sim, ~..density..)) +
{if(plot.grid) simple_theme_grid else simple_theme} +
geom_histogram(breaks=seq(0, 1, bin.size), colour=hist.colour, fill=hist.colour) +
labs(x='Jaccard similarity', y='Density') +
theme(text = element_text(size=text.size))
}
#' Plot distribution of intensities/ratios for each sample
#'
#' \code{plotSampleDistributions} makes a boxplot with intensity/ratio
#' distribution for each sample.
#'
#' @param pdat A \code{proteusData} object with peptide/protein intensities.
#' @param title Title of the plot.
#' @param method "box", "violin" or "dist"
#' @param palette Palette of colours
#' @param x.text.size Text size in value axis
#' @param x.text.angle Text angle in value axis
#' @param vmin Lower bound on log value
#' @param vmax Upper bound on log value
#' @param n.grid.rows Number of rows in the grid of facets
#' @param hist.bins Number of bins in histograms
#' @param log.scale Logical, to plot in logarithmic scale
#' @param log.base Base of the logarithm which will be applied to data
#' @param fill A metadata column to use for the fill of boxes
#' @param colour A metadata column to use for the outline colour of boxes
#' @param hline Logical, if true a horizontal line at zero is added
#'
#' @return A \code{ggplot} object.
#'
#' @export
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotSampleDistributions(prodat)
#' plotSampleDistributions(normalizeData(prodat))
#'
plotSampleDistributions <-
function(pdat, title="", method=c("violin", "dist", "box"), x.text.size=7, n.grid.rows=3,
hist.bins=100, x.text.angle=90, vmin=as.numeric(NA), vmax=as.numeric(NA), log.scale=TRUE,
log.base=10, palette=cbPalette, fill=NULL, colour=NULL, hline=FALSE) {
method <- match.arg(method)
m <- reshape2::melt(pdat$tab, varnames=c("ID", "sample"))
mt <- data.frame(pdat$metadata, row.names = pdat$metadata$sample)
if(!is.null(fill)) m[['fill']] <- as.factor(mt[m$sample, fill])
if(!is.null(colour)) m[['colour']] <- as.factor(mt[m$sample, colour])
if(log.scale > 0) m$value <- log(m$value, base=log.base)
lg <- ifelse(log.scale, paste("log", log.base), "")
if(method == "box" | method == "violin") {
g <- ggplot(m, aes_(x=~sample, y=~value)) +
simple_theme +
ylim(vmin, vmax) +
theme(axis.text.x = element_text(angle = x.text.angle, hjust = 1, vjust=0.5, size=x.text.size)) +
labs(title=title, x="sample", y=paste0(lg, " value"))
if(hline) g <- g + geom_hline(yintercept=0, colour='grey')
if(method=="box") g <- g + geom_boxplot(outlier.shape=NA, na.rm=TRUE)
if(method=="violin") g <- g + geom_violin(na.rm=TRUE, draw_quantiles=c(0.25, 0.5, 0.75), scale="width")
} else if (method == "dist") {
g <- ggplot(m, aes_(x=~value)) +
geom_histogram(bins=hist.bins) +
facet_wrap(~sample, nrow=n.grid.rows) +
xlim(vmin, vmax)
} else stop("Wrong method.")
if(!is.null(fill)) {
g <- g + aes_(fill=~fill)
if(nlevels(m$fill) <= length(palette)) g <- g + scale_fill_manual(name=fill, values=palette)
}
if(!is.null(colour)) {
g <- g + aes_(colour=~colour)
if(nlevels(m$colour) <= length(palette)) g <- g + scale_color_manual(name=colour, values=palette)
}
g
}
#' Statistics for an intensity table
#'
#' \code{intensityStats} calculates the mean, variance and number of good data
#' points for peptide or protein intensities.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @return A data frame with several statistics.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' stats <- intensityStats(prodat)
#'
#' @export
intensityStats <- function(pdat) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
meta <- pdat$metadata
stats <- NULL
for(cond in pdat$conditions) {
w <- pdat$tab[,which(meta$condition == cond), drop=FALSE]
m <- rowMeans(w, na.rm=TRUE)
m[which(is.nan(m))] <- NA
v <- apply(w, 1, function(v) sd(v, na.rm=TRUE)^2)
ngood <- apply(w, 1, function(v) sum(!is.na(v)))
stats <- rbind(stats, data.frame(id=rownames(w), condition=cond, mean=m, variance=v, ngood=ngood))
rownames(stats) <- NULL
}
return(stats)
}
#' Create a table indicating good data.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#'
#' @return A data frame with rows corresponding to peptides/proteins and columns
#' corresponding to conditions. Logical values indicate if at least one
#' replicate is available (TRUE) or if all replicates are missing (FALSE).
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' detect <- goodData(prodat)
#'
#' @export
goodData <- function(pdat) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
meta <- pdat$metadata
G <- lapply(pdat$conditions, function(cond){
w <- pdat$tab[,which(meta$condition == cond), drop=FALSE]
rowSums(!is.na(w)) > 0
})
good <- as.data.frame(do.call(cbind, G))
names(good) <- pdat$conditions
return(good)
}
#' Plot mean-variance relationship
#'
#' \code{plotMV} makes a plot with variance of log-intensity vs mean of
#' log-intensity.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param with.loess Logical. If true, a loess line will be added.
#' @param bins Number of bins for binhex
#' @param xmin Lower limit on x-axis
#' @param xmax Upper limit on x-axis
#' @param ymin Lower limit on y-axis
#' @param ymax Upper limit on y-axis
#' @param with.n Logical to add a text with the number of proteins to the plot
#' @param mid.gradient A parameter to control the midpoint of colour gradient
#' (between 0 and 1)
#' @param text.size Text size on axes
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotMV(prodat, with.loess=TRUE)
#'
#' @export
plotMV <- function(pdat, with.loess=FALSE, bins=80, xmin=5, xmax=10, ymin=7, ymax=20, with.n=FALSE, mid.gradient=0.3, text.size=10) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
meta <- pdat$metadata
if(is.null(meta)) stop("No metadata found.")
stats <- pdat$stats
if(is.null(stats)) stats <- intensityStats(pdat)
stats <- stats[which(!is.na(stats$mean) & !is.na(stats$variance) & stats$mean > 0 & stats$variance > 0),]
stats$mean <- log10(stats$mean)
stats$variance <- log10(stats$variance)
protnum <- as.data.frame(table(stats$condition)) # number of proteins in each condition
colnames(protnum) <- c("condition", "n")
protnum$labn <- paste0("n = ", protnum$n)
# has to be calculated for each condition separately
if(with.loess) {
ldf <- NULL
for(cond in pdat$conditions)
{
st <- stats[which(stats$condition == cond),]
ls <- loess(variance ~ mean, data=st)
x <- seq(from=min(na.omit(st$mean)), to=max(na.omit(st$mean)), by=0.01)
pr <- predict(ls, x)
ldf <- rbind(ldf, data.frame(condition=cond, x=x, y=pr))
}
}
g <- ggplot(stats, aes_(x=~mean, y=~variance)) +
simple_theme +
theme(panel.border = element_rect(fill=NA, color='black')) +
xlim(xmin, xmax) +
ylim(ymin, ymax) +
facet_wrap(~condition) +
stat_binhex(bins=bins) +
theme(text = element_text(size=text.size)) +
viridis::scale_fill_viridis(values=c(0, mid.gradient, 1), name="count", na.value=NA) +
#scale_fill_gradientn(colours=c("seagreen","yellow", "red"), values=c(0, mid.gradient, 1), name="count", na.value=NA)
if(with.n) g <- g + geom_text(data=protnum, aes_(x=~xmin+0.5, y=~ymax, label=~labn))
if(with.loess) g <- g + geom_line(data=ldf, aes_(x=~x,y=~y), color='black')
return(g)
}
#' Plot protein/peptide intensities
#'
#' \code{plotIntensities} makes a plot with peptide/protein intensity as a
#' function of the condition and replicate. When multiple IDs are entered, the
#' mean and standard error is plotted.
#'
#' @param pdat A \code{proteusData} object.
#' @param id Protein name, peptide sequence or a vector with these.
#' @param log Logical. If set TRUE a logarithm of intensity is plotted.
#' @param ymin Lower bound for y-axis
#' @param ymax Upper bound for y-axis
#' @param text.size Text size
#' @param point.size Point size
#' @param title Title of the plot (defaults to protein name)
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' plotIntensities(prodat.med, id='sp|P26263|PDC6_YEAST', log=TRUE)
#'
#' @export
plotIntensities <- function(pdat, id=NULL, log=FALSE, ymin=as.numeric(NA), ymax=as.numeric(NA),
text.size=12, point.size=3, title=NULL) {
# without 'as.numeric' it returns logical NA (!!!)
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
if(is.null(id)) stop ("Need peptide/protein id.")
if(pdat$content == "peptide") {
selcol <- pdat$peptides
} else if (pdat$content == "protein") {
selcol <- pdat$proteins
} else {
stop("Unrecognized content in pdat object.")
}
sel <- which(selcol %in% id)
if(length(sel) > 0 && sel > 0) {
E <- if(log) log10(pdat$tab[sel,,drop=FALSE]) else pdat$tab[sel,,drop=FALSE]
e <- colMeans(E, na.rm=TRUE)
s <- sapply(E, function(x) sd(x, na.rm=TRUE)/sqrt(length(x)))
n <- length(sel)
if(is.null(title)) {
title <- ifelse(n == 1, id, paste0("selection of ", n, " ", pdat$content, "s."))
}
meta <- pdat$metadata
p <- data.frame(
expr = e,
lo = e - s,
up = e + s,
condition = factor(meta$condition, levels=unique(as.factor(meta$condition))),
replicates = factor(meta$replicate)
)
# define shapes
p$shape <- rep(21, length(p$expr))
p$shape[which(p$expr==0)] <- 24
pd <- position_dodge(width = 0.15)
#
if(is.na(ymin) && !log) ymin=0
ylab <- ifelse(pdat$type == "SILAC", "Ratio", "Intensity")
if(log) ylab <- paste("log10", ylab)
ggplot(p, aes_(x=~condition, y=~expr, ymin=~lo, ymax=~up, fill=~replicates, shape=~shape)) +
simple_theme_grid +
theme(text = element_text(size=text.size), legend.position = "none") +
ylim(ymin, ymax) +
{if(n > 1) geom_errorbar(position=pd, width = 0.1)} +
geom_point(position=pd, size=point.size) +
scale_shape_identity() + # necessary for shape mapping
viridis::scale_fill_viridis(discrete=TRUE) +
labs(x = 'Condition', y = ylab, title=title)
}
}
#' Simple differential expression with limma
#'
#' \code{limmaDE} is a wrapper around \code{\link{limma}} to perform a
#' differential expression between a pair of conditions.
#'
#' @details
#'
#' Before \code{limma} is called, intensity data are transformed using the
#' \code{transform.fun} function. The default for this transformation is
#' \code{log2}. Therefore, by default, the column "logFC" in the output data
#' frame contains log2 fold change. If you need log10-based fold change, you can
#' use \code{transform.fun=log10}.
#'
#' \code{limmaDE} is only a simple wrapper around \code{\link{limma}}, to
#' perform differential expression between two conditions. For more complicated
#' designs we recommend using \code{\link{limma}} functions directly.
#'
#'
#' @param pdat Protein \code{proteusData} object.
#' @param formula A string with a formula for building the linear model.
#' @param conditions A character vector with two conditions for differential
#' expression. Can be omitted if there are only two condition in \code{pdat}.
#' @param transform.fun A function to transform data before differential
#' expression.
#' @param sig.level Significance level for rejecting the null hypothesis.
#' @return A data frame with DE results. "logFC" column is a log-fold-change
#' (using the \code{transform.fun}). Two columns with mean log-intensity
#' (again, using \code{transform.fun}) and two columns with the number of good
#' replicates (per condition) are added. Attributes contain additional
#' information about the transformation function, significance level, formula
#' and conditions.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' res <- limmaDE(prodat.med)
#'
#' @export
limmaDE <- function(pdat, formula="~condition", conditions=NULL, transform.fun=log2, sig.level=0.05) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
meta <- pdat$metadata
tab <- transform.fun(pdat$tab)
# default conditions
if(!is.null(conditions)) {
for(cond in conditions ) {
if(!(cond %in% meta$condition)) stop(paste("Condition", cond, "not found in metadata."))
}
sel <- which(meta$condition %in% conditions)
meta <- droplevels(meta[sel,])
tab <- tab[,sel]
} else {
conditions <- levels(meta$condition)
}
if(length(conditions) != 2) stop("This function requires exactly two conditions. Use the parameter conditions.")
# limma analysis
design <- model.matrix(as.formula(formula), meta)
fit <- limma::lmFit(tab, design)
ebay <- limma::eBayes(fit)
coef <- colnames(ebay$design)[2]
res <- limma::topTable(ebay, coef=coef, adjust="BH", sort.by="none", number=1e9)
res <- cbind(rownames(res), res)
colnames(res)[1] <- pdat$content
res$significant <- res$adj.P.Val <= sig.level
rownames(res) <- c()
# add columns with mean intensity
for(cond in conditions) {
cname <- paste0("mean_", cond)
m <- rowMeans(tab[,which(meta$condition == cond), drop=FALSE], na.rm=TRUE)
m[which(is.nan(m))] <- NA
res[, cname] <- m
}
# add columns with number of good replicates
ngood <- reshape2::dcast(pdat$stats, id ~ condition, fun.aggregate=sum, value.var="ngood")
ngood <- ngood[, c("id", conditions)]
names(ngood)[2:ncol(ngood)] <- paste0("ngood_", names(ngood)[2:ncol(ngood)])
res <- merge(res, ngood, by.x=pdat$content, by.y="id")
# add annotations
if(!is.null(pdat$annotation)) {
res <- merge(res, pdat$annotation, by=pdat$content, all.x=TRUE)
}
attr(res, "transform.fun") <- deparse(substitute(transform.fun))
attr(res, "sig.level") <- sig.level
attr(res, "formula") <- formula
attr(res, "conditions") <- levels(meta$condition)
return(res)
}
#' Simple one-sample differential expression with limma
#'
#' \code{limmaDE} is a wrapper around \code{\link{limma}} to perform a
#' differential expression for one condition against the null hypothesis of one.
#' This is designed to be used with SILAC ratios.
#'
#' @details
#'
#' Performs differential expression for one condition, assuming the measured
#' values are ratios and the null hypothesis is the ratio of 1. The analysis is
#' done in logarithmic space, so the actual null hypothesis tested is log ratio
#' = 0. The log fold change returned is the log mean ratio of the condition.
#'
#' Before \code{limma} is called, intensity data are transformed using the
#' \code{transform.fun} function. The default for this transformation is
#' \code{log2}. Therefore, by default, the column "logFC" in the output data
#' frame contains log2 fold change. If you need log10-based fold change, you can
#' use \code{transform.fun=log10}.
#'
#' \code{limmaDE} is only a simple wrapper around \code{\link{limma}}, to
#' perform differential expression between two conditions. For more complicated
#' designs we recommend using \code{\link{limma}} functions directly.
#'
#'
#' @param pdat Protein \code{proteusData} object.
#' @param condition Condition name.
#' @param transform.fun A function to transform data before differential
#' expression.
#' @param sig.level Significance level for rejecting the null hypothesis.
#' @return A data frame with DE results. "logFC" column is a log-fold-change
#' (using the \code{transform.fun}). Attributes contain
#' additional information about the transformation function, significance
#' level and condition name.
#'
#' @examples
#' \dontrun{
#' library(proteusSILAC)
#' data(proteusSILAC)
#' prodat.norm <- normalizeData(prodat)
#' res <- limmaRatioDE(prodat.norm, condition="T48")
#' }
#' @export
limmaRatioDE <- function(pdat, condition=NULL, transform.fun=log2, sig.level=0.05) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
meta <- pdat$metadata
tab <- transform.fun(pdat$tab)
# check condition
if(is.null(condition)) stop("Need a condition.")
if(length(condition) != 1) stop("Need exactly one condition.")
# select condition
sel <- which(meta$condition == condition)
if(length(sel) < 2) stop("Need at least two replicates for one-sample differential expression.")
tab <- tab[, sel]
# limma analysis
design <- cbind(Intercept = rep(1, ncol(tab)))
fit <- limma::lmFit(tab, design)
ebay <- limma::eBayes(fit)
res <- limma::topTable(ebay, adjust="BH", sort.by="none", number=1e9)
res <- cbind(rownames(res), res)
colnames(res)[1] <- pdat$content
res$significant <- res$adj.P.Val <= sig.level
rownames(res) <- c()
# add column with number of good replicates
ngood <- pdat$stats[pdat$stats$condition == condition, c("id", "ngood")]
res <- merge(res, ngood, by.x=pdat$content, by.y="id")
# add annotations
if(!is.null(pdat$annotation)) {
res <- merge(res, pdat$annotation, by=pdat$content, all.x=TRUE)
}
attr(res, "transform.fun") <- deparse(substitute(transform.fun))
attr(res, "sig.level") <- sig.level
attr(res, "conditions") <- condition
return(res)
}
#' Fold-change intensity diagram
#'
#' \code{plotFID} makes a log10 fold change versus log10 sum intensity plot,
#' usually known as MA plot.
#'
#' @param pdat Protein \code{proteusData} object.
#' @param pair A two-element vector containing the pair of conditions to use.
#' Can be skipped if there are only two conditions.
#' @param bins Number of bins for binhex.
#' @param marginal.histograms A logical to add marginal histograms.
#' @param xmin Lower limit on x-axis.
#' @param xmax Upper limit on x-axis.
#' @param ymax Upper limit on y-axis. If used, the lower limit is -ymax.
#' @param text.size Text size.
#' @param point.size Size of points in the plot.
#' @param show.legend Logical to show legend (colour key).
#' @param plot.grid Logical to plot a grid.
#' @param binhex Logical. If TRUE, a hexagonal density plot is made, otherwise it
#' is a simple point plot.
#' @param transform.fun A function to transform data before plotting.
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' plotFID(prodat.med)
#'
#' @export
plotFID <- function(pdat, pair=NULL, bins=80, marginal.histograms=FALSE,
xmin=NULL, xmax=NULL, ymax=NULL, text.size=12, point.size=1,
show.legend=TRUE, plot.grid=TRUE, binhex=TRUE, transform.fun=log10) {
if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
if(binhex & marginal.histograms) {
warning("Cannot plot with both binhex=TRUE and marginal.histograms=TRUE. Ignoring binhex.")
binhex=FALSE
}
if(is.null(pair)) pair <- pdat$conditions
if(length(pair) != 2) stop("Need exactly two conditions. You might need to specify pair parameter.")
title <- paste(pair, collapse=":")
tab <- transform.fun(pdat$tab)
# helper function
condMeans <- function(cond) {
m <- rowMeans(tab[,which(pdat$metadata$condition == cond), drop=FALSE], na.rm=TRUE)
m[is.nan(m)] <- NA
m
}
# build data frame with x-y cooordinates
# including infinities
m1 <- condMeans(pair[1])
m2 <- condMeans(pair[2])
good <- !is.na(m1) & !is.na(m2)
df <- data.frame(
id = rownames(pdat$tab),
x = (m1 + m2) / 2,
y = m2 - m1,
good = good
)
mx <- 1.1 * max(abs(df$y), na.rm=TRUE)
m <- rbind(m1[!good], m2[!good])
df[!good, "x"] <- colSums(m, na.rm=TRUE)
df[!good, "y"] <- ifelse(is.na(m[1,]), mx, -mx)
g <- ggplot(df[good, ], aes_(x=~x, y=~y))
if(binhex) {
g <- g + stat_binhex(bins=bins, show.legend=show.legend, na.rm=TRUE) +
viridis::scale_fill_viridis(name="count", na.value=NA)
#scale_fill_gradientn(colours=c("seagreen","yellow", "red"), name = "count",na.value=NA)
} else {
g <- g + geom_point(size=point.size, na.rm=TRUE) +
geom_point(data=df[!good,], aes_(x=~x, y=~y), colour="orange", size=point.size, na.rm=TRUE)
}
if(plot.grid) {
g <- g + simple_theme_grid
} else {
g <- g + simple_theme
}
g <- g + geom_abline(colour='red', slope=0, intercept=0) +
labs(title=title, x=paste0(pair[1], '+', pair[2]), y=paste0(pair[2], '-', pair[1])) +
theme(text = element_text(size=text.size))
if(!is.null(xmin) && !is.null(xmax)) g <- g + scale_x_continuous(limits = c(xmin, xmax), expand = c(0, 0))
if(!is.null(ymax) ) g <- g + scale_y_continuous(limits = c(-ymax, ymax), expand = c(0, 0))
if(marginal.histograms) g <- ggExtra::ggMarginal(g, size=10, type = "histogram", xparams=list(bins=100), yparams=list(bins=50))
return(g)
}
#' Plot p-value distribution
#'
#' \code{plotPdist} makes a plot with distribution of raw p-values, obtained by
#' \code{\link{limmaDE}}.
#'
#' @param res Output table from \code{\link{limmaDE}}.
#' @param text.size Text size.
#' @param plot.grid Logical to plot grid.
#' @param bin.size Bin size for the histogram.
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' res <- limmaDE(prodat.med)
#' plotPdist(res)
#'
#' @export
plotPdist <- function(res, bin.size=0.02, text.size=12, plot.grid=TRUE) {
ggplot(res, aes_(~P.Value, ~..density..)) +
{if(plot.grid) simple_theme_grid else simple_theme} +
geom_histogram(breaks=seq(0, 1, bin.size), colour='blue', na.rm=TRUE) +
labs(x='P-value', y='Density') +
theme(text = element_text(size=text.size))
}
#'Volcano plot
#'
#'\code{plotVolcano} makes a volcano plot from limma results. Uses
#'\code{\link{stat_binhex}} function from ggplot2 to make a hexagonal heatmap.
#'
#'@param res Result table from \code{\link{limmaDE}}.
#'@param bins Number of bins for binhex.
#'@param xmax Upper limit on x-axis. If used, the lower limit is -xmax.
#'@param ymax Upper limit on y-axis. If used, the lower limit is -ymax.
#'@param marginal.histograms A logical to add marginal histograms.
#'@param text.size Text size.
#'@param show.legend Logical to show legend (colour key).
#'@param plot.grid Logical to plot grid.
#'@param binhex Logical. If TRUE, a hexagonal density plot is made, otherwise it
#' is a simple point plot.
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' res <- limmaDE(prodat.med)
#' plotVolcano(res)
#'
#'@export
plotVolcano <- function(res, bins=80, xmax=NULL, ymax=NULL, marginal.histograms=FALSE, text.size=12, show.legend=TRUE,
plot.grid=TRUE, binhex=TRUE) {
if(binhex & marginal.histograms) {
warning("Cannot plot with both binhex=TRUE and marginal.histograms=TRUE. Ignoring binhex.")
binhex=FALSE
}
tr <- attr(res, "transform.fun")
conds <- attr(res, "conditions")
xlab <- ifelse(is.null(tr), "FC", paste(tr, "FC"))
tit <- paste(conds, collapse=":")
id <- names(res)[1]
g <- ggplot(res, aes_(~logFC, ~-log10(P.Value)))
if(binhex) {
g <- g + stat_binhex(bins=bins, show.legend=show.legend, na.rm=TRUE) +
viridis::scale_fill_viridis(name="count", na.value=NA)
#scale_fill_gradientn(colours=c("seagreen","yellow", "red"), name = "count", na.value=NA)
} else {
g <- g + geom_point(na.rm=TRUE)
}
if(plot.grid) {
g <- g + simple_theme_grid
} else {
g <- g + simple_theme
}
g <- g + geom_vline(colour='red', xintercept=0) +
theme(text = element_text(size=text.size)) +
labs(x=xlab, y="-log10 P", title=tit)
if(!is.null(xmax)) g <- g + scale_x_continuous(limits = c(-xmax, xmax), expand = c(0, 0))
if(!is.null(ymax) ) g <- g + scale_y_continuous(limits = c(0, ymax), expand = c(0, 0))
if(marginal.histograms) g <- ggExtra::ggMarginal(g, size=10, type = "histogram", xparams=list(bins=100), yparams=list(bins=50))
return(g)
}
#' Protein-peptide plot
#'
#' \code{plotProtPeptides} creates a plot consisting of two panels. The top
#' panel shows peptide log intensity. Each box represents one peptide, peptide
#' numbering follows alphabetical sequence order. The bottom panel shows sample
#' intensity. Each box represents one sample. White boxes show derived protein
#' intensities (if \code{prodat} is provided).
#'
#' @param pepdat Peptide \code{proteusData} object.
#' @param protein Protein name.
#' @param prodat (optional) protein \code{proteusData} object.
#' @param palette Palette of colours
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' plotProtPeptides(pepdat.clean, 'sp|P26263|PDC6_YEAST', prodat.med)
#'
#' @export
plotProtPeptides <- function(pepdat, protein, prodat=NULL, palette=cbPalette) {
if(!is(pepdat, "proteusData")) stop ("Input data must be of class proteusData.")
tab <- normalizeMedian(pepdat$tab)
# all peptides for this protein
selprot <- which(pepdat$pep2prot$protein == protein)
if(length(selprot) == 0) stop(paste0("Protein '", protein, "' not found."))
# need as.character, or indexing by factor is wrong!
peps <- as.character(pepdat$pep2prot[selprot,'sequence'])
mat <- as.matrix(tab[peps,])
dat <- reshape2::melt(mat, varnames=c("peptide", "sample"))
dat$sample <- factor(dat$sample, levels=pepdat$metadata$sample)
dat$pepnum <- sprintf("%02d", as.numeric(dat$peptide)) # convert sequences into numbers
dat$intensity <- log10(dat$value)
# add condition column
s2c <- setNames(pepdat$metadata$condition, pepdat$metadata$sample)
dat$condition <- s2c[dat$sample]
# add protein intensity column
if(!is.null(prodat)) {
p2p <- setNames(as.numeric(prodat$tab[protein,]), colnames(prodat$tab))
dat$prot.intensity <- log10(p2p[dat$sample])
}
g1 <- ggplot(dat, aes_(x=~pepnum, y=~intensity, fill=~condition)) +
scale_fill_manual(values=palette) +
#viridis::scale_fill_viridis(discrete=TRUE) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width=0, size=0.5) +
facet_wrap(~condition) +
theme(legend.position="none") +
labs(x="Peptide", y="log intensity", title=protein)
g2 <- ggplot(dat, aes_(x=~sample, y=~intensity, fill=~condition)) +
scale_fill_manual(values=palette) +
#viridis::scale_fill_viridis(discrete=TRUE) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width=0, size=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
theme(legend.position="none") +
labs(x="Sample", y="log intensity")
if(!is.null(prodat)) g2 <- g2 + geom_point(aes_(x=~sample, y=~prot.intensity), shape=22, size=3, fill='white')
gridExtra::grid.arrange(g1, g2, ncol=1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.