Proteus: Downstream analysis of the MaxQuant output

Documented in aggregateHifly aggregateMedian aggregateSum annotateProteins goodData intensityStats jaccardSimilarity limmaDE limmaRatioDE makePeptideTable makeProteinTable normalizeData normalizeMedian normalizeTMT parameterString plotClustering plotCount plotDetectionSimilarity plotDistanceMatrix plotFID plotIntensities plotMV plotPCA plotPdist plotProtPeptides plotSampleDistributions plotVolcano proteusData RAS readColumnNames readEvidenceFile readProteinGroups summary.proteusData

#' @import ggplot2
#' @import graphics
#' @import viridis
#' @import methods
#' @import stats
#' @import utils
#' @import parallel

# In this package we deliberately use aes_(x=~x, y=~y) format in ggplot. This
# gets rid of unwanted NOTEs "no visible binding for global variables" in R CMD
# check.

simple_theme <- ggplot2::theme_bw() +
  ggplot2::theme(
    panel.border = ggplot2::element_blank(),
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.line = ggplot2::element_line(colour = "black")
  )
simple_theme_grid <- ggplot2::theme_bw() +
  ggplot2::theme(
    panel.border = ggplot2::element_blank(),
    panel.grid.major = ggplot2::element_line(colour = "grey90"),
    panel.grid.minor = ggplot2::element_line(colour = "grey95"),
    axis.line = ggplot2::element_line(colour = "black")
  )
cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

#' Evidence columns
#'
#' \code{evidenceColumns} contains default columns to be read from the evidence
#' file. The names of list elements are used internally to reference evidence
#' data.
#'
#' @examples
#' str(evidenceColumns)
#'
#' @export
evidenceColumns <- list(
  sequence = 'Sequence',
  modified_sequence = 'Modified sequence',
  modifications = 'Modifications',
  protein_group = 'Proteins',
  protein = 'Leading razor protein',
  experiment = 'Experiment',
  charge = 'Charge',
  reverse = 'Reverse',
  contaminant = 'Potential contaminant'
)

#' Measure columns
#'
#' \code{measureColumns} contains measurement columns from evidence file. In
#' case of label-free data, there is only one column: Intensity.
#'
#' @examples
#' str(measureColumns)
#'
#' @export
measureColumns <- list(
  intensity = 'Intensity'
)

#' Protein columns
#'
#' To be used as the default \code{data.cols} parameter in
#' \code{\link{readProteinGroups}}.
#'
#' @examples
#' str(proteinColumns)
#'
#' @export
proteinColumns <- list(
  protein = "Majority protein IDs",
  reverse = 'Reverse',
  contaminant = 'Potential contaminant'
)


#' \code{proteusData} constructor
#'
#' @param tab A matrix with data, rows are peptides/proteins, columns are
#'   samples
#' @param metadata A data frame with metadata, needs at least two columns:
#'   "sample" and "condition"
#' @param content Either "peptides" or "proteins"
#' @param pep2prot A data frame with two columns: "sequence" and "protein". Can
#'   use "other" for non-proteus data.
#' @param peptides A character vector with peptide sequences
#' @param proteins A character vector with protein names
#' @param measures A character vector with names of intensity and/or ratio
#'   columns
#' @param type Type of experiment: "label-free" or "SILAC"
#' @param npep A data frame with number of peptides per protein
#' @param sequence.col Name of the sequence used, either "sequence" or "modified_sequence"
#' @param protein.col Name of the protein column used, either "protein" or "protein_group"
#' @param peptide.aggregate.fun Function used to aggregate peptides
#' @param peptide.aggregate.parameters Additional parameters passed to peptide
#'   aggregate function
#' @param protein.aggregate.fun Function used to aggregate proteins
#' @param protein.aggregate.parameters Additional parameters passed to protein
#'   aggregate function
#' @param min.peptides Integer, minimum number of peptides to combine into a
#'   protein
#' @param norm.fun Normalizing function
#'
#' @return A \code{proteusData} object.
proteusData <- function(tab, metadata, content, pep2prot, peptides, proteins, measures,
                        npep=NULL, type="label-free", sequence.col="sequence", protein.col="protein",
                        peptide.aggregate.fun=NULL, peptide.aggregate.parameters=NULL,
                        protein.aggregate.fun=NULL, protein.aggregate.parameters=NULL,
                        min.peptides=NULL, norm.fun=identity) {
  stopifnot(
    ncol(tab) == nrow(metadata),
    is(tab, "matrix"),
    content %in% c("peptide", "protein", "other"),
    type %in% c("label-free", "SILAC", "TMT"),
    sequence.col %in% c("sequence", "modified_sequence"),
    protein.col %in% c("protein", "protein_group")
  )

  # make sure to keep correct order of samples
  metadata$sample <- factor(metadata$sample, levels=metadata$sample)

  colnames(tab) <- metadata$sample
  if(content == "peptide") {
    stopifnot(
      nrow(tab) == length(peptides),
      !is.null(peptide.aggregate.fun)
    )
    rownames(tab) <- peptides
  } else if(content == "protein") {
    stopifnot(
      nrow(tab) == length(proteins),
      is.numeric(min.peptides),
      !is.null(protein.aggregate.fun)
    )
    rownames(tab) <- proteins
  }

  if(!is.null(npep)) {
    stopifnot(nrow(npep) == nrow(tab))
  }

  # number of replicates in each condition
  cnd <- as.character(metadata$condition)
  cnd.fac <- factor(cnd)
  nrep <- tapply(cnd, cnd.fac, length) # nice trick
  metadata$condition <- cnd.fac

  pdat <- list(
    tab = tab,
    metadata = metadata,
    measures = measures,
    content = content,
    conditions = levels(cnd.fac),
    nrep = nrep,
    type = type,
    sequence.col = sequence.col,
    protein.col = protein.col,
    peptide.aggregate.fun = peptide.aggregate.fun,
    peptide.aggregate.parameters = peptide.aggregate.parameters,
    protein.aggregate.fun = protein.aggregate.fun,
    protein.aggregate.parameters = protein.aggregate.parameters,
    min.peptides = min.peptides,
    norm.fun = deparse(substitute(norm.fun)),
    pep2prot = pep2prot,
    peptides = peptides,
    proteins = proteins,
    npep = npep
  )
  class(pdat) <- append(class(pdat), "proteusData")
  pdat$stats <- intensityStats(pdat)
  pdat$detect <- goodData(pdat)

  return(pdat)
}


#' Summary of \code{proteusData} object
#'
#' @param object \code{proteusData} object.
#' @param ... additional arguments affecting the summary produced.
#'
#' @return Text output with object summary
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' summary(prodat)
#'
#' @export
summary.proteusData <- function(object, ...) {
  cat("\n*** Basic statistics ***\n\n")
  cat(paste0("  content = ", object$content, "\n"))
  cat(paste0("  experiment type = ", object$type, "\n"))
  cat(paste0("  number of samples = ", nrow(object$metadata), "\n"))
  cat(paste0("  number of conditions = ", length(object$conditions), "\n"))
  cat(paste0("  number of ", object$content, "s = ", nrow(object$tab), "\n"))
  cat(paste0("  samples = ", paste0(object$metadata$sample, collapse = ", "), "\n"))
  cat(paste0("  conditions = ", paste0(object$conditions, collapse = ", "), "\n"))

  cat("\n*** Data processing ***\n\n")
  cat(paste0("  evidence columns used = ", paste0(object$measures, collapse = ", "), "\n"))
  cat(paste0("  sequence = '", ifelse(object$sequence.col == 'sequence', "Sequence", "Modified sequence"), "'\n"))
  cat(paste0("  protein = '", ifelse(object$protein.col == 'protein', "Leading razor protein", "Proteins"), "'\n"))
  cat(paste0("  normalization = ", object$norm.fun, "\n"))

  if(object$content == "protein") {
    cat("\n*** Protein data ***\n\n")
    cat(paste0("  quantification method = ", object$aggregate.fun, "\n"))
    if(!is.null(object$aggregate.parameters)) {
      cat(paste0("  aggregate parameters : ", object$aggregate.parameters, "\n"))
    }
    cat(paste0("  minimum number of peptides per protein = ", object$min.peptides, "\n"))
  }
}

#' Read column names from a tab-delimited text file
#'
#' @param file File name.
#'
#' @return A vector with column names.
#' @export
#'
#' @examples
#' evidenceFile <- system.file("extdata", "evidence.txt.gz", package="proteusLabelFree")
#' evidence.columns <- readColumnNames(evidenceFile)
#' evidence.columns
readColumnNames <- function(file) {
  cols <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, nrows=1)
  names(cols)
}


#' Read MaxQuant's evidence file
#'
#' \code{readEvidenceFile} reads MaxQuant's evidence file. Contaminants and
#' reverse sequences are filtered out.
#'
#' @details
#'
#' There are two parameters controlling which columns are read from the evidence
#' file. Parameter \code{measure.cols} selects columns with measurements: these
#' are intensities (label-free, TMT) or ratios (SILAC). In the simplest case of
#' label-free data, there is only one measure column: "Intensity". Parameter
#' \code{data.columns} selects all other columns read from the evidence file.
#' There are two default lists, supplied with the package, appropriate for an
#' label-free experiment, \code{measureColumns} and \code{evidenceColumns}.
#'
#'
#' @param file File name.
#' @param measure.cols Named list with measure columns to read.
#' @param data.cols Named list with other columns to read (in addition to
#'   measure columns).
#' @param zeroes.are.missing Logical. If TRUE zeroes are interpreted as missing
#'   data and replaced with NAs.
#' @return Data frame with selected columns from the evidence file.
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' evidenceFile <- system.file("extdata", "evidence.txt.gz", package="proteusLabelFree")
#' evi <- readEvidenceFile(evidenceFile)
#' }
#'
#' @export
readEvidenceFile <- function(file, measure.cols=measureColumns, data.cols=evidenceColumns, zeroes.are.missing=TRUE) {

  columns <- c(data.cols, measure.cols)
  if(anyDuplicated(names(columns))) stop("Column names must be unique.")

  # check if all required columns are in the evidence file
  evi.cols <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE, nrows = 1)
  missing <- NULL
  for(col in columns) {
    if(!(col %in% colnames(evi.cols))) missing <- c(missing, paste0("'", col, "'"))
  }
  if(!is.null(missing))
    stop(paste0("Column(s) ", paste0(missing, collapse=", "), " not found in file ", file))

  # read and process evidence file
  evi <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE)
  evi <- evi[, as.character(columns)]
  names(evi) <- names(columns)
  # sometimes there are only NAs and the condition doesn't work
  evi$reverse[is.na(evi$reverse)] <- ''
  evi$contaminant[is.na(evi$contaminant)] <- ''
  evi <- evi[which(evi$contaminant != '+' & evi$reverse != '+'),]

  # replace NaNs and infinites with NAs in measure columns
  # the same with zeroes if flag is on
  for(col in names(measure.cols)) {
    x <- evi[, col]
    x[is.nan(x) | is.infinite(x)] <- NA
    if(zeroes.are.missing) x[x == 0] <- NA
    evi[, col] <- x
  }

  # remove rows that have only NAs in measure columns
  not.empty <- which(rowSums(!is.na(evi[,names(measure.cols), drop=FALSE])) > 0)
  evi <- evi[not.empty,]
}


#' Read protein groups file from MaxQuant
#'
#'
#' \code{readProteinGroups} reads a MaxQuant's protein groups file (usually
#' named \code{proteinGroups.txt}), extracts intensity data and creates a
#' \code{proteusData} object.
#'
#' @details
#'
#' This function allows to read MaxQuant's proteinGroups file directly, without
#' reading and processing the evidence file. Apart from the proteinGroups file
#' name, a metadata data frame needs to be provided.
#'
#' @param file Input file
#' @param meta Metadata data frame. As a minimum, it should contain two columns:
#'   \code{sample} with unique sample names and \code{condition} with condition
#'   names.
#' @param measure.cols A named list with measure columns to read. These are
#'   columns in the proteinGroup files containing intensity measurements per
#'   sample. Usually they are called \code{Intensity <sample>}. If not provided,
#'   column names will be created from metadata using a template
#'   \code{paste("Intensity", meta$sample)}.
#' @param data.cols A named list with columns to read in addition to measure
#'   columns. It should contain three columns named \code{protein}, which will
#'   be used as protein ID and \code{reverse} and \code{contaminant}, which will
#'   be used to filter data. The values of this named list are the actual column
#'   names in the file (including spaces). The default structure
#'   \code{proteinColumns} is provided with the package, though it might need
#'   modifying in case these columns are named differently in the protein groups
#'   file.
#' @return A \code{proteusData} object with protein intensity data.
#'
#' @examples
#' library(proteusLabelFree)
#' proteinGroupsFile <- system.file("extdata", "proteinGroups.txt.gz", package="proteusLabelFree")
#' prot.MQ <- readProteinGroups(proteinGroupsFile, meta)
#'
#' @export
readProteinGroups <- function(file, meta, measure.cols=NULL, data.cols=proteinColumns) {
  # attempt creating default measure columns "Intensity sample"
  if(is.null(measure.cols)) {
    measure.cols <- paste("Intensity", meta$sample)
    names(measure.cols) <- meta$sample
  }

  dat <- readEvidenceFile(file, measure.cols=measure.cols, data.cols=data.cols)
  tab <- as.matrix(dat[names(measure.cols)])
  tab[tab==0] <- NA
  tab[is.nan(tab)] <- NA
  colnames(tab) <- meta$sample
  rownames(tab) <- dat$protein

  # create pdat object
  pdat <- proteusData(tab, meta, "protein", NULL, NULL, rownames(tab), measure.cols,
                      min.peptides=0, protein.aggregate.fun="MaxQuant")
  return(pdat)
}

#' Helper function, not exported
#'
#' @param ... Any parameters
#' @return A string with list of parameters and their values
parameterString <- function(...) {
  x <- sapply(list(...), deparse)
  if(length(x) > 0) {
    ap <- paste0(names(x), "=", x, collapse=";")
  } else {
    ap <- NULL
  }
  ap
}

#' Create peptide table from evidence data
#'
#' \code{makePeptideTable} computes a peptide table and related data. Peptide
#' table is a matrix with columns corresponding to conditions and rows
#' corresponding to peptide sequences.
#'
#' @details
#'
#' The evidence file contains a column called "Experiment" and one or more
#' columns with measure values. In case of label-free experiment there is only
#' one measure column: "Intensity". In case of TMT experiment there are several
#' measure columns, usually called "Reporter intensity 0", "Reporter intensity
#' 1", and so on. \code{makePeptideTable} will combine "Experiment" and measure
#' columns in a way defined by the metadata (parameter \code{meta}). The name of
#' the combined column will be named using "sample" column in metadata.
#'
#' The result is a \code{proteusData} object containing a table with rows
#' corresponding to peptides and columns corresponding to samples (as defined in
#' metadata). Each cell of the table is an aggregated measure values over all
#' evidence entries corresponding to the given sequence and experiment. How
#' these measure values are aggregated is controlled by the parameter
#' \code{aggregate.fun}.
#'
#' There are two aggregation functions provided in this package:
#' \code{\link{aggregateMedian}} and \code{\link{aggregateSum}}. Depending on
#' the needs, the user can provide any arbitrary function to perform
#' aggregation.  We recommend using \code{aggregateSum} for label-free and TMT
#' experiments and \code{aggregateMedian} for SILAC experiments.
#'
#' The aggregate function should be in form: \code{function(wp, ...)}. \code{wp}
#' is a matrix containing measurements from for a given peptide. Rows are
#' evidence file entries, columns are samples. \code{...} are additional
#' parameters for the function that are passed from \code{makePeptideTable}. The
#' aggregate function should return a vector of values for each sample. That is
#' the length of the vector should be the same as the number of columns in
#' \code{wp}. For example, the default aggregate function \code{aggregateSum}
#' calculates sums in each column of \code{wp}.
#'
#' This function, apart from aggregating evidence data into peptides, builds a
#' peptide-to-protein reference and stores is in the returned object ($pep2prot
#' field). It can be used at peptide level, to identify which protein peptides
#' belong to. It is then passed to \code{\link{makeProteinTable}} function and
#' used to aggregate peptides to proteins. Here we can decide how peptides are
#' aggregated to proteins. This is controlled by the \code{protein.col}
#' parameter, which indicates which evidence column should be used to build
#' peptide-to-protein relation. If "protein" is used (default) the
#' peptite-to-protein relation is build on leading razor proteins. If
#' "protein_group" is used, then peptide-to-protein relation is based on protein
#' groups.
#'
#' Only samples from metadata are used, regardless of the content of the
#' evidence data. This makes selection of samples for downstream processing
#' easy: select only required rows in the metadata data frame.
#'
#' @param evi Evidence table created with \code{\link{readEvidenceFile}}.
#' @param meta Data frame with metadata. As a minimum, it should contain
#'   "sample" and "condition" columns.
#' @param sequence.col A column name to identify peptides. Can be either
#'   "sequence" or "modified_sequence".
#' @param protein.col A column name to use for peptide-to-protein reference. Can
#'   be either "protein" or "protein_group".
#' @param measure.cols A named list of measure columns; should be the same as
#'   used in \code{\link{readEvidenceFile}}
#' @param aggregate.fun A function to aggregate pepetides with the same
#'   sequence/sample.
#' @param ... Additional parameters passed to the aggregate function
#' @param experiment.type Type of the experiment, "label-free", "TMT" or
#'   "SILAC".
#' @param ncores Number of cores for parallel processing
#' @return A \code{proteusData} object, containing peptide intensities and
#'   metadata.
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' pepdat <- makePeptideTable(evi, meta, ncores=2)
#' }
#'
#' @export
makePeptideTable <- function(evi, meta, sequence.col=c("sequence", "modified_sequence"),
                             protein.col=c("protein", "protein_group"), measure.cols=measureColumns,
                             aggregate.fun=aggregateSum, ..., experiment.type=c("label-free", "TMT", "SILAC"),
                             ncores = 4) {

  sequence.col <- match.arg(sequence.col)
  protein.col <- match.arg(protein.col)
  experiment.type <- match.arg(experiment.type)

  # mclapply doesn't work on Windows, so force 1 core
  if(Sys.info()[['sysname']] == "Windows") {
    ncores <- 1
    warning("Multicore processing not available in Windows. Using ncores=1")
  }

  # check if measure.cols, sequence.col and protein.col are in the evidence file
  measures <- names(measure.cols)
  for(col in c(measures, sequence.col, protein.col)) {
    if(!(col %in% names(evi))) stop(paste0("Column '", col, "' not found in evidence data."))
  }

  # dummy experiment if experiment column not present
  if(!("experiment" %in% names(evi)) && !("experiment" %in% names(meta))) {
    evi$experiment = "dummy"
    meta$experiment = "dummy"
  }

  # test metadata integrity
  for(col in c("experiment", "measure", "sample", "condition")) {
    if(!(col %in% names(meta))) stop(paste0("Column '", col, "' not found in metadata."))
  }
  if(anyDuplicated(paste0(meta$experiment, meta$measure)) > 0) stop("Experiment/measure combination in metadata must be unique.")
  if(anyDuplicated(meta$sample) > 0) stop("Sample in metadata must be unique.")
  experiments <- unique(evi$experiment)
  for(ex in meta$experiment) {
    if(!(ex %in% experiments)) stop(paste0("Experiment '", ex, "' from metadata not found in evidence."))
  }
  for(me in meta$measure) {
    if(!(me %in% measure.cols)) stop(paste0("Measure '", me, "' from metadata not found in measure.cols."))
  }

  # make sure to keep correct order of samples
  meta$sample <- factor(meta$sample, levels=meta$sample)

  # melt and recast evidence data
  #
  # NOTE: due to a potential conflict between reshape and reshape2 we do not use
  # "variable.name" parameter and default to the column name "variable". This is
  # changed manually to "measure", because we want it this way. See
  # https://github.com/hadley/reshape/issues/63 for more details.
  eviMelted <- reshape2::melt(evi, id.vars = c(sequence.col, "experiment"), measure.vars=measures)
  names(eviMelted)[1] <- "sequence"  # for simplicity, call it sequence
  names(eviMelted)[3] <- "measure" # see note above
  eviMelted$value <- as.numeric(eviMelted$value)   # integers do not work well in cast + median
  eviMelted$measure <- measure.cols[eviMelted$measure]  # recover original measurement columns

  # Add sample column to melted evidence
  # we tried using merge:
  #   eviMelted <- merge(eviMelted, meta[, c("experiment", "measure", "sample")], by=c("experiment", "measure"))
  # but it turns out it is quite slow; translation of concatenated column is significantly faster
  m2s <- setNames(meta$sample, paste0(meta$experiment, ".", meta$measure)) # named vector for translation
  eviMelted$expmes <- paste0(eviMelted$experiment, ".", eviMelted$measure)
  eviMelted <- eviMelted[which(eviMelted$expmes %in% names(m2s)),] # select only experiment.measure present in metadata
  eviMelted$sample <- m2s[eviMelted$expmes]
  eviMelted <- eviMelted[, c("sequence", "sample", "value")]

  # create unique sequence names
  eviMelted$seqsam <- paste0(eviMelted$sequence, ".", eviMelted$sample)
  eviMelted <- eviMelted[order(eviMelted$seqsam),]
  s2s <- setNames(eviMelted$sequence, eviMelted$seqsam)
  r <- rle(eviMelted$seqsam)
  eviMelted$uniseq <- paste0(rep(s2s[r$values], times=r$lengths), ".", unlist(lapply(r$lengths, seq_len)))

  # cast into table: sample vs unique sequence
  # in this table there are multiple entries per peptide
  tab <- reshape2::dcast(eviMelted, uniseq ~ sample, sum, value.var="value")

  # original sequence
  u2s <- setNames(eviMelted$sequence, eviMelted$uniseq)
  sequences <- u2s[tab$uniseq]

  # extract numeric data as matrix
  tab <- as.matrix(tab[,2:ncol(tab)])
  tab[tab == 0] <- NA

  # aggregate peptides
  ptab <- parallel::mclapply(unique(sequences), function(s) {
    wp <- tab[sequences == s,, drop=FALSE]
    x <- aggregate.fun(wp, ...)
    row <- as.data.frame(t(as.vector(x)))
    rownames(row) <- s
    return(row)
  }, mc.cores=ncores)
  ptab <- as.matrix(do.call(rbind, ptab))
  colnames(ptab) <- colnames(tab)
  peptides <- row.names(ptab)

  # peptide to protein conversion
  pep2prot <- data.frame(sequence=evi[[sequence.col]], protein=evi[[protein.col]])
  pep2prot <- unique(pep2prot)
  if(anyDuplicated(pep2prot$sequence) > 0) stop("Non-unique peptide-to-protein association found. Proteus requires that peptide sequence is uniquely associated with a protein or protein group.")
  rownames(pep2prot) <- pep2prot$sequence
  pep2prot <- pep2prot[peptides,]
  proteins <- levels(as.factor(pep2prot$protein))

  # create pdat object
  pdat <- proteusData(ptab, meta, "peptide", pep2prot, peptides, proteins,
                      as.character(measure.cols), type = experiment.type,
                      sequence.col = sequence.col, protein.col = protein.col,
                      peptide.aggregate.fun = deparse(substitute(aggregate.fun)),
                      peptide.aggregate.parameters = parameterString(...)
  )

  return(pdat)
}



#' Make protein table
#'
#' \code{makeProteinTable} creates a protein table from the peptide table.
#' Protein intensities or ratios are aggregated from peptide data. An external
#' function is used to aggregate each protein.
#'
#' @details
#'
#' There are three protein aggregation functions provided in this package:
#' \code{\link{aggregateHifly}}, \code{\link{aggregateMedian}} and
#' \code{\link{aggregateSum}}. Depending on the needs, the user can provide any
#' arbitrary function to perform aggregation.
#'
#' The aggregate function should be in form: \code{function(wp, ...)}. \code{wp}
#' is a matrix containing measurements from all peptides for the particular
#' protein. Rows are peptides, columns are samples. \code{...} are additional
#' parameters for the function that are passed from \code{makeProteinTable}. The
#' aggregate function should return a vector of values for each sample. That is
#' the length of the vector should be the same as the number of columns in
#' \code{wp}.
#'
#' @param pepdat A \code{proteusData} object containing peptide data, normally
#'   created by \code{\link{makePeptideTable}}
#' @param aggregate.fun Function to aggregate peptides into a protein. See
#'   "Details" for more details.
#' @param ... Additional parameters to pass to \code{aggregate.fun}
#' @param min.peptides Minimum number of peptides per protein
#' @param ncores Number of cores for parallel processing
#'
#' @return A \code{proteusData} object containing protein intensities and
#'   metadata.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, ncores=2)
#'
#' @export
makeProteinTable <- function(pepdat, aggregate.fun=aggregateHifly, ...,
                             min.peptides=2, ncores=4) {
  if(!is(pepdat, "proteusData")) stop ("Input data must be of class proteusData.")

  # mclapply doesn't work on Windows, so force 1 core
  if(Sys.info()[['sysname']] == "Windows") {
    ncores <- 1
    warning("Multicore processing not available in Windows. Using ncores=1")
  }

  meta <- pepdat$metadata
  tab <- pepdat$tab

  # make sure to keep correct order of samples
  # this should be OK in peptide table, but just in case
  meta$sample <- factor(meta$sample, levels=meta$sample)

  protlist <- list()
  for(cond in pepdat$conditions) {
    w <- tab[,which(meta$condition == cond), drop=FALSE]
    samples <- colnames(w)
    protcond <- parallel::mclapply(pepdat$proteins, function(prot) {
      sel <- which(pepdat$pep2prot$protein == prot)
      npep <- length(sel)
      if(npep >= min.peptides)
      {
        wp <- w[sel,, drop=FALSE]
        x <- aggregate.fun(wp, ...)
        row <- as.data.frame(t(as.vector(x)))
      } else {
        row <- as.data.frame(t(rep(NA, length(samples))))
      }
      names(row) <- samples
      row <- data.frame(protein=prot, row, check.names=FALSE)
      return(row)
    }, mc.cores=ncores)
    protint <- do.call(rbind, protcond)
    protlist[[cond]] <- protint
  }

  # dplyr join all tables
  protab <- Reduce(function(df1, df2) dplyr::full_join(df1,df2, by="protein"), protlist)

  # remove empty rows (happens when min.peptides > 1)
  protab <- protab[which(rowSums(!is.na(protab)) > 0), ]
  proteins <- protab$protein
  protab <- as.matrix(protab[,as.character(meta$sample)])  # just numbers

  # count peptides
  n <- as.integer(tapply(pepdat$pep2prot$protein, pepdat$pep2prot$protein, length))
  npep <- data.frame(npep=n[proteins])
  rownames(npep) <- proteins                 # a bit redundant, but might be useful

  prodat <- proteusData(protab, meta, "protein", pepdat$pep2prot, pepdat$peptides,
                        proteins, pepdat$measures,
                        type = pepdat$type,
                        sequence.col = pepdat$sequence.col,
                        min.peptides = min.peptides,
                        peptide.aggregate.fun = pepdat$peptide.aggregate.fun,
                        peptide.aggregate.parameters = pepdat$peptide.aggregate.parameters,
                        protein.aggregate.fun = deparse(substitute(aggregate.fun)),
                        protein.aggregate.parameters = parameterString(...),
                        npep = npep)
  return(prodat)
}


#' Protein aggregate function using hifly method
#'
#' Function to aggregate peptides into proteins using a hi-flyer method of Silva
#' et al. 2006 (http://www.mcponline.org/content/5/1/144.full.pdf).
#'
#' @details
#'
#' This function should be used from within \code{\link{makeProteinTable}}. The
#' "hifly" method is a follows. \enumerate{ \item For a given protein find all
#' corresponding peptides. \item In each replicate, order intensities from the
#' highest to the lowest. This is done separately for each replicate. \item
#' Select n top rows of the ordered table. \item In each replicate, find the
#' mean of these three rows. This is the estimated protein intensity. }
#'
#' @param wp Intensity table with selection of peptides for this protein
#' @param hifly Number of high-fliers
#'
#' @return A numeric vector with aggregated protein intensities
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, aggregate.fun=aggregateHifly, hifly=3)
#' }
#' @export
aggregateHifly <- function(wp, hifly=3) {
  npep <- nrow(wp)
  stopifnot(npep > 0)
  if(npep > 1) {
    sp <- as.matrix(apply(wp, 2, function(x) sort(x, na.last=TRUE, decreasing=TRUE)))
    ntop <- min(npep, hifly)
    row <- t(colMeans(sp[1:ntop,,drop=FALSE], na.rm=TRUE))
    row[is.nan(row)] <- NA    # colMeans puts NaNs where the column contains only NAs
  } else {
    row <- wp
  }
  return(as.vector(row))
}

#' Protein/peptide aggregate function using median
#'
#' This function should be used from within \code{\link{makePeptideTable}} or
#' \code{\link{makeProteinTable}}. It aggregates values by calculating the
#' median for each column of \code{wp}.
#'
#' @param wp Matrix with columns corresponding to samples and rows corresponding
#'   to peptide entries.
#'
#' @return A numeric vector with aggregated values
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, aggregate.fun=aggregateMedian)
#' }
#' @export
aggregateMedian <- function(wp) {
  row <- apply(wp, 2, function(x) median(x, na.rm=TRUE))
  return(as.vector(row))
}


#' Peptide/protein aggregate function using sum
#'
#' This function should be used from within \code{\link{makePeptideTable}} or
#' \code{\link{makeProteinTable}}. It aggregates values by calculating the sum
#' for each column of \code{wp}.
#'
#' @param wp Matrix with columns corresponding to samples and rows corresponding
#'   to peptide entries.
#'
#' @return A numeric vector with aggregated values
#'
#' @examples
#' \dontrun{
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat <- makeProteinTable(pepdat.clean, aggregate.fun=aggregateSum)
#' }
#' @export
aggregateSum <- function(wp) {
  row <- colSums(wp, na.rm=TRUE)
  row[row==0] <- NA    # colSums puts zeroes where the column contains only NAs (!!!)
  return(as.vector(row))
}



#' Annotate proteins
#'
#' \code{annotateProteins} adds protein annotations to a \code{proteusData}
#' object.
#'
#' @details
#'
#' The only information about proteins proteus package extracts from the
#' evidence file are protein identifiers. These can be in various forms,
#' depending on how MaxQuant was ran. In order to annotate them, two steps are
#' required.
#'
#' First, the user needs to create a data frame linking protein identifiers (as
#' in \code{pdat$proteins} vector) to some metadata. This data frame has to
#' contain a column called \code{protein} with the identifiers and any
#' additional columns with annotations, e.g., UniProt IDs, protein names, gene
#' names, domains, GO-terms and so on. These data can be obtained from UniProt.
#' Any annotation columns included will be used by interactive functions
#' \code{plotFID_live} and \code{plotVolcano_live}.
#'
#' See \code{\link{fetchFromUniProt}} for obtaining annotation data from
#' UniProt.
#'
#' Once the annotation table is created, it can be merged into the
#' \code{proteusData} object using \code{annotateProteins} function. The order
#' of identifiers in the annotation table is not important. Also, not all
#' proteins have to be present. The merge will add only the matching proteins.
#' As a result, this function returns a \code{proteusData} object with
#' \code{annotation} field added. It contains an annotation data frame with rows
#' corresponding to the internal list of proteins. This means the rows of this
#' data frame correspond one-to-one to the rows in \code{pdat$tab} and
#' \code{pdat$detect}, if pdat contains proteins (not peptides).
#'
#' @param pdat A \code{proteusData} object containing protein data
#' @param annotation A data frame with a column \code{protein} containing
#'   protein identifiers, as in \code{pdat$proteins}
#'
#' @return A \code{proteusData} with annotation field added.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#'
#' # annotations.id is a data frame with annotations
#' prodat <- annotateProteins(prodat, annotations.id)
#'
#' @export
annotateProteins <- function(pdat, annotation) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  if(!is.data.frame(annotation)) stop ("Annotation needs to be a data frame.")
  if(is.null(annotation$protein)) stop ("Annotation table has to have a 'protein' column.")
  if(anyDuplicated(annotation$protein) > 0) stop("protein column in annotations must contain only unique identifiers.")

  nover <- length(intersect(pdat$proteins, annotation$protein))
  if(nover == 0) stop("No overlap between data and annotation. Nothing to annotate.")

  mrg <- data.frame(protein=pdat$proteins)
  mrg <- merge(mrg, annotation, by="protein", all.x=TRUE)
  pdat$annotation <- mrg

  cat(paste("Annotated", nover, "proteins.\n"))

  return(pdat)
}


#' Normalize columns of a matrix to medians
#'
#' \code{normalizeMedian} normalizes the columns of a matrix to have the same
#' medians. It should be used with \code{\link{normalizeData}} function.
#'
#' @param tab Data frame with raw intensities. Normally, this is a \code{tab}
#'   field in the \code{proteusData} object (see examples below).
#' @return Normalized matrix.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' normtab <- normalizeMedian(prodat$tab)
#'
#' @export
normalizeMedian <- function(tab) {
  norm.fac <- apply(tab, 2, function(x) {median(x, na.rm=TRUE)})
  norm.fac <- norm.fac / mean(norm.fac, na.rm=TRUE)
  tab <- t(t(tab) / norm.fac)
  return(tab)
}


#' Normalize proteus data
#'
#' \code{normalizeData} normalizes the intensity table in a \code{proteusData}
#' object, using the provided normalizing function.
#'
#' @param pdat A \code{proteusData} object with peptide or protein intensities.
#' @param norm.fun A normalizing function.
#' @return A \code{proteusData} object with normalized intensities.
#'
#' @details The normalizing function, specified by \code{norm.fun} needs to
#'   normalize columns of a numerical matrix. The input is a matrix and the
#'   output is a normalized matrix. The default value points to
#'   \code{\link{normalizeMedian}}, Proteus's function normalizing each column
#'   to its median. Other functions can be used, for example
#'   \code{normalizeQuantiles} from the \code{limma} package.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat, norm.fun=normalizeMedian)
#'
#' @export
normalizeData <- function(pdat, norm.fun=normalizeMedian) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  if(class(norm.fun) != "function") stop ("'norm.fun' has to be a function.")

  pdat$tab <- norm.fun(pdat$tab)
  pdat$norm.fun <- deparse(substitute(norm.fun))
  pdat$stats <- intensityStats(pdat)  # need to recalculate stats!
  return(pdat)
}



#' RAS procedure for CONSTANd normalization (not exported)
#'
#' @param K Input matrix
#' @param max.iter Maximum number of iterations
#' @param eps Convergence limit
#'
#' @return Matrix normalized so row and column means equal 1/n (n - number of
#'   columns)
RAS <- function(K, max.iter=50, eps=1e-5) {
  n <- ncol(K)
  m <- nrow(K)

  # ignore rows with only NAs
  good.rows <- which(rowSums(!is.na(K)) > 0)
  K <- K[good.rows, ]

  cnt <- 1
  repeat {
    row.mult <- 1 / (n * rowMeans(K, na.rm=TRUE))
    K <- K * row.mult
    err1 <- 0.5 * sum(abs(colMeans(K, na.rm=TRUE) - 1/n))
    col.mult <- 1 / (n * colMeans(K, na.rm=TRUE))
    K <- t(t(K) * col.mult)
    err2 <- 0.5 * sum(abs(rowMeans(K, na.rm=TRUE) - 1/n))
    cnt <- cnt + 1
    if(cnt > max.iter || (err1 < eps && err2 < eps)) break
  }

  # reconstruct full table
  KF <- matrix(NA, nrow=m, ncol=n)
  KF[good.rows, ] <- K

  return(KF)
}

#' Normalize protein/peptide data using CONSTANd normalization
#'
#' @details
#'
#' \code{normalizeTMT} implements CONSTANd algorithm from Maes et al. (2016)
#' \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4974351/pdf/zjw2779.pdf}.
#' It normalizes TMT table, for each experiment separately. After normalization
#' each row shows the percentage of total row intensity and each column is
#' normalized to their mean value. Hence, both row and column means are equal
#' 1/n, where n is the number of columns (reporters).
#'
#' @param pdat A \code{proteusData} object with peptide or protein intensities.
#' @param max.iter Maximum number of iterations for the RAS procedure.
#' @param eps Convergence limit for the RAS procedure.
#'
#' @return A \code{proteusData} with normalized data.
#'
#' @examples
#' library(proteusTMT)
#' data(proteusTMT)
#' prodat.norm <- normalizeTMT(prodat)
#'
#' @export
normalizeTMT <- function(pdat, max.iter=50, eps=1e-5) {
  for(ex in levels(as.factor(pdat$meta$experiment))) {
    colsel <- which(pdat$metadata$experiment == ex)
    pdat$tab[, colsel] <- RAS(pdat$tab[, colsel])
    pdat$norm.fun <- "CONSTANd"
    pdat$stats <- intensityStats(pdat)  # need to recalculate stats!

  }
  return(pdat)
}

#' Plot distance matrix
#'
#' \code{plotDistanceMatrix} plots a distance matrix for peptide or protein
#' data.
#'
#' Computes a distance matrix and plots it as a shade-plot. The default distance
#' is Pearson's correlation coefficient. Other methods will be introduced later.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param distance A method to calculate distance.
#' @param text.size Text size on axes
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotDistanceMatrix(pepdat)
#'
#' @export
plotDistanceMatrix <- function(pdat, distance=c("correlation"), text.size=10) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  distance <- match.arg(distance)

  corr.mat <- cor(pdat$tab, use="complete.obs")
  m <- reshape2::melt(corr.mat, varnames=c("Sample1", "Sample2"))
  m$Sample1 <- factor(m$Sample1, levels=pdat$metadata$sample)
  m$Sample2 <- factor(m$Sample2, levels=pdat$metadata$sample)
  ggplot(m, aes_(x=~Sample1, y=~Sample2)) +
    geom_tile(aes_(fill=~value)) +
    viridis::scale_fill_viridis() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5, size=text.size),
      axis.text.y = element_text(size=text.size)
    ) +
    labs(x='Sample', y='Sample', fill="Correlation")
}


#' Plot clustering dendrogram
#'
#' \code{plotClustering} plots a dendrogram of intensity data, using
#' hierarchical clustering.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param x.text.size Size of text on the x-axis (sample names).
#' @return Creates a plot
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotClustering(pepdat)
#'
#' @export
plotClustering <- function(pdat, x.text.size=10) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  corr.mat <- cor(pdat$tab, use="complete.obs")
  dis <- as.dist(1 - corr.mat)  # dissimilarity matrix
  hc <- hclust(dis)
  dendr <- ggdendro::dendro_data(hc)
  dat <- ggdendro::segment(dendr)
  theme.d <- ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    panel.background = ggplot2::element_blank(),
    axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust=0.5, size=x.text.size),
    axis.line.x = ggplot2::element_blank(),
    axis.ticks.x = ggplot2::element_blank()
  )
  ggplot() +
    theme.d +
    geom_segment(data=dat, aes_(x=~x, y=~y, xend=~xend, yend=~yend)) +
    scale_x_continuous(breaks = seq_along(dendr$labels$label), labels = dendr$labels$label) +
    scale_y_continuous(expand = c(0,0), limits = c(0, max(dat$y) * 1.03)) +
    labs(x="Sample", y="Distance")
}


#' Plot PCA
#'
#' \code{plotPCA} creates a principal analysis plot using first two components.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param with.legend Logical, whether to plot a legend.
#' @param point.size Size of the point in the plot.
#' @param text.size Size of the axis text.
#' @param label.size Size of the text labels identifying sapmles.
#' @param legend.text.size Size of the legend text.
#' @param palette Palette of colours
#'
#' @return Creates a plot
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotPCA(pepdat)
#' @export
plotPCA <- function(pdat, with.legend=TRUE, point.size=1.2, text.size=10,
                    label.size=3, legend.text.size=7, palette=cbPalette) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  pca <- prcomp(t(na.omit(log10(pdat$tab))), scale.=TRUE, center=TRUE)

  p <- data.frame(
    x = pca$x[, 1],
    y = pca$x[, 2],
    condition = pdat$metadata$condition,
    sample = pdat$metadata$sample
  )
  var.perc <- 100 * (pca$sdev)^2 / sum((pca$sdev)^2)
  pca1 <- sprintf("PCA1 (%5.1f%%)", var.perc[1])
  pca2 <- sprintf("PCA2 (%5.1f%%)", var.perc[2])

  g <- ggplot(p, aes_(x=~x, y=~y, label=~sample)) +
    simple_theme +
    theme(
      legend.title = element_text(size = legend.text.size),
      legend.text = element_text(size = legend.text.size)
    ) +
    coord_cartesian(expand=TRUE) +
    geom_point(aes_(colour=~condition), size=point.size) +
    ggrepel::geom_text_repel(size=label.size) +
    scale_color_manual(values=palette) +
    theme(text = element_text(size=text.size)) +
    labs(x=pca1, y=pca2)
  if(!with.legend) g <- g + theme(legend.position="none")
  g
}



#' Plot peptide or protein count per sample
#'
#' \code{plotCount} makes a plot of peptide/protein count per sample.
#'
#' @param pdat A \code{proteusData} object.
#' @param x.text.size Size of text on the x-axis (sample names).
#' @param palette Palette of colours
#' @return A plot (\code{ggplot} object) of the number of peptides/proteins
#'   detected in each sample.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotCount(pepdat)
#'
#' @export
plotCount <- function(pdat, x.text.size=10, palette=cbPalette){
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  meta <- pdat$metadata
  entry.count <- apply(pdat$tab, 2, function(x) sum(!is.na(x)))
  med.count <- median(entry.count)
  df <- data.frame(x=meta$sample, y=entry.count, condition=meta$condition)
  g <- ggplot(df, aes_(x=~x,y=~y,fill=~condition)) +
    geom_col(colour='grey60') +
    simple_theme +
    scale_y_continuous(expand = c(0,0)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5, size=x.text.size)) +
    labs(x="Sample", y="Count") +
    labs(title = paste0("Median count = ", med.count)) +
    theme(plot.title=element_text(hjust=0, size=12))
  if(nlevels(as.factor(df$condition)) <= length(palette)) g <- g + scale_fill_manual(values=palette)
  #g <- g + viridis::scale_fill_viridis(discrete=TRUE)
  g
}

#' Jaccard similarity
#'
#' Computes Jaccard similarity between "detections" in two vectors of the same
#' length. A detection is a value, as opposed to NA.
#'
#' @param x A vector
#' @param y A vector
#'
#' @return Jaccard similarity between x and y
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' sim <- jaccardSimilarity(pepdat$tab[,1], pepdat$tab[,2])
#'
#' @export
#'
jaccardSimilarity <- function(x, y) {
  stopifnot(length(x) == length(y))

  intersection <- which(!is.na(x) & !is.na(y))
  union <- which(!is.na(x) | !is.na(y))
  jaccard <- ifelse(length(union) > 0, length(intersection) / length(union), 0)
  return(jaccard)
}

#' Detection Jaccard similarity
#'
#' \code{plotDetectionSimilarit} plots a distribution of (peptide) detection
#' similarities between samples. Jaccard similarity between two sets is defined
#' as the size of the intersection divided by the size of the union. This plot
#' can be used to assess quality of data.
#'
#' @param pdat A \code{proteusData} object with peptides (or proteins).
#' @param text.size Text size.
#' @param plot.grid Logical to plot grid.
#' @param bin.size Bin size for the histogram.
#' @param hist.colour Colour of the histogram.
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotDetectionSimilarity(pepdat)
#'
#'
#' @export
#'
plotDetectionSimilarity <- function(pdat, bin.size=0.01, text.size=12, plot.grid=TRUE, hist.colour='grey30') {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")

  n <- ncol(pdat$tab)
  # indices of upper triangular: all pair-wise combinations of columns in tab
  pairs <- which(upper.tri(matrix(1, n, n)) == TRUE, arr.ind=TRUE)
  # Jaccard simliarity for each pair of columns in tab
  sim <- apply(pairs, 1, function(i) jaccardSimilarity(pdat$tab[,i[1]], pdat$tab[,i[2]]))

  ggplot(data.frame(sim), aes_(~sim, ~..density..)) +
  {if(plot.grid) simple_theme_grid else simple_theme} +
    geom_histogram(breaks=seq(0, 1, bin.size), colour=hist.colour, fill=hist.colour) +
    labs(x='Jaccard similarity', y='Density') +
    theme(text = element_text(size=text.size))
}


#' Plot distribution of intensities/ratios for each sample
#'
#' \code{plotSampleDistributions} makes a boxplot with intensity/ratio
#' distribution for each sample.
#'
#' @param pdat A \code{proteusData} object with peptide/protein intensities.
#' @param title Title of the plot.
#' @param method "box", "violin" or "dist"
#' @param palette Palette of colours
#' @param x.text.size Text size in value axis
#' @param x.text.angle Text angle in value axis
#' @param vmin Lower bound on log value
#' @param vmax Upper bound on log value
#' @param n.grid.rows Number of rows in the grid of facets
#' @param hist.bins Number of bins in histograms
#' @param log.scale Logical, to plot in logarithmic scale
#' @param log.base Base of the logarithm which will be applied to data
#' @param fill A metadata column to use for the fill of boxes
#' @param colour A metadata column to use for the outline colour of boxes
#' @param hline Logical, if true a horizontal line at zero is added
#'
#' @return A \code{ggplot} object.
#'
#' @export
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotSampleDistributions(prodat)
#' plotSampleDistributions(normalizeData(prodat))
#'
plotSampleDistributions <-
function(pdat, title="", method=c("violin", "dist", "box"), x.text.size=7, n.grid.rows=3,
         hist.bins=100, x.text.angle=90, vmin=as.numeric(NA), vmax=as.numeric(NA), log.scale=TRUE,
         log.base=10, palette=cbPalette, fill=NULL, colour=NULL, hline=FALSE) {
  method <- match.arg(method)

  m <- reshape2::melt(pdat$tab, varnames=c("ID", "sample"))
  mt <- data.frame(pdat$metadata, row.names = pdat$metadata$sample)
  if(!is.null(fill)) m[['fill']] <- as.factor(mt[m$sample, fill])
  if(!is.null(colour)) m[['colour']] <- as.factor(mt[m$sample, colour])

  if(log.scale > 0) m$value <- log(m$value, base=log.base)
  lg <- ifelse(log.scale, paste("log", log.base), "")

  if(method == "box" | method == "violin") {
    g <- ggplot(m, aes_(x=~sample, y=~value)) +
      simple_theme +
      ylim(vmin, vmax) +
      theme(axis.text.x = element_text(angle = x.text.angle, hjust = 1, vjust=0.5, size=x.text.size)) +
      labs(title=title, x="sample", y=paste0(lg, " value"))
    if(hline) g <- g + geom_hline(yintercept=0, colour='grey')
    if(method=="box") g <- g + geom_boxplot(outlier.shape=NA, na.rm=TRUE)
    if(method=="violin") g <- g + geom_violin(na.rm=TRUE, draw_quantiles=c(0.25, 0.5, 0.75), scale="width")
  } else if (method == "dist") {
    g <- ggplot(m, aes_(x=~value)) +
      geom_histogram(bins=hist.bins) +
      facet_wrap(~sample, nrow=n.grid.rows) +
      xlim(vmin, vmax)
  } else stop("Wrong method.")

  if(!is.null(fill)) {
    g <- g + aes_(fill=~fill)
    if(nlevels(m$fill) <= length(palette)) g <- g + scale_fill_manual(name=fill, values=palette)
  }
  if(!is.null(colour)) {
    g <- g + aes_(colour=~colour)
    if(nlevels(m$colour) <= length(palette)) g <- g + scale_color_manual(name=colour, values=palette)
  }
  g
}


#' Statistics for an intensity table
#'
#' \code{intensityStats} calculates the mean, variance and number of good data
#' points for peptide or protein intensities.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @return A data frame with several statistics.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' stats <- intensityStats(prodat)
#'
#' @export
intensityStats <- function(pdat) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")

  meta <- pdat$metadata

  stats <- NULL
  for(cond in pdat$conditions) {
    w <- pdat$tab[,which(meta$condition == cond), drop=FALSE]
    m <- rowMeans(w, na.rm=TRUE)
    m[which(is.nan(m))] <- NA
    v <- apply(w, 1, function(v) sd(v, na.rm=TRUE)^2)
    ngood <- apply(w, 1, function(v) sum(!is.na(v)))
    stats <- rbind(stats, data.frame(id=rownames(w), condition=cond, mean=m, variance=v, ngood=ngood))
    rownames(stats) <- NULL
  }
  return(stats)
}


#' Create a table indicating good data.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#'
#' @return A data frame with rows corresponding to peptides/proteins and columns
#'   corresponding to conditions. Logical values indicate if at least one
#'   replicate is available (TRUE) or if all replicates are missing (FALSE).
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' detect <- goodData(prodat)
#'
#' @export
goodData <- function(pdat) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")

  meta <- pdat$metadata
  G <- lapply(pdat$conditions, function(cond){
    w <- pdat$tab[,which(meta$condition == cond), drop=FALSE]
    rowSums(!is.na(w)) > 0
  })
  good <- as.data.frame(do.call(cbind, G))
  names(good) <- pdat$conditions
  return(good)
}


#' Plot mean-variance relationship
#'
#' \code{plotMV} makes a plot with variance of log-intensity vs mean of
#' log-intensity.
#'
#' @param pdat Peptide or protein \code{proteusData} object.
#' @param with.loess Logical. If true, a loess line will be added.
#' @param bins Number of bins for binhex
#' @param xmin Lower limit on x-axis
#' @param xmax Upper limit on x-axis
#' @param ymin Lower limit on y-axis
#' @param ymax Upper limit on y-axis
#' @param with.n Logical to add a text with the number of proteins to the plot
#' @param mid.gradient A parameter to control the midpoint of colour gradient
#'   (between 0 and 1)
#' @param text.size Text size on axes
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' plotMV(prodat, with.loess=TRUE)
#'
#' @export
plotMV <- function(pdat, with.loess=FALSE, bins=80, xmin=5, xmax=10, ymin=7, ymax=20, with.n=FALSE, mid.gradient=0.3, text.size=10) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  meta <- pdat$metadata
  if(is.null(meta)) stop("No metadata found.")

  stats <- pdat$stats
  if(is.null(stats)) stats <- intensityStats(pdat)
  stats <- stats[which(!is.na(stats$mean) & !is.na(stats$variance) & stats$mean > 0 & stats$variance > 0),]
  stats$mean <- log10(stats$mean)
  stats$variance <- log10(stats$variance)
  protnum <- as.data.frame(table(stats$condition))  # number of proteins in each condition
  colnames(protnum) <- c("condition", "n")
  protnum$labn <- paste0("n = ", protnum$n)

  # has to be calculated for each condition separately
  if(with.loess) {
    ldf <- NULL
    for(cond in pdat$conditions)
    {
      st <- stats[which(stats$condition == cond),]
      ls <- loess(variance ~ mean, data=st)
      x <- seq(from=min(na.omit(st$mean)), to=max(na.omit(st$mean)), by=0.01)
      pr <- predict(ls, x)
      ldf <- rbind(ldf, data.frame(condition=cond, x=x, y=pr))
    }
  }

  g <- ggplot(stats, aes_(x=~mean, y=~variance)) +
    simple_theme +
    theme(panel.border = element_rect(fill=NA, color='black')) +
    xlim(xmin, xmax) +
    ylim(ymin, ymax) +
    facet_wrap(~condition) +
    stat_binhex(bins=bins) +
    theme(text = element_text(size=text.size)) +
    viridis::scale_fill_viridis(values=c(0, mid.gradient, 1), name="count", na.value=NA) +
    #scale_fill_gradientn(colours=c("seagreen","yellow", "red"), values=c(0, mid.gradient, 1), name="count", na.value=NA)
  if(with.n) g <- g + geom_text(data=protnum, aes_(x=~xmin+0.5, y=~ymax, label=~labn))
  if(with.loess) g <- g + geom_line(data=ldf, aes_(x=~x,y=~y), color='black')
  return(g)
}

#' Plot protein/peptide intensities
#'
#' \code{plotIntensities} makes a plot with peptide/protein intensity as a
#' function of the condition and replicate. When multiple IDs are entered, the
#' mean and standard error is plotted.
#'
#' @param pdat A \code{proteusData} object.
#' @param id Protein name, peptide sequence or a vector with these.
#' @param log Logical. If set TRUE a logarithm of intensity is plotted.
#' @param ymin Lower bound for y-axis
#' @param ymax Upper bound for y-axis
#' @param text.size Text size
#' @param point.size Point size
#' @param title Title of the plot (defaults to protein name)
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' plotIntensities(prodat.med, id='sp|P26263|PDC6_YEAST', log=TRUE)
#'
#' @export
plotIntensities <- function(pdat, id=NULL, log=FALSE, ymin=as.numeric(NA), ymax=as.numeric(NA),
                         text.size=12, point.size=3, title=NULL) {
  # without 'as.numeric' it returns logical NA (!!!)

  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  if(is.null(id)) stop ("Need peptide/protein id.")

  if(pdat$content == "peptide") {
    selcol <- pdat$peptides
  } else if (pdat$content == "protein") {
    selcol <- pdat$proteins
  } else {
    stop("Unrecognized content in pdat object.")
  }
  sel <- which(selcol %in% id)

  if(length(sel) > 0 && sel > 0) {
    E <- if(log) log10(pdat$tab[sel,,drop=FALSE]) else pdat$tab[sel,,drop=FALSE]
    e <- colMeans(E, na.rm=TRUE)
    s <- sapply(E, function(x) sd(x, na.rm=TRUE)/sqrt(length(x)))
    n <- length(sel)

    if(is.null(title)) {
      title <- ifelse(n == 1, id, paste0("selection of ", n, " ", pdat$content, "s."))
    }

    meta <- pdat$metadata
    p <- data.frame(
      expr = e,
      lo = e - s,
      up = e + s,
      condition = factor(meta$condition, levels=unique(as.factor(meta$condition))),
      replicates = factor(meta$replicate)
    )
    # define shapes
    p$shape <- rep(21, length(p$expr))
    p$shape[which(p$expr==0)] <- 24
    pd <- position_dodge(width = 0.15)
    #
    if(is.na(ymin) && !log) ymin=0
    ylab <- ifelse(pdat$type == "SILAC", "Ratio", "Intensity")
    if(log) ylab <- paste("log10", ylab)
    ggplot(p, aes_(x=~condition, y=~expr, ymin=~lo, ymax=~up, fill=~replicates, shape=~shape)) +
      simple_theme_grid +
      theme(text = element_text(size=text.size), legend.position = "none") +
      ylim(ymin, ymax) +
      {if(n > 1) geom_errorbar(position=pd, width = 0.1)} +
      geom_point(position=pd, size=point.size) +
      scale_shape_identity() +  # necessary for shape mapping
      viridis::scale_fill_viridis(discrete=TRUE) +
      labs(x = 'Condition', y = ylab, title=title)
  }
}

#' Simple differential expression with limma
#'
#' \code{limmaDE} is a wrapper around \code{\link{limma}} to perform a
#' differential expression between a pair of conditions.
#'
#' @details
#'
#' Before \code{limma} is called, intensity data are transformed using the
#' \code{transform.fun} function. The default for this transformation is
#' \code{log2}. Therefore, by default, the column "logFC" in the output data
#' frame contains log2 fold change. If you need log10-based fold change, you can
#' use \code{transform.fun=log10}.
#'
#' \code{limmaDE} is only a simple wrapper around \code{\link{limma}}, to
#' perform differential expression between two conditions. For more complicated
#' designs we recommend using \code{\link{limma}} functions directly.
#'
#'
#' @param pdat Protein \code{proteusData} object.
#' @param formula A string with a formula for building the linear model.
#' @param conditions A character vector with two conditions for differential
#'   expression. Can be omitted if there are only two condition in \code{pdat}.
#' @param transform.fun A function to transform data before differential
#'   expression.
#' @param sig.level Significance level for rejecting the null hypothesis.
#' @return A data frame with DE results. "logFC" column is a log-fold-change
#'   (using the \code{transform.fun}). Two columns with mean log-intensity
#'   (again, using \code{transform.fun}) and two columns with the number of good
#'   replicates (per condition) are added. Attributes contain additional
#'   information about the transformation function, significance level, formula
#'   and conditions.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' res <- limmaDE(prodat.med)
#'
#' @export
limmaDE <- function(pdat, formula="~condition", conditions=NULL, transform.fun=log2, sig.level=0.05) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")

  meta <- pdat$metadata
  tab <- transform.fun(pdat$tab)

  # default conditions
  if(!is.null(conditions)) {
    for(cond in conditions ) {
      if(!(cond %in% meta$condition)) stop(paste("Condition", cond, "not found in metadata."))
    }
    sel <- which(meta$condition %in% conditions)
    meta <- droplevels(meta[sel,])
    tab <- tab[,sel]
  } else {
    conditions <- levels(meta$condition)
  }

  if(length(conditions) != 2) stop("This function requires exactly two conditions. Use the parameter conditions.")

  # limma analysis
  design <- model.matrix(as.formula(formula), meta)
  fit <- limma::lmFit(tab, design)
  ebay <- limma::eBayes(fit)
  coef <- colnames(ebay$design)[2]
  res <- limma::topTable(ebay, coef=coef, adjust="BH", sort.by="none", number=1e9)
  res <- cbind(rownames(res), res)
  colnames(res)[1] <- pdat$content
  res$significant <- res$adj.P.Val <= sig.level
  rownames(res) <- c()

  # add columns with mean intensity
  for(cond in conditions) {
    cname <- paste0("mean_", cond)
    m <- rowMeans(tab[,which(meta$condition == cond), drop=FALSE], na.rm=TRUE)
    m[which(is.nan(m))] <- NA
    res[, cname] <- m
  }

  # add columns with number of good replicates
  ngood <- reshape2::dcast(pdat$stats, id ~ condition, fun.aggregate=sum, value.var="ngood")
  ngood <- ngood[, c("id", conditions)]
  names(ngood)[2:ncol(ngood)] <- paste0("ngood_", names(ngood)[2:ncol(ngood)])
  res <- merge(res, ngood, by.x=pdat$content, by.y="id")

  # add annotations
  if(!is.null(pdat$annotation)) {
    res <- merge(res, pdat$annotation, by=pdat$content, all.x=TRUE)
  }

  attr(res, "transform.fun") <- deparse(substitute(transform.fun))
  attr(res, "sig.level") <- sig.level
  attr(res, "formula") <- formula
  attr(res, "conditions") <- levels(meta$condition)

  return(res)
}


#' Simple one-sample differential expression with limma
#'
#' \code{limmaDE} is a wrapper around \code{\link{limma}} to perform a
#' differential expression for one condition against the null hypothesis of one.
#' This is designed to be used with SILAC ratios.
#'
#' @details
#'
#' Performs differential expression for one condition, assuming the measured
#' values are ratios and the null hypothesis is the ratio of 1. The analysis is
#' done in logarithmic space, so the actual null hypothesis tested is log ratio
#' = 0. The log fold change returned is the log mean ratio of the condition.
#'
#' Before \code{limma} is called, intensity data are transformed using the
#' \code{transform.fun} function. The default for this transformation is
#' \code{log2}. Therefore, by default, the column "logFC" in the output data
#' frame contains log2 fold change. If you need log10-based fold change, you can
#' use \code{transform.fun=log10}.
#'
#' \code{limmaDE} is only a simple wrapper around \code{\link{limma}}, to
#' perform differential expression between two conditions. For more complicated
#' designs we recommend using \code{\link{limma}} functions directly.
#'
#'
#' @param pdat Protein \code{proteusData} object.
#' @param condition Condition name.
#' @param transform.fun A function to transform data before differential
#'   expression.
#' @param sig.level Significance level for rejecting the null hypothesis.
#' @return A data frame with DE results. "logFC" column is a log-fold-change
#'   (using the \code{transform.fun}). Attributes contain
#'   additional information about the transformation function, significance
#'   level and condition name.
#'
#' @examples
#' \dontrun{
#' library(proteusSILAC)
#' data(proteusSILAC)
#' prodat.norm <- normalizeData(prodat)
#' res <- limmaRatioDE(prodat.norm, condition="T48")
#' }
#' @export
limmaRatioDE <- function(pdat, condition=NULL, transform.fun=log2, sig.level=0.05) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")

  meta <- pdat$metadata
  tab <- transform.fun(pdat$tab)

  # check condition
  if(is.null(condition)) stop("Need a condition.")
  if(length(condition) != 1) stop("Need exactly one condition.")

  # select condition
  sel <- which(meta$condition == condition)
  if(length(sel) < 2) stop("Need at least two replicates for one-sample differential expression.")
  tab <- tab[, sel]

  # limma analysis
  design <- cbind(Intercept = rep(1, ncol(tab)))
  fit <- limma::lmFit(tab, design)
  ebay <- limma::eBayes(fit)
  res <- limma::topTable(ebay, adjust="BH", sort.by="none", number=1e9)
  res <- cbind(rownames(res), res)
  colnames(res)[1] <- pdat$content
  res$significant <- res$adj.P.Val <= sig.level
  rownames(res) <- c()

  # add column with number of good replicates
  ngood <- pdat$stats[pdat$stats$condition == condition, c("id", "ngood")]
  res <- merge(res, ngood, by.x=pdat$content, by.y="id")

  # add annotations
  if(!is.null(pdat$annotation)) {
    res <- merge(res, pdat$annotation, by=pdat$content, all.x=TRUE)
  }

  attr(res, "transform.fun") <- deparse(substitute(transform.fun))
  attr(res, "sig.level") <- sig.level
  attr(res, "conditions") <- condition

  return(res)
}


#' Fold-change intensity diagram
#'
#' \code{plotFID} makes a log10 fold change versus log10 sum intensity plot,
#' usually known as MA plot.
#'
#' @param pdat Protein \code{proteusData} object.
#' @param pair A two-element vector containing the pair of conditions to use.
#'   Can be skipped if there are only two conditions.
#' @param bins Number of bins for binhex.
#' @param marginal.histograms A logical to add marginal histograms.
#' @param xmin Lower limit on x-axis.
#' @param xmax Upper limit on x-axis.
#' @param ymax Upper limit on y-axis. If used, the lower limit is -ymax.
#' @param text.size Text size.
#' @param point.size Size of points in the plot.
#' @param show.legend Logical to show legend (colour key).
#' @param plot.grid Logical to plot a grid.
#' @param binhex Logical. If TRUE, a hexagonal density plot is made, otherwise it
#'   is a simple point plot.
#' @param transform.fun A function to transform data before plotting.
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' plotFID(prodat.med)
#'
#' @export
plotFID <- function(pdat, pair=NULL, bins=80, marginal.histograms=FALSE,
                   xmin=NULL, xmax=NULL, ymax=NULL, text.size=12, point.size=1,
                   show.legend=TRUE, plot.grid=TRUE, binhex=TRUE, transform.fun=log10) {
  if(!is(pdat, "proteusData")) stop ("Input data must be of class proteusData.")
  if(binhex & marginal.histograms) {
    warning("Cannot plot with both binhex=TRUE and marginal.histograms=TRUE. Ignoring binhex.")
    binhex=FALSE
  }

  if(is.null(pair)) pair <- pdat$conditions
  if(length(pair) != 2) stop("Need exactly two conditions. You might need to specify pair parameter.")
  title <- paste(pair, collapse=":")

  tab <- transform.fun(pdat$tab)

  # helper function
  condMeans <- function(cond) {
    m <- rowMeans(tab[,which(pdat$metadata$condition == cond), drop=FALSE], na.rm=TRUE)
    m[is.nan(m)] <- NA
    m
  }

  # build data frame with x-y cooordinates
  # including infinities
  m1 <- condMeans(pair[1])
  m2 <- condMeans(pair[2])
  good <- !is.na(m1) & !is.na(m2)
  df <- data.frame(
    id = rownames(pdat$tab),
    x = (m1 + m2) / 2,
    y = m2 - m1,
    good = good
  )

  mx <- 1.1 * max(abs(df$y), na.rm=TRUE)
  m <- rbind(m1[!good], m2[!good])
  df[!good, "x"] <- colSums(m, na.rm=TRUE)
  df[!good, "y"] <- ifelse(is.na(m[1,]), mx, -mx)

  g <- ggplot(df[good, ], aes_(x=~x, y=~y))

  if(binhex) {
    g <- g + stat_binhex(bins=bins, show.legend=show.legend, na.rm=TRUE) +
      viridis::scale_fill_viridis(name="count", na.value=NA)
      #scale_fill_gradientn(colours=c("seagreen","yellow", "red"), name = "count",na.value=NA)
  } else {
    g <- g + geom_point(size=point.size, na.rm=TRUE) +
      geom_point(data=df[!good,], aes_(x=~x, y=~y), colour="orange", size=point.size, na.rm=TRUE)
  }

  if(plot.grid) {
    g <- g + simple_theme_grid
  } else {
    g <- g + simple_theme
  }

  g <- g + geom_abline(colour='red', slope=0, intercept=0) +
    labs(title=title, x=paste0(pair[1], '+', pair[2]), y=paste0(pair[2], '-', pair[1])) +
    theme(text = element_text(size=text.size))

  if(!is.null(xmin) && !is.null(xmax)) g <- g + scale_x_continuous(limits = c(xmin, xmax), expand = c(0, 0))
  if(!is.null(ymax) ) g <- g + scale_y_continuous(limits = c(-ymax, ymax), expand = c(0, 0))
  if(marginal.histograms) g <- ggExtra::ggMarginal(g, size=10, type = "histogram", xparams=list(bins=100), yparams=list(bins=50))
  return(g)
}

#' Plot p-value distribution
#'
#' \code{plotPdist} makes a plot with distribution of raw p-values, obtained by
#' \code{\link{limmaDE}}.
#'
#' @param res Output table from \code{\link{limmaDE}}.
#' @param text.size Text size.
#' @param plot.grid Logical to plot grid.
#' @param bin.size Bin size for the histogram.
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' res <- limmaDE(prodat.med)
#' plotPdist(res)
#'
#' @export
plotPdist <- function(res, bin.size=0.02, text.size=12, plot.grid=TRUE) {
  ggplot(res, aes_(~P.Value, ~..density..)) +
    {if(plot.grid) simple_theme_grid else simple_theme} +
    geom_histogram(breaks=seq(0, 1, bin.size), colour='blue', na.rm=TRUE) +
    labs(x='P-value', y='Density') +
    theme(text = element_text(size=text.size))
}

#'Volcano plot
#'
#'\code{plotVolcano} makes a volcano plot from limma results. Uses
#'\code{\link{stat_binhex}} function from ggplot2 to make a hexagonal heatmap.
#'
#'@param res Result table from  \code{\link{limmaDE}}.
#'@param bins Number of bins for binhex.
#'@param xmax Upper limit on x-axis. If used, the lower limit is -xmax.
#'@param ymax Upper limit on y-axis. If used, the lower limit is -ymax.
#'@param marginal.histograms A logical to add marginal histograms.
#'@param text.size Text size.
#'@param show.legend Logical to show legend (colour key).
#'@param plot.grid Logical to plot grid.
#'@param binhex Logical. If TRUE, a hexagonal density plot is made, otherwise it
#'  is a simple point plot.
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' res <- limmaDE(prodat.med)
#' plotVolcano(res)
#'
#'@export
plotVolcano <- function(res, bins=80, xmax=NULL, ymax=NULL, marginal.histograms=FALSE, text.size=12, show.legend=TRUE,
                        plot.grid=TRUE, binhex=TRUE) {
  if(binhex & marginal.histograms) {
    warning("Cannot plot with both binhex=TRUE and marginal.histograms=TRUE. Ignoring binhex.")
    binhex=FALSE
  }

  tr <- attr(res, "transform.fun")
  conds <- attr(res, "conditions")
  xlab <- ifelse(is.null(tr), "FC", paste(tr, "FC"))
  tit <- paste(conds, collapse=":")
  id <- names(res)[1]

  g <- ggplot(res, aes_(~logFC, ~-log10(P.Value)))

  if(binhex) {
    g <- g + stat_binhex(bins=bins, show.legend=show.legend, na.rm=TRUE) +
      viridis::scale_fill_viridis(name="count", na.value=NA)
      #scale_fill_gradientn(colours=c("seagreen","yellow", "red"), name = "count", na.value=NA)
  } else {
    g <- g + geom_point(na.rm=TRUE)
  }

  if(plot.grid) {
    g <- g + simple_theme_grid
  } else {
    g <- g + simple_theme
  }

  g <- g + geom_vline(colour='red', xintercept=0) +
    theme(text = element_text(size=text.size)) +
    labs(x=xlab, y="-log10 P", title=tit)


  if(!is.null(xmax)) g <- g + scale_x_continuous(limits = c(-xmax, xmax), expand = c(0, 0))
  if(!is.null(ymax) ) g <- g + scale_y_continuous(limits = c(0, ymax), expand = c(0, 0))

  if(marginal.histograms) g <- ggExtra::ggMarginal(g, size=10, type = "histogram", xparams=list(bins=100), yparams=list(bins=50))
  return(g)
}


#' Protein-peptide plot
#'
#' \code{plotProtPeptides} creates a plot consisting of two panels. The top
#' panel shows peptide log intensity. Each box represents one peptide, peptide
#' numbering follows alphabetical sequence order. The bottom panel shows sample
#' intensity. Each box represents one sample. White boxes show derived protein
#' intensities (if \code{prodat} is provided).
#'
#' @param pepdat Peptide \code{proteusData} object.
#' @param protein Protein name.
#' @param prodat (optional) protein \code{proteusData} object.
#' @param palette Palette of colours
#'
#' @return A \code{ggplot} object.
#'
#' @examples
#' library(proteusLabelFree)
#' data(proteusLabelFree)
#' prodat.med <- normalizeData(prodat)
#' plotProtPeptides(pepdat.clean, 'sp|P26263|PDC6_YEAST', prodat.med)
#'
#' @export
plotProtPeptides <- function(pepdat, protein, prodat=NULL, palette=cbPalette) {
  if(!is(pepdat, "proteusData")) stop ("Input data must be of class proteusData.")
  tab <- normalizeMedian(pepdat$tab)

  # all peptides for this protein
  selprot <- which(pepdat$pep2prot$protein == protein)
  if(length(selprot) == 0) stop(paste0("Protein '", protein, "' not found."))

  # need as.character, or indexing by factor is wrong!
  peps <- as.character(pepdat$pep2prot[selprot,'sequence'])
  mat <- as.matrix(tab[peps,])
  dat <- reshape2::melt(mat, varnames=c("peptide", "sample"))
  dat$sample <- factor(dat$sample, levels=pepdat$metadata$sample)
  dat$pepnum <- sprintf("%02d", as.numeric(dat$peptide))  # convert sequences into numbers
  dat$intensity <- log10(dat$value)

  # add condition column
  s2c <- setNames(pepdat$metadata$condition, pepdat$metadata$sample)
  dat$condition <- s2c[dat$sample]

  # add protein intensity column
  if(!is.null(prodat)) {
    p2p <- setNames(as.numeric(prodat$tab[protein,]), colnames(prodat$tab))
    dat$prot.intensity <- log10(p2p[dat$sample])
  }

  g1 <- ggplot(dat, aes_(x=~pepnum, y=~intensity, fill=~condition)) +
    scale_fill_manual(values=palette) +
    #viridis::scale_fill_viridis(discrete=TRUE) +
    geom_boxplot(outlier.shape = NA)  +
    geom_jitter(width=0, size=0.5) +
    facet_wrap(~condition) +
    theme(legend.position="none") +
    labs(x="Peptide", y="log intensity", title=protein)
  g2 <- ggplot(dat, aes_(x=~sample, y=~intensity, fill=~condition)) +
    scale_fill_manual(values=palette) +
    #viridis::scale_fill_viridis(discrete=TRUE) +
    geom_boxplot(outlier.shape = NA)  +
    geom_jitter(width=0, size=0.5) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
    theme(legend.position="none") +
    labs(x="Sample", y="log intensity")
  if(!is.null(prodat)) g2 <- g2 + geom_point(aes_(x=~sample, y=~prot.intensity), shape=22, size=3, fill='white')
  gridExtra::grid.arrange(g1, g2, ncol=1)
}