R/plot_functions_frequencies.R

Defines functions plot_cond plot_cond_overlap plot_cond_freq plot_coverage plot_frequency plot_numbers

Documented in plot_cond plot_cond_freq plot_cond_overlap plot_coverage plot_frequency plot_numbers

#' Plot protein numbers
#'
#' \code{plot_numbers} generates a barplot
#' of the number of identified proteins per sample.
#'
#' @param se SummarizedExperiment,
#' Data object for which to plot protein numbers
#' (output from \code{\link{make_se}()} or \code{\link{make_se_parse}()}).
#' @param plot Logical(1),
#' If \code{TRUE} (default) the barplot is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' barplot is based on are returned.
#' @return Barplot of the number of identified proteins per sample
#' (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter and plot numbers
#' filt <- filter_missval(se, thr = 0)
#' plot_numbers(filt)
#' @export
plot_numbers <- function(se, plot = TRUE) {
  # Show error if input is not the required classes
  assertthat::assert_that(inherits(se, "SummarizedExperiment"),
    is.logical(plot),
    length(plot) == 1)

  # Make a binary long data.frame (1 = valid value, 0 = missing value)
  df <- assay(se) %>%
    data.frame() %>%
    rownames_to_column() %>%
    gather(ID, bin, -rowname) %>%
    mutate(bin = ifelse(is.na(bin), 0, 1))
  # Summarize the number of proteins identified
  # per sample and generate a barplot
  stat <- df %>%
    group_by(ID) %>%
    summarize(n = n(), sum = sum(bin)) %>%
    left_join(., data.frame(colData(se)), by = "ID")
  p <- ggplot(stat, aes(x = ID, y = sum, fill = condition)) +
    geom_col() +
    geom_hline(yintercept = unique(stat$n)) +
    labs(title = "Proteins per sample", x = "",
         y = "Number of proteins") +
    theme_DEP2()
  if(plot) {
    return(p)
  } else {
    df <- as.data.frame(stat)
    colnames(df)[seq_len(3)] <- c("sample", "total_proteins", "proteins_in_sample")
    return(df)
  }
}

#' Plot protein overlap between samples
#'
#' \code{plot_frequency} generates a barplot
#' of the protein overlap between samples
#'
#' @param se SummarizedExperiment,
#' Data object for which to plot observation frequency.
#' @param plot Logical(1),
#' If \code{TRUE} (default) the barplot is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' barplot is based on are returned.
#' @return Barplot of overlap of protein identifications
#' between samples (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter and plot frequency
#' filt <- filter_missval(se, thr = 0)
#' plot_frequency(filt)
#' @export
plot_frequency <- function(se, plot = TRUE) {
  # Show error if input is not the required classes
  assertthat::assert_that(inherits(se, "SummarizedExperiment"),
    is.logical(plot),
    length(plot) == 1)

  # Make a binary long data.frame (1 = valid value, 0 = missing value)
  df <- assay(se) %>%
    data.frame() %>%
    rownames_to_column() %>%
    gather(ID, bin, -rowname) %>%
    mutate(bin = ifelse(is.na(bin), 0, 1))
  # Identify the number of experiments a protein was observed
  stat <- df %>%
    group_by(rowname) %>%
    summarize(sum = sum(bin))
  # Get the frequency of the number of experiments proteins
  # were observerd and plot these numbers
  table <- table(stat$sum) %>% data.frame()
  p <- ggplot(table, aes(x = Var1, y = Freq, fill = Var1)) +
    geom_col() +
    scale_fill_grey(start = 0.8, end = 0.2) +
    labs(title = "Protein identifications overlap",
         x = "Identified in number of samples",
         y = "Number of proteins") +
    theme_DEP2() +
    theme(legend.position="none")
  if(plot) {
    return(p)
  } else {
    df <- as.data.frame(table)
    colnames(df) <- c("samples", "proteins")
    return(df)
  }
}

#' Plot protein coverage
#'
#' \code{plot_coverage} generates a barplot
#' of the protein coverage in all samples.
#'
#' @param se SummarizedExperiment,
#' Data object for which to plot observation frequency.
#' @param plot Logical(1),
#' If \code{TRUE} (default) the barplot is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' barplot is based on are returned.
#' @return Barplot of protein coverage in samples
#' (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter and plot coverage
#' filt <- filter_missval(se, thr = 0)
#' plot_coverage(filt)
#' @export
plot_coverage <- function(se, plot = TRUE) {
  # Show error if input is not the required classes
  assertthat::assert_that(inherits(se, "SummarizedExperiment"),
    is.logical(plot),
    length(plot) == 1)

  # Make a binary long data.frame (1 = valid value, 0 = missing value)
  df <- assay(se) %>%
    data.frame() %>%
    rownames_to_column() %>%
    gather(ID, bin, -rowname) %>%
    mutate(bin = ifelse(is.na(bin), 0, 1))
  # Identify the number of experiments a protein was observed
  stat <- df %>%
    group_by(rowname) %>%
    summarize(sum = sum(bin))
  # Get the frequency of the number of experiments proteins were observerd
  # and plot the cumulative sum of these numbers
  table <- table(stat$sum) %>%
    data.frame()
  p <- ggplot(table, aes(x = "all", y = Freq, fill = Var1)) +
    geom_col(col = "white") +
    scale_fill_grey(start = 0.8, end = 0.2) +
    labs(title = "Protein coverage",
      x = "",
      y = "Number of proteins",
      fill = "Samples") +
    theme_DEP1()
  if(plot) {
    return(p)
  } else {
    df <- as.data.frame(table)
    colnames(df) <- c("samples", "proteins")
    return(df)
  }
}

#' Plot frequency of significant conditions per protein
#'
#' \code{plot_cond_freq} generates a histogram of the number of significant conditions per protein.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param plot Logical(1),
#' If \code{TRUE} (default) the histogram is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' histogram is based on are returned.
#' @return A histogram (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' # Plot frequency of significant conditions
#' plot_cond_freq(dep)
#' @export
plot_cond_freq <- function(dep, plot = TRUE) {
  # Show error if inputs are not the required classes
  assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
    is.logical(plot),
    length(plot) == 1)

  # Check for significance columns
  row_data <- rowData(dep, use.names = FALSE)
  if(length(grep("_significant", colnames(row_data))) < 1) {
    stop("'[contrast]_significant' columns are not present in '",
         deparse(substitute(dep)),
         "'\nRun add_rejections() to obtain the required columns",
         call. = FALSE)
  }

  # Check for significant column
  if(!"significant" %in% colnames(row_data)) {
    stop("'significant' column is not present in '",
         deparse(substitute(dep)),
         "'\nRun add_rejections() to obtain the required column",
         call. = FALSE)
  }

  # Filter for significant proteins
  significant <- dep[row_data$significant, ]

  # Get significant columns and count significant conditions per protein
  row_data <- rowData(significant, use.names = FALSE) %>% data.frame()
  cols <- grep("_significant", colnames(row_data))
  df <- row_data %>%
    select(name, ID, cols) %>%
    gather(condition, significant, -c(name, ID)) %>%
    mutate(val = ifelse(significant, 1, 0))
  stat <- df %>%
    group_by(name) %>%
    summarize(sum = sum(val), n = n())
  # Get a counts table
  table <- table(stat$sum) %>% data.frame()

  # Plot the count table as a bar graph
  p <- ggplot(table, aes(x = Var1, y = Freq, fill = Var1)) +
    geom_col() +
    labs(x = "Number of significant conditions",
         y = "Number of proteins",
         title = "Frequency of significant conditions per protein") +
    DEP::theme_DEP1() +
    theme(legend.position = "none")
  if(plot) {
    return(p)
  } else {
    df <- as.data.frame(table)
    colnames(df) <- c("conditions", "proteins")
    return(df)
  }
}

#' Plot conditions overlap
#'
#' \code{plot_cond_overlap} generates a histogram of
#' the number of proteins per condition or overlapping conditions.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param plot Logical(1),
#' If \code{TRUE} (default) the barplot is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' barplot is based on are returned.
#' @return A histogram (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' # Plot condition overlap
#' plot_cond_overlap(dep)
#' @export
plot_cond_overlap <- function(dep, plot = TRUE) {
  # Show error if inputs are not the required classes
  assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
    is.logical(plot),
    length(plot) == 1)

  # Check for significance columns
  row_data <- rowData(dep, use.names = FALSE)
  if(length(grep("_significant", colnames(row_data))) < 1) {
    stop("'[contrast]_significant' columns are not present in '",
         deparse(substitute(dep)),
         "'\nRun add_rejections() to obtain the required columns",
         call. = FALSE)
  }

  # Check for significant column
  if(!"significant" %in% colnames(row_data)) {
    stop("'significant' column is not present in '",
         deparse(substitute(dep)),
         "'\nRun add_rejections() to obtain the required column",
         call. = FALSE)
  }

  # Filter for significant proteins
  significant <- dep[row_data$significant, ]

  # Get significant columns
  row_data <- rowData(significant, use.names = FALSE) %>% data.frame()
  cols <- grep("_significant", colnames(row_data))
  colnames(row_data)[cols] <- gsub("_significant", "", colnames(row_data)[cols])

  # Rename column names
  row_data_renamed <- row_data
  colnames(row_data_renamed)[cols] <- LETTERS[seq(to = length(cols))]
  legend <- data.frame(symbol = colnames(row_data_renamed)[cols],
    contrast = colnames(row_data)[cols])

  # Get co-occurence matrix
  df <- select(row_data_renamed, name, ID, cols)
  counts <- table(df[,3:(length(cols)+2)]) %>%
    as.data.frame() %>%
    filter(Freq > 0)

  # Parse condition names
  mat <- counts[,seq_len(ncol(counts)-1)] == TRUE
  counts$conditions <-
    apply(mat, 1, function(x) {
      paste0(colnames(counts[,seq_len(ncol(counts)-1)])[x], collapse = " ")
    })

  # Sort on number of conditions
  counts$n_con <- apply(mat, 1, function(x) length(which(x)))
  counts <- counts %>%
    arrange(n_con, conditions)
  counts$conditions <- parse_factor(counts$conditions, levels = counts$conditions)

  if(nrow(counts) <= 40 ) {
    labelsize = 12
  } else {
    labelsize = 12 / (nrow(counts) / 40)
  }

  # Plot conditions overlap histogram
  p1 <- ggplot(counts, aes(x = conditions, y = Freq)) +
    geom_col(fill = "black") +
    labs(title = "Overlap between conditions",
         x = "Conditions",
         y = "Number of Proteins") +
    theme_DEP2() +
    theme(axis.text=element_text(size = labelsize))

  # Legend table
  ttheme <- gridExtra::ttheme_minimal(
    core=list(fg_params=list(hjust=0, x=0.1)))
  p2 <- gridExtra::tableGrob(legend, theme = ttheme, rows = NULL, cols = NULL)
  if(!plot) {
    df <- counts %>%
      select(conditions, Freq, n_con)
    colnames(df) <- c("conditions", "proteins", "number_of_conditions")
    list <- list(counts = df, legend = legend)
    return(list)
  } else {
    gridExtra::grid.arrange(p1, p2, ncol = 2, widths = c(0.8, 0.2))
  }
}

#' Plot frequency of significant conditions per protein
#' and the overlap in proteins between conditions
#'
#' \code{plot_cond} generates a histogram of
#' the number of proteins per condition and stacks for overlapping conditions.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param plot Logical(1),
#' If \code{TRUE} (default) the barplot is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' barplot is based on are returned.
#' @return A histogram (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' # Plot histogram with overlaps
#' plot_cond(dep)
#' @export
plot_cond <- function(dep, plot = TRUE) {
  # Show error if inputs are not the required classes
  assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
    is.logical(plot),
    length(plot) == 1)

  # Check for significance columns
  row_data <- rowData(dep, use.names = FALSE)
  if(length(grep("_significant", colnames(row_data))) < 1) {
    stop("'[contrast]_significant' columns are not present in '",
         deparse(substitute(dep)),
         "'\nRun add_rejections() to obtain the required columns",
         call. = FALSE)
  }

  # Check for significant column
  if(!"significant" %in% colnames(row_data)) {
    stop("'significant' column is not present in '",
         deparse(substitute(dep)),
         "'\nRun add_rejections() to obtain the required column",
         call. = FALSE)
  }

  # Filter for significant proteins
  significant <- dep[row_data$significant, ]

  # Get significant columns
  row_data <- rowData(significant, use.names = FALSE) %>% data.frame()
  cols <- grep("_significant", colnames(row_data))
  colnames(row_data)[cols] <- gsub("_significant", "", colnames(row_data)[cols])

  # Rename column names
  row_data_renamed <- row_data
  colnames(row_data_renamed)[cols] <- LETTERS[seq(to = length(cols))]
  legend <- data.frame(symbol = colnames(row_data_renamed)[cols], names = colnames(row_data)[cols])

  # Get co-occurence matrix
  df <- select(row_data_renamed, name, ID, cols)
  counts <- table(df[,3:(length(cols)+2)]) %>%
    as.data.frame() %>%
    filter(Freq > 0)

  # Parse condition names
  mat <- counts[,seq_len(ncol(counts)-1)] == TRUE
  counts$conditions <-
    apply(mat, 1, function(x) {
      paste0(colnames(counts[,seq_len(ncol(counts)-1)])[x], collapse = " ")
    })

  # Sort on number of conditions
  counts$n_con <- apply(mat, 1, function(x) length(which(x)))
  counts <- counts %>%
    arrange(n_con, conditions)
  counts$conditions <- parse_factor(counts$conditions, levels = counts$conditions)

  counts <- mutate(counts, ID = paste0(conditions, ": ", Freq))

  if(nrow(counts) <= 40 ) {
    labelsize = 12
  } else {
    labelsize = 12 / (nrow(counts) / 40)
  }

  # Plot conditions overlap histogram
  p1 <- ggplot(counts, aes(x = n_con, y = Freq, fill = conditions)) +
    geom_col() +
    geom_text(aes(label = ID), size = 3, hjust = 0.5,
              position = position_stack(vjust = 0.5)) +
    labs(x = "Number of significant conditions",
         y = "Number of proteins",
         title = "Overlap between conditions") +
    DEP::theme_DEP1()

  # Legend table
  ttheme <- gridExtra::ttheme_minimal(
    core=list(fg_params=list(hjust=0, x=0.1)))
  p2 <- gridExtra::tableGrob(legend, theme = ttheme, rows = NULL, cols = NULL)
  if(!plot) {
    df <- counts %>%
      select(conditions, Freq, n_con)
    colnames(df) <- c("conditions", "proteins", "number_of_conditions")
    list <- list(counts = df, legend = legend)
    return(list)
  } else{
    gridExtra::grid.arrange(p1, p2, ncol = 2, widths = c(0.8, 0.2))
  }
}

Try the DEP package in your browser

Any scripts or data that you put into this service are public.

DEP documentation built on Nov. 8, 2020, 7:49 p.m.