R/sct.R

Defines functions pivot_sct build_sct plot_sct plot_sct_by_sample

Documented in build_sct pivot_sct plot_sct plot_sct_by_sample

#
# sct
#
# sct stands for sample-condition table. Briefly, it collects information from an ExpressionSet
# into a set of samples with specific conditions, together with gene expression. This is really
# only useful for gene subset. Given the gene subset, you can then plot the genes by sample
# and by condition.





#' Generate sct plots by sample as a list.
#'
#' Given a sample-condition table, generate a list of plots by sample. That is,
#' every sample generates a ggplot showing the gene expression of a subset of
#' genes for a specific condition (as highlighted by the sct, sample_field and
#' condition_field).
#'
#' sct stands for sample-condition table. Briefly, it collects information from an ExpressionSet
#' into a set of samples with specific conditions, together with gene expression. This is really
#' only useful for gene subset. Given the gene subset, you can then plot the genes by sample
#' and by condition.
#'
#' @param sct The sample-condition table (tibble/data.frame)
#' @param sample_field The string representing the sample field
#' @param condition_field The string representing the condition field
#' @param main A title to use for plots (one is autogenerated).
#'
#' @return A list of ggplots.
#' @export
#'
#'
plot_sct_by_sample<-function(sct, sample_field=NULL, condition_field=NULL, main=NULL) {
  assertthat::assert_that(methods::is(sct, "data.frame"),
                          assertthat::has_name(sct, sample_field))


  # An extra check, if it's really a data.frame then convert it to tibble to keep the
  # rownames.
  if ( !tibble::is_tibble(sct)) {
    sct<-tibble::as_tibble(sct, rownames="Sample")
  }

  # Generate the plot list by iteratively calling plot_sct on groups of data.
  plot_list<-sct %>%
    dplyr::group_split(.data[[sample_field]]) %>%
    purrr::map( function(tab) {
      plot_sct(tab, sample_field, condition_field,
               main=sprintf("Gene expression for %s across conditions.",unique(tab$Sample)))
    })

  plot_list
}




#' Plot a sample-condition table
#'
#' Plot the gene expression data within a sample-condition table, indicating both
#' the samples and conditions in the plot.
#'
#' sct stands for sample-condition table. Briefly, it collects information from an ExpressionSet
#' into a set of samples with specific conditions, together with gene expression. This is really
#' only useful for gene subset. Given the gene subset, you can then plot the genes by sample
#' and by condition.
#'
#' The samples are indicated by color and the conditions are indicates by shape. NOTE:
#' if the condition is not specified, then just the samples are plotted (i.e., no conditions).
#'
#' Also note that the samples and conditions have to be factors. This will be done by default
#' for you. However, if you want a particular ordering of the levels then convert the column
#' to a factor before calling the function.
#'
#' NOTE: If you want a list, use plot_sct_by_sample()
#'
#' @param sct A sample-condition table.
#' @param sample_field The string name of the sample field
#' @param condition_field The string name of the condition field
#' @param main a title for the plot
#'
#' @return A ggplot showing the gene expression for a set of samples and conditions.
#' @export
#'
#' @importFrom rlang := !!
plot_sct<-function(sct, sample_field=NULL, condition_field=NULL, main=NULL) {
  assertthat::assert_that(methods::is(sct, "data.frame"),
                          assertthat::not_empty(sample_field),
                          assertthat::has_name(sct, "Gene"),
                          assertthat::has_name(sct, "Expression"),
                          assertthat::has_name(sct, sample_field))

  # This allows a plot without a condition, by replacing condition with sample.
  if (is.null(condition_field)) {
    condition_field<-sample_field
  }

  # Convert the two fields to factors. If they already were, nothing happens. This
  # uses the default factor ordering.
  sct<-sct %>%
    dplyr::mutate(!!sample_field := factor(.data[[sample_field]]),
           !!condition_field := factor(.data[[condition_field]]))

  # ggplot sct.
  g<-ggplot2::ggplot(sct, ggplot2::aes(x=.data$Gene,
                                       y=.data$Expression
                                       ))

  # Special case if too many samples on the plot
  if ( length(unique(dplyr::pull(sct,condition_field))) > 6 ) {
      # Skip the shapes altogether and stick with color only.
      g <- g + ggplot2::geom_jitter(height=0, width=0.2, shape=21,
                                    mapping=ggplot2::aes(fill=.data[[sample_field]]))
  } else {
      # Map shape to the condition.
      g <- g +
      ggplot2::geom_jitter(height=0, width=0.2,
                           mapping=ggplot2::aes(shape=.data[[condition_field]],
                                                color=.data[[sample_field]]))
  }

  # If we are plotting gene ranks, then make a continuous scale
  if (all(sct$Expression %in% 1:10))
    g <- g + scale_y_continuous(breaks=1:10)

  # Finish the plot
  g +
    ggplot2::theme_bw() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, size=8, hjust=1)) +
    ggplot2::ggtitle(main) +
    ggsci::scale_color_npg()

}


#' Build an SCT from an ExpressionSet
#'
#' This creates a sample-condition table from an ExpressionSet with pData.
#'
#' An SCT is a set of gene expressions indicating specific samples and associated with
#' a specific condition. This structure allows plotting of individual gene expression by
#' sample and by condition.
#'
#' sct stands for sample-condition table. Briefly, it collects information from an ExpressionSet
#' into a set of samples with specific conditions, together with gene expression. This is really
#' only useful for gene subset. Given the gene subset, you can then plot the genes by sample
#' and by condition.
#'
#' This function takes an ExpressionSet, extracts two fields from the pData (sample_field
#' and condition_field) and builds the necessary structure for plotting.
#'
#' @param x An ExpressionSet (possibly reduced gene list)
#' @param sample_field A pdata field representing the unique samples (repeated entries).
#' @param condition_field A pdata field representing the condition to plot.
#'
#' @return A sct (sample-condition table).
#' @export
#'
build_sct<-function(x, sample_field=NULL, condition_field=NULL) {
  assertthat::assert_that(methods::is(x, "ExpressionSet"),
                          assertthat::not_empty(sample_field),
                          assertthat::has_name(Biobase::pData(x), sample_field))

  # If condition field is empty, just reuse samlpe field
  if ( is.null(condition_field))
    condition_field<-sample_field

  # Combine gene expression and pData fields into single entity.
  sct<-cbind(
    t(Biobase::exprs(x)),
    Biobase::pData(x)[,unique(c(sample_field, condition_field)),drop=F]
  )

  # Then pivot the data to usable form.
  pivot_sct(sct, sample_field, condition_field)
}


#' Internal function to pivot a SCT
#'
#' @param sct The un-pivoted sample-condition table.
#' @param sample_field Name of the sample field
#' @param condition_field Name of the condition field
#'
#' @return A pivoted sct suitable for graphing.
#' @export
#'
#' @importFrom magrittr %>%
pivot_sct<-function(sct, sample_field, condition_field) {
  assertthat::assert_that(methods::is(sct, "data.frame"),
                          assertthat::not_empty(sample_field)
  )

  # Create gene table first.
  # Exclude metadata columns,
  gene_table<-sct[,-match(c(sample_field,condition_field), colnames(sct))]
  # transpose and convert to tibble
  gene_table<-tibble::as_tibble(t(gene_table), rownames="Gene")

  # Pivot the expression data to longer format.
  psct<-tidyr::pivot_longer(gene_table,
                            cols=c(tidyselect::everything(), -tidyselect::any_of(c("Gene"))),
                            names_to="Sample", values_to="Expression")

  # Add in the sample/condition information by joining on original data and selecting only
  # the prespecified columns.
  dplyr::left_join(psct, tibble::as_tibble(sct, rownames="Sample"),
                   by=c("Sample" = "Sample" )) %>%
    dplyr::select(tidyselect::all_of(c("Gene","Sample","Expression")),
                  !!!sample_field, !!!condition_field)

}
steveneschrich/CLIApet documentation built on March 2, 2021, 12:02 a.m.