rnaseqUtils: Utility Functions for RNASeq Data Analysis

Documented in examine_dataset get_normalized_expression_matrix

#' Generate several plots and objects for dataset exploration
#'
#' This function examines several facets of the dataset, including filtering
#' cells based on metadata, dimension reduction, cluster analysis, and
#' visualization
#'
#' @param cell_dataframe: cell_dataframe consists of sample names of each cell
#'   to be examined, as well as all other data that might be used for
#'   visualizing the resulting cells. This data.frame must have a character
#'   column named 'sample_name', which must match the row names of
#'   normalized_expression_matrix
#'
#' @param normalized_expression_matrix: this must be a CxG matrix, with C cells
#'   and G genes, The C cells are the rownamews of the matrix and must match the
#'   sample_name column from cell_dataframe. The column names must be ENSEMBL
#'   IDs. The entries of the matrix must be the normalized since they will be
#'   used in tsne
#'
#' @param .species: currently either 'mmusculus' or 'hsapiens'
#'
#' @param color_var: variable name to color cells by in overview ggplot images.
#'   This must correspond to a column name in cell_data_dataframe.
#'
#' @return This function returns a list with the following elements: \itemize{
#'   \item \strong{top_1000_mat}: a matrix of the top 1000 overdispersed genes
#'   \item \strong{tsne_results}: a list of Rtsne outputs from the filtered
#'   matrix, each from a differnet perplexity (15,30,45,60) \item
#'   \strong{n50_enriched_genes_by_cluster}: a dataframe showing the top 50
#'   enriched genes in each cluster. Enrichment is based on proportion of cells
#'   expressing a gene \item \strong{updated_metadata_df}: the original
#'   cell_dataframe annotated with cluster_id \item
#'   \strong{clusters_scatterplot}: a scatterplot coloring the cells by
#'   cluster_id\item \strong{important_genes_heatmap}: a heatmap plot showing
#'   the enriched genes in each cluster \item \strong{overview_plots}: a list of
#'   plots corresponding to the tsne runs that are coloredc by @param color_by }

#' @export
examine_dataset <- function(cell_data_dataframe = data.frame(), normalized_expression_matrix = matrix(), .species = 'mmusculus', color_var = character())  {
  results_list = list()
  normalized_expression_matrix <- normalized_expression_matrix[cell_data_dataframe$sample_name,]
  message('Removing non-expressed genes from subsetted data.')
  normalized_expression_matrix <- normalized_expression_matrix[,colSums(normalized_expression_matrix) > 0]

  top_1000_dispersed_mat <- get_dispersion(normalized_expression_matrix)

  #invalid_cols <- sum(colSums(top_1000_dispersed_mat) == 0)
  #if(length(invalid_cols) >= 0)  {
  #  warning("Although 10")
  #}
  valid_top_1000_dispersed_mat <- top_1000_dispersed_mat[,colSums(top_1000_dispersed_mat) > 0]


  results_list[["top_1000_mat"]] <- top_1000_dispersed_mat
  valid_top_1000_dispersed_prcomp <- prcomp(valid_top_1000_dispersed_mat,
                                            scale. = TRUE)
  valid_top_1000_dispersed_rtsne_list <- lapply(c(15, 30, 45, 60),
                                                function(.perplexity) {
                                                  Rtsne::Rtsne(valid_top_1000_dispersed_prcomp$x[, 1:50],
                                                               perplexity = .perplexity, pca = FALSE)
                                                })
  results_list[["tsne_results"]] <- valid_top_1000_dispersed_rtsne_list
  tree_splits_list <- get_treecuts(valid_top_1000_dispersed_mat)
  sample_name_cluster_id_df <- get_sample_name_to_cluster_id_df(tree_splits_list[[2]][[3]],
                                                                normalized_expression_matrix)
  n50_enriched_genes_by_cluster_df <- get_enriched_genes_in_each_cluster(sample_name_cluster_id_df,
                                                                         expression_mat = normalized_expression_matrix, top_n = 50,
                                                                         species = .species)
  results_list[["n50_enriched_genes_by_cluster"]] <- n50_enriched_genes_by_cluster_df
  results_list[["updated_metadata_df"]] <- dplyr::left_join(cell_data_dataframe,
                                                            sample_name_cluster_id_df)
  clusters_scatterplot <- get_clustered_scatterplot(valid_top_1000_dispersed_rtsne_list[[4]]$Y,
                                                    sample_name_cluster_id_df)
  results_list[["clusters_scatterplot"]] <- clusters_scatterplot
  prop_df <- get_prop_expressed_df(sample_to_cluster_df = sample_name_cluster_id_df,
                                   expression_mat = as.matrix(normalized_expression_matrix[,
                                                                                           unique(n50_enriched_genes_by_cluster_df$ENSEMBL)]),
                                   species = .species)

  ensembl_ids_without_genenames <- unique(prop_df$ENSEMBL[is.na(prop_df$GENENAME)])
  if(length(ensembl_ids_without_genenames) > 0)  {
    warning("Several gene IDs were identified without matching Gene names. These are: ", paste(ensembl_ids_without_genenames, collapse=" "), ". Removing them from consideration...")
    prop_df <- dplyr::filter(prop_df, ! ENSEMBL %in% ensembl_ids_without_genenames)
  }



  important_genes_heatmap_ggplot <- plot_ordered_heatmap(prop_df)
  results_list[["important_genes_heatmap"]] <- important_genes_heatmap_ggplot
  temp_visualization_list <- lapply(valid_top_1000_dispersed_rtsne_list,
                                    function(tsne_result) {
                                      overview_df <- data.frame(tsne_result$Y, sample_name = rownames(normalized_expression_matrix),
                                                                stringsAsFactors = FALSE) %>% dplyr::rename(D1 = X1,
                                                                                                            D2 = X2) %>% dplyr::left_join(., cell_data_dataframe)
                                      ggplot2::ggplot(overview_df, aes_string(x = "D1",
                                                                              y = "D2", color = color_var)) + geom_point()
                                    })
  results_list[["overview_plots"]] <- temp_visualization_list
  return(results_list)
}

#' Process cellranger output
#'
#' Read cellranger output directory, filter the data, and return normalized
#' expression matrix
#'
#' @param run_to_path_df: A dataframe with two columns (1) run_name (character): The name
#' of the cellranger run, such as patient_id, sample_date, etc...
#' (2) cellranger_path (character): The path to the toplevel cellranger output
#'
#' @param .genome: a character, typically 'mm10', or 'hg19'.
#'
#' @param umi_limits: vector of length 2, with lower and upper bounds for umi_counts.
#'
#' @return This function reads a series of cellranger output directories. It merges the
#' results together into a large matrix anbd then filters out cells with UMI
#' counts outside of the range specified in umi_limits. Mitochondrial and
#' ribosomal protein-coding genes are then removed from the matrix, as are
#' ENSEMBL IDs with no expression across the remaining cells. Finally, the data
#' is scaled using a global scaling factor (total UMIs per cell) and
#' log2-transformed.
#'
#' @export
get_normalized_expression_matrix  <- function(run_to_path_df = data.frame(), .genome = c('mm10', 'hg19'), umi_limits = c(3e3,Inf))  {
  output_list = list()
  mat_list <- lapply(1:nrow(run_to_path_df), function(.ind)  {
    #sample_name <- sub("PAN_", "", sub('_[^_]+', '', basename(.filename)))
    sample_obj <- cellrangerRkit::load_cellranger_matrix(run_to_path_df$cellranger_path[.ind], genome = .genome)
    sample_mat <- as.matrix(exprs(sample_obj))
    print(class(sample_mat))
    colnames(sample_mat) <- paste(run_to_path_df$run_name[.ind], colnames(sample_mat), sep = "_")
    return(sample_mat)
  })
  print(sapply(mat_list, class))
  giant_mat <- t(do.call(cbind, mat_list))
  giant_mat <- giant_mat[,apply(giant_mat,2,function(.col)  {sum(.col > 0) >= 1})]
  rm(mat_list)
  print(class(giant_mat))

  #Remove ribosomal and mt proteins
  if(.genome == 'mm10')  {
    giant_norpmt_mat <- remove_ribosomal_proteins_and_mitochondrial_genes_from_matrix(cell_by_ensembl_mat = giant_mat, species = 'mmusculus')
  }
  else if(.genome == 'hg19')  {
    giant_norpmt_mat <- remove_ribosomal_proteins_and_mitochondrial_genes_from_matrix(cell_by_ensembl_mat = giant_mat, species = 'hsapiens')
  }
  else  {
    stop(paste(.genome, 'not handled yet.'))
  }

  #Remove cells outside the umi count limits defined by umi_limits
  umi_counts <- rowSums(giant_norpmt_mat)
  filtered_mat <- giant_norpmt_mat[(umi_counts <= umi_limits[2]) & (umi_counts >= umi_limits[1]),]
  rm(umi_counts)

  #Remove non-expressed genes
  filtered_mat <- filtered_mat[,colSums(filtered_mat) > 0]
  filtered_mat <- log2((filtered_mat / rowSums(filtered_mat)) * median(rowSums(filtered_mat)) + 1)

  umi_counts <- rowSums(filtered_mat)
  names(umi_counts) <- rownames(filtered_mat)

  return(list(normalized_matrix = filtered_mat, umi_counts = umi_counts, normalization_scale_factor = median(umi_counts)))

}