R/seuratter.R

Defines functions determine_dimensionality compare_cluster_genes draw_cluster_gene_relations plot_cluster_relations run_seuratter

Documented in compare_cluster_genes determine_dimensionality draw_cluster_gene_relations plot_cluster_relations run_seuratter

#' Calculates The Optimum Dimensionality From An Elbow Plot.
#'
#' A function that calculates the optimum dimensionality from an elbow plot. Involves calculating
#' slopes between two consecutive points. The optimum dimensionality will be determined as the point at
#' which slopes no longer change or no visible change occurs to the slopes thereafter. In this function, a dimension
#' is determined optimal when a slope connecting that point to another is flatter than 10 times the flattest slope
#' in the plot and the stdev at that point is proximal to the lower limit.
#' The plot data must be plotted by dims (x-axis) and stdev (y-axis).
#'
#' @param plot_data An elbow plot data used for detemining dimensionality, generated by calling ElbowPlot() on a Seurat object.
#' @param cutoff The cutoff that indicates how much data/variance is covered.
#'
#' @return Returns an integer that represents the optimal dimensionality that was determined by calculation.
#'
#' @examples
#' library(Seurat)
#' determine_dimensionality(elbow_plot_data, 0.95)
#'
#' @import Seurat
#'
#' @export
determine_dimensionality <- function(plot_data, cutoff) {

  sum_so_far <- 0
  sum_stdev <- sum(plot_data$stdev)
  dimension <- 1

  # Count dimension until we reach the dimension that explains enough of the data
  # that satisfies the cutoff.
  if (cutoff == 1) {
    dimension <- max(plot_data$dims)
  } else {
    while (sum_so_far < sum_stdev * cutoff) {
      sum_so_far <- sum_so_far + plot_data$stdev[dimension]
      dimension <- dimension + 1
    }
  }
  plot_data$color <- "black"
  plot_data$color[plot_data$dims < dimension] <- "green"
  plot_data$color[dimension] <- "red"

  plot(plot_data$dims, plot_data$stdev, col = plot_data$color, pch = 19,
       main = "Elbow plot with determined dimensionality",
       sub = "Red dot with an arrow indicates the dimensionality and the green points are all data included.")
  arrows(dimension, plot_data$stdev[dimension] + 1, dimension, plot_data$stdev[dimension] + 0.5)
  return(dimension)
}


#' Compares Genes Expressed In Each Cluster.
#'
#' A function that compares and compiles all the genes identified by extract_genes_expressed() into one matrix. Does
#' this by determining the gene expressions that overlap between clusters and mapping that information into
#' the matrix.
#'
#' @param seurat_object A Seurat object that has undergone the entire workflow, involving UMAP.
#' @param min_percent A float representing the percentage of cells that have to express the gene to be
#' included in the result.
#'
#' @return Returns a matrix of number of clusters by number of clusters, with each matrix cell (x,y)
#' containing the vector of genes expressed by both clusters x and y.
#'
#' @examples
#' compare_cluster_genes(PBMC)
#'
#' @export
compare_cluster_genes <- function(seurat_object, min_percent) {

  # Preparing data for manipulation.
  clusters <- unique(seurat_object$seurat_clusters)
  num_clusters <- length(clusters)
  row_col_name <- as.character(sort(clusters)) # Used for row and column of the matrix and they have to be chars.

  # Prepare matrices to input the data.
  # First matri is to first compile all the gene expression data for each cluster, generated
  # by extract_genes_expressed.
  # Second matrix is to keep the genes that were determined to overlap between clusters.
  gene_expressions <- matrix(list(), nrow = num_clusters, ncol = 1, dimnames = list(row_col_name, "genes"))
  gene_comparison_matrix <- matrix(list(), nrow = num_clusters, ncol = num_clusters, dimnames = list(row_col_name, row_col_name))

  # Iterate through each cluster and extract genes expressed.
  for (cluster_id in row.names(gene_expressions)) {
    cluster_markers <- Seurat::FindMarkers(seurat_object, ident.1 = cluster_id, min.pct = min_percent)
    gene_expressions[cluster_id, "genes"][[1]] <- rownames(cluster_markers)
  }

  # For each pair of clusters, determine the overlapping gene expressions and store them in a matrix.
  for (cluster_id_x in row_col_name) {
    for (cluster_id_y in row_col_name) {
      if (cluster_id_x == cluster_id_y) {
        gene_comparison_matrix[cluster_id_x, cluster_id_y][[1]] <- NA
      }
      else if (is.null(gene_comparison_matrix[cluster_id_x, cluster_id_y][[1]])) {
        gene_overlaps <-
          intersect(gene_expressions[cluster_id_x, "genes"][[1]], gene_expressions[cluster_id_y, "genes"][[1]])
        gene_comparison_matrix[cluster_id_x, cluster_id_y][[1]] <- gene_overlaps
      } else {
        ;
      }
    }
  }

  return(gene_comparison_matrix)
}



#' Plot A Graph Representing Gene Expression Relationships Between Clusters
#'
#' Plot a graph representing the gene expression relationships between clusters. The edge thickness depends
#' on how many genes the clusters share between each other. The more of the same genes they express, the thicker
#' stronger their relationship is.
#'
#' @param gene_comparison_matrix A matrix object of number of cluster by number of clusters cells, with each
#' matrix cell (x,y) containing the vector of genes expressed by both clusters x and y.
#'
#' @return Returns a graph of nodes and edges, where node (number represents cluster id) represents a cluster
#' and an edge between nodes represent the genes shared by those clusters.
#'
#' @examples
#' library(igraph)
#' library(utils)
#' library(graphics)
#' gene_comparison_matrix <- compare_cluster_genes(PBMC)
#' draw_cluster_gene_relations(gene_comparison_matrix)
#'
#' @import igraph utils graphics
#'
#' @export
draw_cluster_gene_relations <- function(gene_comparison_matrix) {

  # Create a clone of the input matrix because we will replace each cell by counts of genes that overlap
  # (ie. the length of the vector that occupied that cell).
  another_matrix <- cbind(gene_comparison_matrix)
  first_cluster <- head(row.names(gene_comparison_matrix), 1)
  last_cluster <- tail(row.names(gene_comparison_matrix), 1)

  # For every pair of clusters, replace the gene expression vectors with its size/count, so that
  # we can use it for edge width.
  for (cluster_x in first_cluster:last_cluster) {
    for (cluster_y in first_cluster:last_cluster) {
      if (length(another_matrix[as.character(cluster_x), as.character(cluster_y)][[1]]) == 1 &&
          is.na(another_matrix[as.character(cluster_x), as.character(cluster_y)][[1]])) {
        another_matrix[as.character(cluster_x), as.character(cluster_y)][[1]] <- 0
      } else {
        another_matrix[as.character(cluster_x), as.character(cluster_y)][[1]] <-
          length(another_matrix[as.character(cluster_x), as.character(cluster_y)][[1]])
      }
    }
  }

  color_gradient <- colorRamp(c('blue', 'cyan', 'yellow', 'red'))

  graph <- igraph::graph_from_adjacency_matrix(another_matrix, mode="undirected", weighted = TRUE)

  # Convert weight such that it is weight/max(weight) so that we can use it to convert into rgb color.
  E(graph)$weight <- E(graph)$weight / max(E(graph)$weight)

  # Set the graph edge colors in a gradient to mimick a heatmap.
  E(graph)$color <- apply(color_gradient(E(graph)$weight), 1, function(x) rgb(x[1]/255,x[2]/255,x[3]/255))
  # Adapted from:
  # ahmohamed. (February 14 2015). How to scale edge colors in igraph?. *Stackoverflow*.
  # Retrieved from: https://stackoverflow.com/questions/28366329/how-to-scale-edge-colors-in-igraph

  # Set this layout to have a good balance between the graph and legend.
  layout(matrix(1:2,ncol=2), width = c(2,1),height = c(1,1))

  igraph::plot.igraph(graph, main = 'Graph representing gene expression
              relationship between clusters')


  # Set these palettes for plotting the legend, additionally. This is done separately from the
  # igraph plot.
  color_palette <- colorRampPalette(c('red', 'yellow', 'cyan', 'blue'))
  legend <- as.raster(matrix(color_palette(20), ncol = 1))
  plot(c(0,5),c(0,1),type = 'n', axes = F,xlab = '', ylab = '', main = 'Commonness of genes',)
  text(x=1.5, y = seq(0,1,l=5), labels = seq(0,1,l=5))
  rasterImage(legend, 0, 0, 1,1)
  # Above chunk is adapted from the following:
  # mnel. (November 13 2012). Gradient legend in base. Stackoverflow.
  # Retrieved from: https://stackoverflow.com/questions/13355176/gradient-legend-in-base

  # Reset layout because otherwise the subsequent plots are affected too.
  par(mfrow=c(1,1))

  return(graph)
}


#' Plot A Bar Graph Representing Gene Expression Relationships Between Clusters with Numerical Context.
#'
#' Plot a bar graph representing the gene expression relationships between clusters. Each bar represents
#' how much genes are shared in the pair with respect to other pairs. Because the relationships are made
#' relative to each other, the cluster pair with the most genes shared will be indicated as 1.
#'
#' @param gene_expression_graph Graph output from draw_cluster_relations to utilize the edge weights
#'
#' @return Returns a bar graph indicating which cluster pairs share the more genes with respect to
#' the pair with the most genes shared.
#'
#' @examples
#' library(igraph)
#' library(utils)
#' library(graphics)
#' graph <- draw_cluster_gene_relations(gene_comparison_matrix)
#' plot_cluster_relations(graph)
#' @import igraph utils graphics
#'
#' @export
plot_cluster_relations <- function(gene_expression_graph) {
  i <- 1
  j <- 2
  x_labels <- c()
  vertices <- V(gene_expression_graph)$name

  # This loop is working to extract all possible cluster pairs to use it for the x-asix label.
  while (i <= length(vertices)) {
    while (j <= length(vertices)) {
      x_labels <- c(x_labels, paste(vertices[i], vertices[j], sep = ','))
      j <- j + 1
    }
    i <- i + 1
    j <- i + 1
  }

  height <- E(gene_expression_graph)$weight
  names(height) <- x_labels

  return(barplot(height, las = 2, xlab = 'Pairs of cluster ids',
                 ylab = 'Relative gene expression similarity',
                 main = 'Relative comparison of gene expression similarity between clusters',
                 sub = '*It is relative to the most similar cluster pair. 1.0 = the most similar cluster, but not 100% similarity'))
}

#' Combine the functions in seuratter.R to run the workflow for generating the graph.
#'
#' Work as a controller integrating extract_genes_expressed, compare_cluster_genes, and
#' draw_cluster_gene_relations.
#'
#' @param seurat_object A Seurat object that has undergone the entire workflow, involving UMAP.
#'
#' @return Returns a vector containing all the graphical outputs generated
#' by seuratter (except determine_dimensionality)
#'
#' @examples
#' library(igraph)
#' library(utils)
#' library(graphics)
#' run_seuratter(PBMC)
#'
#' @import igraph utils graphics
#'
#' @export
run_seuratter <- function(seurat_object) {
  gene_comparison_matrix <- compare_cluster_genes(PBMC, 0.25)
  graph <- draw_cluster_gene_relations(gene_comparison_matrix)
  bar_graph <- plot_cluster_relations(graph)
  return(c(gene_comparison_matrix, graph, bar_graph))
}
sotaro0214/seuratter documentation built on Dec. 8, 2019, 4:24 p.m.