knitr::opts_chunk$set(echo = TRUE)

This workbook demonstrates the workflow and data structures associated with performing projection drivers with a GLM-based approach.

#These will be loaded with the package once it is installed
#Directories are relative to this .Rmd file in vignettes folder
require(glmpd)


require(monocle3)
require(purrr)
require(tidyr)
require(tibble)
require(dplyr)
require(MASS)
require(ggplot2)
require(furrr)
require(ComplexHeatmap)
require(RhpcBLASctl)
#dataset to project into patterns
#TODO: is there a "package" way to load this
cds <- readRDS("../inst/extdata/6month_LMMP.rds")

#p-weights resulting from projection with projectR
#rownames are cell barcodes, column names are pattern names. eg "cellPattern1"
projected_weights <- read.csv("../inst/extdata/6mo_LMMP_cell_size_log_in_TC_LMMP_9-18_patterns.csv", row.names = 1)

Prepare the cell data set and projection values for the model fittings

Important steps include removing genes with zero (low) counts that will not produce good fittings and will increase run time.

#TODO: All of this is unneccesary for a vignette. do the prep and downsampling before. 
#pattern # to project
select_patterns <- c(27,43)
projected_weights <- projected_weights[paste0("cellPattern", select_patterns)]

#remove lowly expressed genes that will not result in good fits
fData(cds)$num_transcripts <- Matrix::rowSums(exprs(cds))
min_n_transcripts <- 150
min_pct_cells_expressing <- .005

min_cells_expressing <- min_pct_cells_expressing*dim(cds)[2]
genes_to_use <- which(fData(cds)$num_transcripts > min_n_transcripts & fData(cds)$num_cells_expressed > min_cells_expressing)
#character vector of gene names
genes_of_interest <- fData(cds)[genes_to_use, "gene_id"]

cds_sub <- cds[fData(cds)$gene_id %in% genes_of_interest,]
#TODO: Make sure gene short names are all unique

#downsample for ease of running the vignette
# set.seed(208)
set.seed(118)
downsample <- 1
if(downsample){
  num_genes <- 5
  down_gene <- sample(dim(cds_sub)[1], num_genes)
  # num_cells <- 40
  # down_cell <- sample(dim(cds_sub)[2], num_cells)

  cds_sub <- cds_sub[down_gene,]

  #character vector of final gene names
  genes_of_interest <- fData(cds_sub)[,"gene_id"]
}

cds_sub

Declare the model formula and run model fittings

Using standard R notation for defining models, the flexible

#patternWeights is hard-coded in the fit_models_cds function to be the p-weights column provided
model_str <- paste0("~0 + patternWeights:cell_type + cell_type")

#specify parameters for fittings
exp_family <- "negbinomial"
#capable of parallelization over patterns (columns of provided projection matrix)
ncores = 1
#return the full glm output
return_option <- "full_model"

system.time(glm_models_list <- glmpd::fitGLMpd(object = cds_sub,
                                             model_formula_str = model_str,
                                             projected_patterns = projected_weights,

                                             cores = ncores, 
                                             count_assay = "counts"))
                                             # TODO: old parameters that need to be added back
                                             # exp_family = exp_family,
                                             # clean_model = F,
                                             # verbose = T,
                                             # result = return_option))

class(glm_models_list)
length(glm_models_list)
names(glm_models_list)

#model results for first pattern
purrr::pluck(glm_models_list, paste0("cellPattern",select_patterns[1]))

Extract parameter estimates, standard errors, p-value, etc from the model object

Model fittings are complete, now we can extract the relevant info from the models using extractCoefficients(). The first order list is always the patterns. Provided the genes used in the fittings above, it generates a data frame for each gene with each row representing one of the model parameters, and each column representing a measurement (eg. estimate, p-value...).

#genes vector is used to pull parameter estimates from the model info for each gene. Must be the same as provided in the cds_sub that is fitted
#organizes each pattern such that one list is present for each gene, with all coefficients together
extracted_coefficients_list <- extractCoefficients(glm_models = glm_models_list, genes = genes_of_interest)

names(extracted_coefficients_list)
head(names(extracted_coefficients_list[[1]]))

#for one pattern, for one gene
purrr::pluck(extracted_coefficients_list, paste0("cellPattern",select_patterns[1]), genes_of_interest[1])

Re-organize by creating a data frame for each "measurement" from the model

A measurement for each term can be the beta estimates (estimate), the standard error (std_error), or statistical significance (q_value)

#filter significance will filter genes to ones that are signficant (q-value) at the provided threshold (eg 0.05).
#string matching to the string provided. For categorical variables/dummy variables, genes are maintained if any of variables in that group are significant
gene_coefficients_list <- orderCoefficientsByGene(extracted_coefficients_list, filter_significance = NULL, string = NULL)

purrr::pluck(gene_coefficients_list, paste0("cellPattern", select_patterns[1]))

purrr::pluck(gene_coefficients_list, paste0("cellPattern", select_patterns[1])) %>%
  dplyr::filter(measure == "estimate") %>%
  dplyr::select(data) %>%
  unnest(cols = data) %>% 
  as.data.frame()

Generate matrices, and visualize results as a heatmap

#Generate matrices for heatmap plotting.
#terms_match are selected based on matching the beginning of the parameter strings. terms_exact are selected on exact matches.
#feature corresponds to which measure of gene_coefficients_list (above) will be plotted
#gene_name specifies name of column that contains gene_names. default is gene_id
#Transpose flips matrix for plotting genes as columns, parameters as rows
coefficient_matrix_list <- organizeEstimates(gene_coefficients_list, 
                                             terms_exact = NULL,  
                                             terms_match = c("patternWeights:cell_type", "cell_type"),
                                             feature = "estimate",
                                             transpose = T,
                                             gene_name = "gene_id")

names(coefficient_matrix_list)
names(purrr::pluck(coefficient_matrix_list, paste0("cellPattern", select_patterns[1])))
purrr::pluck(coefficient_matrix_list, paste0("cellPattern", select_patterns[1]), "patternWeight:cell_type")[,1:4]

#I left the actual heatmap plotting outside any functions because its so specific and customizable
heatmap_list <- purrr::map(names(coefficient_matrix_list), function(pattern_name){

  #FIRST HEATMAP, clustering of columns based on this one
  term <- "patternWeights:cell_type"

  #extract dataframe of estimates for this pattern, for this set of parameters. Rows are parameters, columns are genes
  match <- coefficient_matrix_list %>% purrr::pluck(pattern_name, term)

  ##order rows here, but they already alphabetical. 
  match <- match[order(rownames(match)),]

  #show gene_names if less than some number of genes
  do.show.col.names <- (dim(match)[2] < 200)

  hm1 <- ComplexHeatmap::Heatmap(match, name = paste0(term, ": ", pattern_name))

  #base cell_type factors on this ordering (which is after clustering in heatmap)
  ord <- row_order(hm1)

  #SECOND HEATMAP
  term <- "cell_type"

  #extract dataframe of estimates for this pattern, for this set of parameters. Rows are parameters, columns are genes
  match2<- coefficient_matrix_list %>% purrr::pluck(pattern_name, term)

  ##order rows here, but they already alphabetical. 
  match2 <- match2[order(rownames(match2)),]
  row_order <- rownames(match2)[ord]

  hm2 <- ComplexHeatmap::Heatmap(match2, name = paste0(term, ": ", pattern_name), cluster_rows = F,
                                 row_order = row_order,  row_names_gp = gpar(fontsize = c(8)), show_column_names = do.show.col.names)

  #vertical heatmap merging
  heatmap_merge <- hm1  %v% hm2

  return(heatmap_merge)

})

heatmap_list


gofflab/glmpd documentation built on April 11, 2021, 6:38 a.m.