knitr::opts_chunk$set(echo = TRUE)
This workbook demonstrates the workflow and data structures associated with performing projection drivers with a GLM-based approach.
#These will be loaded with the package once it is installed #Directories are relative to this .Rmd file in vignettes folder require(glmpd) require(monocle3) require(purrr) require(tidyr) require(tibble) require(dplyr) require(MASS) require(ggplot2) require(furrr) require(ComplexHeatmap) require(RhpcBLASctl)
#dataset to project into patterns #TODO: is there a "package" way to load this cds <- readRDS("../inst/extdata/6month_LMMP.rds") #p-weights resulting from projection with projectR #rownames are cell barcodes, column names are pattern names. eg "cellPattern1" projected_weights <- read.csv("../inst/extdata/6mo_LMMP_cell_size_log_in_TC_LMMP_9-18_patterns.csv", row.names = 1)
Important steps include removing genes with zero (low) counts that will not produce good fittings and will increase run time.
#TODO: All of this is unneccesary for a vignette. do the prep and downsampling before. #pattern # to project select_patterns <- c(27,43) projected_weights <- projected_weights[paste0("cellPattern", select_patterns)] #remove lowly expressed genes that will not result in good fits fData(cds)$num_transcripts <- Matrix::rowSums(exprs(cds)) min_n_transcripts <- 150 min_pct_cells_expressing <- .005 min_cells_expressing <- min_pct_cells_expressing*dim(cds)[2] genes_to_use <- which(fData(cds)$num_transcripts > min_n_transcripts & fData(cds)$num_cells_expressed > min_cells_expressing) #character vector of gene names genes_of_interest <- fData(cds)[genes_to_use, "gene_id"] cds_sub <- cds[fData(cds)$gene_id %in% genes_of_interest,] #TODO: Make sure gene short names are all unique #downsample for ease of running the vignette # set.seed(208) set.seed(118) downsample <- 1 if(downsample){ num_genes <- 5 down_gene <- sample(dim(cds_sub)[1], num_genes) # num_cells <- 40 # down_cell <- sample(dim(cds_sub)[2], num_cells) cds_sub <- cds_sub[down_gene,] #character vector of final gene names genes_of_interest <- fData(cds_sub)[,"gene_id"] } cds_sub
Using standard R notation for defining models, the flexible
#patternWeights is hard-coded in the fit_models_cds function to be the p-weights column provided model_str <- paste0("~0 + patternWeights:cell_type + cell_type") #specify parameters for fittings exp_family <- "negbinomial" #capable of parallelization over patterns (columns of provided projection matrix) ncores = 1 #return the full glm output return_option <- "full_model" system.time(glm_models_list <- glmpd::fitGLMpd(object = cds_sub, model_formula_str = model_str, projected_patterns = projected_weights, cores = ncores, count_assay = "counts")) # TODO: old parameters that need to be added back # exp_family = exp_family, # clean_model = F, # verbose = T, # result = return_option)) class(glm_models_list) length(glm_models_list) names(glm_models_list) #model results for first pattern purrr::pluck(glm_models_list, paste0("cellPattern",select_patterns[1]))
Model fittings are complete, now we can extract the relevant info from the models using extractCoefficients(). The first order list is always the patterns. Provided the genes used in the fittings above, it generates a data frame for each gene with each row representing one of the model parameters, and each column representing a measurement (eg. estimate, p-value...).
#genes vector is used to pull parameter estimates from the model info for each gene. Must be the same as provided in the cds_sub that is fitted #organizes each pattern such that one list is present for each gene, with all coefficients together extracted_coefficients_list <- extractCoefficients(glm_models = glm_models_list, genes = genes_of_interest) names(extracted_coefficients_list) head(names(extracted_coefficients_list[[1]])) #for one pattern, for one gene purrr::pluck(extracted_coefficients_list, paste0("cellPattern",select_patterns[1]), genes_of_interest[1])
A measurement for each term can be the beta estimates (estimate), the standard error (std_error), or statistical significance (q_value)
#filter significance will filter genes to ones that are signficant (q-value) at the provided threshold (eg 0.05). #string matching to the string provided. For categorical variables/dummy variables, genes are maintained if any of variables in that group are significant gene_coefficients_list <- orderCoefficientsByGene(extracted_coefficients_list, filter_significance = NULL, string = NULL) purrr::pluck(gene_coefficients_list, paste0("cellPattern", select_patterns[1])) purrr::pluck(gene_coefficients_list, paste0("cellPattern", select_patterns[1])) %>% dplyr::filter(measure == "estimate") %>% dplyr::select(data) %>% unnest(cols = data) %>% as.data.frame()
#Generate matrices for heatmap plotting. #terms_match are selected based on matching the beginning of the parameter strings. terms_exact are selected on exact matches. #feature corresponds to which measure of gene_coefficients_list (above) will be plotted #gene_name specifies name of column that contains gene_names. default is gene_id #Transpose flips matrix for plotting genes as columns, parameters as rows coefficient_matrix_list <- organizeEstimates(gene_coefficients_list, terms_exact = NULL, terms_match = c("patternWeights:cell_type", "cell_type"), feature = "estimate", transpose = T, gene_name = "gene_id") names(coefficient_matrix_list) names(purrr::pluck(coefficient_matrix_list, paste0("cellPattern", select_patterns[1]))) purrr::pluck(coefficient_matrix_list, paste0("cellPattern", select_patterns[1]), "patternWeight:cell_type")[,1:4] #I left the actual heatmap plotting outside any functions because its so specific and customizable heatmap_list <- purrr::map(names(coefficient_matrix_list), function(pattern_name){ #FIRST HEATMAP, clustering of columns based on this one term <- "patternWeights:cell_type" #extract dataframe of estimates for this pattern, for this set of parameters. Rows are parameters, columns are genes match <- coefficient_matrix_list %>% purrr::pluck(pattern_name, term) ##order rows here, but they already alphabetical. match <- match[order(rownames(match)),] #show gene_names if less than some number of genes do.show.col.names <- (dim(match)[2] < 200) hm1 <- ComplexHeatmap::Heatmap(match, name = paste0(term, ": ", pattern_name)) #base cell_type factors on this ordering (which is after clustering in heatmap) ord <- row_order(hm1) #SECOND HEATMAP term <- "cell_type" #extract dataframe of estimates for this pattern, for this set of parameters. Rows are parameters, columns are genes match2<- coefficient_matrix_list %>% purrr::pluck(pattern_name, term) ##order rows here, but they already alphabetical. match2 <- match2[order(rownames(match2)),] row_order <- rownames(match2)[ord] hm2 <- ComplexHeatmap::Heatmap(match2, name = paste0(term, ": ", pattern_name), cluster_rows = F, row_order = row_order, row_names_gp = gpar(fontsize = c(8)), show_column_names = do.show.col.names) #vertical heatmap merging heatmap_merge <- hm1 %v% hm2 return(heatmap_merge) }) heatmap_list
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.