####################################################
#
# Reproduce Gasperini et al p-values by re-running
# negative binomial GLM on expression/gRNA data.
#
####################################################
######################
# Set up workspace
######################
require(crisprscreen)
######################
# Read in various data
######################
#####################
# 1. The cell expression matrix. Cells are rows (names given by cell_names) and genes are columns (names given by gene_names). The matrix itself is expression_FBM.
#####################
my_path <- is_r_package$find_file()
# cell names
cell_names = read_tsv(paste0(my_path, "/raw/GSE120861_at_scale_screen.cells.txt"), col_names = FALSE, col_types = "c") %>% pull()
num_cells = length(cell_names)
cell_names2 = read_tsv(paste0(my_path, "/processed/cell_names.tsv"), col_names = FALSE, col_types = "c") %>% pull()
# gene names
gene_names = read_tsv(paste0(my_path, "/core_data/GSE120861_at_scale_screen.genes.txt"), col_names = FALSE, col_types = "c") %>% pull()
num_genes = length(gene_names)
# gene expressions (as a file-backed matrix) -- cell_names gives the row names, and gene_names gives the column names.
expression_FBM = FBM(num_cells, num_genes, type = "integer",
backingfile = paste0(my_path, "/processed/expression_matrix"), create_bk = FALSE)
###############################
# 2. Auxillary cell information. Rows (again) are cells. Columns (names given by pd_colnames) give the confounders for each cell (e.g., prep_batch and guide_count of each grna) as well as information on the other guide RNAs.
###############################
# cell "phenotype" names (phenotypes include including grna and confounders)
pd_colnames = read_tsv(paste0(my_path, "/processed/phenoData_selected_colnames.tsv"), col_names = FALSE, col_types = "c") %>% pull()
num_phenotypes = length(pd_colnames)
# "phenotypes" for each cell, including guide counts and confounders (as a file-backed matrix). Again cells are rows and "phenotypes" are columns.
phenodata_FBM = FBM(num_cells, num_phenotypes,
backingfile = paste0(my_path, "/processed/phenoData_selected"),
create_bk = FALSE)
###############################
# 3. Parameters for the statistical model and reference cells
###############################
# dispersion coefficients alpha_0 and alpha_1 from equation (6) in DESeq2 paper
disp_coeffs = as.numeric(read_tsv( paste0(my_path, "/processed/dispCoefficients.tsv"), col_types = "dd"))
## dispersion table (mean expressions and dispersion estimates for each gene)
disp_table = read_tsv(paste0(my_path, "/processed/dispTable.tsv"), col_types = "cdd")
# reference cells used by Gasperini et al for computational purposes
reference_cells = readRDS(paste0(my_path ,"/raw/GSE120861_50k_reference_cells.rds"))
############################
# 4. Finally, results from the original authors
############################
# results of Gasperini et al (command result in a warning that can be ignored)
all_deg_results = suppressWarnings(read_tsv(paste0(my_path, "/raw/GSE120861_all_deg_results.at_scale.txt"), col_types = "cddddddccccciiciiccc", na = "not_applicable"))
###########################################
# Set up for negative binomial GLM analysis
###########################################
# randomly choose a gRNA/gene pair to analyze
random_row = sample(1:nrow(all_deg_results), 1)
grna_group = all_deg_results$gRNA_group[random_row] # select a guide RNA
gene_name = all_deg_results$ENSG[random_row] # select a gene
cat(sprintf("Going to analyze gRNA %s and gene %s.\n", grna_group, gene_name))
# extract confounders
confounder_names = c("guide_count", "prep_batch", "percent.mito") # what are the confounder names?
confounders_full = phenodata_FBM[,match(confounder_names, pd_colnames)] # locate and extract the confounder columns in the auxillary information matrix.
colnames(confounders_full) = confounder_names
confounders_full = as.data.frame(confounders_full) # the confounder data frame; 3 confounders per cell (note -- we need only create this once)
# extract size factors
Size_Factors_full = phenodata_FBM[,match("Size_Factor", pd_colnames)] # the size factors for each cell -- used for dispersion estimation
# extra gRNA indicators for chosen gRNA
grna_indicators_full = phenodata_FBM[,match(grna_group, pd_colnames)] # extract the binary grna vector for the particular grna we are looking at
# extract expression of chosen gene
gene_exp_full = expression_FBM[,match(gene_name, gene_names)]
# keep cells that are either in reference or express the gRNA
cells_to_keep = which((cell_names %in% reference_cells | (grna_indicators_full == 1)) &
(confounders_full$guide_count > 0))
# put all information in a data frame
df = confounders_full[cells_to_keep,]
df$grna = grna_indicators_full[cells_to_keep]
df$gene_exp = gene_exp_full[cells_to_keep]
df$Size_Factor = Size_Factors_full[cells_to_keep]
###########################################
# Carry out negative binomial GLM analysis
###########################################
# extract the raw mean and dispersion of gene expression
mean_estimate_raw = disp_table %>% filter(gene_id == gene_name) %>% pull(mu)
disp_estimate_raw = disp_table %>% filter(gene_id == gene_name) %>% pull(disp)
# calculate the shrunken dispersion estimate based on the DESeq2 formula (6)
disp_estimate_shrunk = disp_coeffs[1] + disp_coeffs[2] / mean_estimate_raw
cat(sprintf("The dispersion of the gene was shrunk from %0.2f to %0.2f.\n", disp_estimate_raw, disp_estimate_shrunk))
# normalize expression by the size factor (I think the size factor would be better accounted for using offsets)
df = df %>% mutate(normalized_gene_exp = round(gene_exp/Size_Factor))
# fit the negative binomial model with grna
full_model_fit <- VGAM::vglm(normalized_gene_exp ~ grna + guide_count + prep_batch + percent.mito,
epsilon=1e-1,
family=negbinomial.size(size=1/disp_estimate_shrunk),
data = df)
# fit the negative binomial model without grna
reduced_model_fit <- VGAM::vglm(normalized_gene_exp ~ guide_count + prep_batch + percent.mito,
epsilon=1e-1,
family=negbinomial.size(size=1/disp_estimate_shrunk),
data = df)
# obtain a p-value by comparing these two models via the likelihood ratio test
pvalue1 = lrtest(full_model_fit, reduced_model_fit)@Body["Pr(>Chisq)"][2,]
# compare to p-value downloaded from GEO
pvalue2 = all_deg_results$pvalue.raw[random_row]
cat(sprintf("The LRT p-value is %0.4f.\n", pvalue1))
cat(sprintf("The p-value downloaded from GEO is %0.4f.\n", pvalue2))
#############
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.