Drivers/reproduce_Gasperini.R

####################################################
#
# Reproduce Gasperini et al p-values by re-running
# negative binomial GLM on expression/gRNA data.
#
####################################################

######################
# Set up workspace
######################

require(crisprscreen)

######################
# Read in various data
######################

#####################
# 1. The cell expression matrix. Cells are rows (names given by cell_names) and genes are columns (names given by gene_names). The matrix itself is expression_FBM.
#####################
my_path <- is_r_package$find_file()

# cell names
cell_names = read_tsv(paste0(my_path, "/raw/GSE120861_at_scale_screen.cells.txt"), col_names = FALSE, col_types = "c") %>% pull()
num_cells = length(cell_names)
cell_names2 = read_tsv(paste0(my_path, "/processed/cell_names.tsv"), col_names = FALSE, col_types = "c") %>% pull()

# gene names
gene_names = read_tsv(paste0(my_path, "/core_data/GSE120861_at_scale_screen.genes.txt"), col_names = FALSE, col_types = "c") %>% pull()
num_genes = length(gene_names)

# gene expressions (as a file-backed matrix) -- cell_names gives the row names, and gene_names gives the column names.
expression_FBM = FBM(num_cells, num_genes, type = "integer",
                  backingfile = paste0(my_path, "/processed/expression_matrix"), create_bk = FALSE)

###############################
# 2. Auxillary cell information. Rows (again) are cells. Columns (names given by pd_colnames)  give the confounders for each cell (e.g., prep_batch and guide_count of each grna) as well as information on the other guide RNAs.
###############################

# cell "phenotype" names (phenotypes include including grna and confounders)
pd_colnames = read_tsv(paste0(my_path, "/processed/phenoData_selected_colnames.tsv"), col_names = FALSE, col_types = "c") %>% pull()
num_phenotypes = length(pd_colnames)

# "phenotypes" for each cell, including guide counts and confounders (as a file-backed matrix). Again cells are rows and "phenotypes" are columns.
phenodata_FBM = FBM(num_cells, num_phenotypes,
             backingfile = paste0(my_path, "/processed/phenoData_selected"),
             create_bk = FALSE)
###############################
# 3. Parameters for the statistical model and reference cells
###############################

# dispersion coefficients alpha_0 and alpha_1 from equation (6) in DESeq2 paper
disp_coeffs = as.numeric(read_tsv( paste0(my_path, "/processed/dispCoefficients.tsv"), col_types = "dd"))

## dispersion table (mean expressions and dispersion estimates for each gene)
disp_table = read_tsv(paste0(my_path, "/processed/dispTable.tsv"), col_types = "cdd")

# reference cells used by Gasperini et al for computational purposes
reference_cells = readRDS(paste0(my_path ,"/raw/GSE120861_50k_reference_cells.rds"))

############################
# 4. Finally, results from the original authors
############################

# results of Gasperini et al (command result in a warning that can be ignored)
all_deg_results = suppressWarnings(read_tsv(paste0(my_path, "/raw/GSE120861_all_deg_results.at_scale.txt"), col_types = "cddddddccccciiciiccc", na = "not_applicable"))

###########################################
# Set up for negative binomial GLM analysis
###########################################

# randomly choose a gRNA/gene pair to analyze
random_row = sample(1:nrow(all_deg_results), 1)
grna_group = all_deg_results$gRNA_group[random_row] # select a guide RNA
gene_name = all_deg_results$ENSG[random_row] # select a gene
cat(sprintf("Going to analyze gRNA %s and gene %s.\n", grna_group, gene_name))

# extract confounders
confounder_names = c("guide_count", "prep_batch", "percent.mito") # what are the confounder names?
confounders_full = phenodata_FBM[,match(confounder_names, pd_colnames)] # locate and extract the confounder columns in the auxillary information matrix.
colnames(confounders_full) = confounder_names
confounders_full = as.data.frame(confounders_full) # the confounder data frame; 3 confounders per cell (note -- we need only create this once)

# extract size factors
Size_Factors_full = phenodata_FBM[,match("Size_Factor", pd_colnames)] # the size factors for each cell -- used for dispersion estimation

# extra gRNA indicators for chosen gRNA
grna_indicators_full = phenodata_FBM[,match(grna_group, pd_colnames)] # extract the binary grna vector for the particular grna we are looking at

# extract expression of chosen gene
gene_exp_full = expression_FBM[,match(gene_name, gene_names)]

# keep cells that are either in reference or express the gRNA
cells_to_keep = which((cell_names %in% reference_cells | (grna_indicators_full == 1)) &
                        (confounders_full$guide_count > 0))

# put all information in a data frame
df = confounders_full[cells_to_keep,]
df$grna = grna_indicators_full[cells_to_keep]
df$gene_exp = gene_exp_full[cells_to_keep]
df$Size_Factor = Size_Factors_full[cells_to_keep]

###########################################
# Carry out negative binomial GLM analysis
###########################################

# extract the raw mean and dispersion of gene expression
mean_estimate_raw = disp_table %>% filter(gene_id == gene_name) %>% pull(mu)
disp_estimate_raw = disp_table %>% filter(gene_id == gene_name) %>% pull(disp)

# calculate the shrunken dispersion estimate based on the DESeq2 formula (6)
disp_estimate_shrunk = disp_coeffs[1] + disp_coeffs[2] / mean_estimate_raw

cat(sprintf("The dispersion of the gene was shrunk from %0.2f to %0.2f.\n", disp_estimate_raw, disp_estimate_shrunk))

# normalize expression by the size factor (I think the size factor would be better accounted for using offsets)
df = df %>% mutate(normalized_gene_exp = round(gene_exp/Size_Factor))

# fit the negative binomial model with grna
full_model_fit <- VGAM::vglm(normalized_gene_exp ~ grna + guide_count + prep_batch + percent.mito,
                             epsilon=1e-1,
                             family=negbinomial.size(size=1/disp_estimate_shrunk),
                             data = df)

# fit the negative binomial model without grna
reduced_model_fit <- VGAM::vglm(normalized_gene_exp ~ guide_count + prep_batch + percent.mito,
                                epsilon=1e-1,
                                family=negbinomial.size(size=1/disp_estimate_shrunk),
                                data = df)

# obtain a p-value by comparing these two models via the likelihood ratio test
pvalue1 = lrtest(full_model_fit, reduced_model_fit)@Body["Pr(>Chisq)"][2,]

# compare to p-value downloaded from GEO
pvalue2 = all_deg_results$pvalue.raw[random_row]

cat(sprintf("The LRT p-value is %0.4f.\n", pvalue1))
cat(sprintf("The p-value downloaded from GEO is %0.4f.\n", pvalue2))

#############
Timothy-Barry/crisprscreen documentation built on Feb. 13, 2020, 12:32 a.m.