# Import libraries #setwd("/wehisan/bioinf/bioinf-data/Papenfuss_lab/projects/mangiola.s/PostDoc/RNAseq-noise-model") set.seed(13254) # Import libraries library(tidyverse) library(magrittr) library(rstan) library(tidybayes) library(here) library(foreach) library(doParallel) registerDoParallel() # library(future) # plan(multiprocess) rstan_options(auto_write = TRUE) options(mc.cores = parallel::detectCores()) #devtools::load_all() # Sorry this costs me 30 minutes every day my_theme = theme_bw() + theme( panel.border = element_blank(), axis.line = element_line(), panel.grid.major = element_line(size = 0.2), panel.grid.minor = element_line(size = 0.1), text = element_text(size=12), legend.position="bottom", aspect.ratio=1, axis.text.x = element_text(angle = 90, hjust = 1), strip.background = element_blank(), axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)), axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) # Tools source("https://gist.githubusercontent.com/stemangiola/90a528038b8c52b21f9cfa6bb186d583/raw/dbd92c49fb03fb05ab0b465704b99c0a39e654d5/transcription_tool_kit.R") source("https://gist.githubusercontent.com/stemangiola/dd3573be22492fc03856cd2c53a755a9/raw/e4ec6a2348efc2f62b88f10b12e70f4c6273a10a/tidy_extensions.R")
counts = foreach( f = dir( path = here("big_data", "tibble_cellType_files"), pattern="tibble_cellTypes_t_", full.names = T ), .combine = bind_rows ) %dopar% { read_csv(f) %>% mutate(sample = sample %>% as.character, isoform = isoform %>% as.character) } %>% # Median redundant do_parallel_start(40, "symbol") %>% do({ `%>%` = magrittr::`%>%` library(tidyverse) library(magrittr) (.) %>% group_by(sample, symbol, `Cell type formatted`, `Data base`) %>% summarise(`read count` = `read count` %>% median(na.rm = T)) %>% ungroup() }) %>% do_parallel_end() %>% # Normalise norm_RNAseq( sample_column = "sample", gene_column = "symbol", value_column = "read count",cpm_threshold = 0.5 ) %>% mutate(`read count normalised` = `read count normalised` %>% as.integer) %>% mutate(`read count normalised log` = `read count normalised` %>% `+` (1) %>% log) %>% # Mark the bimodal distributions do_parallel_start(40, "symbol") %>% do({ `%>%` = magrittr::`%>%` library(tidyverse) library(magrittr) (.) %>% group_by(symbol) %>% do( (.) %>% mutate( `bimodal p-value` = (.) %>% pull(`read count normalised log`) %>% diptest::dip.test() %$% `p.value`, `bimodal coefficient` = (.) %>% pull(`read count normalised log`) %>% modes::bimodality_coefficient(), `anova p-value` = ifelse( (.) %>% distinct(`Data base`) %>% nrow > 1, (.) %>% aov(`read count normalised log` ~ `Cell type formatted` + `Data base`, .) %>% anova %$% `Pr(>F)` %>% `[` (1), (.) %>% aov(`read count normalised log` ~ `Cell type formatted`, .) %>% anova %$% `Pr(>F)` %>% `[` (1) ) ) ) %>% ungroup() }) %>% do_parallel_end() %>% mutate( `anova p-value` = ifelse(`anova p-value` == "NaN", 1, `anova p-value`), `bimodal coefficient` = ifelse(`bimodal coefficient` == "NaN", 0, `bimodal coefficient`) ) %>% mutate( `hard bimodality` = (`bimodal p-value` < 0.05) + (`bimodal coefficient` > 0.6666667) + (`anova p-value` < 0.05 ) >= 2 ) %>% mutate(`soft bimodality` = `anova p-value` < 0.0001)
Inference
nb_model = stan_model(here("stan",sprintf("%s.stan", "negBinomial_tidy"))) set.seed(1325364) counts_stan = counts %>% filter(!filt_for_calc) %>% filter(!`hard bimodality`) %>% #inner_join( (.) %>% distinct(symbol) %>% sample_n(500) ) %>% left_join( (.) %>% distinct(sample, symbol, `read count normalised`) %>% count(symbol) %>% mutate(end = cumsum(n)) %>% mutate(start = c(1, .$end %>% rev() %>% `[` (-1) %>% rev %>% `+` (1))) ) %>% arrange(symbol) data_for_stan = list( N = counts_stan %>% nrow, G = counts_stan %>% distinct(symbol) %>% nrow, S = counts_stan %>% distinct(sample) %>% nrow, symbol_start_end = counts_stan %>% distinct(start, end) %>% select(start, end), counts = counts_stan %>% select(`read count normalised`) %>% pull(1), sample_idx = counts_stan %>% mutate(sample = factor(sample)) %>% mutate(sample_idx = as.integer(sample)) %>% select(sample_idx) %>% pull(1), # Info on data set omit_data = 0, generate_quantities = 1, is_prior_asymetric = 1 ) # fit = # sampling( # nb_model, # data = data_for_stan, # chains=4, iter=500 # ) # fit_parsed = fit %>% spread_draws(lambda[G], sigma[G], sigma_raw[G]) #fit %>% summary() %$% summary %>% as_tibble(rownames="par") load("fit_parsed_all_genes_t_cell_18022019.RData") fit_parsed %>% mean_qi() %>% left_join( counts_stan %>% distinct(symbol, `bimodal p-value`) %>% mutate(G=1:n()) ) %>% ggplot(aes(x=lambda, y=sigma_raw, lable=symbol, bimodal=`bimodal p-value`)) + geom_point() + stat_function( fun = function(x) { inflection = fit %>% rstan::extract(pars="sigma_inflection") %$% sigma_inflection %>% median() slope = fit %>% rstan::extract(pars="sigma_slope") %>% as.data.frame %>% as_tibble %>% pull(1) %>% median y_cross = fit %>% rstan::extract(pars="sigma_y_cross") %$% sigma_y_cross %>% median() b0 = -inflection * slope; y_cross * (1 + exp(-b0)) / ( 1 + exp(- ( b0 + x * slope ) ) ) ; }, geom="line" ) fit_predicted = fit_parsed %>% mean_qi() %>% left_join( counts_stan %>% distinct(symbol) %>% mutate(G=1:n()) ) %>% left_join( counts_stan %>% select(symbol, `read count normalised`) ) %>% do_parallel_start(40, "symbol") %>% do({ `%>%` = magrittr::`%>%` library(tidyverse) library(magrittr) (.) %>% group_by(symbol) %>% do( (.) %>% arrange(`read count normalised`) %>% mutate( predicted_NB = qnbinom( ppoints(`read count normalised`), size=.$sigma %>% unique, mu=.$lambda %>% unique %>% exp ) ) ) %>% ungroup() }) %>% do_parallel_end() %>% mutate(`log of error` = (`read count normalised` - predicted_NB) %>% abs %>% `+` (1) %>% log) %>% mutate(`error of log` = (log(`read count normalised` + 1) - log(predicted_NB + 1)) %>% abs )
Plot worst predictions
fit_predicted %>% inner_join( (.) %>% filter(lambda > log(100)) %>% group_by(symbol) %>% summarise(`error mean` = `error of log` %>% mean) %>% arrange(`error mean` %>% desc) %>% select(symbol) %>% head(20) ) %>% ggplot(aes(y = `read count normalised` + 1, x = predicted_NB + 1)) + geom_point() + facet_wrap(~ symbol, scale="free") + geom_abline(slope = 1, intercept=0) + scale_y_log10() + scale_x_log10() + my_theme
Plot errors in rank
fit_predicted %>% group_by(symbol, lambda) %>% summarise(`error mean` = `error of log` %>% mean) %>% ungroup() %>% mutate(symbol = factor(symbol, levels= (.) %>% arrange(`error mean` %>% desc) %>% pull(symbol))) %>% ggplot(aes(x=symbol, y=`error mean`, color = lambda)) + geom_point()
Plot single gene
counts %>% filter(symbol == "HCG23") %>% { ((.) %>% ggplot(aes(x=sample, y = `read count normalised`+1, color=`Cell type formatted`, shape = `Data base`)) + geom_point() + scale_y_log10()) %>% print() (.) } %>% pull(`read count normalised`) %>% `+` (1) %>% log %>% { (.) %>% modes::bimodality_coefficient() %>% print; (.)} %>% diptest::dip.test() %$% `p.value`
counts_source = "Acute_Myeloid_Leukemia_Primary_Blood_Derived_Cancer_-_Peripheral_Blood" counts_TCGA = read_csv(here("big_data", "tibble_TCGA_files", paste0(counts_source, ".csv"))) %>% # Median redundant do_parallel_start(40, "ens_iso") %>% do({ `%>%` = magrittr::`%>%` library(tidyverse) library(magrittr) (.) %>% group_by(sample, ens_iso) %>% summarise(`read count` = `read count` %>% median(na.rm = T)) %>% ungroup() }) %>% do_parallel_end() %>% # Normalise norm_RNAseq( sample_column = "sample", gene_column = "ens_iso", value_column = "read count",cpm_threshold = 10, prop = 9/10 ) %>% mutate(`read count normalised` = `read count normalised` %>% as.integer) %>% mutate(`read count normalised log` = `read count normalised` %>% `+` (1) %>% log)
file("~/.R/Makevars") %>% { writeLines(c("CXX14FLAGS += -O3"), (.)); (.) } %>% close(.) nb_model = stan_model(here("stan",sprintf("%s.stan", "negBinomial_tidy"))) set.seed(1325364) counts_stan = counts_TCGA %>% #filter(!filt_for_calc) %>% inner_join( (.) %>% distinct(ens_iso) %>% sample_n(500) ) %>% left_join( (.) %>% distinct(sample, ens_iso, `read count normalised`) %>% count(ens_iso) %>% mutate(end = cumsum(n)) %>% mutate(start = c(1, .$end %>% rev() %>% `[` (-1) %>% rev %>% `+` (1))) ) %>% arrange(ens_iso) data_for_stan = list( N = counts_stan %>% nrow, G = counts_stan %>% distinct(ens_iso) %>% nrow, S = counts_stan %>% distinct(sample) %>% nrow, symbol_start_end = counts_stan %>% distinct(start, end) %>% select(start, end), counts = counts_stan %>% select(`read count normalised`) %>% pull(1), sample_idx = counts_stan %>% mutate(sample = factor(sample)) %>% mutate(sample_idx = as.integer(sample)) %>% select(sample_idx) %>% pull(1), # Info on data set omit_data = 0, generate_quantities = 1, is_prior_asymetric = 1 ) fit = sampling( nb_model, data = data_for_stan, chains=4, iter=300, warmup=200 ) fit_parsed = fit %>% spread_draws(lambda[G], sigma[G], sigma_raw[G], sigma_raw_gen[G]) fit %>% summary() %$% summary %>% as_tibble(rownames="par") load("fit_parsed_500_genes_TCGA_leukimia_18022019.RData") fit_parsed %>% mean_qi() %>% left_join( fit_parsed %>% group_by(G) %>% do( (.) %>% head(n=1) ) %>% ungroup() %>% select(G, sigma_raw_gen) %>% rename(gen_quant = sigma_raw_gen) ) %>% left_join( counts_stan %>% distinct(ens_iso) %>% mutate(G=1:n()) ) %>% gather(is_real, my_sigma, c("sigma_raw", "gen_quant")) %>% ggplot(aes(x=lambda, y=my_sigma, lable=ens_iso, color=is_real)) + geom_point() + stat_function( fun = function(x) { slope = fit %>% rstan::extract(pars="sigma_slope") %>% as.data.frame %>% as_tibble %>% pull(1) %>% median intercept = fit %>% rstan::extract(pars="sigma_intercept") %$% sigma_intercept %>% median() exp(slope * x + intercept) ; }, geom="line" ) fit_predicted = fit_parsed %>% mean_qi() %>% left_join( counts_stan %>% distinct(ens_iso) %>% mutate(G=1:n()) ) %>% left_join( counts_stan %>% select(ens_iso, `read count normalised`) ) %>% do_parallel_start(40, "ens_iso") %>% do({ `%>%` = magrittr::`%>%` library(tidyverse) library(magrittr) (.) %>% group_by(ens_iso) %>% do( (.) %>% arrange(`read count normalised`) %>% mutate( predicted_NB = qnbinom( ppoints(`read count normalised`), size=.$sigma %>% unique, mu=.$lambda %>% unique %>% exp ) ) ) %>% ungroup() }) %>% do_parallel_end() %>% mutate(`log of error` = (`read count normalised` - predicted_NB) %>% abs %>% `+` (1) %>% log) %>% mutate(`error of log` = (log(`read count normalised` + 1) - log(predicted_NB + 1)) %>% abs )
# Test for near 0 genes y = fit_parsed %>% mean_qi() %>% left_join( counts_stan %>% distinct(ens_iso, filt_for_calc) %>% mutate(G=1:n()) ) %>% filter(!filt_for_calc) %>% pull(lambda) fit_0s = stan( data = list(y = y, N = y %>% length), model_code = " data{ int N; vector[N] y; } parameters{ real mu; real<lower=0> sigma; real skew; } model{ y ~ skew_normal(mu, sigma, skew); } " )
Plot worst predictions
fit_predicted %>% inner_join( (.) %>% filter(lambda > log(100)) %>% group_by(ens_iso) %>% summarise(`error mean` = `error of log` %>% mean) %>% arrange(`error mean` %>% desc) %>% select(ens_iso) %>% head(20) ) %>% ggplot(aes(y = `read count normalised` + 1, x = predicted_NB + 1)) + geom_point() + facet_wrap(~ ens_iso, scale="free") + geom_abline(slope = 1, intercept=0) + scale_y_log10() + scale_x_log10() + my_theme
Plot errors in rank
fit_predicted %>% group_by(ens_iso, lambda) %>% summarise(`error mean` = `error of log` %>% mean) %>% ungroup() %>% mutate(ens_iso = factor(ens_iso, levels= (.) %>% arrange(`error mean` %>% desc) %>% pull(ens_iso))) %>% ggplot(aes(x=ens_iso, y=`error mean`, color = lambda)) + geom_point()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.