# Import libraries

#setwd("/wehisan/bioinf/bioinf-data/Papenfuss_lab/projects/mangiola.s/PostDoc/RNAseq-noise-model")
set.seed(13254)

# Import libraries
library(tidyverse)
library(magrittr)
library(rstan)
library(tidybayes)
library(here)
library(foreach)
library(doParallel)
registerDoParallel()
# library(future)
# plan(multiprocess)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

#devtools::load_all() # Sorry this costs me 30 minutes every day

my_theme =  
    theme_bw() +
    theme(
        panel.border = element_blank(),
        axis.line = element_line(),
        panel.grid.major = element_line(size = 0.2),
        panel.grid.minor = element_line(size = 0.1),
        text = element_text(size=12),
        legend.position="bottom",
        aspect.ratio=1,
        axis.text.x = element_text(angle = 90, hjust = 1),
        strip.background = element_blank(),
        axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
        axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
    )

# Tools
source("https://gist.githubusercontent.com/stemangiola/90a528038b8c52b21f9cfa6bb186d583/raw/dbd92c49fb03fb05ab0b465704b99c0a39e654d5/transcription_tool_kit.R")
source("https://gist.githubusercontent.com/stemangiola/dd3573be22492fc03856cd2c53a755a9/raw/e4ec6a2348efc2f62b88f10b12e70f4c6273a10a/tidy_extensions.R")

Cell types - t cells

counts = 
    foreach(
        f = dir(
            path =  here("big_data", "tibble_cellType_files"), 
            pattern="tibble_cellTypes_t_",
            full.names = T
        ),
        .combine = bind_rows
    ) %dopar% {
        read_csv(f) %>% mutate(sample = sample %>% as.character, isoform = isoform %>% as.character)
    } %>%

    # Median redundant
  do_parallel_start(40, "symbol") %>%
  do({
        `%>%` = magrittr::`%>%`
        library(tidyverse)
        library(magrittr)

        (.) %>%
            group_by(sample, symbol, `Cell type formatted`,  `Data base`) %>%
            summarise(`read count` = `read count` %>% median(na.rm = T)) %>%
            ungroup() 
  }) %>%
  do_parallel_end() %>%

  # Normalise
    norm_RNAseq( 
            sample_column = "sample", 
            gene_column = "symbol", 
            value_column = "read count",cpm_threshold = 0.5
        ) %>%
  mutate(`read count normalised` = `read count normalised` %>% as.integer) %>%
  mutate(`read count normalised log` = `read count normalised` %>% `+` (1) %>% log) %>%

  # Mark the bimodal distributions
  do_parallel_start(40, "symbol") %>%
    do({
        `%>%` = magrittr::`%>%`
        library(tidyverse)
        library(magrittr)

        (.) %>%
          group_by(symbol) %>%
          do(
            (.) %>%
            mutate(
          `bimodal p-value` =
            (.) %>%
              pull(`read count normalised log`) %>%
              diptest::dip.test() %$%
              `p.value`,

          `bimodal coefficient` = 
             (.) %>%
            pull(`read count normalised log`) %>%
            modes::bimodality_coefficient(),

          `anova p-value` = 
              ifelse(
                (.) %>% distinct(`Data base`) %>% nrow > 1,
                (.) %>% aov(`read count normalised log` ~ `Cell type formatted` + `Data base`, .) %>% anova %$%             `Pr(>F)` %>%  `[` (1),
                (.) %>% aov(`read count normalised log` ~ `Cell type formatted`, .) %>% anova %$% `Pr(>F)` %>% `[` (1)
              ) 
            )
          ) %>%
          ungroup()

    }) %>%
  do_parallel_end() %>% 
  mutate(
    `anova p-value` = ifelse(`anova p-value` == "NaN", 1, `anova p-value`),
    `bimodal coefficient` = ifelse(`bimodal coefficient` == "NaN", 0, `bimodal coefficient`)
  ) %>%
  mutate(
    `hard bimodality` =
      (`bimodal p-value` < 0.05) + 
      (`bimodal coefficient` > 0.6666667) + 
      (`anova p-value` < 0.05 ) >= 
      2
  ) %>%
  mutate(`soft bimodality` = `anova p-value` < 0.0001)

Inference

nb_model = stan_model(here("stan",sprintf("%s.stan", "negBinomial_tidy")))

set.seed(1325364)
counts_stan = 
  counts %>%
  filter(!filt_for_calc) %>%
  filter(!`hard bimodality`) %>%
  #inner_join( (.) %>% distinct(symbol) %>% sample_n(500) ) %>%

  left_join(
    (.) %>% 
    distinct(sample, symbol, `read count normalised`) %>% 
    count(symbol) %>% 
    mutate(end = cumsum(n)) %>%
    mutate(start = c(1, .$end %>% rev() %>% `[` (-1) %>% rev %>% `+` (1)))
  ) %>% 
  arrange(symbol) 

data_for_stan = list(
  N = counts_stan %>% nrow,
  G = counts_stan %>% distinct(symbol) %>% nrow,
  S = counts_stan %>% distinct(sample) %>% nrow,
  symbol_start_end = counts_stan %>% distinct(start, end) %>% select(start, end),
  counts = counts_stan %>% select(`read count normalised`) %>% pull(1),
  sample_idx = 
    counts_stan %>%
    mutate(sample = factor(sample)) %>% 
    mutate(sample_idx = as.integer(sample)) %>% 
    select(sample_idx) %>%
    pull(1),

  # Info on data set
  omit_data = 0,
  generate_quantities = 1,
  is_prior_asymetric = 1
)

# fit =
#   sampling(
#       nb_model,
#     data = data_for_stan, 
#     chains=4, iter=500
# )
# fit_parsed = fit %>% spread_draws(lambda[G], sigma[G], sigma_raw[G]) 
#fit %>% summary() %$% summary %>% as_tibble(rownames="par")


load("fit_parsed_all_genes_t_cell_18022019.RData")


fit_parsed %>% 
  mean_qi() %>% 
  left_join(
    counts_stan %>% distinct(symbol, `bimodal p-value`) %>% mutate(G=1:n())
  ) %>%
  ggplot(aes(x=lambda, y=sigma_raw, lable=symbol, bimodal=`bimodal p-value`)) + geom_point() +
    stat_function(
        fun = function(x) {
          inflection = fit %>% rstan::extract(pars="sigma_inflection") %$% sigma_inflection %>% median()
          slope =  fit %>% rstan::extract(pars="sigma_slope") %>% as.data.frame %>% as_tibble %>% pull(1) %>% median
          y_cross = fit %>% rstan::extract(pars="sigma_y_cross") %$% sigma_y_cross %>% median()
          b0 = -inflection * slope;
            y_cross * (1 + exp(-b0)) / ( 1 + exp(- ( b0 + x * slope ) ) ) ;
        }, 
        geom="line"
    )

fit_predicted = fit_parsed %>% 
  mean_qi() %>%
  left_join(
    counts_stan %>% distinct(symbol) %>% mutate(G=1:n())
  ) %>%
  left_join(
    counts_stan %>% select(symbol, `read count normalised`)
  ) %>% 
  do_parallel_start(40, "symbol") %>%
    do({

      `%>%` = magrittr::`%>%`
        library(tidyverse)
        library(magrittr)

      (.) %>%
    group_by(symbol) %>%
    do(
      (.) %>%
      arrange(`read count normalised`) %>%
      mutate(
        predicted_NB = 
            qnbinom(
                ppoints(`read count normalised`), 
                size=.$sigma %>% unique, 
                mu=.$lambda %>% unique %>% exp
                )
      ) 
    ) %>%
    ungroup() 
    }) %>%
  do_parallel_end() %>%
  mutate(`log of error` = (`read count normalised` - predicted_NB) %>% abs %>% `+` (1) %>% log) %>%
  mutate(`error of log` = (log(`read count normalised` + 1) - log(predicted_NB + 1)) %>% abs )

Plot worst predictions

fit_predicted %>%
  inner_join( 
    (.) %>%
      filter(lambda > log(100)) %>% 
      group_by(symbol) %>% 
      summarise(`error mean` = `error of log` %>% mean) %>% 
      arrange(`error mean` %>% desc) %>% 
      select(symbol) %>% 
      head(20)
  ) %>%
  ggplot(aes(y = `read count normalised` + 1, x = predicted_NB + 1)) + 
  geom_point() + 
  facet_wrap(~ symbol, scale="free") + 
  geom_abline(slope = 1, intercept=0) + scale_y_log10() + scale_x_log10() + my_theme

Plot errors in rank

fit_predicted %>%
    group_by(symbol, lambda) %>% 
    summarise(`error mean` = `error of log` %>% mean) %>%
    ungroup() %>%
    mutate(symbol = factor(symbol, levels= (.) %>% arrange(`error mean` %>% desc) %>% pull(symbol))) %>%
    ggplot(aes(x=symbol, y=`error mean`, color = lambda)) + geom_point()

Plot single gene

counts %>% filter(symbol == "HCG23") %>% 
{
  ((.) %>% ggplot(aes(x=sample, y = `read count normalised`+1, color=`Cell type formatted`, shape = `Data base`)) + geom_point() + scale_y_log10()) %>% print()
  (.)
} %>%
pull(`read count normalised`) %>%
`+` (1) %>%
log %>% 
  { (.) %>% modes::bimodality_coefficient() %>% print; (.)} %>%
  diptest::dip.test() %$% `p.value`

TCGA - leukemia

counts_source = "Acute_Myeloid_Leukemia_Primary_Blood_Derived_Cancer_-_Peripheral_Blood"
counts_TCGA = 

    read_csv(here("big_data", "tibble_TCGA_files", paste0(counts_source, ".csv"))) %>%
    # Median redundant
  do_parallel_start(40, "ens_iso") %>%
  do({
        `%>%` = magrittr::`%>%`
        library(tidyverse)
        library(magrittr)

        (.) %>%
            group_by(sample, ens_iso) %>%
            summarise(`read count` = `read count` %>% median(na.rm = T)) %>%
            ungroup() 
  }) %>%
  do_parallel_end() %>%

  # Normalise
    norm_RNAseq( 
            sample_column = "sample", 
            gene_column = "ens_iso", 
            value_column = "read count",cpm_threshold = 10, prop = 9/10
        ) %>%
  mutate(`read count normalised` = `read count normalised` %>% as.integer) %>%
  mutate(`read count normalised log` = `read count normalised` %>% `+` (1) %>% log) 
file("~/.R/Makevars") %>% {
  writeLines(c("CXX14FLAGS += -O3"), (.)); 
  (.)
} %>%
close(.)
nb_model = stan_model(here("stan",sprintf("%s.stan", "negBinomial_tidy")))

set.seed(1325364)
counts_stan = 
  counts_TCGA %>%
  #filter(!filt_for_calc)  %>%
  inner_join( (.) %>% distinct(ens_iso) %>% sample_n(500) ) %>%

  left_join(
    (.) %>% 
    distinct(sample, ens_iso, `read count normalised`) %>% 
    count(ens_iso) %>% 
    mutate(end = cumsum(n)) %>%
    mutate(start = c(1, .$end %>% rev() %>% `[` (-1) %>% rev %>% `+` (1)))
  ) %>% 
  arrange(ens_iso) 

data_for_stan = list(
  N = counts_stan %>% nrow,
  G = counts_stan %>% distinct(ens_iso) %>% nrow,
  S = counts_stan %>% distinct(sample) %>% nrow,
  symbol_start_end = counts_stan %>% distinct(start, end) %>% select(start, end),
  counts = counts_stan %>% select(`read count normalised`) %>% pull(1),
  sample_idx = 
    counts_stan %>%
    mutate(sample = factor(sample)) %>% 
    mutate(sample_idx = as.integer(sample)) %>% 
    select(sample_idx) %>%
    pull(1),

  # Info on data set
  omit_data = 0,
  generate_quantities = 1,
  is_prior_asymetric = 1
)

fit =
  sampling(
      nb_model,
    data = data_for_stan,
    chains=4, iter=300, warmup=200
)
fit_parsed = fit %>% spread_draws(lambda[G], sigma[G], sigma_raw[G], sigma_raw_gen[G])
fit %>% summary() %$% summary %>% as_tibble(rownames="par")


load("fit_parsed_500_genes_TCGA_leukimia_18022019.RData")


fit_parsed %>% 
  mean_qi() %>% 

  left_join(
    fit_parsed %>% group_by(G) %>% do( (.) %>% head(n=1) ) %>% ungroup() %>% select(G, sigma_raw_gen) %>% rename(gen_quant = sigma_raw_gen)
  ) %>%

  left_join(
    counts_stan %>% distinct(ens_iso) %>% mutate(G=1:n())
  ) %>%
  gather(is_real, my_sigma,  c("sigma_raw", "gen_quant")) %>%
  ggplot(aes(x=lambda, y=my_sigma, lable=ens_iso, color=is_real)) + geom_point() +
    stat_function(
        fun = function(x) {
          slope =  fit %>% rstan::extract(pars="sigma_slope") %>% as.data.frame %>% as_tibble %>% pull(1) %>% median
          intercept = fit %>% rstan::extract(pars="sigma_intercept") %$% sigma_intercept %>% median()
            exp(slope * x + intercept) ;
        }, 
        geom="line"
    )

fit_predicted = fit_parsed %>% 
  mean_qi() %>%
  left_join(
    counts_stan %>% distinct(ens_iso) %>% mutate(G=1:n())
  ) %>%
  left_join(
    counts_stan %>% select(ens_iso, `read count normalised`)
  ) %>% 
  do_parallel_start(40, "ens_iso") %>%
    do({

      `%>%` = magrittr::`%>%`
        library(tidyverse)
        library(magrittr)

      (.) %>%
    group_by(ens_iso) %>%
    do(
      (.) %>%
      arrange(`read count normalised`) %>%
      mutate(
        predicted_NB = 
            qnbinom(
                ppoints(`read count normalised`), 
                size=.$sigma %>% unique, 
                mu=.$lambda %>% unique %>% exp
                )
      ) 
    ) %>%
    ungroup() 
    }) %>%
  do_parallel_end() %>%
  mutate(`log of error` = (`read count normalised` - predicted_NB) %>% abs %>% `+` (1) %>% log) %>%
  mutate(`error of log` = (log(`read count normalised` + 1) - log(predicted_NB + 1)) %>% abs )
# Test for near 0 genes

y = fit_parsed %>% 
    mean_qi() %>% 
    left_join(
        counts_stan %>% 
          distinct(ens_iso, filt_for_calc) %>% 
          mutate(G=1:n())
    ) %>% 
  filter(!filt_for_calc) %>% pull(lambda)

fit_0s = stan(
  data = list(y = y, N = y %>% length),
  model_code = "
    data{
    int N;
    vector[N] y;
    }
    parameters{
    real mu;
    real<lower=0> sigma;
    real skew;
    }
    model{
     y ~ skew_normal(mu, sigma, skew);
    }
  " 
)

Plot worst predictions

fit_predicted %>%
  inner_join( 
    (.) %>%
      filter(lambda > log(100)) %>% 
      group_by(ens_iso) %>% 
      summarise(`error mean` = `error of log` %>% mean) %>% 
      arrange(`error mean` %>% desc) %>% 
      select(ens_iso) %>% 
      head(20)
  ) %>%
  ggplot(aes(y = `read count normalised` + 1, x = predicted_NB + 1)) + 
  geom_point() + 
  facet_wrap(~ ens_iso, scale="free") + 
  geom_abline(slope = 1, intercept=0) + scale_y_log10() + scale_x_log10() + my_theme

Plot errors in rank

fit_predicted %>%
    group_by(ens_iso, lambda) %>% 
    summarise(`error mean` = `error of log` %>% mean) %>%
    ungroup() %>%
    mutate(ens_iso = factor(ens_iso, levels= (.) %>% arrange(`error mean` %>% desc) %>% pull(ens_iso))) %>%
    ggplot(aes(x=ens_iso, y=`error mean`, color = lambda)) + geom_point()


stemangiola/RNAseq-noise-model documentation built on Oct. 18, 2019, 1:47 p.m.