inst/doc/GeneAccord.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  collapse = TRUE,
  comment = "#>"
)

## ---- eval=FALSE--------------------------------------------------------------
#  install.packages("devtools")
#  
#  # to install its dependencies
#  install.packages(c("biomaRt", "caTools", "dplyr", "ggplot2", "gtools", "ggpubr", "magrittr", "maxLik", "RColorBrewer", "reshape2", "tibble"))
#  
#  library(devtools)
#  
#  install_github("cbg-ethz/GeneAccord")

## -----------------------------------------------------------------------------
library(GeneAccord)

## -----------------------------------------------------------------------------
# defines the path to the external data directory
ext_data_dir <- system.file('extdata', package = 'GeneAccord')

# set the patient id
pat_id <- "50"

# find all csv-files from patient 50, which are in the clonal_genotypes subdirectory, with seeds 5, 10, 15, ..., 100
input_files_50 <- paste(ext_data_dir, "/clonal_genotypes/cloe_seed", seq(5, 100, by = 5), "/", pat_id, ".csv", sep = "")
# 'input_files_50' is now a vector with all 20 csv-files for this patient

## -----------------------------------------------------------------------------
head(read.csv(input_files_50[1]))

## -----------------------------------------------------------------------------
clone_tbl_50_all_trees <- create_tbl_tree_collection(input_files_50)


## -----------------------------------------------------------------------------
clone_tbl_50_all_trees

## -----------------------------------------------------------------------------
# print all 20 tree id's
unique(as.character(clone_tbl_50_all_trees$tree_id))

## -----------------------------------------------------------------------------
rates_clon_excl_50 <- compute_rates_clon_excl(clone_tbl_50_all_trees)

## -----------------------------------------------------------------------------
head(rates_clon_excl_50)

# average rate for patient 50 across trees
m_50 <- mean(rates_clon_excl_50)

## -----------------------------------------------------------------------------
# compute the histogram of how often gene pairs occur in the trees, and how often they are clonally exclusive
hist_of_num_trees_clon_excl_50 <- get_hist_clon_excl(clone_tbl_50_all_trees)

## -----------------------------------------------------------------------------
hist_of_num_trees_clon_excl_50[[1]]
hist_of_num_trees_clon_excl_50[[2]]

## ---- eval = F----------------------------------------------------------------
#  # set the patient identifiers
#  all_pat_ids <- c()
#  for(i in seq(1, 89)){
#    if(i %in% c(3, 34, 51, 53, 58, 65, 87)) {
#      next
#    }
#    all_pat_ids <- c(all_pat_ids, ifelse(i<10, paste0("0", i), as.character(i)))
#  }
#  
#  # this will be the vector with the average rates of clonal exclusivity of each patient
#  avg_rates_m <- c()
#  
#  # these will be the two lists with an entry for each patient with
#  # a histogram of how often pairs occur in the collection of trees
#  list_of_num_trees_all_pats <- list()
#  # a histogram of how ofthen pairs were clonally exclusive
#  list_of_clon_excl_all_pats <- list()
#  # this is the list of tibbles that will contain the gene-to-clone assignments from all patients from all trees
#  clone_tbl_all_pats_all_trees_list <- list()
#  
#  
#  # loop over all patients and prepare the input data
#  cnt <- 0
#  for (this_pat in all_pat_ids){
#    cnt <- cnt + 1
#  
#    # find all csv-files from current patient, which are in the clonal_genotypes subdirectory, with seeds 5, 10, 15, ..., 100
#    input_files_this_pat <- paste0(ext_data_dir, "/clonal_genotypes/cloe_seed", seq(5, 100, by = 5), "/", this_pat, ".csv")
#  
#    # here, there should be files from 20 different seeds
#    stopifnot(length(input_files_this_pat) == 20)
#  
#    # generate the tibble containing the gene-to-clone-assignments
#    clone_tbl_this_pat_all_trees <- create_tbl_tree_collection(input_files_this_pat)
#  
#    # compute the rates of clonal exclusivity
#    rates_clon_excl_this_pat <- compute_rates_clon_excl(clone_tbl_this_pat_all_trees)
#  
#    # average rate for current patient across trees
#    m_this_pat <- mean(rates_clon_excl_this_pat)
#  
#    # compute the histogram of how often gene pairs occur in the trees, and how often they are clonally exclusive
#    hist_of_num_trees_clon_excl_this_pat <- get_hist_clon_excl(clone_tbl_this_pat_all_trees)
#  
#    # save this information into named vectors/lists
#    avg_rates_m <- c(avg_rates_m, m_this_pat)
#    list_of_num_trees_all_pats[[cnt]] <- hist_of_num_trees_clon_excl_this_pat[[1]]
#    list_of_clon_excl_all_pats[[cnt]] <- hist_of_num_trees_clon_excl_this_pat[[2]]
#    names(avg_rates_m)[cnt] <- names(list_of_num_trees_all_pats)[cnt] <- names(list_of_clon_excl_all_pats)[cnt] <- this_pat
#    clone_tbl_all_pats_all_trees_list[[cnt]] <- clone_tbl_this_pat_all_trees
#  }
#  
#  # Now we aggregate the list of tibbles into one big tibble
#  clone_tbl_all_pats_all_trees <- do.call("rbind", clone_tbl_all_pats_all_trees_list)

## -----------------------------------------------------------------------------
# load the data as generated in step 1 from all 82 patients
data("clone_tbl_all_pats_all_trees")
data("avg_rates_m")
data("list_of_num_trees_all_pats")
data("list_of_clon_excl_all_pats")

## -----------------------------------------------------------------------------
# the tibble containing the gene-to-clone assignments from all patients and the whole collection of trees
print(clone_tbl_all_pats_all_trees)

## -----------------------------------------------------------------------------
# the average rates of clonal exclusivity from each patient
head(avg_rates_m)

## -----------------------------------------------------------------------------
# the histograms of how often pairs were mutated across the collection of trees - here printed for patient 02
list_of_num_trees_all_pats[[2]]

## -----------------------------------------------------------------------------
# the histograms of how often pairs were clonally exclusive across the collection of trees - here printed for patient 02
list_of_clon_excl_all_pats[[2]]

## -----------------------------------------------------------------------------
pairs_in_patients_hist(clone_tbl_all_pats_all_trees)

## -----------------------------------------------------------------------------
num_pat_pair_max <- 3

## -----------------------------------------------------------------------------
set.seed(1234)

ecdf_list <- generate_ecdf_test_stat(avg_rates_m, list_of_num_trees_all_pats, list_of_clon_excl_all_pats, num_pat_pair_max, num_pairs_sim = 100)

## -----------------------------------------------------------------------------
data("ecdf_list")

## -----------------------------------------------------------------------------
# perform the clonal exclusivity test
res_pairs <- GeneAccord(clone_tbl_all_pats_all_trees, avg_rates_m, ecdf_list)

## -----------------------------------------------------------------------------
suppressMessages(library(magrittr))
suppressMessages(library(dplyr))

# extract all pairs with an adjusted p-value less than 0.05
sig_pairs <- res_pairs %>% filter(qval < 0.05)

# print the top pairs
sig_pairs[order(sig_pairs$pval),]

## ---- eval = F----------------------------------------------------------------
#  # this will create a tibble with ensembl gene id's and corresponding hgnc gene symbols of the human genes
#  all_genes_tbl <- create_ensembl_gene_tbl_hg()

## -----------------------------------------------------------------------------
data("all_genes_tbl")

## -----------------------------------------------------------------------------
# get the hgnc symbols of the significant pairs
sig_pairs <- map_pairs_to_hgnc_symbols(sig_pairs, all_genes_tbl)

# print the top pairs - here just showing the hgnc gene symbols and not the ensembl gene id's 
sig_pairs[order(sig_pairs$pval),] %>% select(-entity_A, -entity_B, -num_patients)

## -----------------------------------------------------------------------------
# find out in which patients they occur and are clonally exclusive
sig_pairs <- take_pairs_and_get_patients(clone_tbl_all_pats_all_trees, sig_pairs)

# print the top pairs
sig_pairs[order(sig_pairs$pval),] %>% select(-entity_A, -entity_B, -mle_delta, -test_statistic, -num_patients)

## -----------------------------------------------------------------------------
# define output file
sig_pairs_tsv <- "GeneAccord_sig_pairs.tsv"

# save the results into the tsv-file
sig_pairs_table <- write_res_pairs_to_disk(sig_pairs, avg_rates_m, sig_pairs_tsv)

## ---- echo=FALSE, results='asis'----------------------------------------------
rownames(sig_pairs_table) <- NULL
knitr::kable(sig_pairs_table)

## ---- echo=F, message=F, results='hide'---------------------------------------
if(file.exists(sig_pairs_tsv)){
  suppressMessages(file.remove(sig_pairs_tsv))
}

## ---- fig.height=5, fig.width=12, fig.wide=TRUE-------------------------------
# plot the ECDF of the test statistic under the null
plot_ecdf_test_stat(ecdf_list)


## -----------------------------------------------------------------------------
res_pairs %>% select(num_patients) %>% group_by(num_patients) %>% tally()

## ---- fig.height=10, fig.width=5, fig.small=TRUE------------------------------
plot_rates_clon_excl(avg_rates_m, clone_tbl_all_pats_all_trees)

## ---- fig.width=8, fig.height=5-----------------------------------------------
# extract the patient id's in which the pairs are mutated
pairs_of_interest <- sig_pairs %>% filter(qval < 0.02)
pat_ids_of_interest <- unique(unlist(strsplit(as.character(pairs_of_interest$mutated_in), ";")))

# from the gene-to-clone tibble extract the data for one tree
this_tree_id <- 1
clone_tbl_all_pats_tree1 <- clone_tbl_all_pats_all_trees %>% 
                   filter(tree_id == this_tree_id, patient_id %in% pat_ids_of_interest) %>% 
                   select(-tree_id)

# plot the heatmap of clones and genes for the patients and genes of interest
heatmap_clones_gene_pat(pairs_of_interest, clone_tbl_all_pats_tree1, all_genes_tbl, first_clone_is_N = TRUE)


## -----------------------------------------------------------------------------
data("ensg_reactome_path_map")

## -----------------------------------------------------------------------------
clone_tbl_pat01_tree1 <- clone_tbl_all_pats_all_trees %>%
                            filter(patient_id == "01") %>%
                            filter(tree_id == 1) %>%
                            select(-tree_id)

clone_tbl_pat01_tree1_pw <- convert_ensembl_to_reactome_pw_tbl(clone_tbl_pat01_tree1, ensg_reactome_path_map)


clone_tbl_pat01_tree1_pw

## -----------------------------------------------------------------------------
# this tells GeneAccord that both cases should be tested: 
# pairs that tend to co-occur together in the same clone more often than expected,
# and pairs that tend to be clonally exclusive.
alternative <- "two.sided"

# to limit the number of tests done, the genes that should be
# tested can be pre-specified
# here, the genes VHL, PXDN
genes_of_interest <- c("ENSG00000134086", "ENSG00000130508")

# this tells GeneAccord that we only want to test pairs where both gene A AND gene B
# are from the 'genes of interest', so we're only testing the pair {VHL, PXDN}
AND_OR <- "AND"

# run GeneAccord with these options
res_pairs_ts <- suppressMessages(GeneAccord(clone_tbl_all_pats_all_trees, avg_rates_m, ecdf_list, alternative, genes_of_interest, AND_OR))

res_pairs_ts

## -----------------------------------------------------------------------------
AND_OR <- "OR"

# Then, to limit the number of tests performed, it is recommended to only specify a small
# number of genes
# here, only SART3
genes_of_interest <- c("ENSG00000075856")

# run GeneAccord testing all pairs that include the gene SART3
res_pairs_ts <- suppressMessages(GeneAccord(clone_tbl_all_pats_all_trees, avg_rates_m, ecdf_list, alternative, genes_of_interest, AND_OR))

res_pairs_ts

## -----------------------------------------------------------------------------
sessionInfo()

Try the GeneAccord package in your browser

Any scripts or data that you put into this service are public.

GeneAccord documentation built on Nov. 8, 2020, 8:04 p.m.