R/data_handling_functions.R

Defines functions pairs_in_patients_hist write_res_pairs_to_disk map_pairs_to_hgnc_symbols take_pairs_and_get_patients extract_num_clones_tbl merge_clones_identical_ents get_hist_clon_excl_this_pat_this_pair get_hist_clon_excl is_diff_branch_ent_pair get_rate_diff_branch_ent_pair compute_rates_clon_excl create_tbl_tree_collection create_tbl_ent_clones

Documented in compute_rates_clon_excl create_tbl_ent_clones create_tbl_tree_collection extract_num_clones_tbl get_hist_clon_excl get_hist_clon_excl_this_pat_this_pair get_rate_diff_branch_ent_pair is_diff_branch_ent_pair map_pairs_to_hgnc_symbols merge_clones_identical_ents pairs_in_patients_hist take_pairs_and_get_patients write_res_pairs_to_disk

#' Creates a tibble containing the information of which genes/pathways are 
#' altered in a patient in which clone.
#'
#' It expects a comma-separated table where the first column is the name 
#' of the altered gene or pathway. The other columns are for the clones in 
#' the respective tumor. Such a table can be generated with a tool that 
#' identifies clones in tumor samples, e.g. \code{Cloe}.
#'
#' The table is expected to be comma-separated and to have the columns 
#' 'altered_entity', 'clone1', 'clone2', ..., 'cloneN', depending on how 
#' many clones were detected in the respective tumor. Each row then contains
#' in the first column the name of the mutated gene or affected pathway, 
#' e.g. "ENSG00000134086", and in the other columns it has either zeros 
#' or ones, indicating in which clone the respective gene/pathway is altered.
#'
#' @title Get clone alteration tibble.
#' @param path_to_file The path to the file with the table of altered 
#' genes/pathways and their clone affiliation.
#' @param max_num_clones The upper bound for the number of clones that 
#' were found per tumor. Default: 7.
#' @author Ariane L. Moore
#' @return The tibble containing the information of which gene/pathway is 
#' altered in which clone in a patient. Has the columns 'file_name', 
#' 'patient_id', 'altered_entity', 'clone1', 'clone2', ... up to the maximal 
#' number of clones (Default: until 'clone7'). Note that the labelling 
#' of the clones does not matter and only needs to stay fixed within each 
#' patient and tree inference.
#' @export
#' @importFrom tibble tibble add_column
#' @importFrom dplyr is.tbl
#' @importFrom magrittr "%>%"
#' @importFrom utils tail read.csv
#' @examples
#' ext_data_dir <- system.file('extdata', package='GeneAccord')
#' create_tbl_ent_clones(paste(ext_data_dir, 
#'     "/clonal_genotypes/cloe_seed5/01.csv", sep=""))
create_tbl_ent_clones <- function(path_to_file, max_num_clones=7){
    stopifnot(is.character(path_to_file))
    stopifnot(is.numeric(max_num_clones))
    stopifnot(file.exists(path_to_file))

    message("Found the file ", path_to_file)

    ## create tibble
    file_name_tibble <- paste(tail(unlist(strsplit(path_to_file, "/")), n=2), 
        collapse="/")
    this_csv <- read.csv(path_to_file)
    all_ents <- as.character(as.vector(this_csv[,1]))
    this_clone_tbl <- tibble(file_name=file_name_tibble,
        patient_id=sub(".csv", "", 
        basename(path_to_file)),
        altered_entity=all_ents)
    num_mutated_ents <- dim(this_clone_tbl)[1]
    ## minus the columns for file_name, patient_id, and altered_entity
    num_clones <- dim(this_csv)[2] - 1
    ## now add the columns with the clone assignments
    for(i in seq_len(num_clones)){
        clone_name <- paste("clone", i, sep="")
        this_clone_tbl <- this_clone_tbl %>%
            add_column(new_col=as.integer(this_csv[,(i+1)]))
        names(this_clone_tbl)[names(this_clone_tbl) == "new_col"] <- 
            clone_name
    }
    stopifnot(num_clones <= max_num_clones)
    ## to make sure that all tibbles from all patients contain the same 
    ## number of
    ## clones, pseudo-columns with only zeros are added if the number of
    ## clones
    ## is less than 'max_num_clones'
    if(num_clones < max_num_clones){
        next_clone <- num_clones+1
        for(i in seq(next_clone, max_num_clones)){
            clone_name <- paste("clone", i, sep="")
            this_clone_tbl <- this_clone_tbl %>%
                add_column(new_col=as.integer(rep(0, 
                    num_mutated_ents)))
            names(this_clone_tbl)[names(this_clone_tbl) == "new_col"] <-
                clone_name
        }
    }
    
    final_clone_tbl <- this_clone_tbl

    stopifnot(is.tbl(final_clone_tbl))
    num_entries <- dim(final_clone_tbl)[1]
    num_clones <- dim(final_clone_tbl)[2] - 3
    stopifnot(num_clones == max_num_clones)
    message("Found ", num_entries, " altered genes/pathways in total.")

    return(final_clone_tbl)
}



#' Read in the patient's gene-to-clone assignment across a collection of 
#' trees
#'
#' Creates a tibble containing the information of which genes/pathways are 
#' altered in which clone in a patient across a collection of tree inferences. 
#' It expects a list containing the paths to the comma-separated tables where 
#' the first column is the name of the altered gene or pathway. The other 
#' columns are for the clones in the respective tumor. Such tables can be
#' generated by repeatedly performing the phylogenetic tree inference with 
#' e.g. the package \code{Cloe}, or by sampling from the posterior. The 
#' tables are expected to be comma-separated and to have the columns 
#' 'altered_entity', 'clone1', 'clone2', ..., 'cloneN', depending on how many
#'  clones were detected in the respective tumor. Each row then contains in 
#'  the first column the name of the mutated gene or affected pathway, e.g. 
#'  "ENSG00000134086", and in the other columns it has either zeros or ones, 
#'  indicating in which clone the respective gene/pathway is altered.
#' 
#' @title Get clone alteration tibble across the collection of trees.
#' @param input_files A vector containing the paths to the files with the 
#' tables of altered genes/pathways and their clone affiliation from
#' the collection of tree inferences. 
#' @param no_noisy_ents Minimum fraction for genes/pathways of how often they 
#' have to occur across the collection of trees in order to be in the
#' tibble. This makes sure that noisy genes, which were not assigned to many 
#' trees are excluded. Default: 0.9.
#' @param max_num_clones The upper bound for the number of clones that were 
#' found per tumor. Default: 7.
#' @author Ariane L. Moore
#' @return A clean tibble with the information of which gene/pathway is 
#' altered in which clone in the patient, and with an entry for each tree 
#' inference where it occurred. Has the columns 'file_name', 'patient_id', 
#' 'altered_entity', 'clone1', 'clone2', ... up to the maximal number of 
#' clones (Default: until 'clone7'), and 'tree_id' as an indication in which 
#' tree the assignment was found. Note that the labelling of the clones does 
#' not matter and only needs to stay fixed within each patient and 
#' tree inference. 
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr select filter mutate bind_rows tally group_by pull is.tbl
#' @examples
#' ext_data_dir <- system.file('extdata', package='GeneAccord')
#' this_patient <- "01"
#' input_files_01 <- paste(ext_data_dir, 
#'     "/clonal_genotypes/cloe_seed", seq(5, 100, by=5), 
#'     "/", this_patient, ".csv", sep="") 
#' create_tbl_tree_collection(input_files_01)
create_tbl_tree_collection <- function(input_files, no_noisy_ents=0.9, 
    max_num_clones=7){
    altered_entity <- n <- NULL
    stopifnot(is.vector(input_files))
    stopifnot(is.numeric(no_noisy_ents))
    stopifnot(is.numeric(max_num_clones))
    sanity_check <- lapply(input_files, 
        function(x){stopifnot(is.character(x)); 
    stopifnot(file.exists(x))})
  
    num_files <- length(input_files)
    message("Found a collection of tree inferences of ", 
        num_files, " input files.")
  
    ## go over the files, create a tibble with the additional information 
    ## from which tree collection the tbl is
    all_tbl_list <- list()
    for(i in seq_len(num_files)){
        this_tbl <- suppressMessages(create_tbl_ent_clones(input_files[i], 
            max_num_clones=max_num_clones))
       all_tbl_list[[i]] <- this_tbl %>% mutate(tree_id=i)
    }
    ## this is now the clone tibble from just that patient but 
    ## from all tree inferences
    all_trees_clone_tbl <- bind_rows(all_tbl_list)
  
    ## now exclude genes that occur in less than 90% of the trees, 
    ## e.g. less than 18 of 20
    all_ents <- unique(all_trees_clone_tbl$altered_entity)
    histogram_ents <- all_trees_clone_tbl %>% 
        select(altered_entity) %>%
        group_by(altered_entity) %>% 
        tally()
    noisy_ent_thresh <- no_noisy_ents*num_files
    ## get a vector with all genes that are stably assigned to the trees
    stable_ents <- histogram_ents %>% 
        filter(n >= noisy_ent_thresh) %>% 
        pull(altered_entity)
    
    ## filter such that only those are included
    all_trees_clone_tbl <- all_trees_clone_tbl %>% 
        filter(altered_entity %in% stable_ents)
    num_genes_filtered_out <- length(all_ents)-length(stable_ents)
    if (num_genes_filtered_out == 1){
        message("Removed ", num_genes_filtered_out, 
        " gene that is missing in more than ", 
        (num_files-noisy_ent_thresh) ," of gene-to-clone assignments.")
    } else {
        message("Removed ", num_genes_filtered_out, 
        " genes that are missing in more than ", 
        (num_files-noisy_ent_thresh) ," of gene-to-clone assignments.")
    }
  
    stopifnot(is.tbl(all_trees_clone_tbl))
    num_entries <- dim(all_trees_clone_tbl)[1]
    num_clones <- dim(all_trees_clone_tbl)[2] - 4
    stopifnot(num_clones == max_num_clones)
    message("Found ", num_entries, 
        " altered genes/pathways in total, which are from ", 
        num_files," gene-to-clone assignments.")
  
    return(all_trees_clone_tbl)
}



#' Compute the clonal exclusivity rates for each gene-to-clone-assignment 
#' from the collection of tree inferences.
#'
#' Takes the gene-to-clone assignment tibble as created with 
#' \code{\link{create_tbl_tree_collection}} and computes for each instance 
#' from the collection of trees the rate of clonal exclusivity. This rate 
#' is the fraction of gene/pathway pairs that were on a different branch
#' in the tumor phylogeny, i.e. the fraction of pairs that was clonally 
#' exclusive.
#' 
#' @title Get rates of clonal exclusivity for each tree inference
#' @param pat_tbl A tibble with the information of which gene/pathway is 
#' altered in which clone in the patient, and including this information
#' from the collection of trees. Can be created with with 
#' \code{\link{create_tbl_tree_collection}}. 
#' @author Ariane L. Moore
#' @return A vector with all rates of clonal exclusivity from all tree 
#' inferences.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl filter select
#' @importFrom stats median
#' @examples
#' clone_tbl <- dplyr::tibble(file_name =
#'     rep("fn1", 10),
#'     "patient_id"=rep("pat1", 10),
#'     "altered_entity"=paste0("gene", 
#'     LETTERS[seq_len(10)]),
#'     "clone1"=c(0, 1, 0, 1, 0, 1, 0, 1, 1, 1),
#'     "clone2"=c(1, 0, 1, 0, 1, 1, 1, 0, 0, 1),
#'     "tree_id"=c(rep(1, 5), rep(2, 5)))
#' compute_rates_clon_excl(clone_tbl)
compute_rates_clon_excl <- function(pat_tbl){
    tree_id <- file_name <- patient_id <- NULL
    stopifnot(is.tbl(pat_tbl))
    stopifnot("patient_id" %in% colnames(pat_tbl))
    stopifnot("file_name" %in% colnames(pat_tbl))
    stopifnot("altered_entity" %in% colnames(pat_tbl))
    stopifnot("clone1" %in% colnames(pat_tbl))
    stopifnot("clone2" %in% colnames(pat_tbl))
    stopifnot("tree_id" %in% colnames(pat_tbl))
  
    ## extract patient id
    pat_id <- unique(pat_tbl$patient_id)
    stopifnot(length(pat_id) == 1)
  
    ## extract how many different instances are in the collection 
    ## of trees
    all_trees <- unique(pat_tbl$tree_id)
    num_trees <- length(all_trees)
    message("Found ", num_trees, 
    " different tree instances in the gene-to-clone tibble from patient ", 
    pat_id, ".")
    message("Will compute the rate of clonal exclusivity for",
    " each of these...")
  
    ## compute the rate of clonal exclusivity for each tree inference
    all_rates_across_tree_collection<-vapply(all_trees, function(this_tree){
        this_tree_pat_tbl_wo_pat_name <- pat_tbl %>%
            filter(tree_id == this_tree) %>%
            select(-file_name, -patient_id, -tree_id)
        this_tree_rate_this_pat <- 
            suppressMessages(
            get_rate_diff_branch_ent_pair(this_tree_pat_tbl_wo_pat_name))
        return(this_tree_rate_this_pat)
    }, numeric(1))
    
    message("...done")
    mean_rate <- mean(all_rates_across_tree_collection)
    median_rate <- median(all_rates_across_tree_collection)
    rate_is_zero <- length(which(all_rates_across_tree_collection == 0))
    message("Current patient ", pat_id, " has mean rate ", 
        round(mean_rate, digits=3), 
        ", median rate " , round(median_rate, digits=3), 
        ", and the rate is zero in ", rate_is_zero, " out of ", 
        num_trees, " trees.")
  
    names(all_rates_across_tree_collection) <- rep(pat_id, num_trees)

    return(all_rates_across_tree_collection)
}



#' Compute the rate of mutated gene/pathway pairs being in different 
#' branches.
#'
#' Given the output of a tool that identifies clones within tumors and their 
#' phylogenetic history, this function computes the rate of mutated 
#' gene/pathway pairs being in different branches. That is, it will calculate 
#' the number of times mutated gene/pathway pairs are in different 
#' branches/clones divided by the total number of all mutated gene/pathway 
#' pairs.
#'
#' @title Compute rate of being in different branches/clones.
#' @param clone_tbl A tibble containing the columns 'altered_entity', and 
#' then a column for each clone
#' @author Ariane L. Moore
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. This tibble can be 
#' generated e.g. from the \code{Cloe} output.
#' @return The rate of occurrence of mutated gene/pathway pairs being in 
#' different clones.
#' @export
#' @importFrom dplyr is.tbl tibble
#' @importFrom caTools combs
#' @examples
#' clone_tbl <- dplyr::tibble(
#'     altered_entity=c(paste("gene", seq_len(10), sep="")),
#'     clone1=c(rep(0,10)),
#'     clone2=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone3=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone4=c(sample(c(0,1), 10, replace=TRUE)))
#' get_rate_diff_branch_ent_pair(clone_tbl)
get_rate_diff_branch_ent_pair <- function(clone_tbl) {
    altered_entity <- NULL
    stopifnot(is.tbl(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    if("patient_id" %in% colnames(clone_tbl) || 
       "file_name" %in% colnames(clone_tbl) || 
       "tree_id" %in% colnames(clone_tbl)){
       stop("The clone tibble is expected to have only the",
       " altered_entity\n",
       "and clones columns, but not file_name, patient_id, or tree_id!")
    }
  
    ## get all genes/pathways, here referred to as entity (ent)
    all_ents <- unique(as.character(clone_tbl$altered_entity))
    num_ents <- length(all_ents)
    num_entries <- dim(clone_tbl)[1]
    stopifnot(num_entries == num_ents)
    if(num_ents == 1){
        message("Found only 1 mutated gene/pathway in the provided",
            " tibble.",
            " The rate of being on a different branch cannot be determined",
            ", but will be set to 0")
        return(0)
    } else {
        message("Found ", num_ents, 
        " different mutated genes/pathways in the provided tibble.")
    
        num_clones <- dim(clone_tbl)[2]-1  
        ## one column is the gene/pathway name
        message("Found ", num_clones, 
            " different clones in the provided tibble.")
    
        ## get the number of pairs
        num_pairs <- num_ents*(num_ents-1)/2
        all_pairs <- combs(all_ents, 2)
        stopifnot(dim(all_pairs)[1] == num_pairs)
    
        ## then use apply on the pairs with a function that takes two 
        ## genes/pathways, and the clone tibble which invokes 
        ## is_diff_branch_ent_pair(ent1, ent2, clone_tbl)
        ## and returns whether or not the pair is in different 
        ## branches/clones (TRUE or FALSE)
        all_pairs_branch_info <- apply(all_pairs, 1, function(x){
            suppressMessages(is_diff_branch_ent_pair(x[1], 
                                                     x[2], 
                                                     clone_tbl))})
        stopifnot(is.logical(all_pairs_branch_info))
    
        ## get the number and rate of pairs being in 
        ## different branches/clones
        num_pairs_diff_branch <- sum(unlist(all_pairs_branch_info))
        rate_pairs_diff_branch <- num_pairs_diff_branch/num_pairs
        stopifnot(rate_pairs_diff_branch >= 0 && 
            rate_pairs_diff_branch <= 1)
      
        ## message to user
        message("Of ", num_pairs, " total gene/pathway pairs, ", 
            num_pairs_diff_branch, " occur in ",
            "different branches/clones of the tumor.")
        return(rate_pairs_diff_branch)
    }
}



#' Check whether a given pair of mutated genes/pathways is in 
#' different branches/clones.
#'
#' Given two mutated genes or pathways and the clone tibble as 
#' described in \code{\link{get_rate_diff_branch_ent_pair}}, 
#' this function returns \code{TRUE} or \code{FALSE} for whether 
#' the pair is mutated in different branches/clones.
#'
#' @title Check whether pair is in different branches/clones.
#' @param ent1 One mutated gene/pathway from the pair.
#' @param ent2 The other mutated gene/pathway from the pair.
#' @param clone_tbl A tibble containing the columns 
#' 'altered_entity', and then a column for each clone
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. This tibble 
#' can be generated e.g. from the \code{Cloe} output.
#' @author Ariane L. Moore
#' @return \code{TRUE} or \code{FALSE} for whether or not the 
#' pair is mutated in different clones/in different
#' branches of the tree.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl filter select
#' @examples
#' clone_tbl <- dplyr::tibble(
#'     altered_entity=c(paste("gene", seq_len(10), sep="")),
#'     clone1=c(rep(0,10)),
#'     clone2=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone3=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone4=c(sample(c(0,1), 10, replace=TRUE)))
#' is_diff_branch_ent_pair("gene1", "gene2", clone_tbl)
is_diff_branch_ent_pair <- function(ent1, ent2, clone_tbl) {
    altered_entity <- NULL
    stopifnot(is.character(ent1))
    stopifnot(is.character(ent2))
    stopifnot(is.tbl(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    if("patient_id" %in% colnames(clone_tbl) || 
    "file_name" %in% colnames(clone_tbl) || 
    "tree_id" %in% colnames(clone_tbl)){
        stop("The clone tibble is expected to have only",
        " the altered_entity and clones columns, but not file_name,",
        " patient_id, or tree_id!")
    }
  
    ## extract the clone profiles of the two entities
    ent1_profile_tbl <- clone_tbl %>% 
        filter(altered_entity == ent1) %>% 
        select(-altered_entity)
    ent2_profile_tbl <- clone_tbl %>% 
        filter(altered_entity == ent2) %>% 
        select(-altered_entity)
  
    ## turn into a vector
    ent1_profile <- as.numeric(as.vector(unlist(ent1_profile_tbl[1,])))
    ent2_profile <- as.numeric(as.vector(unlist(ent2_profile_tbl[1,])))
  
    ## sanity check: make sure the rows with ent1 and ent2 exist
    if(is.na(ent1_profile)[1] || is.na(ent2_profile)[1]){
        warning("At least one of the two entities ", 
        ent1, " and ", ent2, 
        " is not in the provided tibble.")
        return(FALSE)
    } else {
    
        ## the product of these two profiles will only have ones
        ## where both occur in the same clone
        ## i.e. sum of this will count how many clones they share
        num_shared_clones <- sum(ent1_profile*ent2_profile)
        message("The pair ", ent1, " and ", ent2, 
            " share ", num_shared_clones, 
            " clone(s).")
    
        return(ifelse(num_shared_clones > 0, FALSE, TRUE))
    }
}



#' Compute all values of how often gene pairs were clonally 
#' exclusive/all trees for a patient.
#' 
#' It computes a histogram of the following two values: Amomg 
#' all gene/pathway pairs in a patient, the number of trees in 
#' which the both entities of a pair are assigned to a clone 
#' at all, and the number of trees in which the pair is clonally 
#' exclusive.
#' 
#' @title Compute all values of how often gene pairs were 
#' clonally exclusive across all trees for a patient.
#' @param clone_tbl A tibble containing the columns 
#' 'altered_entity', and then a column for each clone
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. It also 
#' contains the column 'tree_id', which specifies
#' which tree of the collection of tree inferences was used. 
#' This tibble can be generated e.g. from the \code{Cloe} output.
#' @author Ariane L. Moore
#' @return A list with two vectors: The numbers of how often 
#' gene pairs were mutated across trees, and the numbers of
#' how often they were clonally exclusive. The order of these 
#' two vectors is matching, i.e. the ith entry in each
#' vector refers to the same gene pair.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom caTools combs
#' @importFrom dplyr is.tbl select tibble
#' @importFrom stats median
#' @examples
#' clone_tbl <- dplyr::tibble(
#'     altered_entity=c(paste("gene", seq_len(10), sep="")),
#'     clone1=c(rep(0,10)),
#'     clone2=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone3=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone4=c(sample(c(0,1), 10, replace=TRUE)),
#'     tree_id=c(rep(5, 5), rep(10, 5)) )
#' get_hist_clon_excl(clone_tbl)
get_hist_clon_excl <- function(clone_tbl) {
    altered_entity <- tree_id <- patient_id <- 
        file_name <- NULL
    stopifnot(is.tbl(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    stopifnot("tree_id" %in% colnames(clone_tbl))
  
    ## extract number of tree instances
    all_trees <- unique(clone_tbl$tree_id)
    num_trees <- length(all_trees)
  
    ## get the histogram of how often pairs were clonally 
    ## exclusive/all trees
    ## get all genes/pathways, here referred to as entity (ent)
    all_ents <- unique(as.character(clone_tbl$altered_entity))
    num_ents <- length(all_ents)
  
    pat_id<-""
    if("patient_id" %in% colnames(clone_tbl)){
        ## extract patient id
        pat_id <- unique(clone_tbl$patient_id)
        clone_tbl <- clone_tbl %>% select(-patient_id)
    
        message("Found ", num_trees, 
        " different tree instances in the gene-to-clone tibble",
        " from patient ", 
        pat_id, ", ",
        "and ", num_ents, " different mutated genes/pathways in total.")
        stopifnot(length(pat_id) == 1)
    } else {
        message("Found ", num_trees, 
        " different tree instances in the provided gene-to-clone tibble, ",
        "and ", num_ents, " different mutated genes/pathways in total.")
    }
    if("file_name" %in% colnames(clone_tbl)){
        clone_tbl <- clone_tbl %>% select(-file_name)
    }
 
    message("Will compute the histograms for all pairs of how often ",
    "they occur across tree instances, ",
    "and how often they are clonally exclusive...")
  
    # get the number of pairs
    num_pairs <- num_ents*(num_ents-1)/2
    all_pairs <- combs(all_ents, 2)
    stopifnot(dim(all_pairs)[1] == num_pairs)
  
    ## then use apply on the pairs with a function that takes two 
    ## genes/pathways, and the clone tibble which invokes 
    ## get_hist_clon_excl_this_pat_this_pair(ent1, ent2, clone_tbl)
    ## and returns in how many trees they were both mutated and how 
    ## often clonally exclusive
    all_pairs_clon_excl_over_num_trees_info <- 
        apply(all_pairs, 1, function(x){
            this_res_num_trees_num_clon<-
            suppressMessages(get_hist_clon_excl_this_pat_this_pair(x[1], 
                x[2], 
                clone_tbl))
            return(this_res_num_trees_num_clon)}
        )
    message("...done")
    stopifnot(is.matrix(all_pairs_clon_excl_over_num_trees_info))
    stopifnot(dim(all_pairs_clon_excl_over_num_trees_info)[1] == 2)
    stopifnot(dim(all_pairs_clon_excl_over_num_trees_info)[2] == num_pairs)
  
    num_trees_pair_mutated <- all_pairs_clon_excl_over_num_trees_info[1,]
    num_times_clonal_excl <- all_pairs_clon_excl_over_num_trees_info[2,]
  
    ## it can be that there are also zeros in the num_trees_pair_mutated, 
    ## because we compute all pairs by taking all ents from all tree 
    ## inferences. It can be that one gene is only in one tree, and 
    ## the other only in a different tree. Then for this pair, the 
    ## num_trees_pair would be zero. We do not want to include
    ## pairs who are never mutated together in a tree. Hence, we exclude 
    ## those
    idx_pairs_zero_times <- which(num_trees_pair_mutated == 0)
    if(length(idx_pairs_zero_times) > 0){
        num_trees_pair_mutated <- 
            num_trees_pair_mutated[-idx_pairs_zero_times]
        num_times_clonal_excl <- 
            num_times_clonal_excl[-idx_pairs_zero_times]
    }
    if(length(num_trees_pair_mutated) == 0 || 
    length(num_times_clonal_excl) == 0){
       message("It seems like there was no gene/pathway pair at all",
       " that occurs in the same gene-to-clone assignment.",
       " Please make sure that you have at least two ",
       "genes/pathways assigned to clones in at least",
       " one of the tree instances.")
       return(list(0, 0))
    } else {
  
        ## message to user
        message("The median value for how often a gene pair",
        " occurs across the trees is ", median(num_trees_pair_mutated), 
        ", and ",
        median(num_times_clonal_excl) , 
        " is the median value for the pairs being clonally exclusive.")
        return(list(num_trees_pair_mutated, num_times_clonal_excl))
    }
}


#' Check for a pair how often it was mutated in the current patient 
#' across trees, and how often also clonally exclusive.
#'
#' @title Check for a pair how often it was mutated in the current 
#' patient across trees, and how often also clonally exclusive.
#' @param entA One gene/pathway of the pair
#' @param entB The other gene/pathway of the pair
#' @param clone_tbl A tibble containing the columns 'altered_entity', 
#' and then a column for each clone
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. It also contains 
#' the column 'tree_id', which specifies
#' which tree of the collection of tree inferences was used. This 
#' tibble can be generated e.g. from the \code{Cloe} output.
#' @author Ariane L. Moore
#' @return A vector with the values of in how many trees the pair 
#' was mutated, and in how many of those it was clonally exclusive.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr is.tbl filter select tibble
#' @examples
#' clone_tbl <- dplyr::tibble(
#'     altered_entity=c(paste("gene", seq_len(10), sep="")),
#'     clone1=c(rep(0,10)),
#'     clone2=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone3=c(sample(c(0,1), 10, replace=TRUE)),
#'     clone4=c(sample(c(0,1), 10, replace=TRUE)),
#'     tree_id=c(rep(5, 5), rep(10, 5)) )
#' get_hist_clon_excl_this_pat_this_pair("gene1", "gene2", clone_tbl)
get_hist_clon_excl_this_pat_this_pair<-function(entA, entB, clone_tbl){
    altered_entity <- tree_id <- NULL
    stopifnot(is.character(entA))
    stopifnot(is.character(entB))
    stopifnot(is.tbl(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    stopifnot("tree_id" %in% colnames(clone_tbl))
    if("patient_id" %in% colnames(clone_tbl) || 
    "file_name" %in% colnames(clone_tbl)){
        stop("The clone tibble is expected to have only the"," 
        altered_entity, tree_id, and clones columns, but not file_name, ",
        "or patient_id!")
    }
              
    for(this_col in colnames(clone_tbl)){ # sanity check
        if(this_col != "altered_entity" && 
        !grepl("clone", this_col) && this_col != "tree_id"){
            ## if there still is a column in this_clone_tbl that is not 
            ## either a clone or an altered entity, or the tree_id
            ## this will be problematic, because it will be counted as 
            ## a clone and hence may 
            ## lead to erroneous numbers of clonal exclusivity
            warning("The column ", this_col, 
            " in the clone tibble may be accidentally counted as a clone.")
        }
    }
  
    ## get the number of trees
    all_tree_ids <- unique(as.character(clone_tbl$tree_id))
    num_trees <- length(all_tree_ids)
    message("Found ", num_trees, 
    " different tree ids in the clone tbl.")
              
    ## get the info of num_trees and clon excl
    this_pair_across_trees <- vapply(all_tree_ids, function(x){
        this_tree_clone_tbl <- clone_tbl %>% 
            filter(tree_id == x) %>% 
            select(-tree_id)
        all_ents_this_tree <- 
            as.character(this_tree_clone_tbl$altered_entity)
        if(entA %in% all_ents_this_tree && 
        entB %in% all_ents_this_tree){
            this_pair_occurs <- 1
        this_pair_clon_excl <- 
            sum(suppressMessages(is_diff_branch_ent_pair(entA, 
                entB, 
                this_tree_clone_tbl)))
        } else {
        this_pair_occurs <- this_pair_clon_excl <- 0
        }
        return(c(this_pair_occurs, this_pair_clon_excl))
    }, numeric(2))
    ## the columns are the info for each tree_id: occurs?, clon excl?
    stopifnot(dim(this_pair_across_trees)[2] == num_trees)
    stopifnot(dim(this_pair_across_trees)[1] == 2)
    
    num_trees_clon_excl <- apply(this_pair_across_trees, 1, sum)
    num_trees_occurs <- num_trees_clon_excl[1]
    num_clon_excl <- num_trees_clon_excl[2]
    stopifnot(num_clon_excl <= num_trees_occurs)
    stopifnot(num_trees_occurs <= num_trees)
    stopifnot(num_trees_occurs >= 0 && num_clon_excl >= 0)
    
    return(c(num_trees_occurs, num_clon_excl))
}




#' Merge clone profile of identical entities in clone tibble from 
#' one patient
#'
#' Given a clone tibble as created with \code{\link{create_tbl_ent_clones}} 
#' from one patient and where the entities were possibly mapped from genes 
#' to pathways, this function checks whether there were several entities 
#' mapped to the same new entity. If so, the clone profile will be merged. 
#' This can be the case, for instance, of two mutated genes are in the same 
#' pathway(s).
#'
#' @title Merge identical entities in clone tibble from one patient
#' @param clone_tbl The clone tibble as generated with 
#' \code{\link{create_tbl_ent_clones}} from one patient.
#' @author Ariane L. Moore
#' @return The same tibble but in case there were several identical 
#' genes/pathways in the same patient with
#' different clone profiles, their profile will be merged together. This 
#' can happen if, e.g. two genes with different clone profiles are in the 
#' same pathway. When mapping them to the pathways, there will be two 
#' identical 'altered_entities' with different clone profiles. These 
#' profiles would be merged by this function because the pathway is 
#' affected in the union of clones were the two genes were mutated.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl filter select as.tbl
#' @examples
#' clone_tbl <- dplyr::tibble(
#'     file_name=c(rep("fn1", 4)),
#'     patient_id=c(rep("pat1", 4)),
#'     altered_entity=c("pw1", "pw1", 
#'                        "pw2", "pw3"),
#'     clone1=c(1, 0, 1, 0),
#'     clone2=c(0, 1, 0, 1),
#'     clone3=c(1, 1, 0, 1),
#'     clone4=c(0, 1, 0, 0))
#' merge_clones_identical_ents(clone_tbl)
merge_clones_identical_ents <- function(clone_tbl) {
    patient_id <- file_name <- altered_entity <- NULL
    stopifnot(is.tbl(clone_tbl))
    stopifnot("patient_id" %in% colnames(clone_tbl))
    stopifnot("file_name" %in% colnames(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    stopifnot("clone2" %in% colnames(clone_tbl))
    if("tree_id" %in% colnames(clone_tbl)){
        stop("The clone tibble is expected to have only",
        " the altered_entity, patient_id, file_name, ",
        "and clones columns, but not tree_id!")
    }
  
    ## make sure that this is just from one patient
    this_pat <- unique(as.character(clone_tbl$patient_id))
    this_file_name <- unique(as.character(clone_tbl$file_name))
    stopifnot(length(this_pat) == 1)
    
    ## extract information from tibble
    num_entries <- dim(clone_tbl)[1]
    num_clones <- dim(clone_tbl)[2] - 3
  
    ## message to user
    message("The clone tibble from patient ", 
        this_pat," contains ", num_entries, " entries.")
    message("There are ", num_clones, 
        " different clones specified.")

    ## check whether there are identical mutated_entities
    all_sorted_ents <- sort(as.character(clone_tbl$altered_entity))
    all_sorted_unique_ents <- unique(all_sorted_ents)
    num_all_ents <- length(all_sorted_ents)
    stopifnot(num_all_ents == num_entries)
    num_unique_ents <- length(all_sorted_unique_ents)
    message("There are ", num_unique_ents, 
        " different mutated genes/pathways in the tibble.")
   
    ## and if so, merge them together
    merged_ents_clone_tbl_list <- lapply(all_sorted_unique_ents, 
        function(x){
        ## extract the tibble with just the clone profiles for 
        ## the current entity
        this_ent_tbl <- clone_tbl %>% 
            filter(altered_entity == x) %>%
            select(-file_name, -patient_id, -altered_entity)
        ## make sure that the clone indicators 0 and 1 are 
        ## integers and not factors
        this_ent_tbl <- apply(this_ent_tbl[, seq_len(num_clones)], 2, 
            function(y){as.numeric(as.character(y))})
        ## this is to make sure that the sum over columns works in 
        ## case there is just one row
        this_ent_tbl <- rbind(this_ent_tbl, c(rep(0, num_clones))) 
        ## sum up the rows, i.e. the clone profiles
        merged_clone_profile <- 
            ifelse(apply(this_ent_tbl, 2, sum) > 0, 1, 0)
        stopifnot(length(merged_clone_profile) == num_clones)
        
        # convert the merged_clone_profile to a dataframe
        merged_clone_profile_df<-as.data.frame(t(merged_clone_profile))
        
        ## create the merged tibble
        ## the clone columns are now double instead of factor
        this_ent_merged_tbl <- as.data.frame(cbind(
            file_name=this_file_name,
            patient_id=this_pat,
            altered_entity=x,
            merged_clone_profile_df))
        
        ## in order to use bind_rows, we have to make sure that 
        ## the altered ent is a factor with all levels from all 
        ## patients
        this_ent_merged_tbl$altered_entity <-
            factor(this_ent_merged_tbl$altered_entity, 
            levels = all_sorted_unique_ents)
        
        ## return the final merged tibble for the current entity
        this_ent_merged_tbl
    })
  
    ## append the list of tibbles to one big tibble
    final_merged_clone_tbl <- 
        as.tbl(bind_rows(merged_ents_clone_tbl_list))
  
    stopifnot(is.tbl(final_merged_clone_tbl))
    num_entries <- dim(final_merged_clone_tbl)[1]
    num_cols <- dim(final_merged_clone_tbl)[2]
    stopifnot(num_cols == (num_clones + 3))
    message("Now there are ", num_entries, 
        " different mutated genes/pathways from patient ",
        this_pat, ".")
    stopifnot(num_entries == num_unique_ents)
    return(final_merged_clone_tbl)
}
  
  


#' Extract number of clones in each patient.
#'
#' Given a clone tibble as created with 
#' \code{\link{create_tbl_ent_clones}} this function 
#' extracts the information, how many clones there are 
#' in each patient. The counted clones will be those 
#' with at least one non-zero entry, i.e.
#' at least one gene/pathway assigned to the clone.
#'
#' @title Extract number of clones.
#' @param clone_tbl The tibble generated with 
#'  \code{\link{create_tbl_ent_clones}}.
#' @author Ariane L. Moore
#' @return A named vector with the number of clones 
#' in each patient. The name of each element is the respective
#' patient_id.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl select distinct pull filter
#' @examples
#' clone_tbl <- dplyr::tibble(
#'     file_name=c(rep("fn1", 2), rep("fn2", 2)),
#'     patient_id=c(rep("pat1", 2), rep("pat2", 2)),
#'     altered_entity=c("pw1", "pw2", "pw1", "pw3"),
#'     clone1=c(0, 0, 0, 0),
#'     clone2=c(0, 1, 0, 1),
#'     clone3=c(1, 1, 0, 1),
#'     clone4=c(1, 0, 0, 0))
#' extract_num_clones_tbl(clone_tbl)
extract_num_clones_tbl <- function(clone_tbl) {
    patient_id <- altered_entity <- file_name <- NULL
    stopifnot(is.tbl(clone_tbl))
    stopifnot("patient_id" %in% colnames(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("file_name" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    stopifnot("clone2" %in% colnames(clone_tbl))
    stopifnot(!"tree_id" %in% colnames(clone_tbl)) 
    ## this should not be in here because otherwise it will be 
    ## counted as a clone

    ## message to user
    num_entries <- dim(clone_tbl)[1]
    message(num_entries, 
        " alterations in total found in the provided tibble.")
    num_clones_upper_bound <- dim(clone_tbl)[2]-3
    message(num_clones_upper_bound, 
        " is assumed to be the upper bound of number of clones.")

    ## extract all the patient id's
    all_pats <- clone_tbl %>% 
        select(patient_id) %>% 
        distinct() %>%
        pull(patient_id)
    num_pats <- length(all_pats)
            
    num_clones <- vapply(all_pats, function(x){
        this_pat_clone_tbl <- clone_tbl %>% 
            filter(patient_id == x) %>%
            select(-file_name, -patient_id, -altered_entity)
        ## make sure that the clone indicators 0 and 1 are integers 
        ## and not factors
        this_pat_clone_tbl <- 
            apply(this_pat_clone_tbl[, seq_len(num_clones_upper_bound)],
            2, function(y){as.numeric(as.character(y))})
        ## this is to make sure that the sum over columns works 
        ## in case there is just one row
        this_pat_clone_tbl <- rbind(this_pat_clone_tbl, 
            c(rep(0, num_clones_upper_bound))) 
        clones_summary <- apply(this_pat_clone_tbl, 2, sum)
        stopifnot(length(clones_summary) == num_clones_upper_bound)
        this_num_clones <- sum(ifelse(clones_summary > 0, 1, 0))
        stopifnot(this_num_clones > 0)
        stopifnot(this_num_clones <= num_clones_upper_bound)
        return(this_num_clones)
    }, numeric(1))

    ## message to user
    min_num_clones <- min(num_clones)
    max_num_clones <- max(num_clones)
    message("There are ", 
        min_num_clones, "-", 
        max_num_clones, 
        " clones in each patient.")
    return(num_clones)
}






#' Take the final pairs and return the patients id's, in which 
#' they are mutated, and the patients id's in which they are 
#' clonally exclusive.
#'
#' This function takes the final pairs of interest, and returns a 
#' tibble with the information for each gene pair, in which patient 
#' the pair was mutated, and in which of these patients the pair was 
#' clonally exclusive in the majority of the trees in the tree 
#' inference collection.
#'
#' @title Get the patients in which pairs are mutated
#' @param clone_tbl_all_trees The tibble containing the information 
#' of which gene/pathway is mutated in which clone from which 
#' patient across a collection of trees. Can be generated with 
#' \code{\link{create_tbl_tree_collection}}, repeatedly for 
#' each patient, and then combining them.
#' @param pairs_of_interest_tbl A tibble containing pairs of 
#' mutated genes/pathways. More precisely, it contains the 
#' columns 'entity_A' and 'entity_B'.
#' @author Ariane L. Moore
#' @return A tibble similar to the input 
#' \code{pairs_of_interest_tbl} but with two additional columns, 
#' namely 'mutated_in', and 'clonally_exclusive_in'. The column 
#' 'mutated_in' contains the patient id's that the pair is mutated 
#' in separated by a semicolon. The column 'clonally_exclusive_in' 
#' contains the semicolon separated patient id's of the ones in 
#' which the pairs was also clonally exclusive in the majority of 
#' the trees in the collection of tree inferences.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble distinct filter select group_by 
#' tally pull is.tbl bind_cols
#' @examples
#' clone_tbl <- dplyr::tibble(file_name=rep(c(rep(c("fn1", "fn2"), 
#'     each=3)), 2),
#'     patient_id=rep(c(rep(c("pat1", "pat2"), 
#'     each=3)), 2),
#'     altered_entity=c(rep(c("geneA", "geneB", "geneC"), 
#'     4)),
#'     clone1=c(0, 1, 0, 1, 0, 1, 0, 
#'     1, 1, 1, 0, 0),
#'     clone2=c(1, 0, 1, 0, 1, 1, 1, 
#'     0, 0, 1, 0, 1),
#'     tree_id=c(rep(5, 6), rep(10, 6)))
#' pairs_of_interest <- dplyr::tibble(entity_A=c("geneA", "geneB"), 
#'     entity_B=c("geneB", "geneC"))
#' take_pairs_and_get_patients(clone_tbl, pairs_of_interest)
take_pairs_and_get_patients <- function(clone_tbl_all_trees, 
                                        pairs_of_interest_tbl){
    
    entity_A <- entity_B <- patient_id <- tree_id <- file_name  <-
    n <- altered_entity <- NULL
    stopifnot(is.tbl(clone_tbl_all_trees))
    stopifnot("patient_id" %in% colnames(clone_tbl_all_trees))
    stopifnot("file_name" %in% colnames(clone_tbl_all_trees))
    stopifnot("altered_entity" %in% colnames(clone_tbl_all_trees))
    stopifnot("clone1" %in% colnames(clone_tbl_all_trees))
    stopifnot("clone2" %in% colnames(clone_tbl_all_trees))
    stopifnot("tree_id" %in% colnames(clone_tbl_all_trees))
    stopifnot("entity_A" %in% colnames(pairs_of_interest_tbl))
    stopifnot("entity_B" %in% colnames(pairs_of_interest_tbl))
  
    ## message to user
    all_tree_ids <- unique(clone_tbl_all_trees %>% pull(tree_id))
    num_pats <- 
        length(unique(clone_tbl_all_trees %>% pull(patient_id)))
    num_trees <- length(all_tree_ids)
    num_pairs <- dim(pairs_of_interest_tbl)[1]
    message("Found ", num_pairs, 
        " pairs of interest for which the patients will be extracted.")
    message("There are ", num_pats, 
        " different patients in the clone tibble and ",
        "the number of tree inferences is ", num_trees, ".")
    num_min_trees_clon_excl <- floor((num_trees/2)+1)
    message("A pair is counted as clonally exclusive in a patient,",
        " if it clonally exclusive in the majority",
        " of tree inferences, i.e. in at least ", 
        num_min_trees_clon_excl, " trees of a patient.")
  
    ## find out for each pair separately in which patients it is
    ## mutated and clonally exclusive
    pairs_res_pats <- apply(pairs_of_interest_tbl, 1, function(x){
        entA <- as.character(x[1])
        entB <- as.character(x[2])
        
        ## get the patients in which they are both mutated
        this_pair_pats_and_trees <- clone_tbl_all_trees %>%
            filter(altered_entity %in% x) %>%
            select(patient_id, tree_id)
        pat_ids_all_trees <- lapply(all_tree_ids, function(y){
            pat_ids_in_which_both_mutated <-
                this_pair_pats_and_trees %>% 
                filter(tree_id == y) %>% 
                select(patient_id) %>%
                group_by(patient_id) %>%
                tally() %>%
                filter(n == 2) %>%
                pull(patient_id)
            return(pat_ids_in_which_both_mutated)
        })
        these_pats_mut <- unique(as.vector(unlist(pat_ids_all_trees)))
        stopifnot(!is.list(these_pats_mut))

        ## now we extract from the clone tbl only the patients 
        ## in which they are both mutated
        this_pair_pats_mut_clon_tbl <- clone_tbl_all_trees %>%
            filter(altered_entity %in% x) %>%
            filter(patient_id %in% these_pats_mut)

        ## for each pat in which they are mutated, get number of 
        ## times it was clonally exclusive
        ## just for the current patient
        pair_num_clon_excl <- vapply(these_pats_mut, function(y){
            ## take the clone tibble
            this_clone_tbl <- this_pair_pats_mut_clon_tbl %>%  
                filter(patient_id == y) %>%      
                select(-file_name, -patient_id)
            
            sanity_check <- lapply(colnames(this_clone_tbl), 
            function(this_col){
                if(this_col != "altered_entity" && 
                !grepl("clone", this_col) && this_col != "tree_id"){
                ## if there still is a column in this_clone_tbl that
                ## is not either a clone or an altered entity, or 
                ## the tree_id this will be problematic, because it 
                ## will be counted as a clone and hence may 
                ## lead to erroneous numbers of clonal exclusivity
                warning("The column ", this_col, 
                " in the clone tibble may be accidentally",
                " counted as a clone.")
                }
            })
            stopifnot(is.list(sanity_check))
        
            num_trees_num_clon_excl_this_pat <- 
                suppressMessages(get_hist_clon_excl_this_pat_this_pair(entA, 
                entB, 
                this_clone_tbl))
        
            return(num_trees_num_clon_excl_this_pat[2])
        }, numeric(1))
        stopifnot(length(pair_num_clon_excl) == length(these_pats_mut))
    
        
        ## now we construct the stings to return, that is the string with 
        ## all semicolon separated pat ids in which the pair is mutated
        ## and the string with all semicolon separated pat ids in which they 
        ## are clonally exlcusive in the majority of trees
        this_pair_mutated_in<-
            paste(these_pats_mut, sep="", collapse=";")
        this_pair_clon_excl_in_vec <- 
            names(pair_num_clon_excl[which(pair_num_clon_excl >= 
                num_min_trees_clon_excl)])
        this_pair_clon_excl_in <- 
            paste(this_pair_clon_excl_in_vec, sep="", collapse=";")
        ## in the case where one or both of these strings is empty:
        if(this_pair_mutated_in == ""){
            this_pair_mutated_in <- "."
        }
        if(this_pair_clon_excl_in == ""){
            this_pair_clon_excl_in <- "."
        }

        return(c(this_pair_mutated_in, this_pair_clon_excl_in))
    })
    stopifnot(dim(pairs_res_pats)[1] == 2)
    stopifnot(dim(pairs_res_pats)[2] == num_pairs)
    mutated_in <- pairs_res_pats[1,]
    clonally_exclusive_in <- pairs_res_pats[2,]

    res_tbl<-bind_cols(pairs_of_interest_tbl, 
        mutated_in=mutated_in, 
        clonally_exclusive_in=clonally_exclusive_in)
    return(res_tbl)
}



#' Map the ensembl gene ids to hgnc symbols from a tibble with pairs.
#'
#' After having extracted the pairs of interest, it is of interest to 
#' know the genes hgnc symbols of the pairs. Here, it is assumed that the 
#' current gene identifier of the pairs are ensembl gene ids. They will 
#' be mapped to the corresponding hgnc symbols.
#'
#' @title Map the ensembl gene ids to hgnc symbols from a tibble
#' @param pairs_of_interest_tbl A tibble containing pairs of mutated 
#' genes/pathways. More precisely, it contains the columns
#' 'entity_A' and 'entity_B'.
#' @param all_genes_tbl A tibble with all genes ensembl id's and 
#' hgnc symbols. Can be created with 
#' \code{\link{create_ensembl_gene_tbl_hg}}.
#' @author Ariane L. Moore
#' @return A tibble similar to the input \code{pairs_of_interest_tbl} 
#' but with two additional columns, namely 'hgnc_gene_A', and 'hgnc_gene_B'. 
#' The column 'hgnc_gene_A' contains the hgnc gene symbol of 'entity_A', 
#' and the column 'hgnc_gene_B' the one of 'entity_B'.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl pull bind_cols
#' @examples
#' \dontrun{
#' pairs_of_interest <- dplyr::tibble(
#'     entity_A=c("ENSG00000181143", "ENSG00000163939"),
#'     entity_B=c("ENSG00000141510", "ENSG00000163930"))
#' all_genes_tbl <- create_ensembl_gene_tbl_hg()
#' map_pairs_to_hgnc_symbols(pairs_of_interest, all_genes_tbl)
#' }
map_pairs_to_hgnc_symbols <- function(pairs_of_interest_tbl, 
    all_genes_tbl){
    entity_A <- entity_B  <- ensembl_gene_id <- 
    hgnc_symbol <-  NULL
    stopifnot(is.tbl(pairs_of_interest_tbl))
    stopifnot(is.tbl(all_genes_tbl))
    stopifnot("entity_A" %in% colnames(pairs_of_interest_tbl))
    stopifnot("entity_B" %in% colnames(pairs_of_interest_tbl))
    stopifnot("ensembl_gene_id" %in% colnames(all_genes_tbl))
    stopifnot("hgnc_symbol" %in% colnames(all_genes_tbl))


    ## map the ensembl gene ID's to the gene names
    these_ens_ids_A <- pairs_of_interest_tbl %>% 
        pull(entity_A)
    hgnc_gene_A <- c()
    for(this_ens in these_ens_ids_A){
        this_hgnc <- 
            suppressMessages(ensembl_to_hgnc(this_ens, 
            all_genes_tbl))
        hgnc_gene_A <- c(hgnc_gene_A, this_hgnc)
    }
    these_ens_ids_B <- pairs_of_interest_tbl %>%
        pull(entity_B)
    hgnc_gene_B <- c()
    for(this_ens in these_ens_ids_B){
        this_hgnc <- 
            suppressMessages(ensembl_to_hgnc(this_ens, 
            all_genes_tbl))
        hgnc_gene_B <- c(hgnc_gene_B, this_hgnc)
    }
    
    ## add the columns to the pairs of interest tibble
    pairs_of_interest_tbl_hgnc <- 
        bind_cols(pairs_of_interest_tbl,
                         hgnc_gene_A=hgnc_gene_A,
                         hgnc_gene_B=hgnc_gene_B)
    
    return(pairs_of_interest_tbl_hgnc)
}



#' Write the resulting significant pairs tibble to disk as 
#' a tab-separated file.
#'
#' After having extracted the significant pairs. The tibble 
#' can be saved as a tab-separated file. It is assumed that the
#' input tibble has the columns 'hgnc_gene_A', 'hgnc_gene_B', 
#' 'pval', 'qval', 'mutated_in', 'clonally_exclusive_in'.
#'
#' @title Write resulting significant pairs to disk
#' @param sig_pairs The tibble containing the significant pairs 
#' of mutated genes/pathways.
#' @param avg_rates_m The average rates of clonal exclusivity for 
#' each patient. The name of each rate is the respective 
#' patient identifier.
#' @param tsv_file The path to the tsv-file to which the results 
#' should be written.
#' @param num_digits The number of digits after the comma of the 
#' average rate m, the p-value and the q-value (adjusted p-value). 
#' Default: 2.
#' @author Ariane L. Moore
#' @return The tibble that is written to disk. It has the columns 'Gene A', 
#' 'Gene B', 'P-value', 'Adjusted p-value', 
#' 'Mutated in (rate)', 'Clonally exclusive in'.
#' @export
#' @importFrom dplyr is.tbl tibble
#' @importFrom utils write.table
#' @examples
#' sig_pairs <- dplyr::tibble(hgnc_gene_A=c("VHL", "BAP1"),
#'     hgnc_gene_B=c("PTEN", "KIT"),
#'     pval=c(0.001, 0.002),
#'     qval=c(0.01, 0.02),
#'     mutated_in=c("pat1; pat2", "pat1; pat2"),
#'     clonally_exclusive_in=c("pat1; pat2", 
#'     "pat2"))
#' avg_rates_m <- c(pat1=0.0034, pat2=0.0021)
#' write_res_pairs_to_disk(sig_pairs, avg_rates_m, "test.tsv")
#' file.remove("test.tsv")
write_res_pairs_to_disk <- function(sig_pairs, avg_rates_m, 
    tsv_file, num_digits=2){
    stopifnot(is.tbl(sig_pairs))
    stopifnot("hgnc_gene_A" %in% colnames(sig_pairs))
    stopifnot("hgnc_gene_B" %in% colnames(sig_pairs))
    stopifnot("pval" %in% colnames(sig_pairs))
    stopifnot("qval" %in% colnames(sig_pairs))
    stopifnot("mutated_in" %in% colnames(sig_pairs))
    stopifnot("clonally_exclusive_in" %in% colnames(sig_pairs))
    stopifnot(is.numeric(avg_rates_m))
    stopifnot(is.character(tsv_file))
    stopifnot(is.numeric(num_digits))
    stopifnot(num_digits >= 0)
  
    ## instead of just the patient id (e.g. "pat1; pat2") we want also 
    ## the string Patient and the rate in parenthesis 
    ## (e.g. "Patient pat1 (0.0034); Patient pat2 (0.0021)")
    mutated_in_with_patient_and_rates <- vapply(sig_pairs$mutated_in,
        function(x){
            these_pats_label <- unlist(strsplit(x, ";"))
            
            these_pats_label_with_rates <- vapply(these_pats_label,
            function(this_pat){
                this_pat <- trimws(this_pat, which="both")
                ## make sure that the patients labels are also 
                ## among the named rates
                stopifnot(this_pat %in% names(avg_rates_m))
                this_new_label <-paste0("Patient ", 
                    this_pat, " (", 
                    round(avg_rates_m[which(names(avg_rates_m) == this_pat)], 
                    digits=num_digits), ")")
                return(this_new_label)
            }, character(1))
            
        return(paste(sort(these_pats_label_with_rates), collapse="; "))
    },
    character(1))
    ## instead of just the patient id (e.g. "pat1; pat2") we want also 
    ## the string Patient (e.g. "Patient pat1; Patient pat2")
    clonally_excl_in_with_patient <- vapply(sig_pairs$clonally_exclusive_in,
    function(x){
        these_pats_label <- unlist(strsplit(x, ";"))
      
        these_pats_label_without_rates<-vapply(these_pats_label,
        function(this_pat){
            this_pat <- trimws(this_pat, which="both")
            this_new_label <-paste0("Patient ", this_pat)
            return(this_new_label)
        }, character(1))
        
        return(paste(sort(these_pats_label_without_rates), collapse="; "))
    },
    character(1))
    ## remove the names, because otherwise the results tibble has 
    ## weird rownames
    names(mutated_in_with_patient_and_rates) <- NULL
    names(clonally_excl_in_with_patient) <- NULL
  
    ## prepare the results table
    table_to_save <- data.frame(GeneA=sig_pairs$hgnc_gene_A,
        GeneB=sig_pairs$hgnc_gene_B,
        Pvalue=round(as.numeric(as.vector(sig_pairs$pval)), 
        digits=num_digits),
        AdjustedPvalue=round(as.numeric(as.vector(sig_pairs$qval)), 
        digits=num_digits),
        MutatedIn=mutated_in_with_patient_and_rates,
        ClonallyExclusiveIn=clonally_excl_in_with_patient)
    ## the order should be without rounding it
    orderOfPairs <- order(as.numeric(as.vector(sig_pairs$pval)))
    table_to_save <- table_to_save[orderOfPairs, ]
    colnames(table_to_save) <- c("Gene A", "Gene B", "P-value", 
        "Adjusted p-value",
        "Mutated in (rate)", "Clonally exclusive in")
    ## write it to disk
    write.table(table_to_save,
        file=tsv_file,
        quote=FALSE, col.names=TRUE, 
        row.names=FALSE, sep="\t")
    return(table_to_save)
}



#' Check in how many patients pairs are mutated in
#' 
#' After having created the tibble with all gene-to-clone 
#' assignments from all patients and the whole collection
#' of trees, we're interested in how many patients tha 
#' pairs are mutated in. This function creats a histogram
#' that shows in how many patients the pairs are mutated in.
#' 
#' @title Pairs in how many patients histogram
#' @param clone_tbl The tibble containing the information of 
#' which gene/pathway is mutated in which
#' clone from which patient and in which tree from the collection 
#' of trees. Can be generated with \code{\link{create_tbl_tree_collection}}
#' for each patient separately and then appended.
#' @author Ariane L. Moore
#' @return The tibble that summarizes the number of pairs that
#' occur in how many patients.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr filter select is.tbl mutate distinct pull
#' tibble bind_rows tally group_by
#' @importFrom caTools combs
#' @examples
#' clone_tbl <- dplyr::tibble("file_name" =
#'     rep(c(rep(c("fn1", "fn2"), each=3)), 2),
#'     "patient_id"=rep(c(rep(c("pat1", "pat2"), each=3)), 2),
#'     "altered_entity"=c(rep(c("geneA", "geneB", "geneC"), 4)),
#'     "clone1"=c(0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0),
#'     "clone2"=c(1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1),
#'     "tree_id"=c(rep(5, 6), rep(10, 6)))
#' pairs_in_patients_hist(clone_tbl)
pairs_in_patients_hist <- function(clone_tbl){
    patient_id <- altered_entity <- file_name <- 
    tree_id <- pairs <- n <- pairs_count <- 
    patient_count <- NULL
    stopifnot(is.tbl(clone_tbl))
    stopifnot("file_name" %in% colnames(clone_tbl))
    stopifnot("patient_id" %in% colnames(clone_tbl))
    stopifnot("altered_entity" %in% colnames(clone_tbl))
    stopifnot("clone1" %in% colnames(clone_tbl))
    stopifnot("clone2" %in% colnames(clone_tbl))
    stopifnot("tree_id" %in% colnames(clone_tbl))
  
    ## get all patients
    all_pats <- unique(as.character(clone_tbl$patient_id))
    num_pats <- length(all_pats)
    message("Found ", num_pats, 
        " different patient id's in the provided tibble.")
    
    ## get the pairs from each patient separately, and then check 
    ## which pairs re-occur across at least two patients
    list_all_pairs_all_pats <- lapply(all_pats, function(x){
        all_ents_this_pat <- clone_tbl %>% 
            filter(patient_id == x) %>%
            select(altered_entity) %>%
            distinct() %>%
            pull(altered_entity)
        num_ents_this_pat <- length(all_ents_this_pat)
        num_pairs_this_pat <- 
        num_ents_this_pat*(num_ents_this_pat-1)/2
        ## get all pairs of entities of that patient efficiently
        all_pairs_this_pat <- combs(all_ents_this_pat, 2)
        ## to make sure the gene pairs are all in the same order such that
        ## later, when we check which pairs occur in multiple patients,
        ## that we do not miss pairs just because they are in a different
        ## order
        all_pairs_this_pat <- as.data.frame(t(apply(all_pairs_this_pat, 1, 
            function(pair){sort(pair)})))
        stopifnot(num_pairs_this_pat == dim(all_pairs_this_pat)[1])
        return(all_pairs_this_pat)
    })
    all_pairs_all_pats <- bind_rows(list_all_pairs_all_pats)
    unique_all_pairs_all_pats <- unique(all_pairs_all_pats)
    num_pairs <- dim(unique_all_pairs_all_pats)[1]
    all_pairs <- unique_all_pairs_all_pats
    message("There are in total ", 
    num_pairs, 
    " pairs of genes/pathways mutated in at least one patient.")
  
    ## get the pairs and patients histogram
    all_pairs_all_pats_one_string <- apply(all_pairs_all_pats, 1, 
        function(x){paste0(x, collapse="__")})
    pairs_num_pats_tally <- 
        tibble(pairs=all_pairs_all_pats_one_string) %>% 
        group_by(pairs) %>% 
        tally() %>%
        mutate(patient_count=n) %>%
        select(pairs, patient_count)
  
    num_pairs_num_pats_tally <- 
        pairs_num_pats_tally %>% 
        select(patient_count) %>%
        group_by(patient_count) %>%
        tally() %>%
        mutate(pairs_count=n) %>%
        select(pairs_count, patient_count)
  
    pairs_c <- num_pairs_num_pats_tally$pairs_count
    patient_c <- num_pairs_num_pats_tally$patient_count
  
    num_pairs_atLeast2 <- sum(pairs_c[which(patient_c >= 2)])
    message("There are in total ", num_pairs_atLeast2, 
    " pairs of genes/pathways mutated in at least 2 patients.")
  
    ## check what is the maximum n that pairs are mutated in how many 
    ## patients
    max_num_pat_pairs <- max(patient_c)
  
    message("The maximum number of patients that a pair is ",
        "mutated in is ", 
    max_num_pat_pairs, ". Please run the function",
    " 'generate_ecdf_test_stat' ",
    "with ",
    "num_pat_pair_max=", max_num_pat_pairs, ".")
    return(num_pairs_num_pats_tally)
}

Try the GeneAccord package in your browser

Any scripts or data that you put into this service are public.

GeneAccord documentation built on Nov. 8, 2020, 8:04 p.m.