#' Construct a REVOLVER cohort.
#'
#' @param dataset A dataframe in the specified format (see the package vignettes).
#' @param CCF_parser A function to parse the format for the encoding of CCF
#' or binary values for each sequenced region. A possible function is available
#' inside REVOLVER; \code{\link{CCF_parser}} (the default of this parameter).
#' @param ONLY.DRIVER If true, uses only annotated driver events.
#' @param MIN.CLUSTER.SIZE Discard clusters that have less than this number of entries.
#' @param annotation Brief cohort description.
#'
#' @return An object of the S3 class \code{"rev_cohort"} that represents a \code{REVOLVER} cohort.
#'
#' @import tidyverse
#' @import tidygraph
#' @import pio
#' @import easypar
#' @import RColorBrewer
#' @import crayon
#' @import cluster
#' @import dendextend
#' @import dynamicTreeCut
#' @import igraph
#' @import ggpubr
#' @import ggrepel
#' @import clisymbols
#' @import evoverse.datasets
#'
#' @export
#' @family Cohort creation
#'
#' @examples
#' # Example cohort creation with the TRACERx data
#' data(TRACERx_data)
#'
#' # To speed up the process we use only 2 patients
#' TRACERx_data = TRACERx_data %>%
#' filter(patientID %in% c('CRUK0001', 'CRUK0002'))
#'
#' cohort = revolver_cohort(TRACERx_data, annotation = 'A toy REVOLVER dataset')
#'
#' # The S3 print/ plot for this cohort
#' print(cohort)
#' plot(cohort)
revolver_cohort = function(dataset,
CCF_parser = revolver::CCF_parser,
ONLY.DRIVER = FALSE,
MIN.CLUSTER.SIZE = 10,
annotation = 'My REVOLVER dataset')
{
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# Some header print and the output object are created, before checking for input consistency
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
options = list(ONLY.DRIVER = ONLY.DRIVER, MIN.CLUSTER.SIZE = MIN.CLUSTER.SIZE)
pio::pioHdr(paste('REVOLVER ~ Cohort constructor'))
cat("\n")
cli::cli_alert_info(paste(
"Using",
ifelse(
options$ONLY.DRIVER == 0,
'only driver' %>% crayon::bold(),
'all annotated' %>% crayon::bold()
),
'mutations. '
))
cli::cli_alert_info(paste(
"Rejecting clusters with less than",
options$MIN.CLUSTER.SIZE %>% crayon::bold(),
'mutations. '
))
# The output object will be this, of class rev_cohort
obj <-
structure(
list(
patients = NULL,
variantIDs = NULL,
variantIDs.driver = NULL,
numVariants = NULL,
annotation = annotation,
dataset = NULL,
CCF = NULL,
n = NULL,
CCF_parser = CCF_parser
),
class = "rev_cohort",
call = match.call()
)
# Check input and stop on error
dataset = check_input(dataset, CCF_parser)
dataset$id = paste0('__mut_id_', 1:nrow(dataset))
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# Input data is subset according to the options parameter
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
if (options$ONLY.DRIVER)
dataset = dataset %>% filter(is.driver)
# by cluster size
grp = dataset %>%
group_by(patientID, cluster) %>%
summarize(cluster_size = n(), .groups = 'keep') %>%
ungroup()
dataset = left_join(dataset, grp, by = c('patientID', 'cluster'))
if (options$MIN.CLUSTER.SIZE > 0)
{
cli::cli_h1('Filtering small clusters (starting with {.field {nrow(dataset)}} entries)')
cat('Removing\n\n')
print(dataset %>%
filter(cluster_size < options$MIN.CLUSTER.SIZE))
dataset = dataset %>%
filter(cluster_size >= options$MIN.CLUSTER.SIZE)
cli::cli_alert_success('\nAfter filtering: {.field {nrow(dataset)}} entries\n')
}
if (nrow(dataset) == 0)
stop("Your dataset is empty, aborting.")
cli::cli_h1('\nREVOLVER input data')
cat('\n')
pio::pioDisp(dataset)
obj$patients = unique(dataset$patientID)
obj$dataset = dataset
obj$variantIDs = unique(dataset$variantID)
obj$variantIDs.driver = unique(dataset %>% filter(is.driver) %>% pull(variantID))
obj$numVariants = nrow(dataset)
obj$n = list(
patients = length(obj$patients),
variants = nrow(obj$dataset),
drivers = length(obj$variantIDs.driver)
)
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# Data of each patient needs to be unwrapped and stored
# This takes some times
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
cli::cli_h3('\nPreprocessing data (this may take some time)')
# cli::cli_process_start("Extracting patient data ...")
cat('\n')
obj$dataset = obj$dataset %>%
group_by(patientID) %>%
nest() %>%
ungroup() %>%
select(data) %>%
unlist(recursive = F)
names(obj$dataset) = obj$patients
# For each patient extract its explicit region CCF data
foo = function(w)
{
values = CCF_parser(w)
tv = as_tibble(as.numeric(values))
tv$sample = names(values)
tv %>%
spread(sample, value)
}
obj$dataset = lapply(obj$patients,
function(pat)
{
d = obj$dataset[[pat]] %>%
mutate(patientID = pat) %>%
select(id,
Misc,
patientID,
variantID,
is.driver,
is.clonal,
cluster,
cluster_size,
CCF)
# pio::pioStr(d$patientID[1],
# paste0(nrow(d), ' entries'),
# suffix = '\n',
# prefix = '\n')
# cli::cli_alert('{.field {d$patientID[1]}} : {.count {nrow(d)}} entries.')
cat('.')
CCF_values = d %>%
group_by(id) %>%
do(foo(.$CCF)) %>%
ungroup()
d %>%
full_join(CCF_values, by = 'id')
})
names(obj$dataset) = obj$patients
cat('\n')
# cli::cli_process_done()
cli::cli_h1('\nExtracting clones table')
cat('\n')
# pio::pioTit('Extracting clones\' table for each patient')
# Then we also create a table with the clones for each patient
obj$CCF = lapply(names(obj$dataset),
function(pat)
{
d = obj$dataset[[pat]]
# pio::pioStr(pat, paste0(nrow(d), ' entries'), suffix = '')
# cli::cli_alert('{.field {pat}} : {.count {nrow(d)}} entries.')
d = d %>%
select(-Misc, -variantID, -CCF, -id, -patientID, -cluster_size)
CCFclone_headers = d %>%
group_by(cluster) %>%
summarise(
nMuts = n(),
is.driver = any(is.driver),
is.clonal = all(is.clonal)
) %>%
arrange(desc(nMuts))
CCFclone_values = d %>%
select(-is.driver, -is.clonal) %>%
reshape2::melt(id = 'cluster') %>%
group_by(cluster, variable) %>%
summarise(CCF = median(as.numeric(value))) %>%
spread(variable, CCF) %>%
ungroup()
cli::cli_alert('{.field {pat}} : {.count {nrow(d)}} entries, {.count {nrow(CCFclone_headers)}} clone(s).')
# pio::pioStr(' --> ', paste0(nrow(CCFclone_headers), ' clones'), suffix = '\n')
CCFclone_headers %>% full_join(CCFclone_values, by = 'cluster')
})
names(obj$CCF) = names(obj$dataset)
return(obj)
}
# Check input format - fails on error
check_input = function(dataset, CCF.parser)
{
# Dataset with certain columns required
required.cols = c('Misc',
'patientID',
'variantID',
'cluster',
'is.driver',
'is.clonal',
'CCF')
# check for types
types = sapply(dataset, class)
if (!is.data.frame(dataset) |
!all(required.cols %in% colnames(dataset)) |
types['Misc'] != 'character' |
types['patientID'] != 'character' |
types['variantID'] != 'character' |
types['cluster'] != 'character' |
types['CCF'] != 'character' |
types['is.driver'] != 'logical' |
types['is.clonal'] != 'logical')
{
print(str(dataset))
stop(
'Input `dataset` should be a dataframe in with the following columns:\n
\t- Misc: any annotation (string),
\t- patientID: a unique ID to identify a patient (string)
\t- variantID: a unique ID to identify a variant across patients (string)
\t- cluster: to putative clone in this patient\'s tumour where the variant is found (string)
\t- is.driver: if the mutation is a driver to be correlated (logical)
\t- is.clonal: if the mutation is a clonal (truncal) in this patient (logical)
\t- CCF: Cancer Cell Fraction or binary annotation for this mutations across the biopsies of this patient\'s tumour (string)
For more information see the manual: https://github.com/caravagn/revolver'
)
}
if (any(dataset[, required.cols[-1]] == ''))
stop("Entries empty string ('') in the dataset are allowed only in the `Misc` field.")
if (!is.function(CCF_parser))
{
stop(
'You need to provide a function to parse CCFs, see "CCF_parser" available in this package.'
)
}
dataset[, required.cols] %>% as_tibble()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.