#' Pipe operator
#'
#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @export
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
#' @param lhs A value or the magrittr placeholder.
#' @param rhs A function call using the magrittr semantics.
#' @return The result of calling `rhs(lhs)`.
NULL
#' Create a dataset.
#'
#' @description
#'
#' This function creates a dataset (an object of class \code{rcongasplus}) by assembling multiple single-cell input measurements
#' (ATAC and/or RNA data modalities), the input segmentation (from bulk DNA sequencing),
#' and the per-cell normalisation factors for the data.
#'
#' All input data are passed as tibbles; the input formats are as follows:
#'
#' * for single-cell ATAC/RNA data, the \code{cell} identifier, the genomic coordinates
#' (\code{chr}, \code{from}, \code{to}) which refer either to an ATAC peak, or an RNA gene
#' identifier, and a \code{value} reporting the reads mapped.
#'
#' * for the input segmentation, the genomic coordinates
#' (\code{chr}, \code{from}, \code{to}) which refer to the segment, and the number of
#' \code{copies} (i.e., DNA ploidy) of the segment.
#'
#' * for normalization factors the \code{cell} identifier, the actual \code{normalisation_factor}
#' and the \code{modality} to wihch the factor refers to
#'
#' This function receives also other parameters - e.g., the models likelihoods - which
#' will determine the overall behaviour of the underlying model, and how data are preared for inference.
#'
#' * A Negative Binomial likelihood (\code{"NB"}), which works directly from raw counts data
#'
#' * A Gaussian likelihood (\code{"G"}), which requires a z-score transformation of the data. This consists
#' in :
#' * scaling raw counts by the input normalization factors;
#' * computing z-scores per cell;
#' * summing up z-scores per segment;
#' * computing z-scores per segment;
#' * center the z-scores mean to the input ploidy.
#'
#' @param rna A tibble with single-cell RNA data.
#' @param atac A tibble with single-cell ATAC data.
#' @param segmentation A tibble with the input segmentation.
#' @param rna_normalisation_factors The RNA tibble with the input per-cell normalisation factors.
#' By default these are computed by function \code{auto_normalisation_factor}.
#' @param atac_normalisation_factors The ATAC tibble with the input per-cell normalisation factors.
#' By default these are computed by function \code{auto_normalisation_factor}.
#' @param rna_likelihood Type of likelihood used for RNA data (\code{"G"} for Gaussian and
#' \code{""NB} for Negative Binomial). The RNA default is \code{"G"}.
#' @param atac_likelihood Type of likelihood used for ATAC data, with default \code{"NB"}.
#' @param reference_genome Either \code{"GRCh38"} or \code{"hg19"}.
#' @param description A model in-words description.
#' @param smooth If yes, input segments are smootheed by joining per chromosome segments that
#' have the same ploidy.
#' @param mutiome Default to FALSE. Flag indicating whether the RNA and ATAC observations are the result of a matched RNA-ATAC sequencing assay such as 10x multiome assay.
#' (i.e., there is a 1:1 correspondence between barcodes of the two modalities.)
#'
#' @return An object of class \code{rcongasplus}
#'
#' @importFrom tidyr separate pivot_longer
#' @importFrom tibble rownames_to_column
#' @importFrom crayon bgCyan bgBlue bgGreen bgRed bgMagenta bgWhite underline bgYellow blue red
#' @importFrom cli cli_rule cli_h3 cli_alert cli_alert_info cli_alert_warning cli_alert_danger
#' @importFrom reshape2 acast melt
#' @importFrom stats4 mle
#' @importFrom cowplot plot_grid
#' @importFrom progress progress_bar
#' @importFrom tidyr unite
#' @importFrom graphics curve hist
#' @importFrom stats complete.cases dgamma quantile rnorm sd
#' @importFrom utils head object.size
#' @importFrom gtools mixedsort
#' @import dplyr
#' @import ggplot2
#' @import CNAqc
#'
#' @export
#'
#' @examples
#' data("example_input")
#'
#' # For instance, RNA data
#' example_input$x_rna %>% print
#'
#' # .. or ATAC data
#' example_input$x_atac %>% print
#'
#' # .. and segmentation
#' example_input$x_segmentation %>% print
#'
#' # .. and normalisation factors can be computed (default)
#' example_input$x_rna %>% auto_normalisation_factor()
#'
#' x = init(
#' rna = example_input$x_rna,
#' atac = example_input$x_atac,
#' segmentation = example_input$x_segmentation,
#' rna_likelihood = "G",
#' atac_likelihood = 'NB',
#' description = 'My model')
#'
#' print(x)
init = function(
rna,
atac,
segmentation,
rna_normalisation_factors = rna %>% auto_normalisation_factor(),
atac_normalisation_factors = atac %>% auto_normalisation_factor(),
rna_likelihood = "NB",
atac_likelihood = "NB",
reference_genome = 'GRCh38',
description = "(R)CONGAS+ model",
smooth = FALSE,
# out.rm = TRUE,
multiome = FALSE
)
{
if(is.null(rna) & is.null(atac))
stop("Cannot have both assays null.")
# Output object
ret_obj = list()
class(ret_obj) = 'rcongasplus'
ret_obj$description = description
ret_obj$reference_genome = reference_genome
cli::boxx(paste("(R)CONGAS+:", description),
background_col = 'orange',
padding = 1,
float = 'center') %>% cat
cat("\n")
# Sanitise by data type and required columns
if(!is.null(rna))
rna = rna %>% filter(!is.na(chr))
sanitize_input(rna,
required_input_columns = c("chr", "from", "to", "value", "cell"),
types_required = c("character", "integer", "integer", "integer", "character")
)
sanitize_input(atac,
required_input_columns = c("chr", "from", "to", "value", "cell"),
types_required = c("character", "integer", "integer", "integer", "character")
)
sanitize_input(segmentation,
required_input_columns = c("chr", "from", "to", "copies"),
types_required = c("character", "integer", "integer", "integer")
)
sanitize_input(rna_normalisation_factors,
required_input_columns = c("cell", "normalisation_factor"),
types_required = c("character", "numeric")
)
sanitize_input(atac_normalisation_factors,
required_input_columns = c("cell", "normalisation_factor"),
types_required = c("character", "numeric")
)
# Reference
if(!(reference_genome %in% c("hg19", 'GRCh37', 'hg38', "GRCh38")))
stop("Unsupported reference, use any of 'hg19'/'GRCh37' or 'hg38'/'GRCh38'")
if (multiome){
print("Running in multiome mode.")
res = validate_multiome(rna, atac)
atac = res$atac
rna = res$rna
rna_normalisation_factors = rna_normalisation_factors %>%
mutate(multiome_barcode = cell, cell = paste0(cell, '-RNA'))
atac_normalisation_factors = atac_normalisation_factors %>%
mutate(multiome_barcode = cell, cell = paste0(cell, '-ATAC'))
} else {
# Non unique cell ids
nu_ids = intersect(rna$cell, atac$cell)
if(!is.null(nu_ids) & (length(nu_ids) > 0))
{
stop("ATAC and RNA cells have shared ids, this is not possibile.")
}
}
# Check that normalization factors are available for all cells
if(!all(rna$cell %in% rna_normalisation_factors$cell))
{
message("Error with this RNA tibble")
rna_normalisation_factors %>% print()
stop("Missing normalisation factors for some input cells.")
}
if(!all(atac$cell %in% atac_normalisation_factors$cell))
{
message("Error with this ATAC tibble")
atac_normalisation_factors %>% print()
stop("Missing normalisation factors for some input cells.")
}
# Check likelihood
if(!(rna_likelihood %in% c("NB", "P", "G")))
{
stop("Unsupported RNA likelihood, use:
- NB (Negative-Binomial),
- P (Poisson),
- G (Gaussian, with z-score).")
}
if(!(atac_likelihood %in% c("NB", "P", "G")))
{
stop("Unsupported RNA likelihood, use:
- NB (Negative-Binomial),
- P (Poisson),
- G (Gaussian, with z-score).")
}
# Prepare segment
if(smooth)
segmentation = smoothie(segments = segmentation, reference = reference_genome)
segmentation = segmentation %>% idify()
segmentation$RNA_genes =
segmentation$RNA_nonzerovals =
segmentation$ATAC_peaks =
segmentation$ATAC_nonzerovals =
segmentation$ATAC_nonzerocells =
segmentation$RNA_nonzerocells = 0
# Create RNA modality data
rna_modality_data = create_modality(
modality = "RNA",
data = rna,
segmentation = segmentation,
normalisation_factors = rna_normalisation_factors,
likelihood = rna_likelihood,
multiome = multiome,
out.rm = TRUE)
if(!is.null(rna))
{
rna = rna_modality_data$data
segmentation = rna_modality_data$segmentation
rna_normalisation_factors = rna_modality_data$normalisation
}
# Create ATAC modality data
atac_modality_data = create_modality(
modality = "ATAC",
data = atac,
segmentation = segmentation,
normalisation_factors = atac_normalisation_factors,
likelihood = atac_likelihood,
multiome = multiome,
out.rm = TRUE)
if(!is.null(atac))
{
atac = atac_modality_data$data
segmentation = atac_modality_data$segmentation
atac_normalisation_factors = atac_modality_data$normalisation
}
# The last thing we do is to sanitise the segmentation, this removes useless segments
cli::cli_h3("Checking segmentation")
cat("\n")
# First, test if some segments have NO events associated
s_subset_zero = segmentation %>% filter(RNA_nonzerovals == 0 & ATAC_nonzerovals == 0)
segmentation = segmentation %>% filter(RNA_nonzerovals > 0 | ATAC_nonzerovals > 0)
if (nrow(s_subset_zero) > 0)
{
cli::cli_alert_warning("These segments have no events associated and will be removed - we suggest you to check if these can be further smoothed.")
cli::cli_alert_info("{crayon::bgCyan(crayon::white(' Hint '))} {crayon::italic('Check if the reduced segments can be further smoothed!')}")
s_subset_zero %>% print
}
else cli::cli_alert_success("Nothing to process.")
# Add to the return object
ret_obj$input$dataset = bind_rows(rna, atac)
ret_obj$input$normalisation = bind_rows(rna_normalisation_factors,
atac_normalisation_factors)
ret_obj$input$segmentation = segmentation
ret_obj$input$multiome = multiome
ret_obj$log = paste('-', Sys.time(), "Created input object.")
return(ret_obj %>% sanitize_obj %>% sanitize_zeroes
)
}
create_modality = function(modality, data, segmentation, normalisation_factors, likelihood, multiome, out.rm=T)
{
# Special case, data are missing
if(is.null(data)) {
return(list(data = NULL, segmentation = segmentation))
}
if (multiome) {
barcode_mapping = data %>% select(cell, multiome_barcode) %>% distinct()
}
normalisation_factors = normalisation_factors %>% mutate(modality = !!modality)
os = object.size(data)/1e6
# Report input information for RNA
cli::cli_h3("{.field {modality}} modality ({.value {os} Mb})")
cat("\n")
cli::cli_alert("Input events: {.field {data %>% nrow}}")
cli::cli_alert("Input cells: {.field {data %>% distinct(cell) %>% nrow}}")
cli::cli_alert("Input locations: {.field {data %>% distinct(chr, from, to) %>% nrow}}")
# Computing mapping for RNA
segmentation = segmentation %>% Rcongas:::idify()
evt_lbs = paste0(modality, '_nonzerovals')
loc_lbs = ifelse(
modality == 'RNA',
paste0(modality, '_genes'),
paste0(modality, '_peaks')
)
cells_lbs = paste0(modality, '_nonzerocells')
segmentation[[evt_lbs]] = 0
segmentation[[loc_lbs]] = 0
segmentation[[cells_lbs]] = 0
data$segment_id = NA
pb <- progress::progress_bar$new(total = nrow(segmentation))
pb$tick(0)
for(i in 1:nrow(segmentation))
{
pb$tick()
what_maps = which(
data$chr == segmentation$chr[i] &
data$from >= segmentation$from[i] &
data$to <= segmentation$to[i]
)
if(length(what_maps) == 0) next;
data$segment_id[what_maps] = segmentation$segment_id[i]
segmentation[[evt_lbs]][i] = what_maps %>% length
segmentation[[loc_lbs]][i] = data[what_maps, ] %>%
distinct(chr, from, to) %>%
nrow()
segmentation[[cells_lbs]][i] = data[what_maps, ] %>%
pull(cell) %>% unique %>% length
}
n_na = is.na(data$segment_id) %>% sum()
nn_na = (data %>% nrow) - n_na
if (out.rm)
data = Rcongas:::clean_outliers_persegment(modality, data, normalisation_factors)$data_cleaned
cli::cli_alert("Entries mapped: {.field {nn_na}}, with {.field {n_na}} outside input segments that will be discarded.")
if(n_na > 0) data = data %>% filter(!is.na(segment_id))
cli::cli_alert("Using likelihood: {.field {likelihood}}.")
# Compute z-score per mapped gene/peak according to the required likelihood
if(likelihood %in% c("G"))
{
cli::cli_alert_warning("Gaussian likelihood requires z-score representation of input values.")
# Normalise by factor - divide counts by per-cell normalization_factor
data = normalise_modality(data %>% mutate(modality = !!modality), normalisation_factors)
if ('gene' %in% colnames(data)){
# data = data %>% mutate(idFeature = paste0(gene,chr,from,to))
features = data %>%
select(gene, chr, from, to) %>%
distinct() %>%
unite(idFeature, c(gene, chr, from, to), remove = F)
data = data %>% left_join(features)
}
else {
features = data %>%
select(chr, from, to) %>%
distinct() %>%
unite(idFeature, c(chr, from, to), remove = F)
data = data %>% left_join(features)
}
# Compute z-score
cli::cli_alert("Computing z-score.")
gene_segments = data %>% group_by(idFeature, segment_id) %>% summarise(n_cells = n()) %>%
select(-n_cells) %>% tibble::column_to_rownames('idFeature')
inp = reshape2::acast(data,
cell ~ idFeature,
value.var = "value")
inp[is.na(inp)] <- 0
data_scaled = scale(inp)
data_scaled[is.na(data_scaled)] <- 0
mapped = sapply(segmentation$segment_id, function (x) {
curr_genes = rownames(gene_segments %>% filter(segment_id == x))
tmp = data_scaled[,curr_genes,drop=F]
tmp = rowSums(tmp)
return(tmp)
}, USE.NAMES = T )
mapped = as.data.frame(mapped) %>% rownames_to_column(var = 'cell') %>%
pivot_longer(cols = setdiff(colnames(mapped), 'cell'), names_to = 'segment_id')
mapped$modality = modality
# # Set to 1 normalisation factors
cli::cli_alert_warning("With Gaussian likelihood normalisation factors are changed to 1.")
normalisation_factors$normalisation_factor = 1
} else {
# Mapped counts
mapped = data %>%
group_by(segment_id, cell) %>%
summarise(value = sum(value), .groups = 'keep') %>%
ungroup() %>%
mutate(modality = !!modality)
}
if (multiome) {
mapped = mapped %>% left_join(barcode_mapping)
}
# Center the new scores to the ploidy value
if(likelihood %in% c("G")){
cli::cli_alert("Centering the new scores around input ploidy values.")
inp = reshape2::acast(mapped,
cell ~ segment_id,
value.var = "value")
inp[is.na(inp)] <- 0
data_scaled = scale(inp)
data_scaled[is.na(data_scaled)] <- 0
mapped = as.data.frame(data_scaled) %>% rownames_to_column(var = 'cell') %>%
pivot_longer(cols = setdiff(colnames(data_scaled), 'cell'), names_to = 'segment_id') %>%
mutate(modality = !!modality)
print(colnames(mapped))
mapped = mapped %>%
left_join(segmentation, by = c("segment_id")) %>%
mutate(
value = value + copies
) %>% select(segment_id, cell, value, modality)
}
# Handle data type request: convert to z-score if required
mapped$value_type = likelihood
what_lik = case_when(
likelihood == "NB" ~ "Negative Binomial",
likelihood == "P" ~ "Poisson",
likelihood == "G" ~ "Gaussian"
)
if (multiome) {
mapped = mapped %>%
select(segment_id, cell, value, modality, value_type, multiome_barcode)
} else {
mapped = mapped %>%
select(segment_id, cell, value, modality, value_type)
}
return(
list(
data = mapped,
normalisation = normalisation_factors,
segmentation = segmentation
)
)
}
validate_multiome = function(rna, atac) {
if (length(unique(rna$cell)) != length(unique(atac$cell)))
stop("ATAC ana RNA have different number of cells. Error for multiome mode, returning.")
same_ids = intersect(rna$cell, atac$cell)
if (length(same_ids) != length(unique(rna$cell)))
stop("Running in multiome mode requires ATAC and RNA to have the same cell ids.")
atac = atac %>% mutate(multiome_barcode = cell,
cell = paste0(cell, '-ATAC'))
rna = rna %>% mutate(multiome_barcode = cell,
cell = paste0(cell, '-RNA'))
return(list(rna = rna, atac = atac))
}
# Segments smoothing function
smoothie = function(segments, reference = 'GRCh38')
{
cli::cli_alert("Smoothing {.field {nrow(segments)}} input segments.")
reference = CNAqc:::get_reference(ref = reference)
smoothed_segments = NULL
for (chr in unique(segments$chr))
{
chr_segments = segments %>% filter(chr == !!chr)
if (nrow(chr_segments) == 1) {
smoothed_segments = bind_rows(smoothed_segments,
chr_segments)
next
}
index = 1
repeat {
template = chr_segments[index, ]
j = index
repeat {
if (j == nrow(chr_segments))
break
copies_match = chr_segments$copies[j + 1] == chr_segments$copies[j]
if (copies_match)
j = j + 1
else
break
}
template$to = chr_segments$to[j]
template$length = template$to - template$from
smoothed_segments = bind_rows(smoothed_segments,
template)
if (j == nrow(chr_segments))
break
index = j + 1
}
}
cli::cli_alert("After smoothing there are {.field {nrow(smoothed_segments)}} input segments.")
return(smoothed_segments)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.