#' Create cicero input CDS
#'
#' Function to generate an aggregated input CDS for cicero. \code{run_cicero}
#' takes as input an aggregated cicero CDS object. This function will generate
#' the CDS given an input CDS (perhaps generated by \code{make_atac_cds}) and
#' a value for k, which is the number of cells to be aggregated per bin. The
#' default value for k is 50.
#'
#' @param cds Input CDS object.
#' @param reduced_coordinates A data frame with columns representing the
#' coordinates of each cell in reduced dimension space (generally 2-3
#' dimensions). \code{row.names(reduced_coordinates)} should match the cell
#' names in the CDS object. If dimension reduction was done using monocle,
#' tSNE coordinates can be accessed by \code{t(reducedDimA(cds))}, and
#' DDRTree coordinates can be accessed by \code{t(reducedDimS(cds))}.
#' @param k Number of cells to aggregate per bin.
#' @param summary_stats Which numeric \code{pData(cds)} columns you would like
#' summarized (mean) by bin in the resulting CDS object.
#' @param size_factor_normalize Logical, should accessibility values be
#' normalized by size factor?
#' @param silent Logical, should warning and info messages be printed?
#' @param return_agg_info Logical, should a list of the assignments of cells to
#' aggregated bins be output? When \code{TRUE}, this function returns a list
#' of two items, first, the aggregated CDS object and second, a data.frame
#' with the binning information.
#'
#' @details Aggregation of similar cells is done using a k-nearest-neighbors
#' graph and a randomized "bagging" procedure. Details are available in the
#' publication that accompanies this package. Run \code{citation("cicero")}
#' for publication details. KNN is calculated using
#' \code{\link[FNN]{knn.index}}
#'
#' @return Aggregated CDS object. If return_agg_info is \code{TRUE}, a list
#' of the aggregated CDS object and a data.frame of aggregation info.
#' @export
#'
#' @examples
#' \dontrun{
#' data("cicero_data")
#'
#' input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#' input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#' reduction_method = 'tSNE',
#' norm_method = "none")
#' tsne_coords <- t(reducedDimA(input_cds))
#' row.names(tsne_coords) <- row.names(pData(input_cds))
#' cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' }
#'
make_cicero_cds <- function(cds,
reduced_coordinates,
k=50,
summary_stats = NULL,
size_factor_normalize = TRUE,
silent = FALSE,
return_agg_info = FALSE) {
assertthat::assert_that(is(cds, "CellDataSet"))
assertthat::assert_that(is.data.frame(reduced_coordinates) |
is.matrix(reduced_coordinates))
assertthat::assert_that(assertthat::are_equal(nrow(reduced_coordinates),
nrow(pData(cds))))
assertthat::assert_that(setequal(row.names(reduced_coordinates),
colnames(cds)))
assertthat::assert_that(assertthat::is.count(k) & k > 1)
assertthat::assert_that(is.character(summary_stats) | is.null(summary_stats))
if(!is.null(summary_stats)) {
assertthat::assert_that(all(summary_stats %in% names(pData(cds))),
msg = paste("One of your summary_stats is missing",
"from your pData table. Either add a",
"column with the name in",
"summary_stats, or remove the name",
"from the summary_stats parameter.",
collapse = " "))
assertthat::assert_that(sum(vapply(summary_stats, function(x) {
!(is(pData(cds)[,x], "numeric") | is(pData(cds)[,x], "integer"))}, 1)) == 0,
msg = paste("All columns in summary_stats must be",
"of class numeric or integer.",
collapse = " "))
}
assertthat::assert_that(is.logical(size_factor_normalize))
assertthat::assert_that(is.logical(silent))
assertthat::assert_that(is.logical(return_agg_info))
reduced_coordinates <- as.data.frame(reduced_coordinates)
reduced_coordinates <- reduced_coordinates[colnames(cds),]
# Create a k-nearest neighbors map
nn_map <- FNN::knn.index(reduced_coordinates, k=(k-1)) # no data.frame wrapper
row.names(nn_map) <- row.names(reduced_coordinates)
nn_map <- cbind(nn_map, seq_len(nrow(nn_map)))
good_choices <- seq_len(nrow(nn_map))
choice <- sample(seq_len(length(good_choices)), size = 1, replace = FALSE)
chosen <- good_choices[choice]
good_choices <- good_choices[good_choices != good_choices[choice]]
it <- 0
k2 <- k * 2 # Compute once
# function for sapply
get_shared <- function(other, this_choice) {
k2 - length(union(cell_sample[other,], this_choice))
}
while (length(good_choices) > 0 & it < 5000) { # slow
it <- it + 1
choice <- sample(seq_len(length(good_choices)), size = 1, replace = FALSE)
new_chosen <- c(chosen, good_choices[choice])
good_choices <- good_choices[good_choices != good_choices[choice]]
cell_sample <- nn_map[new_chosen,]
others <- seq_len(nrow(cell_sample) - 1)
this_choice <- cell_sample[nrow(cell_sample),]
shared <- sapply(others, get_shared, this_choice = this_choice)
if (max(shared) < .9 * k) {
chosen <- new_chosen
}
}
cell_sample <- nn_map[chosen,]
if(!silent) {
# Only need this slow step if !silent
combs <- combn(nrow(cell_sample), 2)
shared <- apply(combs, 2, function(x) { #slow
k2 - length(unique(as.vector(cell_sample[x,])))
})
message(paste0("Overlap QC metrics:\nCells per bin: ", k,
"\nMaximum shared cells bin-bin: ", max(shared),
"\nMean shared cells bin-bin: ", mean(shared),
"\nMedian shared cells bin-bin: ", median(shared)))
if (mean(shared)/k > .1)
warning("On average, more than 10% of cells are shared between paired bins.")
}
exprs_old <- exprs(cds)
mask <- sapply(seq_len(nrow(cell_sample)),
function(x) seq_len(ncol(exprs_old)) %in% cell_sample[x,,drop=FALSE])
if (return_agg_info) {
row.names(mask) <- colnames(exprs_old)
colnames(mask) <- paste0("agg_", chosen)
agg_map <- reshape2::melt(mask)
agg_map <- agg_map[agg_map$value,]
agg_map$value <- NULL
names(agg_map) <- c("cell", "agg_cell")
}
mask <- Matrix::Matrix(mask)
new_exprs <- exprs_old %*% mask
new_exprs <- Matrix::t(new_exprs)
new_exprs <- as.matrix(new_exprs)
pdata <- pData(cds)
new_pcols <- "agg_cell"
if(!is.null(summary_stats)) {
new_pcols <- c(new_pcols, paste0("mean_",summary_stats))
}
new_pdata <- plyr::adply(cell_sample,1, function(x) {
sub <- pdata[x,]
df_l <- list()
df_l["temp"] <- 1
for (att in summary_stats) {
df_l[paste0("mean_", att)] <- mean(sub[,att])
}
data.frame(df_l)
})
new_pdata$agg_cell <- paste("agg", chosen, sep="")
new_pdata <- new_pdata[,new_pcols, drop = FALSE] # fixes order, drops X1 and temp
row.names(new_pdata) <- new_pdata$agg_cell
row.names(new_exprs) <- new_pdata$agg_cell
new_exprs <- as.matrix(t(new_exprs))
fdf <- fData(cds)
new_pdata$temp <- NULL
fd <- new("AnnotatedDataFrame", data = fdf)
pd <- new("AnnotatedDataFrame", data = new_pdata)
cicero_cds <- suppressWarnings(newCellDataSet(new_exprs,
phenoData = pd,
featureData = fd,
expressionFamily=negbinomial.size(),
lowerDetectionLimit=0))
cicero_cds <- monocle::detectGenes(cicero_cds, min_expr = .1)
cicero_cds <- estimateSizeFactorsSimp(cicero_cds)
#cicero_cds <- suppressWarnings(BiocGenerics::estimateDispersions(cicero_cds))
if (any(!c("chr", "bp1", "bp2") %in% names(fData(cicero_cds)))) {
fData(cicero_cds)$chr <- NULL
fData(cicero_cds)$bp1 <- NULL
fData(cicero_cds)$bp2 <- NULL
fData(cicero_cds) <- cbind(fData(cicero_cds),
df_for_coords(row.names(fData(cicero_cds))))
}
if (size_factor_normalize) {
Biobase::exprs(cicero_cds) <-
t(t(Biobase::exprs(cicero_cds))/Biobase::pData(cicero_cds)$Size_Factor)
}
if (return_agg_info) {
return(list(cicero_cds, agg_map))
}
cicero_cds
}
#' Run Cicero
#'
#' A wrapper function that runs the primary functions of the Cicero pipeline
#' with default parameters. Runs \code{\link{estimate_distance_parameter}},
#' \code{\link{generate_cicero_models}} and \code{\link{assemble_connections}}.
#' See the manual pages of these functions for details about their function and
#' parameter options. Defaults in this function are designed for mammalian data,
#' those with non-mammalian data should read about parameters in the above
#' functions.
#'
#' @param cds Cicero CDS object, created using \code{\link{make_cicero_cds}}
#' @param window Size of the genomic window to query, in base pairs.
#' @param silent Whether to print progress messages
#' @param sample_num How many sample genomic windows to use to generate
#' \code{distance_parameter} parameter. Default: 100.
#' @param genomic_coords Either a data frame or a path (character) to a file
#' with chromosome lengths. The file should have two columns, the first is
#' the chromosome name (ex. "chr1") and the second is the chromosome length
#' in base pairs. See \code{data(human.hg19.genome)} for an example. If a
#' file, should be tab-separated and without header.
#'
#' @return A table of co-accessibility scores
#' @export
#'
#' @examples
#' data("cicero_data")
#' data("human.hg19.genome")
#' sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#' sample_genome$V2[1] <- 100000
#' input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#' input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#' reduction_method = 'tSNE',
#' norm_method = "none")
#' tsne_coords <- t(reducedDimA(input_cds))
#' row.names(tsne_coords) <- row.names(pData(input_cds))
#' cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' cons <- run_cicero(cicero_cds, sample_genome, sample_num = 2)
#'
run_cicero <- function(cds,
genomic_coords,
window = 500000,
silent=FALSE,
sample_num = 100) {
# Check input
assertthat::assert_that(is(cds, "CellDataSet"))
assertthat::assert_that(is.logical(silent))
assertthat::assert_that(assertthat::is.number(window))
assertthat::assert_that(assertthat::is.count(sample_num))
if (!is.data.frame(genomic_coords)) {
assertthat::is.readable(genomic_coords)
}
if (!silent) print("Starting Cicero")
if (!silent) print("Calculating distance_parameter value")
distance_parameters <- estimate_distance_parameter(cds, window=window,
maxit=100, sample_num = sample_num,
distance_constraint = 250000,
distance_parameter_convergence = 1e-22,
genomic_coords = genomic_coords)
mean_distance_parameter <- mean(unlist(distance_parameters))
if (!silent) print("Running models")
cicero_out <-
generate_cicero_models(cds,
distance_parameter = mean_distance_parameter,
window = window,
genomic_coords = genomic_coords)
if (!silent) print("Assembling connections")
all_cons <- assemble_connections(cicero_out, silent=silent)
if (!silent) print("Done")
all_cons
}
#' Calculate distance penalty parameter
#'
#' Function to calculate distance penalty parameter (\code{distance_parameter})
#' for random genomic windows. Used to choose \code{distance_parameter} to pass
#' to \code{\link{generate_cicero_models}}.
#'
#' @param cds A cicero CDS object generated using \code{\link{make_cicero_cds}}.
#' @param window Size of the genomic window to query, in base pairs.
#' @param maxit Maximum number of iterations for distance_parameter estimation.
#' @param s Power law value. See details for more information.
#' @param sample_num Number of random windows to calculate
#' \code{distance_parameter} for.
#' @param distance_constraint Maximum distance of expected connections. Must be
#' smaller than \code{window}.
#' @param distance_parameter_convergence Convergence step size for
#' \code{distance_parameter} calculation.
#' @param max_elements Maximum number of elements per window allowed. Prevents
#' very large models from slowing performance.
#' @param genomic_coords Either a data frame or a path (character) to a file
#' with chromosome lengths. The file should have two columns, the first is
#' the chromosome name (ex. "chr1") and the second is the chromosome length
#' in base pairs. See \code{data(human.hg19.genome)} for an example. If a
#' file, should be tab-separated and without header.
#' @param max_sample_windows Maximum number of random windows to screen to find
#' sample_num windows for distance calculation. Default 500.
#'
#' @examples
#' data("cicero_data")
#' data("human.hg19.genome")
#' sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#' sample_genome$V2[1] <- 100000
#' input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#' input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#' reduction_method = 'tSNE',
#' norm_method = "none")
#' tsne_coords <- t(reducedDimA(input_cds))
#' row.names(tsne_coords) <- row.names(pData(input_cds))
#' cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' distance_parameters <- estimate_distance_parameter(cicero_cds,
#' sample_num=5,
#' genomic_coords = sample_genome)
#'
#' @seealso \code{\link{generate_cicero_models}}
#' @return A list of results of length \code{sample_num}. List members are
#' numeric \code{distance_parameter} values.
#'
#' @details The purpose of this function is to calculate the distance scaling
#' parameter used to adjust the distance-based penalty function used in
#' Cicero's model calculation. The scaling parameter, in combination with the
#' power law value \code{s} determines the distance-based penalty.
#'
#' This function chooses random windows of the genome and calculates a
#' \code{distance_parameter}. The function returns a vector of values
#' calculated on these random windows. We recommend using the mean value of
#' this vector moving forward with Cicero analysis.
#'
#' The function works by finding the minimum distance scaling parameter such
#' that no more than 5% of pairs of sites at a distance greater than
#' \code{distance_constraint} have non-zero entries after graphical lasso
#' regularization and such that fewer than 80% of all output entries are
#' nonzero.
#'
#' If the chosen random window has fewer than 2 or greater than
#' \code{max_elements} sites, the window is skipped. In addition, the random
#' window will be skipped if there are insufficient long-range comparisons
#' (see below) to be made. The \code{max_elements} parameter exist to prevent
#' very dense windows from slowing the calculation. If you expect that your
#' data may regularly have this many sites in a window, you will need to
#' raise this parameter.
#'
#' Calculating the \code{distance_parameter} in a sample window requires
#' peaks in that window that are at a distance greater than the
#' \code{distance_constraint} parameter. If there are not enough examples at
#' high distance have been found, the function will return the warning
#' \code{"Warning: could not calculate sample_num distance_parameters - see
#' documentation details"}.When looking for \code{sample_num} example
#' windows, the function will search \code{max_sample_windows} windows. By
#' default this is set at 500, which should be well beyond the 100 windows
#' that need to be found. However, in very sparse datasets, increasing
#' \code{max_sample_windows} may help avoid the above warning. Increasing
#' \code{max_sample_windows} may slow performance in sparse datasets. If you
#' are still not able to get enough example windows, even with a large
#' \code{max_sample_windows} paramter, this may mean your \code{window}
#' parameter needs to be larger or your \code{distance_constraint} parameter
#' needs to be smaller. A less likely possibility is that your
#' \code{max_elements} parameter needs to be larger. This would occur if your
#' data is particularly dense.
#'
#' The parameter \code{s} is a constant that captures the power-law
#' distribution of contact frequencies between different locations in the
#' genome as a function of their linear distance. For a complete discussion
#' of the various polymer models of DNA packed into the nucleus and of
#' justifiable values for s, we refer readers to (Dekker et al., 2013) for a
#' discussion of justifiable values for s. We use a value of 0.75 by default
#' in Cicero, which corresponds to the “tension globule” polymer model of DNA
#' (Sanborn et al., 2015). This parameter must be the same as the s parameter
#' for generate_cicero_models.
#'
#' Further details are available in the publication that accompanies this
#' package. Run \code{citation("cicero")} for publication details.
#'
#' @references
#' \itemize{
#' \item Dekker, J., Marti-Renom, M.A., and Mirny, L.A. (2013). Exploring
#' the three-dimensional organization of genomes: interpreting chromatin
#' interaction data. Nat. Rev. Genet. 14, 390–403.
#' \item Sanborn, A.L., Rao, S.S.P., Huang, S.-C., Durand, N.C., Huntley,
#' M.H., Jewett, A.I., Bochkov, I.D., Chinnappan, D., Cutkosky, A., Li, J.,
#' et al. (2015). Chromatin extrusion explains key features of loop and
#' domain formation in wild-type and engineered genomes. Proc. Natl. Acad.
#' Sci. U. S. A. 112, E6456–E6465.
#' }
#' @export
estimate_distance_parameter <- function(cds,
window=500000,
maxit=100,
s=0.75,
sample_num = 100,
distance_constraint = 250000,
distance_parameter_convergence = 1e-22,
max_elements = 200,
genomic_coords = cicero::human.hg19.genome,
max_sample_windows = 500) {
assertthat::assert_that(is(cds, "CellDataSet"))
assertthat::assert_that(assertthat::is.number(window))
assertthat::assert_that(assertthat::is.count(maxit))
assertthat::assert_that(assertthat::is.number(s), s < 1, s > 0)
assertthat::assert_that(assertthat::is.count(sample_num))
assertthat::assert_that(assertthat::is.count(distance_constraint))
assertthat::assert_that(distance_constraint < window)
assertthat::assert_that(assertthat::is.number(distance_parameter_convergence))
if (!is.data.frame(genomic_coords)) {
assertthat::is.readable(genomic_coords)
}
assertthat::assert_that(assertthat::is.count(max_sample_windows))
grs <- generate_windows(window, genomic_coords)
fData(cds)$chr <- gsub("chr", "", fData(cds)$chr)
fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))
distance_parameters <- list()
distance_parameters_calced <- 0
it <- 0
while(sample_num > distance_parameters_calced & it < max_sample_windows) {
it <- it + 1
win <- sample(seq_len(length(grs)), 1)
GL <- "Error"
win_range <- get_genomic_range(grs, cds, win)
if (nrow(exprs(win_range))<=1) {
next()
}
if (nrow(exprs(win_range)) > max_elements) {
next()
}
dist_matrix <- calc_dist_matrix(win_range)
distance_parameter <- find_distance_parameter(dist_matrix,
win_range,
maxit = maxit,
null_rho = 0,
s,
distance_constraint = distance_constraint,
distance_parameter_convergence =
distance_parameter_convergence)
if (!is(distance_parameter, "numeric")) next()
distance_parameters = c(distance_parameters, distance_parameter)
distance_parameters_calced <- distance_parameters_calced + 1
}
if(length(distance_parameters) < sample_num)
warning(paste0("Could not calculate sample_num distance_parameters (",
length(distance_parameters), " were calculated) - see ",
"documentation details"))
if(length(distance_parameters) == 0)
stop("No distance_parameters calculated")
unlist(distance_parameters)
}
#' Generate cicero models
#'
#' Function to generate graphical lasso models on all sites in a CDS object
#' within overlapping genomic windows.
#'
#' @param cds A cicero CDS object generated using \code{\link{make_cicero_cds}}.
#' @param distance_parameter Distance based penalty parameter value. Generally,
#' the mean of the calculated \code{distance_parameter} values from
#' \code{\link{estimate_distance_parameter}}.
#' @param s Power law value. See details.
#' @param window Size of the genomic window to query, in base pairs.
#' @param max_elements Maximum number of elements per window allowed. Prevents
#' very large models from slowing performance.
#' @param genomic_coords Either a data frame or a path (character) to a file
#' with chromosome lengths. The file should have two columns, the first is
#' the chromosome name (ex. "chr1") and the second is the chromosome length
#' in base pairs. See \code{data(human.hg19.genome)} for an example. If a
#' file, should be tab-separated and without header.
#'
#' @details The purpose of this function is to compute the raw covariances
#' between each pair of sites within overlapping windows of the genome.
#' Within each window, the function then estimates a regularized correlation
#' matrix using the graphical LASSO (Friedman et al., 2008), penalizing pairs
#' of distant sites more than proximal sites. The scaling parameter,
#' \code{distance_parameter}, in combination with the power law value \code{s}
#' determines the distance-based penalty.
#'
#' The parameter \code{s} is a constant that captures the power-law
#' distribution of contact frequencies between different locations in the
#' genome as a function of their linear distance. For a complete discussion
#' of the various polymer models of DNA packed into the nucleus and of
#' justifiable values for s, we refer readers to (Dekker et al., 2013) for a
#' discussion of justifiable values for s. We use a value of 0.75 by default
#' in Cicero, which corresponds to the “tension globule” polymer model of DNA
#' (Sanborn et al., 2015). This parameter must be the same as the s parameter
#' for \code{\link{estimate_distance_parameter}}.
#'
#' Further details are available in the publication that accompanies this
#' package. Run \code{citation("cicero")} for publication details.
#'
#' @return A list of results for each window. Either a \code{glasso} object, or
#' a character description of why the window was skipped. This list can be
#' directly input into \code{\link{assemble_connections}} to create a
#' reconciled list of cicero co-accessibility scores.
#' @examples
#' data("cicero_data")
#' data("human.hg19.genome")
#' sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#' sample_genome$V2[1] <- 100000
#' input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#' input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#' reduction_method = 'tSNE',
#' norm_method = "none")
#' tsne_coords <- t(reducedDimA(input_cds))
#' row.names(tsne_coords) <- row.names(pData(input_cds))
#' cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' model_output <- generate_cicero_models(cicero_cds,
#' distance_parameter = 0.3,
#' genomic_coords = sample_genome)
#'
#' @references
#' \itemize{
#' \item Dekker, J., Marti-Renom, M.A., and Mirny, L.A. (2013). Exploring
#' the three-dimensional organization of genomes: interpreting chromatin
#' interaction data. Nat. Rev. Genet. 14, 390–403.
#' \item Friedman, J., Hastie, T., and Tibshirani, R. (2008). Sparse
#' inverse covariance estimation with the graphical lasso. Biostatistics 9,
#' 432–441.
#' \item Sanborn, A.L., Rao, S.S.P., Huang, S.-C., Durand, N.C., Huntley,
#' M.H., Jewett, A.I., Bochkov, I.D., Chinnappan, D., Cutkosky, A., Li, J.,
#' et al. (2015). Chromatin extrusion explains key features of loop and
#' domain formation in wild-type and engineered genomes. Proc. Natl. Acad.
#' Sci. U. S. A. 112, E6456–E6465.
#' }
#'
#' @seealso \code{\link{estimate_distance_parameter}}
#' @export
#'
generate_cicero_models <- function(cds,
distance_parameter,
s = 0.75,
window = 500000,
max_elements = 200,
genomic_coords = cicero::human.hg19.genome) {
assertthat::assert_that(is(cds, "CellDataSet"))
assertthat::assert_that(assertthat::is.number(distance_parameter))
assertthat::assert_that(assertthat::is.number(s), s < 1, s > 0)
assertthat::assert_that(assertthat::is.number(window))
assertthat::assert_that(assertthat::is.count(max_elements))
if (!is.data.frame(genomic_coords)) {
assertthat::is.readable(genomic_coords)
}
grs <- generate_windows(window, genomic_coords)
fData(cds)$chr <- gsub("chr", "", fData(cds)$chr)
fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))
outlist <- parallel::mclapply(seq_len(length(grs)), mc.cores = 1, function(win) {
GL <- "Error"
win_range <- get_genomic_range(grs, cds, win)
if (nrow(exprs(win_range))<=1) {
return("Zero or one element in range")
}
if (nrow(exprs(win_range)) > max_elements) {
return("Too many elements in range")
}
dist_matrix <- calc_dist_matrix(win_range)
rho_mat <- get_rho_mat(dist_matrix, distance_parameter, s)
vals <- exprs(win_range)
cov_mat <- cov(t(vals))
diag(cov_mat) <- diag(cov_mat) + 1e-4
GL <- glasso::glasso(cov_mat, rho_mat)
colnames(GL$w) <- row.names(GL$w) <- row.names(vals)
colnames(GL$wi) <- row.names(GL$wi) <- row.names(vals)
return(GL)
})
names_df <- as.data.frame(grs)
names(outlist) <- paste(names_df$seqnames,
names_df$start,
names_df$end, sep="_")
#FIXME add warning about how many regions removed due to too many elements
outlist
}
#' Combine and reconcile cicero models
#'
#' Function which takes the output of \code{\link{generate_cicero_models}} and
#' assembles the connections into a data frame with cicero co-accessibility
#' scores.
#'
#' This function combines glasso models computed on overlapping windows of the
#' genome. Pairs of sites whose regularized correlation was calculated twice
#' are first checked for qualitative concordance (both zero, positive or
#' negative). If they not concordant, NA is returned. If they are concordant
#' the mean is returned.
#'
#' @param cicero_model_list A list of cicero output objects, generally, the
#' output of \code{\link{generate_cicero_models}}.
#' @param silent Logical, should the function run silently?
#'
#' @return A data frame of connections with their cicero co-accessibility
#' scores.
#' @examples
#' data("cicero_data")
#' data("human.hg19.genome")
#' sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#' sample_genome$V2[1] <- 100000
#' input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#' input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#' reduction_method = 'tSNE',
#' norm_method = "none")
#' tsne_coords <- t(reducedDimA(input_cds))
#' row.names(tsne_coords) <- row.names(pData(input_cds))
#' cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' model_output <- generate_cicero_models(cicero_cds,
#' distance_parameter = 0.3,
#' genomic_coords = sample_genome)
#' cicero_cons <- assemble_connections(model_output)
#'
#' @seealso \code{\link{generate_cicero_models}}
#' @importFrom data.table melt.data.table
#' @export
assemble_connections <- function(cicero_model_list, silent = FALSE) {
types <- vapply(cicero_model_list, FUN=class, FUN.VALUE="character")
char_hbn <- cicero_model_list[types=="character"]
gl_only <- cicero_model_list[types=="list"]
if(!silent) {
print(paste("Successful cicero models: ", length(gl_only)))
print("Other models: ")
print(table(unlist(char_hbn)))
print(paste("Models with errors: ", sum(is.null(cicero_model_list))))
}
cors <- lapply(gl_only, function(gl) {
cors <- stats::cov2cor(gl$w)
data.table::melt(as.data.table(cors, keep.rownames=TRUE),
measure=patterns("[0-9]"))
})
cors <- data.table::rbindlist(cors)
names(cors) <- c("Var1", "Var2", "value")
data.table::setkey(cors, "Var1", "Var2")
cors_rec <- as.data.frame(cors[,list(mean_coaccess = reconcile(value)),
by="Var1,Var2"])
names(cors_rec) <- c("Peak1", "Peak2", "coaccess")
cors_rec <- cors_rec[cors_rec$Peak1 != cors_rec$Peak2,]
return(cors_rec)
}
reconcile <- function(values) {
if (length(values) == 1) return(values)
if (sum(values >= 0) == length(values)) return(mean(values))
if (sum(values <= 0) == length(values)) return(mean(values))
if (sum(values == 0) == length(values)) return(0)
return(NA_real_)
}
generate_windows <- function(window, genomic_coords) {
if(!is(genomic_coords, "data.frame")) {
chr_maxes <- read.table(genomic_coords)
} else {
chr_maxes <- genomic_coords
}
names(chr_maxes) <- c("V1", "V2")
win_ranges <- plyr::ddply(chr_maxes, plyr::.(V1), function(x) {
r <- seq(from = 1, to = x$V2[1], by = window/2)
l <- r + window - 1
data.frame(start = r, end = l)
})
gr <- GenomicRanges::GRanges(win_ranges$V1,
ranges=IRanges::IRanges(win_ranges$start,
win_ranges$end))
return(gr)
}
get_genomic_range <- function(grs, cds, win) {
end1 <- as.numeric(as.character(GenomicRanges::end(grs[win])))
end2 <- as.numeric(as.character(GenomicRanges::start(grs[win])))
win_range <- cds[(fData(cds)$bp1 < end1 &
fData(cds)$bp1 > end2) |
(fData(cds)$bp2 < end1 &
fData(cds)$bp2 > end2), ]
win_range <-
win_range[as.character(fData(win_range)$chr) ==
gsub("chr", "",
as.character(GenomicRanges::seqnames(grs[win]))),]
fData(win_range)$mean_bp <-
(as.numeric(as.character(fData(win_range)$bp1)) +
as.numeric(as.character(fData(win_range)$bp2)))/2
return(win_range)
}
find_distance_parameter <- function(dist_mat,
gene_range,
maxit,
null_rho,
s,
distance_constraint,
distance_parameter_convergence) {
if (sum(dist_mat > distance_constraint)/2 < 1) {
return("No long edges")
}
found <- FALSE
starting_max <- 2
distance_parameter <- 2
distance_parameter_max <- 2
distance_parameter_min <- 0
it <- 0
while(found != TRUE & it < maxit) {
vals <- exprs(gene_range)
cov_mat <- cov(t(vals))
diag(cov_mat) <- diag(cov_mat) + 1e-4
rho <- get_rho_mat(dist_mat, distance_parameter, s)
GL <- glasso::glasso(cov_mat, rho)
big_entries <- sum(dist_mat > distance_constraint)
if (((sum(GL$wi[dist_mat > distance_constraint] != 0)/big_entries) > 0.05) |
(sum(GL$wi == 0)/(nrow(GL$wi)^2) < 0.2 ) ) {
longs_zero <- FALSE
} else {
longs_zero <- TRUE
}
if (longs_zero != TRUE | (distance_parameter == 0)) {
distance_parameter_min <- distance_parameter
} else {
distance_parameter_max <- distance_parameter
}
new_distance_parameter <- (distance_parameter_min +
distance_parameter_max)/2
if(new_distance_parameter == starting_max) {
new_distance_parameter <- 2 * starting_max
starting_max <- new_distance_parameter
}
if (distance_parameter_convergence > abs(distance_parameter -
new_distance_parameter)) {
found <- TRUE
} else {
distance_parameter <- new_distance_parameter
}
it <- it + 1
}
if (maxit == it) warning("maximum iterations hit")
return(distance_parameter)
}
get_rho_mat <- function(dist_matrix, distance_parameter, s) {
xmin <- 1000
out <- (1-(xmin/dist_matrix)^s) * distance_parameter
out[!is.finite(out)] <- 0
out[out < 0] <- 0
return(out)
}
calc_dist_matrix <- function(gene_range) {
dist_mat <- as.matrix(dist(fData(gene_range)$mean_bp))
row.names(dist_mat) <- colnames(dist_mat) <- row.names(fData(gene_range))
return(dist_mat)
}
make_ccan_graph <- function(connections_df, coaccess_cutoff) {
connections_df <- as.data.frame(connections_df)
#make graph
cons_info_gr <- connections_df[!is.na(connections_df$coaccess) &
connections_df$coaccess > coaccess_cutoff,]
if(nrow(cons_info_gr) == 0) stop("No connections for graph")
cons_graph <- make_sparse_matrix(cons_info_gr, x.name = "coaccess")
site_graph <- igraph::graph.adjacency(cons_graph,
mode = "undirected",
weighted = TRUE)
return(site_graph)
}
#' Generate cis-co-accessibility networks (CCANs)
#'
#' Post process cicero co-accessibility scores to extract modules of sites that
#' are co-accessible.
#'
#' @param connections_df Data frame of connections with columns: Peak1, Peak2,
#' coaccess. Generally, the output of \code{\link{run_cicero}} or
#' \code{\link{assemble_connections}}
#' @param coaccess_cutoff_override Numeric, co-accessibility score threshold to
#' impose. Overrides automatic calculation.
#' @param tolerance_digits The number of digits to calculate cutoff to. Default
#' is 2 (0.01 tolerance)
#'
#' @details CCANs are calculated by first specifying a minimum co-accessibility
#' score and then using the Louvain community detection algorithm on the
#' subgraph induced by excluding edges below this score. For this function,
#' either the user can specify the minimum co-accessibility using
#' \code{coaccess_cutoff_override}, or the cutoff can be calculated
#' automatically by optimizing for CCAN number. The cutoff calculation can be
#' slow, so users may wish to use the \code{coaccess_cutoff_override} after
#' initially calculating the cutoff to speed future runs.
#'
#' @return Data frame with two columns - Peak and CCAN. CCAN column indicates
#' CCAN assignment. Peaks not included in a CCAN are not returned.
#' @export
#'
#' @examples
#' \dontrun{
#' data("cicero_data")
#' set.seed(18)
#' data("human.hg19.genome")
#' sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#' sample_genome$V2[1] <- 100000
#' input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#' input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#' reduction_method = 'tSNE',
#' norm_method = "none")
#' tsne_coords <- t(reducedDimA(input_cds))
#' row.names(tsne_coords) <- row.names(pData(input_cds))
#' cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' cicero_cons <- run_cicero(cicero_cds, sample_genome, sample_num = 2)
#' ccan_assigns <- generate_ccans(cicero_cons)
#' }
#'
generate_ccans <- function(connections_df,
coaccess_cutoff_override = NULL,
tolerance_digits = 2) {
assertthat::assert_that(is.data.frame(connections_df))
assertthat::assert_that(assertthat::has_name(connections_df, "Peak1"),
assertthat::has_name(connections_df, "Peak2"),
assertthat::has_name(connections_df, "coaccess"))
assertthat::assert_that(assertthat::is.number(tolerance_digits))
assertthat::assert_that(assertthat::is.number(coaccess_cutoff_override) |
is.null(coaccess_cutoff_override),
msg = paste("coaccess_cutoff_override must be a",
"number or NULL", collapse = " "))
if (!is.null(coaccess_cutoff_override)) {
assertthat::assert_that(coaccess_cutoff_override <= 1 &
coaccess_cutoff_override >= 0,
msg = paste("coaccess_cutoff_override must be",
"between 0 and 1 (or NULL)",
collapse = " "))
}
if (!is.null(coaccess_cutoff_override)) {
coaccess_cutoff <- coaccess_cutoff_override
} else {
coaccess_cutoff <- find_ccan_cutoff(connections_df, tolerance_digits)
}
print(paste("Coaccessibility cutoff used:", coaccess_cutoff))
ccan_graph <- make_ccan_graph(connections_df,
coaccess_cutoff = coaccess_cutoff)
comp_membership <- igraph::cluster_louvain(ccan_graph)
sizes <- igraph::sizes(comp_membership) > 2
comps_list <- unlist(as.list(igraph::membership(comp_membership)))
df <- data.frame(Peak = names(comps_list), CCAN = comps_list)
df$CCAN[!df$CCAN %in% names(sizes[sizes])] <- NA
df <- df[!is.na(df$CCAN),]
return(df)
}
find_ccan_cutoff <- function(connection_df, tolerance_digits) {
connection_df <- connection_df[connection_df$coaccess > 0,]
tolerance <- 10^-(tolerance_digits)
bottom <- 0
top <- max(connection_df$coaccess, na.rm = TRUE)
while ((top - bottom) > tolerance) {
test_val <- bottom + round((top - bottom)/2, digits = tolerance_digits + 1)
ccan_num_test <- number_of_ccans(connection_df, test_val)
next_step <- test_val
repeat{
next_step <- next_step + (top - bottom)/10
ccan_num_test2 <- number_of_ccans(connection_df, next_step)
if(ccan_num_test2 != ccan_num_test){
break
}
}
if (ccan_num_test > ccan_num_test2) {
top <- test_val
} else {
bottom <- test_val
}
}
return(round((top + bottom)/2, digits = tolerance_digits))
}
number_of_ccans <- function(connections_df, coaccess_cutoff) {
ccan_graph <- make_ccan_graph(connections_df,
coaccess_cutoff = coaccess_cutoff)
comp_membership <- igraph::cluster_louvain(ccan_graph)
return(sum(igraph::sizes(comp_membership) > 2))
}
#' Find CCANs that overlap each other in genomic coordinates
#'
#' @param ccan_assignments A data frame where the first column is the peak and
#' the second is the CCAN assignment. For example, output of
#' \code{generate_ccans}.
#' @param min_overlap The minimum base pair overlap to count as overlapping.
#'
#' @return A data frame with two columns, CCAN1 and CCAN2. CCANs in this list
#' are overlapping. The data frame is reciprocal (if CCAN 2 overlaps CCAN 1,
#' there will be two rows, 1,2 and 2,1).
#'
#' @examples
#' ccan_df <- data.frame(peak = c("chr18_1408345_1408845", "chr18_1779830_1780330",
#' "chr18_1929095_1929595", "chr18_1954501_1954727",
#' "chr18_2049865_2050884", "chr18_2083726_2084102",
#' "chr18_2087935_2088622", "chr18_2104705_2105551",
#' "chr18_2108641_2108907"),
#' CCAN = c(1,2,2,2,3,3,3,3,2))
#' olap_ccans <- find_overlapping_ccans(ccan_df)
#'
#'
#' @export
find_overlapping_ccans <- function(ccan_assignments, min_overlap=1) {
ccan_assignments <- ccan_assignments[,c(1,2)]
names(ccan_assignments) <- c("Peak", "CCAN")
ccans <- df_for_coords(ccan_assignments$Peak)
ccans$CCAN <- ccan_assignments$CCAN
ccan_info <- plyr::ddply(ccans, plyr::.(CCAN), function(ccan) {
return(data.frame(ccan_coords = paste(ccan$chr[1], bp1 = min(ccan$bp1),
bp2 = max(ccan$bp2), sep="_")))
})
ccan_ranges <- ranges_for_coords(ccan_info$ccan_coords,
meta_data_df = ccan_info)
ol <- GenomicRanges::findOverlaps(ccan_ranges, ccan_ranges,
minoverlap=min_overlap, #maxgap = 0,
select="all")
olaps <- data.frame(
CCAN1 = GenomicRanges::mcols(ccan_ranges[
S4Vectors::queryHits(ol)])@listData$CCAN,
CCAN2 = GenomicRanges::mcols(ccan_ranges[
S4Vectors::subjectHits(ol)])@listData$CCAN)
olaps <- olaps[!duplicated(olaps),]
olaps <- olaps[olaps$CCAN1 != olaps$CCAN2, ]
return(olaps)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.