R/runCicero.R

Defines functions generate_windows reconcile assemble_connections generate_cicero_models estimate_distance_parameter run_cicero make_cicero_cds

Documented in assemble_connections estimate_distance_parameter generate_cicero_models make_cicero_cds run_cicero

#' Create cicero input CDS
#'
#' Function to generate an aggregated input CDS for cicero. \code{run_cicero}
#' takes as input an aggregated cicero CDS object. This function will generate
#' the CDS given an input CDS (perhaps generated by  \code{make_atac_cds}) and
#' a value for k, which is the number of cells to be aggregated per bin. The
#' default value for k is 50.
#'
#' @param cds Input CDS object.
#' @param reduced_coordinates A data frame with columns representing the
#'   coordinates of each cell in reduced dimension space (generally 2-3
#'   dimensions). \code{row.names(reduced_coordinates)} should match the cell
#'   names in the CDS object. If dimension reduction was done using monocle,
#'   tSNE coordinates can be accessed by \code{t(reducedDimA(cds))}, and
#'   DDRTree coordinates can be accessed by \code{t(reducedDimS(cds))}.
#' @param k Number of cells to aggregate per bin.
#' @param summary_stats Which numeric \code{pData(cds)} columns you would like
#'   summarized (mean) by bin in the resulting CDS object.
#' @param size_factor_normalize Logical, should accessibility values be
#'   normalized by size factor?
#' @param silent Logical, should warning and info messages be printed?
#' @param return_agg_info Logical, should a list of the assignments of cells to
#'   aggregated bins be output? When \code{TRUE}, this function returns a list 
#'   of two items, first, the aggregated CDS object and second, a data.frame 
#'   with the binning information.
#'
#' @details Aggregation of similar cells is done using a k-nearest-neighbors
#'   graph and a randomized "bagging" procedure. Details are available in the
#'   publication that accompanies this package. Run \code{citation("cicero")}
#'   for publication details. KNN is calculated using
#'   \code{\link[FNN]{knn.index}}
#'
#' @return Aggregated CDS object. If return_agg_info is \code{TRUE}, a list 
#'   of the aggregated CDS object and a data.frame of aggregation info.
#' @export
#'
#' @examples
#' \dontrun{
#'   data("cicero_data")
#'
#'   input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#'   input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#'                                reduction_method = 'tSNE',
#'                                norm_method = "none")
#'   tsne_coords <- t(reducedDimA(input_cds))
#'   row.names(tsne_coords) <- row.names(pData(input_cds))
#'   cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#' }
#'
make_cicero_cds <- function(cds,
                            reduced_coordinates,
                            k=50,
                            summary_stats = NULL,
                            size_factor_normalize = TRUE,
                            silent = FALSE, 
                            return_agg_info = FALSE) {
  
  assertthat::assert_that(is(cds, "CellDataSet"))
  assertthat::assert_that(is.data.frame(reduced_coordinates) |
                            is.matrix(reduced_coordinates))
  assertthat::assert_that(assertthat::are_equal(nrow(reduced_coordinates),
                                                nrow(pData(cds))))
  assertthat::assert_that(setequal(row.names(reduced_coordinates),
                                   colnames(cds)))
  assertthat::assert_that(assertthat::is.count(k) & k > 1)
  assertthat::assert_that(is.character(summary_stats) | is.null(summary_stats))
  if(!is.null(summary_stats)) {
    assertthat::assert_that(all(summary_stats %in% names(pData(cds))),
                            msg = paste("One of your summary_stats is missing",
                                        "from your pData table. Either add a",
                                        "column with the name in",
                                        "summary_stats, or remove the name",
                                        "from the summary_stats parameter.",
                                        collapse = " "))
    assertthat::assert_that(sum(vapply(summary_stats, function(x) {
      !(is(pData(cds)[,x], "numeric") | is(pData(cds)[,x], "integer"))}, 1)) == 0,
                            msg = paste("All columns in summary_stats must be",
                                        "of class numeric or integer.",
                                        collapse = " "))
  }
  assertthat::assert_that(is.logical(size_factor_normalize))
  assertthat::assert_that(is.logical(silent))
  assertthat::assert_that(is.logical(return_agg_info))
  
  reduced_coordinates <- as.data.frame(reduced_coordinates)
  
  reduced_coordinates <- reduced_coordinates[colnames(cds),]
  
  # Create a k-nearest neighbors map
  nn_map <- FNN::knn.index(reduced_coordinates, k=(k-1)) # no data.frame wrapper
  
  row.names(nn_map) <- row.names(reduced_coordinates)
  
  nn_map <- cbind(nn_map, seq_len(nrow(nn_map)))
  
  good_choices <- seq_len(nrow(nn_map))
  choice <- sample(seq_len(length(good_choices)), size = 1, replace = FALSE)
  chosen <- good_choices[choice]
  good_choices <- good_choices[good_choices != good_choices[choice]]
  it <- 0
  k2 <- k * 2 # Compute once
  
  # function for sapply
  get_shared <- function(other, this_choice) {
    k2 - length(union(cell_sample[other,], this_choice))
  }
  
  while (length(good_choices) > 0 & it < 5000) { # slow
    it <- it + 1
    choice <- sample(seq_len(length(good_choices)), size = 1, replace = FALSE)
    new_chosen <- c(chosen, good_choices[choice])
    good_choices <- good_choices[good_choices != good_choices[choice]]
    cell_sample <- nn_map[new_chosen,]

    others <- seq_len(nrow(cell_sample) - 1)
    this_choice <- cell_sample[nrow(cell_sample),]
    shared <- sapply(others, get_shared, this_choice = this_choice)
    
    if (max(shared) < .9 * k) {
      chosen <- new_chosen
    }
  }
  
  cell_sample <- nn_map[chosen,]
  
  if(!silent) {
    # Only need this slow step if !silent
    combs <- combn(nrow(cell_sample), 2)
    
    shared <- apply(combs, 2, function(x) {  #slow
      k2 - length(unique(as.vector(cell_sample[x,])))
    })
    
    message(paste0("Overlap QC metrics:\nCells per bin: ", k,
                   "\nMaximum shared cells bin-bin: ", max(shared),
                   "\nMean shared cells bin-bin: ", mean(shared),
                   "\nMedian shared cells bin-bin: ", median(shared)))
    
    if (mean(shared)/k > .1)
      warning("On average, more than 10% of cells are shared between paired bins.")
  }
  
  exprs_old <- exprs(cds)
  
  mask <- sapply(seq_len(nrow(cell_sample)), 
                 function(x) seq_len(ncol(exprs_old)) %in% cell_sample[x,,drop=FALSE])
  
  if (return_agg_info) {
    row.names(mask) <- colnames(exprs_old)
    colnames(mask) <- paste0("agg_", chosen)
    agg_map <- reshape2::melt(mask)
    agg_map <- agg_map[agg_map$value,]
    agg_map$value <- NULL
    names(agg_map) <- c("cell", "agg_cell")
  }
  
  mask <- Matrix::Matrix(mask)
  new_exprs <- exprs_old %*% mask
  
  new_exprs <- Matrix::t(new_exprs)
  new_exprs <- as.matrix(new_exprs)
  
  pdata <- pData(cds)
  new_pcols <- "agg_cell"
  if(!is.null(summary_stats)) { 
    new_pcols <- c(new_pcols, paste0("mean_",summary_stats)) 
  }
  
  new_pdata <- plyr::adply(cell_sample,1, function(x) {
    sub <- pdata[x,]
    df_l <- list()
    df_l["temp"] <- 1
    for (att in summary_stats) {
      df_l[paste0("mean_", att)] <- mean(sub[,att])
    }
    data.frame(df_l)
  })
  
  new_pdata$agg_cell <- paste("agg", chosen, sep="")
  new_pdata <- new_pdata[,new_pcols, drop = FALSE] # fixes order, drops X1 and temp
  
  row.names(new_pdata) <- new_pdata$agg_cell
  row.names(new_exprs) <- new_pdata$agg_cell
  new_exprs <- as.matrix(t(new_exprs))
  
  fdf <- fData(cds)
  new_pdata$temp <- NULL
  
  fd <- new("AnnotatedDataFrame", data = fdf)
  pd <- new("AnnotatedDataFrame", data = new_pdata)
  
  cicero_cds <-  suppressWarnings(newCellDataSet(new_exprs,
                                                 phenoData = pd,
                                                 featureData = fd,
                                                 expressionFamily=negbinomial.size(),
                                                 lowerDetectionLimit=0))
  
  cicero_cds <- monocle::detectGenes(cicero_cds, min_expr = .1)
  cicero_cds <- estimateSizeFactorsSimp(cicero_cds)
  #cicero_cds <- suppressWarnings(BiocGenerics::estimateDispersions(cicero_cds))

  if (any(!c("chr", "bp1", "bp2") %in% names(fData(cicero_cds)))) {
    fData(cicero_cds)$chr <- NULL
    fData(cicero_cds)$bp1 <- NULL
    fData(cicero_cds)$bp2 <- NULL
    fData(cicero_cds) <- cbind(fData(cicero_cds),
                               df_for_coords(row.names(fData(cicero_cds))))
  }
  
  if (size_factor_normalize) {
    Biobase::exprs(cicero_cds) <-
      t(t(Biobase::exprs(cicero_cds))/Biobase::pData(cicero_cds)$Size_Factor)
  }
  
  if (return_agg_info) {
    return(list(cicero_cds, agg_map))
  }
  cicero_cds
}


#' Run Cicero
#'
#' A wrapper function that runs the primary functions of the Cicero pipeline
#' with default parameters. Runs \code{\link{estimate_distance_parameter}},
#' \code{\link{generate_cicero_models}} and \code{\link{assemble_connections}}.
#' See the manual pages of these functions for details about their function and
#' parameter options. Defaults in this function are designed for mammalian data,
#' those with non-mammalian data should read about parameters in the above
#' functions.
#'
#' @param cds Cicero CDS object, created using \code{\link{make_cicero_cds}}
#' @param window Size of the genomic window to query, in base pairs.
#' @param silent Whether to print progress messages
#' @param sample_num How many sample genomic windows to use to generate
#'   \code{distance_parameter} parameter. Default: 100.
#' @param genomic_coords Either a data frame or a path (character) to a file
#'   with chromosome lengths. The file should have two columns, the first is
#'   the chromosome name (ex. "chr1") and the second is the chromosome length
#'   in base pairs. See \code{data(human.hg19.genome)} for an example. If a
#'   file, should be tab-separated and without header.
#'
#' @return A table of co-accessibility scores
#' @export
#'
#' @examples
#'   data("cicero_data")
#'   data("human.hg19.genome")
#'   sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#'   sample_genome$V2[1] <- 100000
#'   input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#'   input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#'                                reduction_method = 'tSNE',
#'                                norm_method = "none")
#'   tsne_coords <- t(reducedDimA(input_cds))
#'   row.names(tsne_coords) <- row.names(pData(input_cds))
#'   cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#'   cons <- run_cicero(cicero_cds, sample_genome, sample_num = 2)
#'
run_cicero <- function(cds,
                       genomic_coords,
                       window = 500000,
                       silent=FALSE,
                       sample_num = 100) {
  # Check input
  assertthat::assert_that(is(cds, "CellDataSet"))
  assertthat::assert_that(is.logical(silent))
  assertthat::assert_that(assertthat::is.number(window))
  assertthat::assert_that(assertthat::is.count(sample_num))
  if (!is.data.frame(genomic_coords)) {
    assertthat::is.readable(genomic_coords)
  }

  if (!silent) print("Starting Cicero")
  if (!silent) print("Calculating distance_parameter value")
  distance_parameters <- estimate_distance_parameter(cds, window=window,
                                  maxit=100, sample_num = sample_num,
                                   distance_constraint = 250000,
                                   distance_parameter_convergence = 1e-22,
                                   genomic_coords = genomic_coords)

  mean_distance_parameter <- mean(unlist(distance_parameters))

  if (!silent) print("Running models")
  cicero_out <-
    generate_cicero_models(cds,
                           distance_parameter = mean_distance_parameter,
                           window = window,
                           genomic_coords = genomic_coords)

  if (!silent) print("Assembling connections")
  all_cons <- assemble_connections(cicero_out, silent=silent)

  if (!silent) print("Done")
  all_cons
  }


#' Calculate distance penalty parameter
#'
#' Function to calculate distance penalty parameter (\code{distance_parameter})
#' for random genomic windows. Used to choose \code{distance_parameter} to pass
#' to \code{\link{generate_cicero_models}}.
#'
#' @param cds A cicero CDS object generated using \code{\link{make_cicero_cds}}.
#' @param window Size of the genomic window to query, in base pairs.
#' @param maxit Maximum number of iterations for distance_parameter estimation.
#' @param s Power law value. See details for more information.
#' @param sample_num Number of random windows to calculate
#'   \code{distance_parameter} for.
#' @param distance_constraint Maximum distance of expected connections. Must be
#'   smaller than \code{window}.
#' @param distance_parameter_convergence Convergence step size for
#'   \code{distance_parameter} calculation.
#' @param max_elements Maximum number of elements per window allowed. Prevents
#'   very large models from slowing performance.
#' @param genomic_coords Either a data frame or a path (character) to a file
#'   with chromosome lengths. The file should have two columns, the first is
#'   the chromosome name (ex. "chr1") and the second is the chromosome length
#'   in base pairs. See \code{data(human.hg19.genome)} for an example. If a
#'   file, should be tab-separated and without header.
#' @param max_sample_windows Maximum number of random windows to screen to find
#'   sample_num windows for distance calculation. Default 500.
#'
#' @examples
#'   data("cicero_data")
#'   data("human.hg19.genome")
#'   sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#'   sample_genome$V2[1] <- 100000
#'   input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#'   input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#'                                reduction_method = 'tSNE',
#'                                norm_method = "none")
#'   tsne_coords <- t(reducedDimA(input_cds))
#'   row.names(tsne_coords) <- row.names(pData(input_cds))
#'   cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#'   distance_parameters <- estimate_distance_parameter(cicero_cds,
#'                                                      sample_num=5,
#'                                                      genomic_coords = sample_genome)
#'
#' @seealso \code{\link{generate_cicero_models}}
#' @return A list of results of length \code{sample_num}. List members are
#'   numeric \code{distance_parameter} values.
#'
#' @details The purpose of this function is to calculate the distance scaling
#'   parameter used to adjust the distance-based penalty function used in
#'   Cicero's model calculation. The scaling parameter, in combination with the
#'   power law value \code{s} determines the distance-based penalty.
#'
#'   This function chooses random windows of the genome and calculates a
#'   \code{distance_parameter}. The function returns a vector of values
#'   calculated on these random windows. We recommend using the mean value of
#'   this vector moving forward with Cicero analysis.
#'
#'   The function works by finding the minimum distance scaling parameter such
#'   that no more than 5% of pairs of sites at a distance greater than
#'   \code{distance_constraint} have non-zero entries after graphical lasso
#'   regularization and such that fewer than 80% of all output entries are
#'   nonzero.
#'
#'   If the chosen random window has fewer than 2 or greater than
#'   \code{max_elements} sites, the window is skipped. In addition, the random
#'   window will be skipped if there are insufficient long-range comparisons
#'   (see below) to be made. The \code{max_elements} parameter exist to prevent
#'   very dense windows from slowing the calculation. If you expect that your
#'   data may regularly have this many sites in a window, you will need to
#'   raise this parameter.
#'
#'   Calculating the \code{distance_parameter} in a sample window requires
#'   peaks in that window that are at a distance greater than the
#'   \code{distance_constraint} parameter. If there are not enough examples at
#'   high distance have been found, the function will return the warning 
#'   \code{"Warning: could not calculate sample_num distance_parameters - see 
#'   documentation details"}.When looking for \code{sample_num} example 
#'   windows, the function will search \code{max_sample_windows} windows. By 
#'   default this is set at 500, which should be well beyond the 100 windows 
#'   that need to be found. However, in very sparse datasets, increasing 
#'   \code{max_sample_windows} may help avoid the above warning. Increasing 
#'   \code{max_sample_windows} may slow performance in sparse datasets. If you 
#'   are still not able to get enough example windows, even with a large
#'   \code{max_sample_windows} paramter, this may mean your \code{window} 
#'   parameter needs to be larger or your \code{distance_constraint} parameter 
#'   needs to be smaller. A less likely possibility is that your 
#'   \code{max_elements} parameter needs to be larger. This would occur if your 
#'   data is particularly dense.
#'
#'   The parameter \code{s} is a constant that captures the power-law
#'   distribution of contact frequencies between different locations in the
#'   genome as a function of their linear distance. For a complete discussion
#'   of the various polymer models of DNA packed into the nucleus and of
#'   justifiable values for s, we refer readers to (Dekker et al., 2013) for a
#'   discussion of justifiable values for s. We use a value of 0.75 by default
#'   in Cicero, which corresponds to the “tension globule” polymer model of DNA
#'   (Sanborn et al., 2015). This parameter must be the same as the s parameter
#'   for generate_cicero_models.
#'
#'   Further details are available in the publication that accompanies this
#'   package. Run \code{citation("cicero")} for publication details.
#'
#' @references
#'   \itemize{
#'     \item Dekker, J., Marti-Renom, M.A., and Mirny, L.A. (2013). Exploring
#'     the three-dimensional organization of genomes: interpreting chromatin
#'     interaction data. Nat. Rev. Genet. 14, 390–403.
#'     \item Sanborn, A.L., Rao, S.S.P., Huang, S.-C., Durand, N.C., Huntley,
#'     M.H., Jewett, A.I., Bochkov, I.D., Chinnappan, D., Cutkosky, A., Li, J.,
#'     et al. (2015). Chromatin extrusion explains key features of loop and
#'     domain formation in wild-type and engineered genomes. Proc. Natl. Acad.
#'     Sci. U. S. A. 112, E6456–E6465.
#'   }
#' @export
estimate_distance_parameter <- function(cds,
                                   window=500000,
                                   maxit=100,
                                   s=0.75,
                                   sample_num = 100,
                                   distance_constraint = 250000,
                                   distance_parameter_convergence = 1e-22,
                                   max_elements = 200,
                                   genomic_coords = cicero::human.hg19.genome,
                                   max_sample_windows = 500) {

  assertthat::assert_that(is(cds, "CellDataSet"))
  assertthat::assert_that(assertthat::is.number(window))
  assertthat::assert_that(assertthat::is.count(maxit))
  assertthat::assert_that(assertthat::is.number(s), s < 1, s > 0)
  assertthat::assert_that(assertthat::is.count(sample_num))
  assertthat::assert_that(assertthat::is.count(distance_constraint))
  assertthat::assert_that(distance_constraint < window)
  assertthat::assert_that(assertthat::is.number(distance_parameter_convergence))
  if (!is.data.frame(genomic_coords)) {
    assertthat::is.readable(genomic_coords)
  }
  assertthat::assert_that(assertthat::is.count(max_sample_windows))

  grs <- generate_windows(window, genomic_coords)

  fData(cds)$chr <- gsub("chr", "", fData(cds)$chr)
  fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
  fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))

  distance_parameters <- list()
  distance_parameters_calced <- 0
  it <- 0

  while(sample_num > distance_parameters_calced & it < max_sample_windows) {
    it <- it + 1
    win <- sample(seq_len(length(grs)), 1)
    GL <- "Error"
    win_range <- get_genomic_range(grs, cds, win)

    if (nrow(exprs(win_range))<=1) {
      next()
    }
    if (nrow(exprs(win_range)) > max_elements) {
      next()
    }

    dist_matrix <- calc_dist_matrix(win_range)

    distance_parameter <- find_distance_parameter(dist_matrix,
                        win_range,
                        maxit = maxit,
                        null_rho = 0,
                        s,
                        distance_constraint = distance_constraint,
                        distance_parameter_convergence =
                          distance_parameter_convergence)

    if (!is(distance_parameter, "numeric")) next()
    distance_parameters = c(distance_parameters, distance_parameter)
    distance_parameters_calced <- distance_parameters_calced + 1
  }

  if(length(distance_parameters) < sample_num)
    warning(paste0("Could not calculate sample_num distance_parameters (",
                   length(distance_parameters), " were calculated) - see ",
                   "documentation details"))
  if(length(distance_parameters) == 0)
    stop("No distance_parameters calculated")

  unlist(distance_parameters)
}


#' Generate cicero models
#'
#' Function to generate graphical lasso models on all sites in a CDS object
#' within overlapping genomic windows.
#'
#' @param cds A cicero CDS object generated using \code{\link{make_cicero_cds}}.
#' @param distance_parameter Distance based penalty parameter value. Generally,
#'   the mean of the calculated \code{distance_parameter} values from
#'   \code{\link{estimate_distance_parameter}}.
#' @param s Power law value. See details.
#' @param window Size of the genomic window to query, in base pairs.
#' @param max_elements Maximum number of elements per window allowed. Prevents
#'   very large models from slowing performance.
#' @param genomic_coords Either a data frame or a path (character) to a file
#'   with chromosome lengths. The file should have two columns, the first is
#'   the chromosome name (ex. "chr1") and the second is the chromosome length
#'   in base pairs. See \code{data(human.hg19.genome)} for an example. If a
#'   file, should be tab-separated and without header.
#'
#' @details The purpose of this function is to compute the raw covariances
#'   between each pair of sites within overlapping windows of the genome.
#'   Within each window, the function then estimates a regularized correlation
#'   matrix using the graphical LASSO (Friedman et al., 2008), penalizing pairs
#'   of distant sites more than proximal sites. The scaling parameter,
#'   \code{distance_parameter}, in combination with the power law value \code{s}
#'   determines the distance-based penalty.
#'
#'   The parameter \code{s} is a constant that captures the power-law
#'   distribution of contact frequencies between different locations in the
#'   genome as a function of their linear distance. For a complete discussion
#'   of the various polymer models of DNA packed into the nucleus and of
#'   justifiable values for s, we refer readers to (Dekker et al., 2013) for a
#'   discussion of justifiable values for s. We use a value of 0.75 by default
#'   in Cicero, which corresponds to the “tension globule” polymer model of DNA
#'   (Sanborn et al., 2015). This parameter must be the same as the s parameter
#'   for \code{\link{estimate_distance_parameter}}.
#'
#'   Further details are available in the publication that accompanies this
#'   package. Run \code{citation("cicero")} for publication details.
#'
#' @return A list of results for each window. Either a \code{glasso} object, or
#'   a character description of why the window was skipped. This list can be
#'   directly input into \code{\link{assemble_connections}} to create a
#'   reconciled list of cicero co-accessibility scores.
#' @examples
#'   data("cicero_data")
#'   data("human.hg19.genome")
#'   sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#'   sample_genome$V2[1] <- 100000
#'   input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#'   input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#'                                reduction_method = 'tSNE',
#'                                norm_method = "none")
#'   tsne_coords <- t(reducedDimA(input_cds))
#'   row.names(tsne_coords) <- row.names(pData(input_cds))
#'   cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#'   model_output <- generate_cicero_models(cicero_cds,
#'                                          distance_parameter = 0.3,
#'                                          genomic_coords = sample_genome)
#'
#' @references
#'   \itemize{
#'     \item Dekker, J., Marti-Renom, M.A., and Mirny, L.A. (2013). Exploring
#'     the three-dimensional organization of genomes: interpreting chromatin
#'     interaction data. Nat. Rev. Genet. 14, 390–403.
#'     \item Friedman, J., Hastie, T., and Tibshirani, R. (2008). Sparse
#'     inverse covariance estimation with the graphical lasso. Biostatistics 9,
#'     432–441.
#'     \item Sanborn, A.L., Rao, S.S.P., Huang, S.-C., Durand, N.C., Huntley,
#'     M.H., Jewett, A.I., Bochkov, I.D., Chinnappan, D., Cutkosky, A., Li, J.,
#'     et al. (2015). Chromatin extrusion explains key features of loop and
#'     domain formation in wild-type and engineered genomes. Proc. Natl. Acad.
#'     Sci. U. S. A. 112, E6456–E6465.
#'   }
#'
#' @seealso \code{\link{estimate_distance_parameter}}
#' @export
#'
generate_cicero_models <- function(cds,
                                   distance_parameter,
                                   s = 0.75,
                                   window = 500000,
                                   max_elements = 200,
                                   genomic_coords = cicero::human.hg19.genome) {

  assertthat::assert_that(is(cds, "CellDataSet"))
  assertthat::assert_that(assertthat::is.number(distance_parameter))
  assertthat::assert_that(assertthat::is.number(s), s < 1, s > 0)
  assertthat::assert_that(assertthat::is.number(window))
  assertthat::assert_that(assertthat::is.count(max_elements))
  if (!is.data.frame(genomic_coords)) {
    assertthat::is.readable(genomic_coords)
  }

  grs <- generate_windows(window, genomic_coords)

  fData(cds)$chr <- gsub("chr", "", fData(cds)$chr)
  fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
  fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))

  outlist <- parallel::mclapply(seq_len(length(grs)), mc.cores = 1, function(win) {
    GL <- "Error"

    win_range <- get_genomic_range(grs, cds, win)

    if (nrow(exprs(win_range))<=1) {
      return("Zero or one element in range")
    }
    if (nrow(exprs(win_range)) > max_elements) {
      return("Too many elements in range")
    }

    dist_matrix <- calc_dist_matrix(win_range)

    rho_mat <- get_rho_mat(dist_matrix, distance_parameter, s)

    vals <- exprs(win_range)
    cov_mat <- cov(t(vals))
    diag(cov_mat) <- diag(cov_mat) + 1e-4

    GL <- glasso::glasso(cov_mat, rho_mat)
    colnames(GL$w) <- row.names(GL$w) <- row.names(vals)
    colnames(GL$wi) <- row.names(GL$wi) <- row.names(vals)
    return(GL)
  })
  names_df <- as.data.frame(grs)
  names(outlist) <- paste(names_df$seqnames,
                          names_df$start,
                          names_df$end, sep="_")

  #FIXME add warning about how many regions removed due to too many elements
  outlist
}

#' Combine and reconcile cicero models
#'
#' Function which takes the output of \code{\link{generate_cicero_models}} and
#' assembles the connections into a data frame with cicero co-accessibility
#' scores.
#'
#' This function combines glasso models computed on overlapping windows of the
#' genome. Pairs of sites whose regularized correlation was calculated twice
#' are first checked for qualitative concordance (both zero, positive or
#' negative). If they not concordant, NA is returned. If they are concordant
#' the mean is returned.
#'
#' @param cicero_model_list A list of cicero output objects, generally, the
#'   output of \code{\link{generate_cicero_models}}.
#' @param silent Logical, should the function run silently?
#'
#' @return A data frame of connections with their cicero co-accessibility
#'   scores.
#' @examples
#'   data("cicero_data")
#'   data("human.hg19.genome")
#'   sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#'   sample_genome$V2[1] <- 100000
#'   input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#'   input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#'                                reduction_method = 'tSNE',
#'                                norm_method = "none")
#'   tsne_coords <- t(reducedDimA(input_cds))
#'   row.names(tsne_coords) <- row.names(pData(input_cds))
#'   cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#'   model_output <- generate_cicero_models(cicero_cds,
#'                                          distance_parameter = 0.3,
#'                                          genomic_coords = sample_genome)
#'   cicero_cons <- assemble_connections(model_output)
#'
#' @seealso \code{\link{generate_cicero_models}}
#' @importFrom data.table melt.data.table
#' @export
assemble_connections <- function(cicero_model_list, silent = FALSE) {
  types <- vapply(cicero_model_list, FUN=class, FUN.VALUE="character")
  char_hbn <- cicero_model_list[types=="character"]
  gl_only <- cicero_model_list[types=="list"]
  if(!silent) {
    print(paste("Successful cicero models: ", length(gl_only)))
    print("Other models: ")
    print(table(unlist(char_hbn)))
    print(paste("Models with errors: ", sum(is.null(cicero_model_list))))
  }

  cors <- lapply(gl_only, function(gl)  {
    cors <- stats::cov2cor(gl$w)
    data.table::melt(as.data.table(cors, keep.rownames=TRUE), 
                     measure=patterns("[0-9]"))
  })

  cors <- data.table::rbindlist(cors)
  names(cors) <- c("Var1", "Var2", "value")
  data.table::setkey(cors, "Var1", "Var2")

  cors_rec <- as.data.frame(cors[,list(mean_coaccess = reconcile(value)),
                                 by="Var1,Var2"])

  names(cors_rec) <- c("Peak1", "Peak2", "coaccess")
  cors_rec <- cors_rec[cors_rec$Peak1 != cors_rec$Peak2,]
  return(cors_rec)
}

reconcile <- function(values) {
  if (length(values) == 1) return(values)
  if (sum(values >= 0) == length(values)) return(mean(values))
  if (sum(values <= 0) == length(values)) return(mean(values))
  if (sum(values == 0) == length(values)) return(0)
  return(NA_real_)
}

generate_windows <- function(window, genomic_coords) {
  if(!is(genomic_coords, "data.frame")) {
    chr_maxes <- read.table(genomic_coords)
  } else {
    chr_maxes <- genomic_coords
  }
  names(chr_maxes) <- c("V1", "V2")
  win_ranges <- plyr::ddply(chr_maxes, plyr::.(V1), function(x) {
    r <- seq(from = 1, to = x$V2[1], by = window/2)
    l <- r + window - 1
    data.frame(start = r, end = l)
  })
  gr <- GenomicRanges::GRanges(win_ranges$V1,
                               ranges=IRanges::IRanges(win_ranges$start,
                                                       win_ranges$end))
  return(gr)
}

get_genomic_range <- function(grs, cds, win) {
  end1 <- as.numeric(as.character(GenomicRanges::end(grs[win])))
  end2 <- as.numeric(as.character(GenomicRanges::start(grs[win])))
  win_range <- cds[(fData(cds)$bp1 < end1 &
                                fData(cds)$bp1 > end2) |
                               (fData(cds)$bp2 < end1 &
                                  fData(cds)$bp2 > end2), ]
  win_range <-
    win_range[as.character(fData(win_range)$chr) ==
                gsub("chr", "",
                     as.character(GenomicRanges::seqnames(grs[win]))),]
  fData(win_range)$mean_bp <-
    (as.numeric(as.character(fData(win_range)$bp1)) +
       as.numeric(as.character(fData(win_range)$bp2)))/2

  return(win_range)
}

find_distance_parameter <- function(dist_mat,
                       gene_range,
                       maxit,
                       null_rho,
                       s,
                       distance_constraint,
                       distance_parameter_convergence) {
  if (sum(dist_mat > distance_constraint)/2 < 1) {
    return("No long edges")
  }

  found <- FALSE
  starting_max <- 2
  distance_parameter <- 2
  distance_parameter_max <- 2
  distance_parameter_min <- 0
  it <- 0
  while(found != TRUE & it < maxit) {
    vals <- exprs(gene_range)
    cov_mat <- cov(t(vals))
    diag(cov_mat) <- diag(cov_mat) + 1e-4

    rho <- get_rho_mat(dist_mat, distance_parameter, s)

    GL <- glasso::glasso(cov_mat, rho)
    big_entries <- sum(dist_mat > distance_constraint)

    if (((sum(GL$wi[dist_mat > distance_constraint] != 0)/big_entries) > 0.05) |
        (sum(GL$wi == 0)/(nrow(GL$wi)^2) < 0.2 ) ) {
      longs_zero <- FALSE
    } else {
      longs_zero <- TRUE
    }

    if (longs_zero != TRUE | (distance_parameter == 0)) {
      distance_parameter_min <- distance_parameter
    } else {
      distance_parameter_max <- distance_parameter
    }
    new_distance_parameter <- (distance_parameter_min +
                                 distance_parameter_max)/2

    if(new_distance_parameter == starting_max) {
      new_distance_parameter <- 2 * starting_max
      starting_max <- new_distance_parameter
    }

    if (distance_parameter_convergence > abs(distance_parameter -
                                             new_distance_parameter)) {
      found <- TRUE
    } else {
      distance_parameter <- new_distance_parameter
    }
    it <- it + 1
  }
  if (maxit == it) warning("maximum iterations hit")
  return(distance_parameter)
}

get_rho_mat <- function(dist_matrix, distance_parameter, s) {
  xmin <- 1000
  out <- (1-(xmin/dist_matrix)^s) * distance_parameter
  out[!is.finite(out)] <- 0
  out[out < 0] <- 0
  return(out)
}

calc_dist_matrix <- function(gene_range) {
  dist_mat <- as.matrix(dist(fData(gene_range)$mean_bp))
  row.names(dist_mat) <- colnames(dist_mat) <- row.names(fData(gene_range))

  return(dist_mat)
}

make_ccan_graph <- function(connections_df, coaccess_cutoff) {
  connections_df <- as.data.frame(connections_df)
  #make graph
  cons_info_gr <- connections_df[!is.na(connections_df$coaccess) &
                                   connections_df$coaccess > coaccess_cutoff,]
  if(nrow(cons_info_gr) == 0) stop("No connections for graph")
  cons_graph <- make_sparse_matrix(cons_info_gr, x.name = "coaccess")
  site_graph <- igraph::graph.adjacency(cons_graph,
                                        mode = "undirected",
                                        weighted = TRUE)
  return(site_graph)
}

#' Generate cis-co-accessibility networks (CCANs)
#'
#' Post process cicero co-accessibility scores to extract modules of sites that
#' are co-accessible.
#'
#' @param connections_df Data frame of connections with columns: Peak1, Peak2,
#'   coaccess. Generally, the output of \code{\link{run_cicero}} or
#'   \code{\link{assemble_connections}}
#' @param coaccess_cutoff_override Numeric, co-accessibility score threshold to
#'   impose. Overrides automatic calculation.
#' @param tolerance_digits The number of digits to calculate cutoff to. Default
#'   is 2 (0.01 tolerance)
#'
#' @details CCANs are calculated by first specifying a minimum co-accessibility
#'   score and then using the Louvain community detection algorithm on the
#'   subgraph induced by excluding edges below this score. For this function,
#'   either the user can specify the minimum co-accessibility using
#'   \code{coaccess_cutoff_override}, or the cutoff can be calculated
#'   automatically by optimizing for CCAN number. The cutoff calculation can be
#'   slow, so users may wish to use the \code{coaccess_cutoff_override} after
#'   initially calculating the cutoff to speed future runs.
#'
#' @return Data frame with two columns - Peak and CCAN. CCAN column indicates
#'   CCAN assignment. Peaks not included in a CCAN are not returned.
#' @export
#'
#' @examples
#' \dontrun{
#'   data("cicero_data")
#'   set.seed(18)
#'   data("human.hg19.genome")
#'   sample_genome <- subset(human.hg19.genome, V1 == "chr18")
#'   sample_genome$V2[1] <- 100000
#'   input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
#'   input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
#'                                reduction_method = 'tSNE',
#'                                norm_method = "none")
#'   tsne_coords <- t(reducedDimA(input_cds))
#'   row.names(tsne_coords) <- row.names(pData(input_cds))
#'   cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
#'   cicero_cons <- run_cicero(cicero_cds, sample_genome, sample_num = 2)
#'   ccan_assigns <- generate_ccans(cicero_cons)
#'  }
#'
generate_ccans <- function(connections_df,
                           coaccess_cutoff_override = NULL,
                           tolerance_digits = 2) {
  assertthat::assert_that(is.data.frame(connections_df))
  assertthat::assert_that(assertthat::has_name(connections_df, "Peak1"),
                          assertthat::has_name(connections_df, "Peak2"),
                          assertthat::has_name(connections_df, "coaccess"))
  assertthat::assert_that(assertthat::is.number(tolerance_digits))
  assertthat::assert_that(assertthat::is.number(coaccess_cutoff_override) |
                            is.null(coaccess_cutoff_override),
                          msg = paste("coaccess_cutoff_override must be a",
                                      "number or NULL", collapse = " "))
  if (!is.null(coaccess_cutoff_override)) {
    assertthat::assert_that(coaccess_cutoff_override <= 1 &
                              coaccess_cutoff_override >= 0,
                            msg = paste("coaccess_cutoff_override must be",
                                        "between 0 and 1 (or NULL)",
                                        collapse = " "))
  }

  if (!is.null(coaccess_cutoff_override)) {
    coaccess_cutoff <- coaccess_cutoff_override
  } else {
    coaccess_cutoff <- find_ccan_cutoff(connections_df, tolerance_digits)
  }
  print(paste("Coaccessibility cutoff used:", coaccess_cutoff))
  ccan_graph <- make_ccan_graph(connections_df,
                                coaccess_cutoff = coaccess_cutoff)
  comp_membership <- igraph::cluster_louvain(ccan_graph)
  sizes <- igraph::sizes(comp_membership) > 2
  comps_list <- unlist(as.list(igraph::membership(comp_membership)))
  df <- data.frame(Peak = names(comps_list), CCAN = comps_list)
  df$CCAN[!df$CCAN %in% names(sizes[sizes])] <- NA
  df <- df[!is.na(df$CCAN),]
  return(df)
}

find_ccan_cutoff <- function(connection_df, tolerance_digits) {
  connection_df <- connection_df[connection_df$coaccess > 0,]
  tolerance <- 10^-(tolerance_digits)
  bottom <- 0
  top <- max(connection_df$coaccess, na.rm = TRUE)
  while ((top - bottom) > tolerance) {
    test_val <- bottom + round((top - bottom)/2, digits = tolerance_digits + 1)
    ccan_num_test <- number_of_ccans(connection_df, test_val)
    next_step <- test_val
    repeat{
      next_step <- next_step + (top - bottom)/10
      ccan_num_test2 <- number_of_ccans(connection_df, next_step)
      if(ccan_num_test2 != ccan_num_test){
        break
      }
    }
    if (ccan_num_test > ccan_num_test2) {
      top <- test_val
    } else {
      bottom <- test_val
    }
  }
  return(round((top + bottom)/2, digits = tolerance_digits))
}

number_of_ccans <- function(connections_df, coaccess_cutoff) {
  ccan_graph <- make_ccan_graph(connections_df,
                                coaccess_cutoff = coaccess_cutoff)
  comp_membership <- igraph::cluster_louvain(ccan_graph)
  return(sum(igraph::sizes(comp_membership) > 2))
}

#' Find CCANs that overlap each other in genomic coordinates
#'
#' @param ccan_assignments A data frame where the first column is the peak and
#'   the second is the CCAN assignment. For example, output of
#'   \code{generate_ccans}.
#' @param min_overlap The minimum base pair overlap to count as overlapping.
#'
#' @return A data frame with two columns, CCAN1 and CCAN2. CCANs in this list
#'   are overlapping. The data frame is reciprocal (if CCAN 2 overlaps CCAN 1,
#'   there will be two rows, 1,2 and 2,1).
#'
#' @examples
#'   ccan_df <- data.frame(peak = c("chr18_1408345_1408845", "chr18_1779830_1780330", 
#'                                  "chr18_1929095_1929595", "chr18_1954501_1954727",
#'                                  "chr18_2049865_2050884", "chr18_2083726_2084102",
#'                                  "chr18_2087935_2088622", "chr18_2104705_2105551",
#'                                  "chr18_2108641_2108907"), 
#'                         CCAN = c(1,2,2,2,3,3,3,3,2))
#'   olap_ccans <- find_overlapping_ccans(ccan_df)
#' 
#'
#' @export
find_overlapping_ccans <- function(ccan_assignments, min_overlap=1) {
  ccan_assignments <- ccan_assignments[,c(1,2)]
  names(ccan_assignments) <- c("Peak", "CCAN")
  ccans <- df_for_coords(ccan_assignments$Peak)
  ccans$CCAN <- ccan_assignments$CCAN
  ccan_info <- plyr::ddply(ccans, plyr::.(CCAN), function(ccan) {
    return(data.frame(ccan_coords = paste(ccan$chr[1], bp1 = min(ccan$bp1),
                                          bp2 = max(ccan$bp2), sep="_")))
  })

  ccan_ranges <- ranges_for_coords(ccan_info$ccan_coords,
                                   meta_data_df = ccan_info)
  ol <- GenomicRanges::findOverlaps(ccan_ranges, ccan_ranges,
                                    minoverlap=min_overlap, #maxgap = 0,
                                    select="all")
  olaps <- data.frame(
    CCAN1 = GenomicRanges::mcols(ccan_ranges[
      S4Vectors::queryHits(ol)])@listData$CCAN,
    CCAN2 = GenomicRanges::mcols(ccan_ranges[
      S4Vectors::subjectHits(ol)])@listData$CCAN)
  olaps <- olaps[!duplicated(olaps),]
  olaps <- olaps[olaps$CCAN1 != olaps$CCAN2, ]
  return(olaps)
}
cole-trapnell-lab/cicero-release documentation built on Sept. 4, 2024, 1:49 p.m.