R/pair_cells.R

Defines functions splitAndFetch GetPairList CreatePairedObject PairCells CoembedData

Documented in CoembedData CreatePairedObject GetPairList PairCells

#' Integrate data
#'
#' This function wraps a number of functions from the Seurat package.
#' It integrates and projects the single-cell multi-omics (e.g., scRNA-seq
#' and scATAC-seq) data into a common low-dimensional space via CCA approach.
#' The details can be found:
#' \url{https://satijalab.org/seurat/articles/seurat5_atacseq_integration_vignette}
#'
#' @param obj.rna A Seurat object including gene expression data
#' @param obj.atac A Seurat object including chromatin accessibility data
#' @param gene.activity A sparse matrix containing gene activity score per cell
#' @param reference.assay The reference assay from gene expression data
#' @param reduction The reduction name used for function FindTransferAnchors.
#' @param weight.reduction Dimensional reduction to use for the weighting anchors
#' @param dims Set of dimensions to use in the anchor weighting procedure.
#' @param verbose Print progress bars and output
#'
#' @return An integrated Seurat object
#' @export
#'
#' @examples
#' \dontrun{
#' obj.coembed <- CoembedData(
#'    obj.rna = obj.rna,
#'    obj.atac = obj.atac,
#'    gene.activity = gene.activity,
#'    weight.reduction = "harmony"
#' )
#'
#' }
CoembedData <-
  function(obj.rna,
           obj.atac,
           gene.activity,
           reference.assay = "RNA",
           reduction = "cca",
           weight.reduction = NULL,
           verbose = TRUE,
           dims = 1:30) {
    ## make sure there are the same number of cells in atac and gene activity data
    if (ncol(obj.atac) != ncol(gene.activity)) {
      stop("The number of cells in ATAC-seq and Gene activity are not the same!")
    }

    if (is.null(weight.reduction)) {
      stop("Please specify dimensional reduction to use for the weighting anchors!")
    }

    gene.use <- intersect(rownames(gene.activity),
                          rownames(obj.rna))

    n_genes <- length(gene.use)
    message(glue::glue("Find {n_genes} common genes between ATAC and RNA"))

    message("Subset ATAC and RNA data")
    obj.atac[['GeneActivity']] <-
      CreateAssayObject(counts = gene.activity[gene.use,])
    DefaultAssay(obj.atac) <- "GeneActivity"

    obj.rna <- subset(obj.rna, features = gene.use)

    message("Normalize gene activity score for ATAC data")
    obj.atac <- obj.atac %>%
      NormalizeData(verbose = verbose) %>%
      ScaleData(verbose = verbose)

    message("Performing data integration using Seurat...")
    transfer.anchors <- FindTransferAnchors(
      reference = obj.rna,
      query = obj.atac,
      features = gene.use,
      reference.assay = reference.assay,
      query.assay = "GeneActivity",
      reduction = reduction,
      verbose = verbose
    )

    # we here restrict the imputation to the selected genes
    refdata <- LayerData(obj.rna, assay = reference.assay, layer = "data")

    # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.
    # imputation (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
    obj.atac[["RNA"]] <- TransferData(
      anchorset = transfer.anchors,
      refdata = refdata,
      weight.reduction = obj.atac[[weight.reduction]],
      dims = dims
    )

    DefaultAssay(obj.atac) <- "RNA"

    obj.rna$tech <- "RNA"
    obj.atac$tech <- "ATAC"

    # merge the objects
    coembed <- merge(x = obj.atac, y = obj.rna)
    coembed <- JoinLayers(coembed)

    # Finally, we run PCA and UMAP on this combined object, 
    # to visualize the co-embedding of both datasets
    message("Coemebdding the data...")
    coembed <- coembed %>%
      ScaleData(do.scale = FALSE) %>%
      RunPCA(verbose = verbose) %>%
      RunUMAP(dims = 1:30, verbose = verbose)

    return(coembed)
  }


#' Pair the cells
#'
#' This function will create the pairs of cells between different modalities based on
#' the integrated data generated by \code{\link{CoembedData}}.
#' It was modified from \url{https://github.com/buenrostrolab/stimATAC_analyses_code/blob/master/R/optMatching_functions.R}
#' to use Seurat object as input. \cr
#' Author: Vinay Kartha, Yan Hu \cr
#' Contact: <vinay_kartha@g.harvard.edu> \cr
#' Affiliation: Buenrostro Lab, Department of Stem Cell and Regenerative Biology, Harvard University \cr
#'
#' @param object The integrated Seurat object generated by the function CoembedData
#' @param reduction Dimensional reduction to use for the pairing cells
#' @param pair.by Name of one metadata column to split the object for pairing;
#' @param ident1 Specify how to split the object, must be an value of pair.by
#' @param ident2 Specify how to split the object, must be an value of pair.by
#' @param assay Assay name based on which a KNN graph is constructed
#' @param pair.mode Pair mode. Available options are: "greedy" and "geodesic".
#' @param tol Tol times the number of subjects to be matched specifies the extent to which fullmatch's
#' @param search.range This determines the size of the search knn. search_range * total number of cells = size of knn.
#' @param max.multimatch Maximum number of cells allowed to be matched to each cell
#' @param min_subgraph_size Minimum number of cells (ATAC/RNA each) needed for
#' pairing in a given subgraph. Will skip subgraphs with fewer than these cells
#' @param seed Random seed
#' @param k k-NN parameter used for applying constraints on ATAC-RNA pairs
#' @return A data frame containing the cell pairs
#' @export
#'
#' @examples
#' \dontrun{
#' df.pair <- PairCells(
#'   object = obj.coembed,
#'   reduction = "harmony",
#'   pair.by = "tech",
#'   ident1 = "ATAC",
#'   ident2 = "RNA"
#'   )
#' }
PairCells <- function(object,
                       reduction = NULL,
                       pair.by = NULL,
                       ident1 = "ATAC",
                       ident2 = "RNA",
                       assay = "RNA",
                       pair.mode = "greedy",
                       tol = 0.0001,
                       search.range = 0.2,
                       max.multimatch = 5,
                       min_subgraph_size = 50,
                       seed = 42,
                       k = 300) {
  if (is.null(reduction)) {
    stop("Please specify dimensional reduction to use for the pairing cells!")
  }

  if (is.null(pair.by)) {
    stop("Please specify how to split the data for pairing!")
  }

  message("Getting dimensional reduction data for pairing cells...")
  obj.1 <- object[, object@meta.data[[pair.by]] == ident1]
  obj.2 <- object[, object@meta.data[[pair.by]] == ident2]

  embedding.atac <-
    Seurat::Embeddings(object = obj.1, reduction = reduction)
  embedding.rna <-
    Seurat::Embeddings(object = obj.2, reduction = reduction)

  embedding <- rbind(embedding.atac, embedding.rna)
  n.cells <- dim(embedding)[1]

  n.cells.atac <- dim(embedding.atac)[1]
  n.cells.rna <- dim(embedding.rna)[1]

  message(glue::glue("Number of ATAC cells: {n.cells.atac}, number of RNA cells: {n.cells.rna}"))

  # release memory
  rm(obj.1)
  rm(obj.2)
  gc()

  message(glue::glue("Pairing cells using {pair.mode} mode..."))

  if (pair.mode == "geodesic") {
    message("Constructing KNN graph for computing geodesic distance ..")
    optmatch::setMaxProblemSize(size = Inf)

    object <-
      Seurat::FindNeighbors(object,
                            reduction = reduction,
                            assay = assay,
                            verbose = FALSE)

    adjmatrix <- object@graphs$RNA_nn
    diag(adjmatrix) <- 0
    knn.graph <-
      igraph::graph_from_adjacency_matrix(adjmatrix = adjmatrix,
                                          mode = "max",
                                          weighted = NULL)

    message("Computing graph-based geodesic distance ..")
    # Compute shortest paths between all cell pairs.
    # We checked that this returns a symmetric matrix
    shortest.paths <- igraph::shortest.paths(knn.graph)

    # Find connected subgraphs
    sub.graphs <- igraph::clusters(knn.graph)
    message(
      glue::glue(
        "# KNN subgraphs detected: {length(unique(sub.graphs$membership))}"
      )
    )

    object$subgraph <- sub.graphs$membership

    all.pairs <- NULL

    message("Skipping subgraphs with either ATAC/RNA cells fewer than: ",
            min_subgraph_size)

    # Go through each subgraph
    for (sub.graph.idx in unique(sub.graphs$membership)) {
      message("Pairing cells for subgraph No.", sub.graph.idx)

      # Retrieve the subgraph
      sub.graph.nodes <- sub.graphs$membership == sub.graph.idx
      knn.sub.graph <-
        igraph::induced_subgraph(knn.graph, which(sub.graph.nodes))

      # Use down-sampling to make sure in this subgraph the number of ATAC and RNA cells are balanced
      subgraph_cells <- colnames(object)[sub.graph.nodes]
      n_ATAC <-
        sum(subgraph_cells %in% rownames(embedding.atac))
      n_RNA <- sum(subgraph_cells %in% rownames(embedding.rna))

      message("Total ATAC cells in subgraph: ", n_ATAC)
      message("Total RNA cells in subgraph: ", n_RNA)

      embedding.atac.sub <-
        embedding.atac[sub.graph.nodes[1:dim(embedding.atac)[1]], ]
      embedding.rna.sub <-
        embedding.rna[sub.graph.nodes[(dim(embedding.atac)[1] + 1):n.cells],]

      if (n_ATAC > n_RNA) {
        set.seed(seed)
        embedding.atac.sub <-
          embedding.atac.sub[sample(1:n_ATAC, n_RNA, replace = FALSE), ]
      } else if (n_ATAC < n_RNA) {
        set.seed(seed)
        embedding.rna.sub <-
          embedding.rna.sub[sample(1:n_RNA, n_ATAC, replace = FALSE), ]
      }

      if (is.null(nrow(embedding.atac.sub)) |
          is.null(nrow(embedding.rna.sub))) {
        message("Down-sampling within subgraph between assays led to 0 cells in one assay..")
        message("Skipping current subgraph")
        next
      }

      # Subset the original geodesic distance matrix to get the geodesic distance matrix for the subgraph
      subgraph_geodist <-
        shortest.paths[match(rownames(embedding.atac.sub), rownames(embedding)),
                       match(rownames(embedding.rna.sub), rownames(embedding))]
      subgraph_size <- dim(subgraph_geodist)[1]

      message("Subgraph size: ", subgraph_size)

      # TO AVOID MAJOR SUBGRAPH(S) WERE BEING SKIPPED SOMETIMES
      size_threshold <-
        ceiling((nrow(embedding.atac.sub) + nrow(embedding.rna.sub)) * search.range)
      k_pairing <- size_threshold

      message("Search threshold being used: ", k_pairing)

      if (subgraph_size < size_threshold |
          subgraph_size < min_subgraph_size) {
        message("Insufficient number of cells in subgraph. Skipping current subgraph")
        next
      }

      # We also calculate euclidean distance matrix
      subgraph_eucdist <-
        pracma::distmat(embedding.atac.sub, embedding.rna.sub)

      # Find KNN based on geodesic distances.
      message("Constructing KNN based on geodesic distance to reduce search pairing search space")
      geodist_knn <- array(-1, dim = dim(subgraph_geodist))

      for (i in 1:subgraph_size) {
        # Find RNA cells in the KNN of each ATAC cell
        geodist_threshold <- sort(subgraph_geodist[i,])[k_pairing]
        knn_ind <- subgraph_geodist[i, ] < geodist_threshold
        geodist_knn[i, knn_ind] <- subgraph_eucdist[i, knn_ind]

        # Find ATAC cells in the KNN of each RNA cell
        geodist_threshold <- sort(subgraph_geodist[, i])[k_pairing]
        knn_ind <- subgraph_geodist[, i] < geodist_threshold
        geodist_knn[knn_ind, i] <- subgraph_eucdist[knn_ind, i]

      }

      # For an ATAC-RNA cell pair, if neither of them are in the KNN of the other, we set their distance to be inf
      geodist_knn[geodist_knn < 0] <- Inf

      # Add rownames to the matrix
      rownames(geodist_knn) <- paste0("ATAC_", 1:subgraph_size)
      colnames(geodist_knn) <- paste0("RNA_", 1:subgraph_size)

      message(paste(
        "Number of cells being paired:",
        dim(geodist_knn)[1],
        "ATAC and",
        dim(geodist_knn)[1],
        " RNA cells"
      ))

      message("Determing pairs through optimized bipartite matching ..")
      cell_matches <-
        suppressWarnings(
          optmatch::fullmatch(
            optmatch::as.InfinitySparseMatrix(as.matrix(geodist_knn)),
            tol = tol,
            min.controls = 1 / max.multimatch,
            max.controls = max.multimatch
          )
        )

      pair.list <- GetPairList(cell_matches,
                               rownames(embedding.atac.sub),
                               rownames(embedding.rna.sub))

      message("Finished!\n")

      # Append the results for this subgraph to the list of all results
      all.pairs <- rbind(all.pairs, pair.list)
    }
  } else if(pair.mode == "greedy"){
    n_ATAC <- dim(embedding.atac)[1]
    n_RNA <- dim(embedding.rna)[1]

    if (n_ATAC >= n_RNA){
      print("Pairing all RNA cells to nearest ATAC cells")

      ATAC_paired <- vector("character", length = n_RNA)

      for(i in 1:n_RNA){
        query <- t(as.matrix(embedding.rna[i, ]))
        pair_knn <- FNN::get.knnx(data = embedding.atac,
                                  query = query,
                                  k = 1)
        ATAC_paired[i] <- rownames(embedding.atac)[pair_knn$nn.index]
        embedding.atac <- embedding.atac[-pair_knn$nn.index, ]

      }
      RNA_paired <- rownames(embedding.rna)


    } else{
      print("Pairing all ATAC cells to nearest RNA cells")

      RNA_paired <- vector("character", length = n_ATAC)

      for(i in 1:n_ATAC){
        query <- t(as.matrix(embedding.atac[i, ]))
        pair_knn <- FNN::get.knnx(data = embedding.rna,
                                  query = query,
                                  k = 1)
        RNA_paired[i] <- rownames(embedding.rna)[pair_knn$nn.index]
        embedding.rna <- embedding.rna[-pair_knn$nn.index, ]
      }

      ATAC_paired <- rownames(embedding.atac)
    }

    message("Finished!\n")

    all.pairs <- data.frame(ATAC=ATAC_paired, RNA=RNA_paired, stringsAsFactors=FALSE)

  }
  all.pairs$cell_name <- paste0("cell_", 1:nrow(all.pairs))

  return(all.pairs)
}

#' Create paired object
#'
#' This function will create a Seurat object including single-cell multimodal
#' data based on the paired cells.
#'
#' @param df.pair A data frame containing the cell pairing results
#' generated by the function \code{\link{PairCells}}.
#' @param object Seurat object generated by the function \code{\link{CoembedData}}
#' @param use.assay1 A string indicating the first assay
#' @param use.assay2 A string indicating the first assay
#' @param sep Separators to use for strings encoding genomic coordinates.
#' First element is used to separate the chromosome from the coordinates,
#' second element is used to separate the start from end coordinate.
#'
#' @return A Seurat object
#' @export
#'
#' @examples
#' \dontrun{
#' obj <- CreatePairedObject(
#'    df.pair = df.pair,
#'    object = obj.coembed,
#'    use.assay1 = "RNA",
#'    use.assay2 = "ATAC",
#' )
#' }
CreatePairedObject <- function(df.pair,
                                obj.coembed = NULL,
                                obj.rna = NULL,
                                obj.atac = NULL,
                                rna.assay = NULL,
                                atac.assay = NULL,
                                sep = c("-", "-")) {
  message("Merging objects...")
  rna.counts <-
    LayerData(obj.rna, assay = rna.assay, layer = "counts", cells = df.pair$RNA)
  atac.counts <-
    LayerData(obj.atac, assay = atac.assay, layer = "counts", cells = df.pair$ATAC)

  colnames(rna.counts) <- df.pair$cell_name
  colnames(atac.counts) <- df.pair$cell_name

  # create a Seurat object containing the RNA adata
  obj.pair <- CreateSeuratObject(counts = rna.counts,
                                 assay = rna.assay)

  # create ATAC assay and add it to the object
  obj.pair[[atac.assay]] <-
    CreateChromatinAssay(counts = atac.counts,
                         sep = sep,
                         min.cells = 10)

  for (reduction in names(obj.coembed@reductions)) {
    embedding <- Embeddings(
      obj.coembed, 
      reduction = reduction)[df.pair$RNA, ]

    rownames(embedding) <- df.pair$cell_name

    obj.pair[[reduction]] <- CreateDimReducObject(
      embeddings = embedding,
      assay = DefaultAssay(obj.pair))
  }

  # add metadata, here we use the metadata from RNA assay
  meta.data <- obj.coembed@meta.data[df.pair$RNA,]
  rownames(meta.data) <- df.pair$cell_name

  obj.pair <- AddMetaData(obj.pair, metadata = meta.data)

  return(obj.pair)

}


#' Get list of paired cells
#'
#' This function takes in the output of the \code{\link{fullmatch}} function and sort the
#' results into a list of ATAC-RNA pairs.
#' It was modified from \url{https://github.com/buenrostrolab/stimATAC_analyses_code/blob/master/R/optMatching_functions.R} \cr
#' Author: Vinay Kartha, Yan Hu \cr
#' Contact: <vinay_kartha@g.harvard.edu> \cr
#' Affiliation: Buenrostro Lab, Department of Stem Cell and Regenerative Biology, Harvard University \cr
#'
#' @param cell_matches The output object of \code{\link{fullmatch}} in the package optmatch
#' @param ATAC_barcodes  Barcode of ATAC cells, must match the order of IDs in cell_matches.
#' @param RNA_barcodes Barcode of RNA cells, must match the order of IDs in cell_matches.
#'
#' @export
#'
#' @return A data frame containing the cell pairs
GetPairList <- function(cell_matches,
                        ATAC_barcodes,
                        RNA_barcodes)
{
  cell_matches <- sort(cell_matches) # Sort to get ATAC, RNA tuples

  if (length(cell_matches) == 0) {
    stop(
      "Matches could not be found .. Perhaps try adjusting the constraints to allow optimal matching to be solved?\n"
    )
  }


  if (any(is.na(cell_matches))) {
    warning("NA pairs exist ..\n")
  }

  # Currently the result list contain cells paired to multiple other cells
  # If a cell is paired to k cells, we duplicate this cell k times to make k pairs
  # Thus we generate a new list consisting pairs of 1 ATAC - 1 RNA cells
  # We also make sure that in a pair, the first ID is ATAC and the second ID is RNA
  matches <- character()
  pair_ids <- unique(unname(cell_matches))
  for (pair_id in 1:length(pair_ids)) {
    new_match <-
      names(cell_matches[unname(cell_matches) == pair_ids[pair_id]])
    new_match_ATAC <-
      new_match[splitAndFetch(new_match, "_", 1) == "ATAC"]
    new_match_RNA <-
      new_match[splitAndFetch(new_match, "_", 1) == "RNA"]
    new_match <- vector()
    if (length(new_match_ATAC) > length(new_match_RNA)) {
      new_match[seq(1, 2 * length(new_match_ATAC), 2)] <- new_match_ATAC
      new_match[seq(2, 2 * length(new_match_ATAC), 2)] <-
        rep(new_match_RNA, length(new_match_ATAC))
    } else {
      new_match[seq(1, 2 * length(new_match_RNA), 2)] <-
        rep(new_match_ATAC, length(new_match_RNA))
      new_match[seq(2, 2 * length(new_match_RNA), 2)] <-
        new_match_RNA
    }
    matches <- c(matches, new_match)
  }

  # Make sure pair groupings (factors) are adjacent
  #stopifnot(all.equal(cell_matches[seq(1,length(cell_matches),2)],cell_matches[seq(2,length(cell_matches),2)],check.attributes=FALSE))

  ATAC_IDs <-
    matches[seq(1, length(matches), 2)] # Names of ATAC cells
  RNA_IDs <-
    matches[seq(2, length(matches), 2)] # Names of RNA cells

  # Checking if ATAC and RNA tuples are concordant
  stopifnot(all(splitAndFetch(ATAC_IDs, "_", 1) %in% "ATAC"))
  stopifnot(all(splitAndFetch(RNA_IDs, "_", 1) %in% "RNA"))

  # This is just to make sure 1-1, with the names we gave
  # (can still comprise actual doublets from upsampling if any)
  # stopifnot(all(unique(unique(ATACp))) & all(unique(RNAp)))

  # Get corresponding index relative to input matrix order
  ATAC_inds <- as.numeric(splitAndFetch(ATAC_IDs, "_", 2))
  RNA_inds <- as.numeric(splitAndFetch(RNA_IDs, "_", 2))

  matches_mat <-
    matrix(c(ATAC_inds, RNA_inds), ncol = 2, byrow = FALSE) # ATAC col1, RNA col2

  message("Assembling pair list ..")
  # Make data frame of matches

  pair_df <- data.frame("ATAC" = ATAC_inds,
                        "RNA" = RNA_inds)

  pair_df <- pair_df %>% arrange(ATAC)

  # Convert to labels if they exist
  pair_df$ATAC <-
    ATAC_barcodes[pair_df$ATAC] # ATAC cell labels
  pair_df$RNA <-
    RNA_barcodes[pair_df$RNA] # RNA cell labels

  pair_df
}

splitAndFetch <- function(vec,
                          delim,
                          part) {
  if (length(part) == 1) {
    sapply(strsplit(as.character(vec), delim, fixed = TRUE), "[[", part)
  } else {
    sapply(strsplit(as.character(vec), delim, fixed = TRUE), function(x)
      paste(x[part], collapse = delim))
  }
}
CostaLab/scMEGA documentation built on Sept. 25, 2024, 6:11 a.m.