R/DataParsingFunctions.R

Defines functions loadSampleData downloadSampleData findMarkers tuneFilterWeights tuneEdgeWeights walkCells computeLabelEdges mapCiceroToGenes mapArchRToGenes mapSnapATACToGenes mapPeaksToGenes getRegions computeCellSim

Documented in computeCellSim computeLabelEdges downloadSampleData findMarkers getRegions loadSampleData mapArchRToGenes mapCiceroToGenes mapPeaksToGenes mapSnapATACToGenes tuneEdgeWeights tuneFilterWeights walkCells

#' Compute Cell Similarity
#'
#' \code{computeCellSim} computes a cell-to-cell similarity matrix using the given method.
#' Currently only Jaccard similarity is implemented.
#'
#' @param ATACMat either a cell-by-peak matrix, a SnapATAC object, an ArchR project, or a Cicero count table
#' @param method function, the method used to compute cell similarity, should operate on rows of a matrix. The default
#' is Jaccard similiarty, but a second built in option is PCAdist which computes the euclidean distance in PCA space
#' @return matrix of cell-to-cell similarity
#' @export
#' @examples
#' data("SampleCellWalkRData")
#' computeCellSim(SampleCellWalkRData$ATACMat)
#'
computeCellSim = function(ATACMat, method=sparseJaccard){
  if(missing(ATACMat)){
    stop("Must provide either a cell-by-peak matrix or a SnapATAC object")
  }
  if(!is(method, "function")){
    stop("Must provide a similarity function")
  }

  if(is(ATACMat,"snap")){
    if(!requireNamespace("SnapATAC", quietly = TRUE)){
      stop("Must install SnapATAC")
    }
    if(dim(ATACMat@bmat)[1]==0){
      stop("No bmat")
    }
    ATACMat = SnapATAC::makeBinary(ATACMat, mat="bmat")
    method(ATACMat@bmat)
  }
  else if(is(ATACMat,"ArchRProject")){
    if(!requireNamespace("ArchR", quietly = TRUE)){
      stop("Must install ArchR")
    }
    if(!"TileMatrix" %in% ArchR::getAvailableMatrices(ATACMat)){
      stop("TileMatrix missing from ArchR project")
    }
    ATACData = ArchR::getMatrixFromProject(ATACMat, "TileMatrix", verbose=FALSE, binarize = TRUE)
    ATACMat = Matrix::t(assay(ATACData))
    method(ATACMat)
  }
  else if(is(ATACMat,"Matrix")|is(ATACMat,"matrix")){
    if(min(dim(ATACMat))==0){
      stop("Not enough cells or peaks in matrix")
    }
    if(dim(ATACMat)[1]>dim(ATACMat)[2]){
      warning("ATACMat has more cells than peaks, does it need to be transposed?")
    }
    method(ATACMat>0)
  }
  else if(is(ATACMat, "data.frame")){
    peaks = unique(ATACMat[,1])
    cells = unique(ATACMat[,2])
    rowIndex = match(ATACMat[,2], cells)
    colIndex = match(ATACMat[,1], peaks)
    ATACMat = Matrix::sparseMatrix(rowIndex, colIndex, x=ATACMat[,3])
    simMat = method(ATACMat>0)
    dimnames(simMat) = list(cells,cells)
    simMat
  }
  else{
    stop("Must provide either a cell-by-peak matrix, SnapATAC object, ArchR project, or Cicero count table")
  }
}

#' Get genomic regions
#'
#' \code{getRegions} returns a GRanges object of how gene name identifiers map to
#' genomic regions either based on gene promoters or the full gene body
#'
#' @param geneBody boolean, use gene body or just promoter
#' @param genome which genome build (hg38, mm10)
#' @param names which gene names to use (Entrez, Ensembl)
#' @return GRanges list of regions with gene_id
#' @export
#' @examples
#' getRegions()
#'
getRegions = function(geneBody=TRUE, genome="hg38", names="Entrez"){
  if(genome=="hg38"){
    if(names=="Ensembl"){
      stop("Not an implemented gene name identifier")
    }
    else if(names=="Entrez"){
      pathToGenes <- system.file("extdata", "Hg38_knownGene.rds", package = "CellWalkR")
      genes <- readRDS(pathToGenes)
    }
    else{
      stop("Not a valid gene name identifier")
    }
  }
  else if(genome=="mm10"){
    if(names=="Ensembl"){
      pathToGenes <- system.file("extdata", "Mm10_ensGene.rds", package = "CellWalkR")
      genes <- readRDS(pathToGenes)
    }
    else if(names=="Entrez"){
      pathToGenes <- system.file("extdata", "Mm10_knownGene.rds", package = "CellWalkR")
      genes <- readRDS(pathToGenes)
    }
    else{
      stop("Not a valid gene name identifier")
    }
  }
  else{
    stop("Not an implemented genome build")
  }

  regions = GenomicRanges::promoters(genes)
  if(geneBody){
    regions = c(regions, genes)
  }

  regions
}


#' Map peaks to genes
#'
#' \code{mapPeaksToGenes} Generates a mapping from genes to peaks per label
#'
#' @param labelGenes data.frame with genes of interest in first column and corresponding
#' labels in second column
#' @param ATACMat cell-by-peak matrix
#' @param peaks GRanges object of peaks (equal to number of rows in ATACMat)
#' @param regions GRanges object of genomic regions associated with genes
#' @return list of genes and corresponding peaks by label
#' @export
#' @examples
#' data("SampleCellWalkRData")
#' regions <- getRegions()
#' mapPeaksToGenes(SampleCellWalkRData$labelGenes,
#'                 SampleCellWalkRData$ATACMat,
#'                 SampleCellWalkRData$peaks,
#'                 regions)
#'
mapPeaksToGenes = function(labelGenes, ATACMat, peaks, regions){
  if(missing(labelGenes)){
    stop("Must provide a table with genes of interest in first column and corresponding labels in second column")
  }
  if(is.null(dim(labelGenes)) || dim(labelGenes)[2]<2){
    stop("Must provide a table with genes of interest in first column and corresponding labels in second column")
  }
  labels = as.character(unique(labelGenes[,2]))
  if(length(labels)==1){
    warning("Only one label exists")
  }
  if(missing(ATACMat)){
    stop("Must provide either a cell-by-peak matrix or a SnapATAC object")
  }
  if(is(ATACMat,"snap")){
    if(!requireNamespace("SnapATAC", quietly = TRUE)){
      stop("Must install SnapATAC")
    }
    if(dim(ATACMat@gmat)[1]==0){
      stop("No gmat, call SnapATAC function createGmatFromMat first")
    }
    whichGenes = match(unique(labelGenes[,1]),colnames(ATACMat@gmat))
    if(length(whichGenes)<10){
       warning("Fewer than 10 gene names match")
    }
    stop("No need to map to genes, use gmat")
  }
  else{
    if(missing(regions)){
      stop("Must provide a GRanges object of genomic regions associatated with genes")
    }
    if(!is(regions,"GRanges")){
      stop("regions must be a GRanges object")
    }
    if(length(regions)==0){
      stop("regions must contain at least one genomic range")
    }
    if(length(regions$gene_id)!=length(regions)){
      stop("Each region must have an associated gene_id")
    }
    whichGenes = match(unique(labelGenes[,1]),regions$gene_id)
    if(length(whichGenes)<10){
      warning("Fewer than 10 gene names match")
    }
    if(missing(peaks)){
      stop("Must provide a GRanges object of peaks")
    }
    if(!is(peaks,"GRanges")){
      stop("peaks must be a GRanges object")
    }
    if(!is(ATACMat,"Matrix") & !is(ATACMat,"matrix")){
      stop("Must provide either a cell-by-peak matrix or a SnapATAC object")
    }
    if(min(dim(ATACMat))==0){
      stop("Not enough cells or peaks in matrix")
    }
    if(dim(ATACMat)[1]>dim(ATACMat)[2]){
      warning("ATACMat has more cells than peaks, does it need to be transposed?")
    }
    if(!length(peaks)==dim(ATACMat)[2]){
      stop("Number of peaks must be the same as the number of columns in ATACMat")
    }
    cellTypePeakMap = sapply(labels, function(x) {
      markerGenes = labelGenes[labelGenes[,2]==x,1]
      markerGenes = markerGenes[markerGenes %in% regions$gene_id]
      markerLocs = regions[match(markerGenes, regions$gene_id)]
      markerOverlaps = GenomicRanges::findOverlaps(peaks,markerLocs)
      list(peak=markerOverlaps@from,gene=markerGenes[(markerOverlaps@to-1)%%length(markerGenes)+1])
    })
  }

}


#' Map SnapATAC peaks to genes
#'
#' \code{mapSnapATACToGenes} Generates a mapping from genes to peaks per label
#'
#' @param labelGenes data.frame with genes of interest in first column and corresponding
#' labels in second column
#' @param snap a snap object
#' @param whichMat string determing whether to use "bmat" or "gmat" from snap object
#' @param regions GRanges object of genomic regions associated with genes (only needed for "bmat")
#' @return list of genes and corresponding peaks by label
#' @export
#'
mapSnapATACToGenes = function(labelGenes, snap, whichMat = "bmat", regions){
  if(missing(snap)){
    stop("Must provide a SnapATAC object")
  }
  if(!is(snap,"snap")){
    stop("Must provide a SnapATAC object")
  }
  if(!requireNamespace("SnapATAC", quietly = TRUE)){
    stop("Must install SnapATAC")
  }
  if(whichMat=="bmat"){
    mapPeaksToGenes(labelGenes, ATACMat = snap@bmat, peaks=snap@feature, regions)
  }
  else if(whichMat=="gmat"){
    whichGenes = match(unique(labelGenes[,1]),colnames(snap@gmat))
    if(length(which(!is.na(whichGenes)))<10){
      warning("Fewer than 10 gene names match")
    }
    sapply(as.character(unique(labelGenes[,2])), function(x) {
      markerGenes = labelGenes[labelGenes[,2]==x,1]
      markerGenes = markerGenes[markerGenes %in% colnames(snap@gmat)]
      list(peak=match(markerGenes,colnames(snap@gmat)),
           gene=markerGenes)
    })
  }
  else{
    stop("Must use bmat or gmat")
  }
}

#' Map ArchR peaks to genes
#'
#' \code{mapArchRToGenes} Generates a mapping from genes to peaks per label
#'
#' @param labelGenes data.frame with genes of interest in first column and corresponding
#' labels in second column
#' @param ArchRproj an ArchR project
#' @param whichMat string determing whether to use "TileMatrix" or "GeneScoreMatrix" from ArchR project
#' @param regions GRanges object of genomic regions associated with genes (only needed for "TileMatrix")
#' @return list of genes and corresponding peaks by label
#' @export
#'
mapArchRToGenes = function(labelGenes, ArchRproj, whichMat = "TileMatrix", regions){
  if(missing(ArchRproj)){
    stop("Must provide an ArchR project")
  }
  if(!is(ArchRproj,"ArchRProject")){
    stop("Must provide an ArchR project")
  }
  if(!requireNamespace("ArchR", quietly = TRUE)){
    stop("Must install ArchR")
  }
  if(whichMat=="TileMatrix"){
    if(!"TileMatrix" %in% ArchR::getAvailableMatrices(ArchRproj)){
      stop("TileMatrix missing from ArchR project")
    }
    ATACData = ArchR::getMatrixFromProject(ArchRproj, "TileMatrix", verbose=FALSE, binarize = TRUE)
    peaks = GenomicRanges::GRanges(ATACData@elementMetadata$seqnames, IRanges(start = ATACData@elementMetadata$start, width = 500))
    ATACMat = Matrix::t(assay(ATACData))
    mapPeaksToGenes(labelGenes, ATACMat = ATACMat, peaks=peaks, regions)
  }
  else if(whichMat=="GeneScoreMatrix"){
    if(!"GeneScoreMatrix" %in% ArchR::getAvailableMatrices(ArchRproj)){
      stop("GeneScoreMatrix missing from ArchR project")
    }
    ATACData = ArchR::getMatrixFromProject(ArchRproj, "GeneScoreMatrix", verbose=FALSE)
    whichGenes = match(unique(labelGenes[,1]),rowData(ATACData)$name)
    if(length(which(!is.na(whichGenes)))<10){
      warning("Fewer than 10 gene names match")
    }
    sapply(as.character(unique(labelGenes[,2])), function(x) {
      markerGenes = labelGenes[labelGenes[,2]==x,1]
      markerGenes = markerGenes[markerGenes %in% rowData(ATACData)$name]
      list(peak=match(markerGenes,rowData(ATACData)$name),
           gene=markerGenes)
    })
  }
  else{
    stop("Must use TileMatrix or GeneScoreMatrix")
  }
}

#' Map peaks to genes based on Cicero coaccessibility
#'
#' \code{mapCiceroToGenes} Generates a mapping from genes to peaks per label
#'
#' @param labelGenes data.frame with genes of interest in first column and corresponding
#' labels in second column
#' @param cicero_gene_activities gene activity matrix
#' @return list of genes and corresponding peaks by label
#' @export
#'
mapCiceroToGenes = function(labelGenes, cicero_gene_activities){
  if(missing(cicero_gene_activities)){
    stop("Must provide a gene activity matrix")
  }
  if(!is(cicero_gene_activities,"Matrix")){
    stop("Must provide a gene activity matrix")
  }
  whichGenes = match(unique(labelGenes[,1]),rownames(cicero_gene_activities))
  if(length(which(!is.na(whichGenes)))<10){
    warning("Fewer than 10 gene names match")
  }
  sapply(as.character(unique(labelGenes[,2])), function(x) {
    markerGenes = labelGenes[labelGenes[,2]==x,1]
    markerGenes = markerGenes[markerGenes %in% rownames(cicero_gene_activities)]
    list(peak=match(markerGenes,rownames(cicero_gene_activities)),
         gene=markerGenes)
  })
}

#' Compute Label Edges
#'
#' \code{computeLabelEdges} generates a matrix of edges from each label to each cell.
#' Edges are scaled by the passed weights and filters. Each filter can either apply
#' just to specific peaks or to whole genes (filterGene, TRUE by default) and can be permissive or
#' filtering out (filterOut, FALSE by default). Regions mapping peaks to genes must be
#' provided to use filters. If filters are applied to specific peaks, peaks for the ATAC
#' data must also be provided.
#'
#' @param labelGenes data.frame with genes of interest in first column and corresponding
#' labels in second column, optionally log-fold change in expression values in the third
#' @param ATACMat either a cell-by-peak matrix, a SnapATAC object, or an ArchR project
#' @param ATACGenePeak per label mapping of peaks to genes returned by mapPeaksToGenes()
#' @param method scaling method, either "Expression","Correlation", or "None"
#' @param filters list of GRanges lists for locations that pass filters
#' @param filterWeights numeric vector of weights assigned to each filter
#' @param filterOut boolean for each filter of whether those regions are permitted or filtered out
#' @param filterGene boolean for each filter of whether it applies to genes as a whole or just overlapping peaks (note that weights do not apply to peaks)
#' @param regions regions map peaks to genes
#' @param peaks GRanges of peaks in ATACMat
#' @param whichMat string determing whether to use "bmat" or "gmat" from snap object or "TileMatrix" or "GeneScoreMatrix" from ArchR project
#' @return matrix of weights from each label to each cell
#' @export
#' @examples
#' data("SampleCellWalkRData")
#' computeLabelEdges(SampleCellWalkRData$labelGenes,
#'                   SampleCellWalkRData$ATACMat,
#'                   SampleCellWalkRData$ATACGenePeak)
#'
computeLabelEdges = function(
  labelGenes,
  ATACMat,
  ATACGenePeak,
  method="Expression",
  filters,
  filterWeights,
  filterOut,
  filterGene,
  regions,
  peaks,
  whichMat="bmat"){
  if(missing(labelGenes)){
    stop("Must provide a table with genes of interest in first column and corresponding labels in second column")
  }
  if(is.null(dim(labelGenes)) || dim(labelGenes)[2]<2){
    stop("Must provide a table with genes of interest in first column and corresponding labels in second column")
  }
  labels = unique(labelGenes[,2])
  if(length(labels)==1){
    warning("Only one label exists")
  }

  if(missing(ATACMat)){
    stop("Must provide either a cell-by-peak matrix, a SnapATAC object, or an ArchR project")
  }
  else{
    if(is(ATACMat,"snap")){
      if(whichMat == "bmat"){
        peaks = ATACMat@feature
        ATACMat = ATACMat@bmat
      }
      else if(whichMat == "gmat"){
        ATACMat = ATACMat@gmat
      }
      else{
        stop("Must use bmat or gmat")
      }
    }
    if(is(ATACMat,"ArchRProject")){
      if(whichMat == "TileMatrix"){
        ATACData = ArchR::getMatrixFromProject(ATACMat, "TileMatrix", verbose=FALSE, binarize = TRUE)
        peaks = GenomicRanges::GRanges(ATACData@elementMetadata$seqnames, IRanges(start = ATACData@elementMetadata$start, width = 500))
        ATACMat = Matrix::t(assay(ATACData))
      }
      else if(whichMat == "GeneScoreMatrix"){
        ATACData = ArchR::getMatrixFromProject(ATACMat, "GeneScoreMatrix", verbose=FALSE)
        ATACMat = Matrix::t(assay(ATACData))
      }
      else{
        stop("Must use TileMatrix or GeneScoreMatrix")
      }
    }
    if(!is(ATACMat,"Matrix") & !is(ATACMat,"matrix")){
      stop("Must provide either a cell-by-peak matrix")
    }
    if(min(dim(ATACMat))==0){
      stop("Not enough cells or peaks in matrix")
    }
    if(dim(ATACMat)[1]>dim(ATACMat)[2]){
      warning("ATACMat has more cells than peaks, does it need to be transposed?")
    }
    if(missing(ATACGenePeak)){
      stop("Must provide a per label mapping of peaks to genes returned by mapPeaksToGenes()")
    }

    if(!missing(filters)){
      if(missing(regions)){
        stop("Must provide a GRanges object of genomic regions associatated with genes to apply filters")
      }
      if(!is(regions,"GRanges")){
        stop("regions must be a GRanges object")
      }
      if(length(regions)==0){
        stop("regions must contain at least one genomic range")
      }
      if(length(regions$gene_id)!=length(regions)){
        stop("Each region must have an associated gene_id")
      }
      if(missing(filterWeights) || length(filters)!=length(filterWeights)){
        warning("Assuming all filters have full weight")
        filterWeights = rep(1, length(filters))
      }
      if(missing(filterOut) || length(filters)!=length(filterOut)){
        warning("Assuming all filters are permissive")
        filterOut = rep(FALSE, length(filters))
      }
      if(missing(filterGene) || length(filters)!=length(filterGene)){
        warning("Assuming all filters apply at the gene level")
        filterGene = rep(TRUE, length(filters))
      }
      geneFilter = rep(1, length(labelGenes[,1]))
      for(filterIndex in which(filterGene)){
        filterRanges = filters[[filterIndex]]
        regionFilterOverlap = GenomicRanges::countOverlaps(regions, filterRanges)
        if(filterOut[filterIndex]){
          geneFilterHit = labelGenes[,1] %in% names(which(regionFilterOverlap>0))
          geneFilter[geneFilterHit] = geneFilter[geneFilterHit]*(1-filterWeights[filterIndex])
        }
        else{
          geneFilterHit = labelGenes[,1] %in% names(which(regionFilterOverlap>0))
          geneFilter[!geneFilterHit] = geneFilter[!geneFilterHit]*(1-filterWeights[filterIndex])
        }
      }
    }
    else{
      geneFilter = rep(1, length(labelGenes[,1]))
    }

    cellPeakCounts = Matrix::rowSums(ATACMat)
    cellsInMarkers = c()
    for(label in labels){
      if(!label %in% dimnames(ATACGenePeak)[[2]]){
        stop("A label is missing from the peak to gene mapping. Is it the right mapping?")
      }
      peaksInMarkers = ATACGenePeak[["peak",label]]
      genesInMarkers = ATACGenePeak[["gene",label]]
      if(max(peaksInMarkers)>dim(ATACMat)[2]){
        stop("Indecies dont match in peak to gene mapping. Is it the right mapping?")
      }
      if(length(which(genesInMarkers %in% labelGenes[,1]))!=length(genesInMarkers)){
        stop("Genes dont match in peak to gene mapping. Is it the right mapping?")
      }
      if(method=="None"){
        geneExp = rep(1, length(genesInMarkers))
      }
      else if(method=="Correlation"){
        stop("Correlation not yet implemented")
      }
      else if(method=="Expression"){
        if(dim(labelGenes)[2]<3){
          stop("Must provide a table with genes of interest in first column and corresponding labels in
               second column and log fold gene expression in the third")
        }
        # geneExp = labelGenes[match(genesInMarkers,labelGenes[labelGenes[,2]==label,1]),3]
        geneExp = labelGenes[labelGenes[,2]==label,3][match(genesInMarkers,labelGenes[labelGenes[,2]==label,1])]
      }
      else{
        stop("Not a recognized scaling method")
      }

      # geneFilterScale = geneFilter[match(genesInMarkers,labelGenes[labelGenes[,2]==label,1])]
      geneFilterScale = geneFilter[labelGenes[,2]==label][match(genesInMarkers,labelGenes[labelGenes[,2]==label,1])]

      if(!missing(filters) && sum(!filterGene)>0){
        if(missing(peaks) || !is(peaks,"GRanges")){
          stop("Must provide a GRanges object of peaks to apply filters to peaks")
        }
        if(!length(peaks)==dim(ATACMat)[2]){
          stop("Number of peaks must be the same as the number of columns in ATACMat")
        }

        for(filterIndex in which(!filterGene)){
          filterRanges = filters[[filterIndex]]
          peakFilterOverlap = GenomicRanges::countOverlaps(peaks, filterRanges)
          if(filterOut[filterIndex]){
            peaksInMarkers = peaksInMarkers[peakFilterOverlap==0]
            geneExp = geneExp[peakFilterOverlap==0]
            geneFilterScale = geneFilterScale[peakFilterOverlap==0]
          }
          else{
            peaksInMarkers = peaksInMarkers[peakFilterOverlap>0]
            geneExp = geneExp[peakFilterOverlap>0]
            geneFilterScale = geneFilterScale[peakFilterOverlap>0]
          }
        }
      }
      cellsInLabelMarkers = Matrix::colSums(Matrix::t(ATACMat[,peaksInMarkers])*geneExp*geneFilterScale)/cellPeakCounts
      cellsInLabelMarkers[cellsInLabelMarkers<0] = 0
      cellsInMarkers = cbind(cellsInMarkers,cellsInLabelMarkers)
    }
    colnames(cellsInMarkers) = labels
    cellsInMarkers = cellsInMarkers[,colSums(cellsInMarkers)!=0]
    cellsInMarkers
  }
}

#' Run CellWalker Method
#'
#' \code{walkCells} applies the CellWalker method to the given cell-to-cell and
#' cell-to-label edges.
#'
#' @param cellEdges matrix of cell-to-cell similarity (c-by-c)
#' @param labelEdgesList list of cell-to-label similarity matrices each with dimensions c-by-l
#' @param labelEdgeWeights vector of weights to test for label edges
#' @param sampleDepth depth to subsample cells to for faster calculation
#' @param steps integer indicating number of steps to take if walk should not be run to convergence
#' @param tensorflow boolean to indicate whether to compute on GPU
#' @return cellWalk object with influence matrix and labels
#' @export
#' @examples
#' data("SampleCellWalkRData")
#' labelEdgesList <- list(SampleCellWalkRData$labelEdges)
#' cellWalk <- walkCells(SampleCellWalkRData$cellEdges, labelEdgesList, 1)
#'
walkCells = function(cellEdges, labelEdgesList, labelEdgeWeights, sampleDepth, steps, tensorflow=FALSE){
  if(missing(cellEdges)){
    stop("Must provide a matrix of cell-to-cell similarity")
  }
  if(!is(cellEdges, "matrix") & !is(cellEdges,"Matrix")){
    stop("Cell-to-cell similarity must be a matrix")
  }
  if(dim(cellEdges)[1] != dim(cellEdges)[2]){
    stop("Cell-to-cell similarity matrix must have dimensions c-by-c")
  }

  c = dim(cellEdges)[1]
  if(missing(sampleDepth)){
    sampleDepth = c
  }
  else if(sampleDepth>c){
    warning("Sample depth is greater than number of cells, will use all cells")
    sampleDepth = c
  }

  if(missing(labelEdgesList) || !is(labelEdgesList, "list")){
    stop("Must provide a list of cell-to-label similarity matrices")
  }
  if(sum(sapply(labelEdgesList, function(x) dim(x)[1])!=c)!=0){
    stop("Each cell-to-label similarity matrix must have dimensions c-by-l, number of rows equal to the number of cells")
  }

  if(missing(labelEdgeWeights) || !length(labelEdgeWeights)==length(labelEdgesList)){
    stop("Must provide a weight for each set of cell-to-label edges")
  }

  l = sum(sapply(labelEdgesList, function(x) dim(x)[2]))
  combinedGraph = combineMultiLabelGraph(labelEdgesList, cellEdges, labelEdgeWeights)

  if(! is.null(colnames(cellEdges))){
    colnames(combinedGraph) = c( c(unlist(sapply(labelEdgesList, function(x) colnames(x)))), colnames(cellEdges))
    rownames(combinedGraph) = c( c(unlist(sapply(labelEdgesList, function(x) colnames(x)))), colnames(cellEdges))
  }

  if(!sampleDepth == c){
    selectCells = sample(dim(cellEdges)[1], sampleDepth)+l
    combinedGraph = combinedGraph[c(1:l,selectCells),c(1:l,selectCells)]
  }

  combinedGraph = combinedGraph[Matrix::colSums(combinedGraph)!=0, Matrix::colSums(combinedGraph)!=0]
  infMat = randomWalk(combinedGraph, tensorflow=tensorflow, steps=steps)

  if(! is.null(colnames(cellEdges))){
    rownames(infMat) = rownames(combinedGraph)
    colnames(infMat) = colnames(combinedGraph)
  }

  normMat = normalizeInfluence(infMat[-(1:l),(1:l)])
  colnames(normMat) = c(unlist(sapply(labelEdgesList, function(x) colnames(x))))
  if(max(table(colnames(normMat)))>1){
    warning("Multiple labels share the same name")
  }
  cellLabels = apply(normMat, 1, function(x) colnames(normMat)[order(x, decreasing = TRUE)][1])

  cellWalk = list(infMat=infMat, normMat=normMat, cellLabels=cellLabels)
  class(cellWalk) = "cellWalk"
  cellWalk
}

#' Tune edge weights
#'
#' \code{tuneEdgeWeights} generates cell homogeneity scores for possible edge
#' weight settings across all labelEdge matrices. To speed up calculations cells
#' can be subsampled to a specified sampleDepth (default 10000).
#'
#' @param cellEdges matrix of cell-to-cell similarity (c-by-c)
#' @param labelEdgesList list of cell-to-label similarity matrices each with dimensions c-by-l
#' @param labelEdgeOpts vector of weights to test for label edges
#' @param sampleDepth integer depth to subsample cells to for faster calculation
#' @param steps integer indicating number of steps to take if walk should not be run to convergence
#' @param cellTypes list of names of cell types, if not provided unique cell labels are used
#' @param parallel boolean, execute in parallel
#' @param numCores integer, number of cores to use for parallel execution
#' @param trackProgress boolean, print percent run completed
#' @param tensorflow boolean to indicate whether to compute on GPU
#' @return data frame of weights with corresponding cell homogeneity
#' @export
#' @examples
#' data("SampleCellWalkRData")
#' labelEdgesList <- list(SampleCellWalkRData$labelEdges)
#' tuneEdgeWeights(SampleCellWalkRData$cellEdges, labelEdgesList, 10^seq(1,7,1))
#'
tuneEdgeWeights = function(
  cellEdges,
  labelEdgesList,
  labelEdgeOpts = 10^seq(-4,4,1),
  sampleDepth = 10000,
  steps,
  cellTypes,
  parallel = FALSE,
  numCores = 1,
  trackProgress = FALSE,
  tensorflow=FALSE){

  if(missing(cellEdges)){
    stop("Must provide a matrix of cell-to-cell similarity")
  }
  if(!is(cellEdges, "matrix") & !is(cellEdges,"Matrix")){
    stop("Cell-to-cell similarity must be a matrix")
  }
  if(dim(cellEdges)[1] != dim(cellEdges)[2]){
    stop("Cell-to-cell similarity matrix must have dimensions c-by-c")
  }
  c = dim(cellEdges)[1]
  if(sampleDepth>c){
    warning("Sample depth is greater than number of cells, will use all cells")
    sampleDepth = c
  }

  if(missing(labelEdgesList) || !is(labelEdgesList, "list")){
    stop("Must provide a list of cell-to-label similarity matrices")
  }
  if(sum(sapply(labelEdgesList, function(x) dim(x)[1])!=c)!=0){
    stop("Each cell-to-label similarity matrix must have dimensions c-by-l, number of rows equal to the number of cells")
  }

  numLabelLists = length(labelEdgesList)
  l = sum(sapply(labelEdgesList, function(x) dim(x)[2]))

  parameterCombos = expand.grid(rep(list(labelEdgeOpts),numLabelLists))

  cellTypesMissing = FALSE
  if(missing(cellTypes)){
    cellTypesMissing = TRUE
  }

  if(parallel){
    if(!requireNamespace("parallel", quietly = TRUE)){
      stop("Must install parallel")
    }
    if(numCores >= parallel::detectCores()) {
      numCores = parallel::detectCores() - 1
      warning("numCores greater than avaialble cores, using available cores")
    }
    if(trackProgress){
      warning("Cant track progress in parallel")
    }
    if(tensorflow){
      warning("Cant run both on GPU and in parallel, running in parallel")
    }

    cellHomogeneity = parallel::mcmapply(function(param){
      theseWeights = as.numeric(parameterCombos[param,])

      cellWalk = walkCells(cellEdges, labelEdgesList, theseWeights, sampleDepth, steps)

      if(cellTypesMissing){
        cellTypes = unique(cellWalk[["cellLabels"]])
      }

      computeCellHomogeneity(cellWalk, cellTypes)

    }, param=1:dim(parameterCombos)[1], mc.cores = numCores)
  }
  else{
    cellHomogeneity = c()
    for(param in 1:dim(parameterCombos)[1]){
      theseWeights = as.numeric(parameterCombos[param,])

      cellWalk = walkCells(cellEdges, labelEdgesList, theseWeights, sampleDepth, steps, tensorflow=tensorflow)

      if(cellTypesMissing){
        cellTypes = unique(cellWalk[["cellLabels"]])
      }

      cellHomogeneity = c(cellHomogeneity, computeCellHomogeneity(cellWalk, cellTypes))

      if(trackProgress & sum(sapply(seq(.1,.9,.1), function(x)
        (param)/dim(parameterCombos)[1] >= x &
        (param-1)/dim(parameterCombos)[1] <= x))>=1){
        print(paste0(as.integer(param/dim(parameterCombos)[1]*100), "% done"))
      }

    }
  }

  data.frame(parameterCombos, cellHomogeneity)

}

#' Tune filter weights
#'
#' \code{tuneFilterWeights} generates cell homogeneity scores for filter weight settings (by default
#' 0 and 1 for on and off)
#' across all filters. Each filter can either apply just to specific peaks or to whole genes
#' (filterGene, TRUE by default) and can be permissive or
#' filtering out (filterOut, FALSE by default). Regions mapping peaks to genes must be
#' provided to use filters. If filters are applied to specific peaks, peaks for the ATAC
#' data must also be provided. To speed up calculations cells
#' can be subsampled to a specified sampleDepth (default 5000).
#'
#' @param cellEdges matrix of cell-to-cell similarity (c-by-c)
#' @param labelGenesList list of tables with genes of interest in first column and corresponding
#' labels in second column, optionally log-fold change in expression values in the third
#' @param labelEdgesList list of cell-to-label similarity matrices each with dimensions c-by-l
#' @param labelEdgeWeights vector of edge weights corresponding to list of cell-to-label similarity matrices
#' @param ATACMat either a cell-by-peak matrix or a SnapATAC object
#' @param ATACGenePeak per label mapping of peaks to genes returned by mapPeaksToGenes()
#' @param method scaling method, either "Expression","Correlation", or "None"
#' @param filters list of GRanges for locations that pass filters
#' @param filterWeightOpts vector of parameters to test for weights assigned to each filter
#' @param filterOut boolean for each filter of whether those regions are permitted or filtered out
#' @param filterGene boolean for each filter of whether it applies to genes as a whole or just overlapping peaks (note that weights do not apply to peaks)
#' @param regions regions map peaks to genes
#' @param peaks GRanges of peaks in ATACMat
#' @param sampleDepth integer, depth to subsample cells to for faster calculation
#' @param steps integer indicating number of steps to take if walk should not be run to convergence
#' @param cellTypes list of names of cell types, if not provided unique cell labels are used
#' @param parallel boolean, execute in parallel
#' @param numCores integer, number of cores to use for parallel execution
#' @param trackProgress boolean, print percent run completed
#' @param tensorflow boolean to indicate whether to compute on GPU
#' @return data frame of parameters with corresponding cell homogeneity
#' @export
#' @examples
#' data("SampleCellWalkRData")
#' regions <- getRegions()
#' filters <- list(SampleCellWalkRData$filter)
#' labelGenesList <- list(SampleCellWalkRData$labelGenes)
#' labelEdgesList <- list(SampleCellWalkRData$labelEdges)
#' \dontrun{tuneFilterWeights(SampleCellWalkRData$cellEdges,
#'                   labelGenesList,
#'                   labelEdgesList,
#'                   1,
#'                   SampleCellWalkRData$ATACMat,
#'                   SampleCellWalkRData$ATACGenePeak,
#'                   filters=filters,
#'                   regions=regions)}
#'
tuneFilterWeights = function(
  cellEdges,
  labelGenesList,
  labelEdgesList,
  labelEdgeWeights,
  ATACMat,
  ATACGenePeak,
  method = "Expression",
  filters,
  filterWeightOpts=c(0,1),
  filterOut,
  filterGene,
  regions,
  peaks,
  sampleDepth=5000,
  steps,
  cellTypes,
  parallel = FALSE,
  numCores = 1,
  trackProgress = FALSE,
  tensorflow = FALSE){

  if(missing(cellEdges)){
    stop("Must provide a matrix of cell-to-cell similarity")
  }
  if(!is(cellEdges, "matrix") & !is(cellEdges,"Matrix")){
    stop("Cell-to-cell similarity must be a matrix")
  }
  if(dim(cellEdges)[1] != dim(cellEdges)[2]){
    stop("Cell-to-cell similarity matrix must have dimensions c-by-c")
  }
  c = dim(cellEdges)[1]
  if(sampleDepth>c){
    warning("Sample depth is greater than number of cells, will use all cells")
    sampleDepth = c
  }

  if(missing(labelGenesList) || !is(labelGenesList, "list")){
    stop("Must provide a  list of tables with genes of interest in first column and corresponding
         labels in second column, optionally log-fold change in expression values in the third")
  }

  if(missing(labelEdgesList) || !is(labelEdgesList, "list")){
    stop("Must provide a list of cell-to-label similarity matrices")
  }
  if(sum(sapply(labelEdgesList, function(x) dim(x)[1])!=c)!=0){
    stop("Each cell-to-label similarity matrix must have dimensions c-by-l, number of rows equal to the number of cells")
  }

  if(missing(labelEdgeWeights) || !length(labelEdgeWeights)==length(labelEdgesList)){
    stop("Must provide a weight for each set of cell-to-label edges")
  }

  if(missing(ATACMat)){
    stop("Must provide either a cell-by-peak matrix or a SnapATAC object")
  }
  if(is(ATACMat,"snap")){
    stop("Not yet implemented")
  }
  if(missing(ATACGenePeak)){
    stop("Must provide a per label mapping of peaks to genes returned by mapPeaksToGenes()")
  }

  if(missing(filters) || !is(filters, "list")){
    stop("Must provide list of GRanges for locations that pass filters")
  }
  if(missing(regions)){
      stop("Must provide a GRanges object of genomic regions associatated with genes to apply filters")
  }

  numFilters = length(filters)
  parameterCombos = expand.grid(rep(list(filterWeightOpts),numFilters))

  cellTypesMissing = FALSE
  if(missing(cellTypes)){
    cellTypesMissing = TRUE
  }

  if(parallel){
    if(!requireNamespace("parallel", quietly = TRUE)){
      stop("Must install parallel")
    }
    if(numCores >= parallel::detectCores()) {
      numCores = parallel::detectCores() - 1
      warning("numCores greater than avaialble cores, using available cores")
    }
    if(trackProgress){
      warning("Cant track progress in parallel")
    }
    if(tensorflow){
      warning("Cant run both on GPU and in parallel, running in parallel")
    }
    if(missing(filterOut)){
      filterOut = rep(FALSE, length(filters))
    }
    if(missing(filterGene)){
      filterGene = rep(TRUE, length(filters))
    }
    if(missing(peaks)){
      if(sum(!filterGene)>0 && missing(peaks)){
        stop("Must provide a GRanges object of peaks to apply filters to peaks")
      }
      else{
        peaks = NULL
      }
    }
    cellHomogeneity = parallel::mcmapply(function(param){
      theseWeights = as.numeric(parameterCombos[param,])
      labelEdgesList = list()
      for(labelGenes in labelGenesList){
        newEdges = computeLabelEdges(labelGenes, ATACMat, ATACGenePeak, method,
                                     filters, theseWeights,filterOut,
                                     filterGene, regions,peaks)
        labelEdgesList = c(labelEdgesList,list(newEdges))
      }
      l = sum(sapply(labelEdgesList, function(x) dim(x)[2]))

      cellWalk = walkCells(cellEdges, labelEdgesList, labelEdgeWeights, sampleDepth, steps)

      if(cellTypesMissing){
        cellTypes = unique(cellWalk[["cellLabels"]])
      }

      computeCellHomogeneity(cellWalk, cellTypes)

    }, param=1:dim(parameterCombos)[1], mc.cores = numCores)
  }
  else{
    cellHomogeneity = c()
    for(param in 1:dim(parameterCombos)[1]){
      #add option for filter to only apply to some cells? to some labels?
      theseWeights = as.numeric(parameterCombos[param,])
      labelEdgesList = list()
      for(labelGenes in labelGenesList){
        newEdges = computeLabelEdges(labelGenes, ATACMat, ATACGenePeak, method,
                                     filters, theseWeights,filterOut,
                                     filterGene, regions,peaks)
        labelEdgesList = c(labelEdgesList,list(newEdges))
      }
      l = sum(sapply(labelEdgesList, function(x) dim(x)[2]))

      cellWalk = walkCells(cellEdges, labelEdgesList, labelEdgeWeights, sampleDepth, steps, tensorflow=tensorflow)
      infMat = cellWalk[["infMat"]]
      normMat = cellWalk[["normMat"]]
      cellLabels = cellWalk[["cellLabels"]]

      if(cellTypesMissing){
        cellTypes = unique(cellWalk[["cellLabels"]])
      }

      cellHomogeneity = c(cellHomogeneity, computeCellHomogeneity(cellWalk, cellTypes))

      if(trackProgress & sum(sapply(seq(.1,.9,.1), function(x)
        (param)/dim(parameterCombos)[1] >= x &
        (param-1)/dim(parameterCombos)[1] <= x))>=1){
        print(paste0(as.integer(param/dim(parameterCombos)[1]*100), "% done"))
      }

    }
  }

  data.frame(parameterCombos, cellHomogeneity)
}

#' Find markers in scRNA-seq data
#'
#' \code{findMarkers} Uses Seurat to find markers in scRNA-seq data and returns
#' a table of markers and labels that can be used by CellWalker. This implementation
#' uses default parameters and is designed for a simple first pass for creating
#' labels.
#'
#' @param RNAMat gene-by-barcode matrix of scRNA data
#' @param genes character vector of gene names
#' @param barcodes character vector of barcodes
#' @param dims integer vector of PCA dimensions to use
#' @param resolution numeric resolution to use in Louvain clustering
#' @param only.pos only return positive markers
#' @return table of marker genes, labels, and logFC in expression
#' @export
#'
findMarkers = function(RNAMat, genes, barcodes, dims=1:10, resolution=0.5, only.pos = FALSE){
  if(!requireNamespace("Seurat", quietly = TRUE)){
    stop("Must install Seurat")
  }
  if(missing(RNAMat) || !is(RNAMat, "matrix")){
    stop("Must provide a matrix of RNA data")
  }
  if(missing(genes) || length(genes)!=dim(RNAMat)[1]){
    stop("Must provide a vector of gene names equal to the number of rows in the RNA matrix")
  }
  if(missing(barcodes) || length(barcodes)!=dim(RNAMat)[2]){
    stop("Must provide a vector of barcodes equal to the number of columns in the RNA matrix")
  }
  rownames(RNAMat) = genes
  colnames(RNAMat) = barcodes
  RNASeurat = Seurat::CreateSeuratObject(counts = RNAMat)
  RNASeurat = Seurat::NormalizeData(RNASeurat)
  RNASeurat = Seurat::FindVariableFeatures(RNASeurat)
  RNASeurat = Seurat::ScaleData(RNASeurat)
  RNASeurat = Seurat::RunPCA(RNASeurat)
  RNASeurat = Seurat::FindNeighbors(RNASeurat, dims = dims)
  RNASeurat = Seurat::FindClusters(RNASeurat, resolution = resolution)
  RNASeuratMarkers = Seurat::FindAllMarkers(RNASeurat, only.pos = only.pos)
  data.frame(gene=RNASeuratMarkers$gene,
             cluster=as.character(RNASeuratMarkers$cluster),
             logFC=RNASeuratMarkers$avg_logFC)
}

#' Download Sample Data
#'
#' \code{downloadSampleData} Download sample data for use with vignette
#'
#' @param dest folder to download data to
#' @return destination folder
#' @export
#'
downloadSampleData = function(dest="inst/extdata"){
  if(!dir.exists(dest)){
    dir.create(dest, showWarnings = FALSE)
  }
  downloadFiles = download.file(url = "https://figshare.com/ndownloader/files/32630126", destfile = file.path(dest,"SamplePeakMat.mtx"))
  downloadFiles = download.file(url = "https://figshare.com/ndownloader/files/32670425", destfile = file.path(dest,"SamplePeaks.txt"))
  dest
}

#' Load Sample Data
#'
#' \code{loadSampleData} Download sample data for use with vignette, only used for testing.
#' To load sampel data, simply call \code{data("SampleCellWalkRData")}
#' @export
#' @return sample data
#' @examples
#' loadSampleData()
#'
loadSampleData = function(){
  # data("SampleCellWalkRData")
  # SampleCellWalkRData
}
PFPrzytycki/CellWalkR documentation built on Oct. 26, 2023, 1:50 p.m.