R/GroupCoverages.R

Defines functions .writeCoverageToBed .getCoverageInsertionSites .getCoverageRle .getCoverageParams .getCoverageMetadata .addKmerBiasToCoverage .leastOverlapCells .identifyGroupsForPseudoBulk .createCoverages addGroupCoverages

Documented in addGroupCoverages

#' Add Group Coverages to an ArchRProject object
#' 
#' This function will merge cells within each designated cell group for the generation of pseudo-bulk replicates and then
#' merge these replicates into a single insertion coverage file.
#'
#' @param ArchRProj An `ArchRProject` object.
#' @param groupBy The name of the column in `cellColData` to use for grouping multiple cells together prior to generation of the insertion coverage file.
#' @param useLabels A boolean value indicating whether to use sample labels to create sample-aware subgroupings during as pseudo-bulk replicate generation.
#' @param sampleLabels The name of a column in `cellColData` to use to identify samples. In most cases, this parameter should be left as `NULL` and you
#' should only use this parameter if you do not want to use the default sample labels stored in `cellColData$Sample`. However, if your individual Arrow
#' files do not map to individual samples, then you should set this parameter to accurately identify your samples. This is the case in (for example)
#' multiplexing applications where cells from different biological samples are mixed into the same reaction and demultiplexed based on a lipid barcode or genotype.
#' @param minCells The minimum number of cells required in a given cell group to permit insertion coverage file generation.
#' @param maxCells The maximum number of cells to use during insertion coverage file generation.
#' @param maxFragments The maximum number of fragments per cell group to use in insertion coverage file generation. This prevents the generation
#' of excessively large files which would negatively impact memory requirements.
#' @param minReplicates The minimum number of pseudo-bulk replicates to be generated.
#' @param maxReplicates The maximum number of pseudo-bulk replicates to be generated.
#' @param sampleRatio The fraction of the total cells that can be sampled to generate any given pseudo-bulk replicate.
#' @param kmerLength The length of the k-mer used for estimating Tn5 bias.
#' @param threads The number of threads to be used for parallel computing.
#' @param returnGroups A boolean value that indicates whether to return sample-guided cell-groupings without creating coverages.
#' This is used mainly in `addReproduciblePeakSet()` when MACS2 is not being used to call peaks but rather peaks are called from a
#' TileMatrix (`peakMethod = "Tiles"`).
#' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing.
#' @param force A boolean value that indicates whether or not to overwrite the relevant data in the `ArchRProject` object if
#' insertion coverage / pseudo-bulk replicate information already exists.
#' @param verbose A boolean value that determines whether standard output includes verbose sections.
#' @param logFile The path to a file to be used for logging ArchR output.
#' @export
addGroupCoverages <- function(
  ArchRProj = NULL,
  groupBy = "Clusters",
  useLabels = TRUE,
  sampleLabels = "Sample",
  minCells = 40,
  maxCells = 500,
  maxFragments = 25*10^6,
  minReplicates = 2,
  maxReplicates = 5,
  sampleRatio = 0.8,
  kmerLength = 6,
  threads = getArchRThreads(),
  returnGroups = FALSE,
  parallelParam = NULL,
  force = FALSE,
  verbose = TRUE,
  logFile = createLogFile("addGroupCoverages")
  ){

  .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
  .validInput(input = groupBy, name = "groupBy", valid = c("character"))
  .validInput(input = useLabels, name = "useLabels", valid = c("boolean"))
  .validInput(input = sampleLabels, name = "sampleLabels", valid = c("character"))
  .validInput(input = minCells, name = "minCells", valid = c("integer"))
  .validInput(input = maxCells, name = "maxCells", valid = c("integer"))
  .validInput(input = maxFragments, name = "maxFragments", valid = c("integer"))
  .validInput(input = minReplicates, name = "minReplicates", valid = c("integer"))
  .validInput(input = maxReplicates, name = "maxReplicates", valid = c("integer"))
  .validInput(input = sampleRatio, name = "sampleRatio", valid = c("numeric"))
  .validInput(input = kmerLength, name = "kmerLength", valid = c("integer"))
  .validInput(input = threads, name = "threads", valid = c("integer"))
  .validInput(input = returnGroups, name = "returnGroups", valid = c("boolean"))
  .validInput(input = parallelParam, name = "parallelParam", valid = c("parallelparam","null"))
  .validInput(input = force, name = "force", valid = c("boolean"))
  .validInput(input = verbose, name = "verbose", valid = c("boolean"))
  .validInput(input = logFile, name = "logFile", valid = c("character"))

  if(minReplicates < 2){
    stop("minReplicates must be at least 2!")
  }

  if(sampleLabels %ni% colnames(ArchRProj@cellColData)) {
    stop("sampleLabels is not a column in cellColData!")
  }

  tstart <- Sys.time()
  .startLogging(logFile = logFile)
  .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addGroupCoverages Input-Parameters", logFile = logFile)

  Params <- SimpleList(
    groupBy = groupBy,
    minCells = minCells,
    maxCells = maxCells,
    minReplicates = minReplicates,
    sampleRatio = sampleRatio,
    kmerLength = kmerLength
  )

  if(is.null(ArchRProj@projectMetadata$GroupCoverages)){
    ArchRProj@projectMetadata$GroupCoverages <- SimpleList()
  }

  if(!returnGroups){
    if(!is.null(ArchRProj@projectMetadata$GroupCoverages[[groupBy]])){
      if(!force){
        stop("Group Coverages Already Computed, Set force = TRUE to continue!")
      }
    }
  }else{
    if(!is.null(ArchRProj@projectMetadata$GroupCoverages[[groupBy]])){
      if(!force){
        message("Group Coverages Already Computed Returning Groups, Set force = TRUE to Recompute!")
        return(ArchRProj@projectMetadata$GroupCoverages[[groupBy]])
      }
    }      
  }

  #####################################################
  #Groups 
  #####################################################
  cellNames <- rownames(getCellColData(ArchRProj))
  groups <- getCellColData(ArchRProj, groupBy, drop = TRUE)
  if(any(is.na(groups))){
      cellNames <- cellNames[!is.na(groups)]
      groups <- groups[!is.na(groups)]
  }
  uniqueGroups <- gtools::mixedsort(unique(groups))
  tableGroups <- table(groups)[uniqueGroups]

  #####################################################
  #Create Cell Groups
  #####################################################
  cellGroups <- lapply(seq_along(uniqueGroups), function(x){
      subColDat <- getCellColData(ArchRProj)[which(groups==uniqueGroups[x]),]
      cellNamesx <- rownames(subColDat)
      #if(length(cellNamesx) < minCells){
      #  outListx <- SimpleList(LowCellGroup = cellNamesx) or NULL
      #}
      if(useLabels){
        sampleLabelsx <- paste0(subColDat[,sampleLabels])
      } else {
        sampleLabelsx <- NULL
      }
      outListx <- .identifyGroupsForPseudoBulk(
        cells = cellNamesx, 
        sampleLabels = sampleLabelsx,
        useLabels = useLabels,
        minCells = minCells, 
        maxCells = maxCells,
        minReplicates = minReplicates, 
        maxReplicates = maxReplicates,
        sampleRatio = sampleRatio,
        prefix = sprintf("%s (%s of %s) :", uniqueGroups[x], x, length(uniqueGroups)),
        logFile = logFile
      )
      if(is.null(outListx)){
        return(NULL)
      }
      if(is.null(names(outListx))){
          names(outListx) <- paste0("Rep", seq_along(outListx))
      }else if(any(names(outListx)=="")){
          names(outListx)[which(names(outListx)=="")] <- paste0("Rep", which(names(outListx)==""))
      }
      outListx
  }) %>% SimpleList
  names(cellGroups) <- uniqueGroups
  Params$cellGroups <- cellGroups

  #####################################################
  #Check For Max Fragments!
  #####################################################
  it <- 0
  for(i in seq_along(cellGroups)){
    for(j in seq_along(cellGroups[[i]])){
      if(sum(getCellColData(ArchRProj, "nFrags")[cellGroups[[i]][[j]],]) > maxFragments){
        it <- it + 1
        nFrags <- getCellColData(ArchRProj, "nFrags")[cellGroups[[i]][[j]],]
        cells <- cellGroups[[i]][[j]][order(nFrags)]
        nFrags <- nFrags[order(nFrags)]
        cellGroups[[i]][[j]] <- cells[which(cumsum(nFrags) < maxFragments)]
      }
    }
  }
  if(it > 0){
    .logDiffTime(sprintf("Further Sampled %s Groups above the Max Fragments!", it), tstart)
  }

  if(returnGroups){
    return(cellGroups)
  }

  #####################################################
  # Arguments for Coverages
  #####################################################

  dir.create(file.path(getOutputDirectory(ArchRProj), "GroupCoverages"), showWarnings = FALSE)
  dir.create(file.path(getOutputDirectory(ArchRProj), "GroupCoverages", groupBy), showWarnings = FALSE)

  unlistGroups <- lapply(seq_along(cellGroups), function(x){
    if(is.null(cellGroups[[x]])){
      NULL
    }else{
      names(cellGroups[[x]]) <- paste0(names(cellGroups)[x], "._.", names(cellGroups[[x]]))
      cellGroups[[x]]
    }
  }) %>% SimpleList %>%unlist()

  args <- list()
  args$X <- seq_along(unlistGroups)
  args$FUN <- .createCoverages
  args$cellGroups <- unlistGroups
  args$genome <- getGenome(ArchRProj)
  args$kmerLength <- kmerLength
  args$ArrowFiles <- getArrowFiles(ArchRProj)
  args$availableChr <- .availableSeqnames(getArrowFiles(ArchRProj))
  args$chromLengths <- getChromLengths(ArchRProj)
  #args$cellsInArrow <- split(rownames(getCellColData(ArchRProj)), getCellColData(ArchRProj)$Sample)
  args$cellsInArrow <-   cellsInArrow <- split(
    rownames(getCellColData(ArchRProj)), 
    stringr::str_split(rownames(getCellColData(ArchRProj)), pattern="\\#", simplify=TRUE)[,1]
  )
  args$covDir <- file.path(getOutputDirectory(ArchRProj), "GroupCoverages", groupBy)
  args$parallelParam <- parallelParam
  args$threads <- threads
  args$verbose <- verbose
  args$tstart <- tstart
  args$logFile <- logFile
  args$registryDir <- file.path(getOutputDirectory(ArchRProj), "GroupCoverages", "batchRegistry")

  #####################################################
  # Batch Apply to Create Insertion Coverage Files
  #####################################################

  #Disable Hdf5 File Locking
  h5disableFileLocking()

  #Batch Apply
  .logDiffTime(sprintf("Creating Coverage Files!"), tstart, addHeader = FALSE)
  batchOut <- .batchlapply(args)
  coverageFiles <- lapply(seq_along(batchOut),function(x) batchOut[[x]]$covFile) %>% unlist
  nCells <- lapply(seq_along(batchOut),function(x) batchOut[[x]]$nCells) %>% unlist
  nFragments <- lapply(seq_along(batchOut),function(x) batchOut[[x]]$nFragments) %>% unlist

  #Add To Project
  coverageMetadata <- DataFrame(
    Group = stringr::str_split(names(unlistGroups), pattern = "\\._.", simplify=TRUE)[,1],
    Name = names(unlistGroups), 
    File = coverageFiles,
    nCells = nCells,
    nInsertions = nFragments * 2
  )

  #####################################################
  # Compute Kmer Bias for each coverage file!
  #####################################################
  .logDiffTime(sprintf("Adding Kmer Bias to Coverage Files!"), tstart, addHeader = FALSE)
  o <- .addKmerBiasToCoverage(
    coverageMetadata = coverageMetadata, 
    genome = getGenome(ArchRProj), 
    kmerLength = kmerLength, 
    threads = threads,
    verbose = FALSE,
    logFile = logFile
  )

  ArchRProj@projectMetadata$GroupCoverages[[groupBy]] <- SimpleList(Params = Params, coverageMetadata = coverageMetadata)

  #Enable Hdf5 File Locking
  h5enableFileLocking()

  .logDiffTime(sprintf("Finished Creation of Coverage Files!"), tstart, addHeader = FALSE)
  .endLogging(logFile = logFile)

  ArchRProj

}

#####################################################################################################
# Creating Insertion (1bp) Coverage Hdf5 Files for downstream group analyses
#####################################################################################################

.createCoverages <- function(
  i = NULL, 
  cellGroups,
  kmerBias = NULL, 
  kmerLength = 6, 
  genome = NULL,
  ArrowFiles = NULL, 
  cellsInArrow = NULL, 
  availableChr = NULL,
  chromLengths = NULL, 
  covDir = NULL, 
  tstart = NULL, 
  subThreads = 1,
  verbose = TRUE,
  logFile = NULL
  ){

  prefix <- sprintf("Group %s (%s of %s) :", names(cellGroups)[i], i, length(cellGroups))

  #Cells
  cellGroupi <- cellGroups[[i]]
  
  #Coverage File!
  namei <- make.names(names(cellGroups)[i]) #Maybe naming convention is weird
  covFile <- file.path(covDir, paste0(namei, ".insertions.coverage.h5"))
  rmf <- .suppressAll(file.remove(covFile))

  .logDiffTime(sprintf("%s Creating Group Coverage File : %s", prefix, basename(covFile)), tstart, verbose = verbose, logFile = logFile)

  #Dealing with sampling w/o replacement!
  tableGroupi <- table(cellGroupi)

  .logThis(cellGroupi, paste0(prefix, " cellGroupi"), logFile = logFile)

  .logMessage(paste0("Number of Cells = ", length(cellGroups[[i]])), logFile = logFile)

  #Create Hdf5 File!
  o <- tryCatch({
    o <- h5closeAll()
    o <- h5createFile(covFile)   
  }, error = function(e){
    rmf <- .suppressAll(file.remove(covFile))
    o <- h5closeAll()
    o <- h5createFile(covFile)
  })

  if(file.exists(covFile)){
    .logMessage("Coverage File Exists!", logFile = logFile)
  }else{
    .logMessage("Coverage File Does Not Exist!", logFile = logFile)
  }

  o <- h5closeAll()
  o <- h5createGroup(covFile, paste0("Coverage"))
  .logMessage("Added Coverage Group", logFile = logFile)

  o <- h5closeAll()
  o <- h5createGroup(covFile, paste0("Metadata"))
  .logMessage("Added Metadata Group", logFile = logFile)

  o <- h5closeAll()
  o <- h5write(obj = "ArrowCoverage", file = covFile, name = "Class")
  .logMessage("Added ArrowCoverage Class", logFile = logFile)

  o <- h5closeAll()
  o <- h5createGroup(covFile, paste0("Coverage/Info"))
  .logMessage("Added Coverage/Info", logFile = logFile)

  o <- h5closeAll()
  o <- h5write(as.character(cellGroupi), covFile, "Coverage/Info/CellNames")
  .logMessage("Added Coverage/Info/CellNames", logFile = logFile)

  #We need to dump all the cells into a coverage file
  nFragDump <- 0
  nCells <- c()
  for(k in seq_along(availableChr)){
    
    .logDiffTime(sprintf("%s Processed Fragments Chr (%s of %s)", prefix, k, length(availableChr)), tstart, verbose = FALSE, logFile = logFile)
    
    it <- 0
    
    for(j in seq_along(ArrowFiles)){
      
      cellsInI <- sum(cellsInArrow[[names(ArrowFiles)[j]]] %in% cellGroupi)
      
      if(cellsInI > 0){
      
        it <- it + 1
      
        if(it == 1){
      
          fragik <- .getFragsFromArrow(ArrowFiles[j], chr = availableChr[k], out = "GRanges", cellNames = cellGroupi)
      
        }else{
      
          fragik <- c(fragik, .getFragsFromArrow(ArrowFiles[j], chr = availableChr[k], out = "GRanges", cellNames = cellGroupi))
      
        }
      
      }

    }

    #Dealing with sampling w/o replacement!
    matchRG <- as.vector(S4Vectors::match(mcols(fragik)$RG, names(tableGroupi)))
    fragik <- rep(fragik, tableGroupi[matchRG])
    nCells <- c(nCells, unique(mcols(fragik)$RG))

    #Compute Rle Coverage
    covk <- coverage(IRanges(start = c( start(fragik), end(fragik) ), width = 1), width = chromLengths[availableChr[k]])
        nFragDump <- nFragDump + length(fragik)
    rm(fragik)

    #Write To Hdf5
    chrLengths <- paste0("Coverage/",availableChr[k],"/Lengths")
    chrValues <- paste0("Coverage/",availableChr[k],"/Values")
    lengthRle <- length(covk@lengths)
    o <- h5createGroup(covFile, paste0("Coverage/",availableChr[k]))
    o <- .suppressAll(h5createDataset(covFile, chrLengths, storage.mode = "integer", dims = c(lengthRle, 1), level = 0))
    o <- .suppressAll(h5createDataset(covFile, chrValues, storage.mode = "integer", dims = c(lengthRle, 1), level = 0))
    o <- h5write(obj = covk@lengths, file = covFile, name = chrLengths)
    o <- h5write(obj = covk@values, file = covFile, name = chrValues)

    gc()

  }

  if(length(unique(cellGroupi)) != length(unique(nCells))){
    .logMessage(paste0("Not all cells (", length(unique(cellGroupi)), ") were found for coverage creation (", length(unique(nCells)), ")!"), logFile = logFile)
    stop("Not all cells (", length(unique(cellGroupi)), ") were found for coverage creation (", length(unique(nCells)), ")!")
  }

  out <- list(covFile = covFile, nCells = length(cellGroupi), nFragments = nFragDump)

  return(out)

}

#####################################################################################################
# Creating Groups of Cells For Pseudobulk Coverage Files
#####################################################################################################

.identifyGroupsForPseudoBulk <- function(
  cells = NULL,
  sampleLabels = NULL,
  useLabels = TRUE,
  minCells = 50,
  maxCells = 500,
  filterGroups = FALSE,
  minReplicates = 2,
  maxReplicates = NULL,
  sampleRatio = 0.8,
  prefix = NULL,
  logFile = NULL
  ){

  if(is.null(sampleLabels)){
    sampleLabels <- rep("A", length(cells))
  }else{
    if(length(cells) != length(sampleLabels)){
      .logMessage("Length of cells need to be same length as sample labels!", logFile = logFile)
      stop("Length of cells need to be same length as sample labels!")
    }
  }
  nCells <- length(cells)
  nCellsPerSample <- table(sampleLabels)
  nCellsPerSample <- nCellsPerSample[sample(seq_along(nCellsPerSample), length(nCellsPerSample))]
  #Samples Passing Min Filter
  samplesPassFilter <- sum(nCellsPerSample >= minCells)   
  samplesThatCouldBeMergedToPass <- floor(sum(nCellsPerSample[nCellsPerSample < minCells]) / minCells)

  errorList <- mget(names(formals()),sys.frame(sys.nframe()))
  errorList$nCells <- nCells
  errorList$nCellsPerSample <- nCellsPerSample
  errorList$samplesPassFilter <- samplesPassFilter
  errorList$samplesThatCouldBeMergedToPass <- samplesThatCouldBeMergedToPass

  cellGroupsPass2 <- tryCatch({

    if(nCells >= minCells * minReplicates & useLabels){
        ############################################################
        # Identifying High-Quality peaks when Cells and Fragments are abundant
        ############################################################
        #Samples Passing Min Filter
        samplesPassFilter <- sum(nCellsPerSample >= minCells) 
        samplesThatCouldBeMergedToPass <- floor(sum(nCellsPerSample[nCellsPerSample < minCells]) / minCells)
        #First Group Cells By Sample
        cellGroups <- split(cells, sampleLabels)
        #Identify Samples That Pass Min Cells
        samples <- names(nCellsPerSample)[nCellsPerSample > minCells]
        cellGroupsPass <- cellGroups[samples]
        #Samples That Do Not Pass
        if(!all(names(cellGroups) %in% names(cellGroupsPass))){
          cellGroupsNotPass <- cellGroups[names(cellGroups) %ni% samples]
        }else{
          cellGroupsNotPass <- list()
        }
        if(samplesPassFilter >= minReplicates){
          ############################################################
          # If there are at least minReplicates with > minCells
          ############################################################
          #If we look at the remaining cells and see that there are enough to make an additional replicate
          nCellsRemaining <- length(unlist(cellGroupsNotPass))
          if(nCellsRemaining >= minCells){
            cellGroupsPass$Other <- unlist(cellGroupsNotPass)
          }
        }else if(samplesPassFilter + samplesThatCouldBeMergedToPass >= minReplicates){
          cellsRemaining <- unlist(cellGroupsNotPass, use.names = FALSE)
          nGroupsRemaining <- minReplicates - samplesPassFilter
          cellsRemaining <- sample(cellsRemaining, length(cellsRemaining))
          cellGroupsSample <- split(cellsRemaining, ceiling(seq_along(cellsRemaining)/ ceiling(length(cellsRemaining)/nGroupsRemaining)))
          #Add to Groups Pass QC
          if(samplesPassFilter == 0){
            cellGroupsPass <- cellGroupsSample
            names(cellGroupsPass) <- paste0("Rep", seq_along(cellGroupsPass))
          }else{
            cellGroupsPass <- append(cellGroupsPass, cellGroupsSample)
          }
        }else{
          cellsRemaining <- unlist(cellGroupsNotPass, use.names = FALSE)
          nCellsRemaining <- length(cellsRemaining)
          cellsNeeded <- minCells * (minReplicates - samplesPassFilter) - nCellsRemaining
          cellsFromPass <- sample(unlist(cellGroupsPass, use.names = FALSE), cellsNeeded)
          cellGroupsPass <- lapply(cellGroupsPass, function(x){
            x[x %ni% cellsFromPass]
          })
          cellGroupsPass[[length(cellGroupsPass) + 1]] <- c(cellsRemaining, cellsFromPass)
          names(cellGroupsPass) <- paste0("Rep", seq_along(cellGroupsPass))
        }
    }else{
        ############################################################
        # Identifying High-Quality peaks when Cells are not abundant
        ############################################################
        if(nCells >= minCells / sampleRatio){
            ############################################################
            # When there are more cells than min cells but not enough for robust reproducibility
            ############################################################
            cellGroupsPass <- .leastOverlapCells(x = cells, n = minReplicates, nSample = minCells) #length(cells) * sampleRatio)
        }else{
            ############################################################
            # Sampling With Replacement, Not Super Desirable
            ############################################################
            if(filterGroups){
              return(NULL)
            }else{            
              cellGroupsPass <- .leastOverlapCells(x = cells, n = minReplicates, nSample = minCells, replace = TRUE) #length(cells) * sampleRatio)
            }
        }
    }
    cellGroupsPass <- as(cellGroupsPass, "SimpleList")

    for(i in seq_along(cellGroupsPass)){
        if(length(cellGroupsPass[[i]]) > maxCells){
            cellGroupsPass[[i]] <- sample(cellGroupsPass[[i]], maxCells)
        }
    }

    #Rank Group by Unique # of Cells
    nUnique <- lapply(cellGroupsPass, function(x){
      length(unique(x))
    }) %>% unlist

    cellGroupsPass <- cellGroupsPass[order(nUnique, decreasing = TRUE)]

    if(!is.null(maxReplicates)){
      if(length(cellGroupsPass) > maxReplicates){
        cellGroupsPass <- cellGroupsPass[seq_len(maxReplicates)]
      }    
    }

    cellGroupsPass

  }, error = function(e){

   .logError(e, fn = ".identifyGroupsForPseudoBulk", info = prefix, errorList = errorList, logFile = logFile) 

  })

  .logMessage(paste0(prefix, " CellGroups N = ", length(cellGroupsPass2)), logFile = logFile)
  #.logThis(cellGroupsPass2, paste0(prefix, " CellGroups"), logFile = logFile)

  return(cellGroupsPass2)
  
}

.leastOverlapCells <- function(x = NULL, n = 2, nSample = 0.8 * length(l), iterations = 100, replace = FALSE){   
    set.seed(1)
    maxMat <- matrix(0, nrow = length(x), ncol = n)
    for(i in seq_len(iterations)){
      currentMat <- matrix(0, nrow = length(x), ncol = n)
      for(j in seq_len(n)){
        currentMat[sample(seq_along(x), nSample, replace = replace), j] <- 1
      }
      disti <- max(dist(t(currentMat), method = "euclidean"))
      if(i==1){
        maxMat <- currentMat
        maxDist <- disti
      }else{
        if(disti > maxDist){
          maxMat <- currentMat
          maxDist <- disti        
        }
      }
    }
    out <- lapply(seq_len(ncol(maxMat)), function(i){
        x[which(maxMat[,i]==1)]
    })
    return(out)
}

#####################################################################################################
# Add Kmer Tn5 Bias Values to Each Coverage File!
#####################################################################################################
.addKmerBiasToCoverage <- function(
  coverageMetadata = NULL,
  genome = NULL,
  kmerLength = NULL,
  threads = NULL,
  verbose = TRUE,
  tstart = NULL,
  logFile = NULL
  ){

  .logThis(append(args, mget(names(formals()),sys.frame(sys.nframe()))), "kmerBias-Parameters", logFile = logFile)
  
  .requirePackage("Biostrings", source = "bioc")
  BSgenome <- eval(parse(text = genome))
  BSgenome <- validBSgenome(BSgenome)

  if(is.null(tstart)){
    tstart <- Sys.time()
  }
  
  coverageFiles <- coverageMetadata$File
  names(coverageFiles) <- coverageMetadata$Name
  availableChr <- .availableSeqnames(coverageFiles, "Coverage")

  biasList <- .safelapply(seq_along(availableChr), function(x){
    .logMessage(sprintf("Kmer Bias %s (%s of %s)", availableChr[x], x, length(availableChr)), logFile = logFile)
    message(availableChr[x]," ", appendLF = FALSE)
    chrBS <- BSgenome[[availableChr[x]]]
    exp <- Biostrings::oligonucleotideFrequency(chrBS, width = kmerLength)
    obsList <- lapply(seq_along(coverageFiles), function(y){
      .logMessage(sprintf("Coverage File %s (%s of %s)", availableChr[x], y, length(coverageFiles)), logFile = logFile)
      tryCatch({
        obsx <- .getCoverageInsertionSites(coverageFiles[y], availableChr[x]) %>%
          {BSgenome::Views(chrBS, IRanges(start = . - floor(kmerLength/2), width = kmerLength))} %>%
          {Biostrings::oligonucleotideFrequency(., width = kmerLength, simplify.as="collapsed")}
        tryCatch({gc()}, error = function(e){})
        obsx
      }, error = function(e){
        errorList <- list(
          y = y, 
          coverageFile = coverageFiles[y], 
          chr = availableChr[x], 
          iS = tryCatch({
            .getCoverageInsertionSites(coverageFiles[y], availableChr[x])
            }, error = function(e){
              "Error .getCoverageInsertionSites"
            })
        )
        .logError(e, fn = ".addKmerBiasToCoverage", info = "", errorList = errorList, logFile = logFile)
      })
    }) %>% SimpleList
    names(obsList) <- names(coverageFiles)
    SimpleList(expected = exp, observed = obsList)
  }, threads = threads) %>% SimpleList
  names(biasList) <- availableChr
  .logMessage("Completed Kmer Bias Calculation", logFile = logFile)

  #Summarize Bias
  for(i in seq_along(biasList)){
    if(i == 1){
      expAll <- biasList[[i]]$expected
      obsAll <- biasList[[i]]$observed
    }else{
      expAll <- expAll + biasList[[i]]$expected 
      for(j in seq_along(obsAll)){
        obsAll[[j]] <- obsAll[[j]] + biasList[[i]]$observed[[names(obsAll)[j]]]
      }
    }
  }

  #Write Bias to Coverage Files
  for(i in seq_along(coverageFiles)){

    .logMessage(sprintf("Adding Kmer Bias (%s of %s)", i, length(coverageFiles)), logFile = logFile)

    obsAlli <- obsAll[[names(coverageFiles)[i]]]
    if(!identical(names(expAll), names(obsAlli))){
      .logMessage("Kmer Names in Exp and Obs not Identical!", logFile = logFile)
      stop("Kmer Names in Exp and Obs not Identical!")
    }
    o <- h5createGroup(coverageFiles[i], "KmerBias")
    o <- h5createGroup(coverageFiles[i], "KmerBias/Info")
    o <- h5write(obj = genome, file = coverageFiles[i], name = "KmerBias/Info/Genome")
    o <- h5write(obj = kmerLength, file = coverageFiles[i], name = "KmerBias/Info/KmerLength")
    o <- h5write(obj = paste0(names(obsAlli)), file = coverageFiles[i], name = "KmerBias/Kmer")
    o <- h5write(obj = obsAlli, file = coverageFiles[i], name = "KmerBias/ObservedKmers")
    o <- h5write(obj = expAll, file = coverageFiles[i], name = "KmerBias/ExpectedKmers")

  }

  return(0)

}

#####################################################################################################
# Get Coverage Metadata and Params from ArchR Project
#####################################################################################################

.getCoverageMetadata <- function(ArchRProj = NULL, groupBy = NULL, useGroups = NULL, minCells = NULL){
  coverageMetadata <- ArchRProj@projectMetadata$GroupCoverages[[groupBy]]$coverageMetadata
  if(is.null(coverageMetadata)){
    stop("No Coverage Metadata found for : ", groupBy, ". Please run addGroupCoverages!")
  }
  if(!is.null(useGroups)){
    if(sum(coverageMetadata[,1] %in% useGroups) == 0){
      stop("No Groups found matching useGroups!")
    }
    coverageMetadata <- coverageMetadata[coverageMetadata[,1] %in% useGroups, , drop = FALSE]
  }
  if(!is.null(minCells)){
    coverageMetadata <- coverageMetadata[coverageMetadata$nCells >= minCells, , drop = FALSE]
  }
  if(nrow(coverageMetadata)==0){
    stop("No coverages remain!")
  }
  coverageMetadata
}

.getCoverageParams <- function(ArchRProj = NULL, groupBy = NULL, useGroups = NULL){
  coverageParams <- ArchRProj@projectMetadata$GroupCoverages[[groupBy]]$Params
  if(is.null(coverageParams)){
    stop("No Coverage Metadata found for : ", groupBy)
  }
  coverageParams
}

#####################################################################################################
# Create Coverage Rle List of all chr
#####################################################################################################

.getCoverageRle <- function(coverageFile = NULL, allChr = NULL){
  cov <- lapply(seq_along(allChr), function(x){
    Rle(
      lengths = h5read(coverageFile, paste0("Coverage/",allChr[x],"/Lengths")), 
      values = h5read(coverageFile, paste0("Coverage/",allChr[x],"/Values"))
    )
  }) %>% {as(., "RleList")}
  names(cov) <- allChr
  cov
}

#####################################################################################################
# Get All Non-Zero Insertion Sites and N
#####################################################################################################

.getCoverageInsertionSites <- function(coverageFile = NULL, chr = NULL){
    cov <- Rle(
      lengths = h5read(coverageFile, paste0("Coverage/", chr, "/Lengths")), 
      values = h5read(coverageFile, paste0("Coverage/", chr, "/Values"))
    )
    rV <- runValue(cov)
    cov <- ranges(cov)
    mcols(cov)$values <- rV
    cov <- cov[mcols(cov)$values > 0]
    cov <- unlist(start(slidingWindows(rep(cov, mcols(cov)$values), width = 1, step = 1)))
    cov
}

#####################################################################################################
# Write Coverage To Bed File for MACS2
#####################################################################################################

.writeCoverageToBed <- function(coverageFile = NULL, out = NULL, excludeChr = NULL, logFile = NULL){
  rmf <- .suppressAll(file.remove(out))
  allChr <- .availableSeqnames(coverageFile, "Coverage")
  if(!is.null(excludeChr)){
    allChr <- allChr[allChr %ni% excludeChr]
  }
  if(length(allChr)==0){
    stop("No Chromosomes in Coverage after Excluding Chr!")
  }
  ##Note that there was a bug with data.table vs data.frame with stacking
  for(x in seq_along(allChr)){
    o <- tryCatch({
      iS <- .getCoverageInsertionSites(coverageFile = coverageFile, chr = allChr[x])
      if(x == 1) .logThis(iS, "InsertionSites", logFile = logFile)
      iS <- data.table(seqnames = allChr[x], start = iS - 1L, end = iS)
      if(x == 1) .logThis(iS, "InsertionSites-DT", logFile = logFile)
      if(!any(is.na(iS$start))) {
        data.table::fwrite(iS, out, sep = "\t", col.names = FALSE, append = TRUE)
      } else {
        message(paste0("Warning - No insertions found on seqnames ", allChr[x], " for coverageFile ", coverageFile,"."))
        .logMessage(paste0("Warning - No insertions found on seqnames ", allChr[x], " for coverageFile ", coverageFile,"."), logFile = logFile)
      }
    }, error = function(e){
      errorList <- list(
        x = x, 
        coverageFile = coverageFile, 
        allChr = allChr, 
        iS = if(exists("iS", inherits = FALSE)) iS else "Error with insertion sites!"
      )
      .logError(e, fn = ".writeCoverageToBed", info = "", errorList = errorList, logFile = logFile)
    })
  }
  if(!file.exists(out)){
    stop("Error in writing coverage to bedfile")
  }
  out
}
GreenleafLab/ArchR documentation built on Feb. 28, 2024, 4:17 p.m.