R/import_atac_archr.R

Defines functions DsATACsc.archr

Documented in DsATACsc.archr

#-------------------------------------------------------------------------------
#' DsATACsc.archr
#' 
#' Create a DsATACsc dataset from an \code{ArchR} project
#' @param ap	\code{ArchR} project object
#' @param useTiling flag indicating whether to use tiling information from the ArchR project
#' @param keepInsertionInfo flag indicating whether to maintain the insertion information in the resulting object.
#' @param diskDump.fragments Keep fragment coordinates stored on disk rather than in main memory. This saves memory, but increases runtime and I/O.
#' @return \code{\linkS4class{DsATACsc}} object
#' @author Fabian Mueller
#' @export
DsATACsc.archr <- function(ap, useTiling=TRUE, keepInsertionInfo=FALSE, diskDump.fragments=keepInsertionInfo){
	require(ArchR)
	if (class(ap)!="ArchRProject") logger.error("Invalid input: expected ArchRProject")
	afs <- ArchR::getArrowFiles(ap)
	if (!all(file.exists(afs))) logger.error("Could not find all arrow files")

	gg <- ArchR::getGenome(ap)
	ga <- ArchR::getGenomeAnnotation(ap)
	genomeAss <- ""
	if (grepl("Hsapiens.*hg38", gg)){
		genomeAss <- "hg38"
	} else {
		logger.error(c("unsupported genome:", gg))
	}

	# check genome compatibility
	logger.status("Checking genome compatibility")
	sls <- muRtools::getSeqlengths4assembly(genomeAss, onlyMainChrs=TRUE, adjChrNames=TRUE)
	chromSizes_archr <- ga$chromSizes
	chromNames_archr <- as.character(seqnames(chromSizes_archr))

	chromSizes_archr <- muRtools::setGenomeProps(chromSizes_archr, genomeAss, dropUnknownChrs=TRUE, onlyMainChrs=TRUE, adjChrNames=TRUE, silent=FALSE)
	chromNames <- as.character(seqnames(chromSizes_archr))
	if (!all(chromNames_archr %in% chromNames)){
		logger.error(c("Could not find the following chromosomes in the annotation:", paste(chromNames_archr[!(chromNames_archr %in% chromNames)], collapse=",")))
	}

	sls_archr <- end(chromSizes_archr)
	names(sls_archr) <- chromNames

	if (any(sls_archr != sls[chromNames])){
		logger.error("Incompatible chromosome lengths detected")
	}

	logger.start("Preparing cell annotation")
		cellAnnot <- as.data.frame(ArchR::getCellColData(ap))
		# rename the annotation columns to "archr."
		colnames(cellAnnot) <- paste0("archr.", colnames(cellAnnot)) 

		cellIds_archr <- rownames(cellAnnot)
		cellIds <- gsub("#", "_", cellIds_archr)
		rownames(cellAnnot) <- cellIds
		cellBcs <- gsub("(.+)#(.+)$", "\\2", cellIds_archr)

		# map special column names to annotation
		cellAnnot[,"cellId"] <- cellIds
		cellAnnot[,"cellBarcode"] <- cellBcs
		cellAnnot[,"archr.cellId"] <- cellIds_archr
		cn_map <- c(
			"archr.TSSEnrichment"=".tssEnrichment",
			"archr.Sample"=".sampleId",
			"archr.nFrags"="nFrags"
		)
		idx <- colnames(cellAnnot) %in% names(cn_map)
		colnames(cellAnnot)[idx] <- cn_map[colnames(cellAnnot)[idx]]

		sampleIds <- ArchR::getSampleNames(ap)
	logger.completed()

	# TODO: add gene activity scores to cell annotation, if desired

	# blacklistGr <- ga$blacklist
	getTileGr <- function(tileSize){
		# tGr <- muRtools::getTilingRegions(genomeAss, width=tileSize, onlyMainChrs=TRUE, adjChrNames=TRUE)
		# the above line is of by one basepair. 
		# make sure that the intervals start at *0 and end at *9 (except the first range of each chromosome (starts at 1) and the last range (ends at length))
		tGr <- do.call("c", lapply(chromNames, FUN=function(chrom){
			ss <- seq(0, sls[chrom], by=tileSize) # [-1] : exclude the 0
			ss[1] <- 1
			ee <- c(ss[2:length(ss)] - 1, sls[chrom])
			names(ee) <- NULL
			gr <- GRanges(seqnames=chrom, ranges=IRanges(start=ss, end=ee))
			gr <- muRtools::setGenomeProps(gr, genomeAss, dropUnknownChrs=TRUE, onlyMainChrs=TRUE, adjChrNames=TRUE)
			return(gr)
		}))
		# # sanity check
		# nTiles <- trunc(sls[chromNames] / tileSize) + 1
		# nTiles2 <- elementNROWS(split(tGr, seqnames(tGr)))[chromNames]
		# if (any(nTiles != nTiles2)) logger.error("Incompatible tile numbers")
		return(tGr)
	}
	logger.start("Preparing region sets")
		regionSets <- list()
		tileGr <- NULL
		tileDf <- ArchR:::.getFeatureDF(afs, "TileMatrix")
		if (!is.null(tileDf) && is.element(class(tileDf), c("DataFrame", "DFrame"))){
			logger.status("Preparing tiling regions ...")
			x <- tileDf[,"start"]
			tileW <- median(x[2:length(x)]-x[1:(length(x)-1)])
			logger.info(c("Determined tiling width as", tileW, "bp"))
			tileGr <- getTileGr(tileW)
			if (useTiling){
				regionSets[[paste0("tiling", tileW, "bp")]] <- tileGr
			} else {
				logger.info("Not adding tiling count matrix")
			}
			
		}
		geneAnnot <- ArchR::getGeneAnnotation(ap)
		if (!is.null(geneAnnot) && all(c("TSS", "genes") %in% names(geneAnnot))){
			logger.status("Preparing gene regions ...")
			regionSets[["tssWindow"]] <- promoters(geneAnnot$TSS, upstream=100, downstream=100)
			regionSets[["promoter"]] <- promoters(geneAnnot$genes, upstream=1500, downstream=500)
		}
		peakGr <- ArchR::getPeakSet(ap)
		if (!is.null(peakGr) && class(peakGr) == "GRanges"){
			logger.status("Preparing peak regions ...")
			regionSets[["archr.peaks"]] <- peakGr
		}
		redDim <- ArchR::getReducedDims(ap, reducedDims="IterativeLSI", returnMatrix=FALSE)
		if (is.element("LSIFeatures", names(redDim))){
			logger.status("Preparing iterativeLSI feature regions ...")
			featDf <- redDim$LSIFeatures
			if (is.element(class(featDf), c("DataFrame", "DFrame")) && !is.null(tileGr)){
				logger.info("Assuming iterativeLSI has been applied to tiling windows")
				tileIdxL <- tapply(featDf[,"idx"], featDf[,"seqnames"], c)
				tileGrl <- split(tileGr, seqnames(tileGr))
				cns <- intersect(names(tileGrl), names(tileIdxL))
				gr <- do.call("c", lapply(cns, FUN=function(chrom){
					tileGrl[[chrom]][tileIdxL[[chrom]]]
				}))
				regionSets[["archr.itlsi.feats"]] <- gr
			}
		}
		for (rt in names(regionSets)){
			regionSets[[rt]] <- muRtools::setGenomeProps(regionSets[[rt]], genomeAss, dropUnknownChrs=TRUE, onlyMainChrs=TRUE, adjChrNames=TRUE, silent=TRUE)
		}
	logger.completed()


	logger.start("Creating DsATAC object")
		obj <- DsATACsc(cellAnnot, genomeAss, diskDump=FALSE, diskDump.fragments=diskDump.fragments, sparseCounts=TRUE)
		for (rt in names(regionSets)){
			logger.info(c("Including region set:", rt))
			obj <- regionAggregation(obj, regionSets[[rt]], rt, signal=NULL, dropEmpty=FALSE)
		}
		if (obj@diskDump.fragments && .hasSlot(obj, "diskDump.fragments.nSamplesPerFile")){
			obj@diskDump.fragments.nSamplesPerFile <- 500L
		}
	logger.completed()

	logger.start("Preparing fragment data")
		for (i in seq_along(sampleIds)){
			sid <- sampleIds[i]
			logger.start(c("Importing arrow file for sample", ":", sid, paste0("(", i, " of ", length(sampleIds), ")")))
				fragGrl <- ArchR::getFragmentsFromArrow(afs[sid], cellNames=ArchR::getCellNames(ap), verbose=FALSE)
				fragGrl <- muRtools::setGenomeProps(fragGrl, genomeAss, dropUnknownChrs=TRUE, onlyMainChrs=TRUE, adjChrNames=TRUE)
				cids <- elementMetadata(fragGrl)[,"RG"]
				cids <- gsub("#", "_", cids)
				elementMetadata(fragGrl) <- NULL
				elementMetadata(fragGrl)[,"sampleId"] <- sid
				elementMetadata(fragGrl)[,"cellId"] <- cids
				if (!all(cids %in% cellIds)) logger.error("Fragment files contain unknown cells")
				fragGrl <- split(fragGrl, elementMetadata(fragGrl)[,"cellId"])
				fragGrl <- as.list(fragGrl)
				logger.info(c("Found", length(fragGrl), "cells"))

				logger.status("Preparing insertion data ...")
				insGrl <- lapply(fragGrl, getInsertionSitesFromFragmentGr)
				logger.status("Summarizing count data ...")
				obj <- addCountDataFromGRL(obj, insGrl)
			logger.completed()
			if (keepInsertionInfo) {
				chunkedFragmentFiles <- obj@diskDump.fragments && .hasSlot(obj, "diskDump.fragments.nSamplesPerFile") && obj@diskDump.fragments.nSamplesPerFile > 1
				logger.start("Adding fragment data to data structure")
					nCells <- length(fragGrl)
					if (chunkedFragmentFiles){
						chunkL <- split(1:nCells, rep(1:ceiling(nCells/obj@diskDump.fragments.nSamplesPerFile), each=obj@diskDump.fragments.nSamplesPerFile)[1:nCells])
						names(chunkL) <- NULL
						for (k in 1:length(chunkL)){
							iis <- chunkL[[k]]
							cids <- names(fragGrl)[iis]
							fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
							saveRDS(fragGrl[iis], fn, compress=TRUE)
							obj@fragments[cids] <- rep(list(fn), length(iis))
						}
					} else {
						cids <- names(fragGrl)
						if (obj@diskDump.fragments){
							obj@fragments[cids] <- lapply(fragGrl, FUN=function(x){
								fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
								saveRDS(x, fn, compress=TRUE)
								return(fn)
							})
						} else {
							obj@fragments[cids] <- fragGrl
						}
					}
				logger.completed()
			}
		}
		# just to make sure: reorder fragment list
		if (keepInsertionInfo) {
			obj@fragments <- obj@fragments[getSamples(obj)]
		}
	logger.completed()
	
	logger.start("Removing uncovered regions from annotation")
		obj <- filterLowCovg(obj, thresh=1L, reqSamples=1)
	logger.completed()

	return(obj)
}
GreenleafLab/ChrAccR documentation built on March 22, 2023, 11:42 p.m.