R/Preprocessing.R

Defines functions .FindLayersToJoin .InferMatrixDir .RunSoupX MergeSplitLayers HasSplitLayers .MergeSplitLayersIfNeeded .DoMergeSimple PerformEmptyDrops PerformEmptyDropletFiltering .PrintQcPlots CalculatePercentMito CreateSeuratObj

Documented in CalculatePercentMito CreateSeuratObj HasSplitLayers MergeSplitLayers PerformEmptyDropletFiltering

utils::globalVariables(
	names = c('x', 'y', 'lsr'),
	package = 'CellMembrane',
	add = TRUE
)



#' @title Wrapper around Seurat::CreateSeuratObject
#'
#' @description Create Seurat object from count data (usually from Read10X()). This also sets pct.mito.
#' @param seuratData Seurat input data, usually from Read10X().
#' @param datasetId This will be used as a prefix for barcodes, and stored in metadata. Also used as the project name for the Seurat object.
#' @param datasetName An optional print-friendly name that will be stored in metadata
#' @param minFeatures Include cells where at least this many features are detected.
#' @param minCells Include features detected in at least this many cells.
#' @param mitoGenesPattern The expression to use when identifying mitochondrial genes
#' @param annotateMitoFromReferenceIfNoHitsFound If true, a list of mitochondrial genes, taken from (https://en.wikipedia.org/wiki/Category:Human_mitochondrial_genes) will be used to calculate p.mito
#' @return A Seurat object with p.mito calculated.
#' @export
#' @importFrom Matrix colSums
CreateSeuratObj <- function(seuratData, datasetId, datasetName = NULL, minFeatures = 25, minCells = 0, mitoGenesPattern = "^MT-", annotateMitoFromReferenceIfNoHitsFound = TRUE){
	seuratObj <- Seurat::CreateSeuratObject(counts = seuratData, min.cells = minCells, min.features = minFeatures, project = datasetId)
	seuratObj <- .PossiblyAddBarcodePrefix(seuratObj, datasetId = datasetId, datasetName = datasetName)
	seuratObj<- CalculatePercentMito(seuratObj, mitoGenesPattern = mitoGenesPattern, annotateMitoFromReferenceIfNoHitsFound = annotateMitoFromReferenceIfNoHitsFound)

	return(seuratObj)
}

#' @title Calculate Mitochrondrial Percentage
#'
#' @description This will identify mitochrondrial genes and calculate p.mito for each cell
#' @param seuratObj The seurat object
#' @param mitoGenesPattern The expression to use when identifying mitochondrial genes
#' @param annotateMitoFromReferenceIfNoHitsFound If true, a list of mitochondrial genes, taken from (https://www.genedx.com/wp-content/uploads/crm_docs/Mito-Gene-List.pdf) will be used to calculate p.mito
#' @param outputColName The name of the output column to hold p.mito
#' @return A Seurat object with p.mito calculated.
#' @export
CalculatePercentMito <- function(seuratObj, mitoGenesPattern = "^MT-", annotateMitoFromReferenceIfNoHitsFound = TRUE, outputColName = 'p.mito') {
	mito.features <- grep(pattern = mitoGenesPattern, x = rownames(x = seuratObj), value = TRUE)
	if (length(mito.features) == 0 && annotateMitoFromReferenceIfNoHitsFound) {
		print('There were no genes matching mitoGenesPattern, so attempting to identify MT genes using name')
		mito.features <- c('ATP6','ATP8','COX1','COX2','COX3','CYTB','ND1','ND2','ND3','ND4','ND4L','ND5','ND6')
		mito.features.prefix <- c('MT-', mito.features)
		i1 <- length(intersect(mito.features, rownames(seuratObj)))
		i2 <- length(intersect(mito.features.prefix, rownames(seuratObj)))
		if (i2 > 0 && i2 > i1) {
			print('Selecting using reference gene set with MT- prefix')
			mito.features <- mito.features.prefix
		}

		mito.features <- intersect(mito.features, rownames(seuratObj))
	}

	print(paste0('Total mito features: ', length(mito.features)))
	mito.features <- intersect(mito.features, rownames(seuratObj))
	print(paste0('Total intersecting with seurat rownames (total: ', length(rownames(seuratObj)),'): ', length(mito.features)))

	if (all(is.null(mito.features)) || length(mito.features) == 0) {
		print('No mito features found')
		seuratObj[[outputColName]] <- 0
	} else {
		p.mito <- Matrix::colSums(x = GetAssayData(object = seuratObj, layer = 'counts')[mito.features, ]) / Matrix::colSums(x = GetAssayData(object = seuratObj, layer = 'counts'))
		seuratObj[[outputColName]] <- p.mito
	}

	return(seuratObj)
}


#' @importFrom Matrix colSums
.PrintQcPlots <- function(seuratObj) {
	if ('p.mito' %in% colnames(seuratObj@meta.data)) {
		totalPMito <- length(unique(seuratObj$p.mito))
	} else {
		totalPMito <- -1
	}

	assayName <- Seurat::DefaultAssay(seuratObj)
	nCountField <- paste0('nCount_', assayName)
	nFeatureField <- paste0('nFeature_', assayName)

	feats <- c(nFeatureField, nCountField)
	if (totalPMito > 1) {
		feats <- c(feats, "p.mito")
	}

	suppressWarnings(print(VlnPlot(object = seuratObj, features = feats, ncol = length(feats))))

	if (totalPMito > 1) {
		print(FeatureScatter(object = seuratObj, feature1 = nCountField, feature2 = "p.mito"))
	} else {
		print("p.mito absent or identical across all cells, will not plot")
	}
	print(FeatureScatter(object = seuratObj, feature1 = nCountField, feature2 = nFeatureField))

	#10x-like plot
	nUMI <- Matrix::colSums(suppressWarnings(GetAssayData(object = seuratObj, layer = "counts")))
	nUMI <- sort(nUMI)

	countAbove <-sapply(nUMI, function(x){
		sum(nUMI >= x)
	})

	print(ggplot(data.frame(x = log10(countAbove), y = log(nUMI)), aes(x = x, y = y)) +
		geom_point() + ylab("UMI/Cell") + xlab("log10(# Cells)") +
		egg::theme_presentation()
	)
}


#' @title PerformEmptyDropletFiltering
#'
#' @param seuratRawData Raw data
#' @param fdrThreshold FDR threshold, passed directly to PerformEmptyDrops()
#' @param emptyDropNIters Number of iterations, passed directly to PerformEmptyDrops()
#' @param emptyDropsLower Passed directly to emptyDrops(). The lower bound on the total UMI count, at or below which all barcodes are assumed to correspond to empty droplets.
#' @param useEmptyDropsCellRanger If TRUE, will use DropletUtils emptyDropsCellRanger instead of emptyDrops
#' @param nExpectedCells Only applied if emptyDropsCellRanger is selected. Passed to n.expected.cells argument
#' @return Plot
#' @importFrom DropletUtils barcodeRanks
PerformEmptyDropletFiltering <- function(seuratRawData, fdrThreshold=0.001, emptyDropNIters=10000, emptyDropsLower=200, useEmptyDropsCellRanger = FALSE, nExpectedCells = 8000) {
	br.out <- DropletUtils::barcodeRanks(seuratRawData)
	plot(br.out$rank, br.out$total+1, log="xy", xlab="Rank", ylab="Total")

	o <- order(br.out$rank)
	lines(br.out$rank[o], br.out$fitted[o], col="red")
	abline(h=br.out@metadata$knee, col="dodgerblue", lty=2)
	abline(h=br.out@metadata$inflection, col="forestgreen", lty=2)
	legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), legend=c("knee", "inflection"))

	print(paste0('Knee: ', br.out@metadata$knee))
	print(paste0('Inflection: ', br.out@metadata$inflection))
	e.out <- PerformEmptyDrops(seuratRawData, emptyDropNIters = emptyDropNIters, fdrThreshold = fdrThreshold, emptyDropsLower = emptyDropsLower, useEmptyDropsCellRanger = useEmptyDropsCellRanger, nExpectedCells = nExpectedCells)

	toPlot <- e.out[is.finite(e.out$LogProb),]
	if (nrow(toPlot) > 0) {
		plot(toPlot$Total, -toPlot$LogProb, col=ifelse(toPlot$is.cell, "red", "black"), log = "x", xlab="log(Total UMI count)", ylab="-Log Probability")
	} else {
		print('Probabilities all -Inf, unable to plot')
	}

	if (nrow(toPlot) != nrow(e.out)) {
		print(paste0('Total rows with non-finite probabilities: ', (nrow(e.out) - nrow(toPlot))))
	}

	print(paste0('Min UMI count in a droplet called a cell: ', min(e.out$Total[e.out$is.cell])))
  	print(paste0('Max UMI count in a droplet not called a cell: ', max(e.out$Total[!e.out$is.cell])))

	passingCells <- rownames(e.out)[e.out$is.cell]

	return(seuratRawData[,passingCells])
}

PerformEmptyDrops <- function(seuratRawData, emptyDropNIters, fdrThreshold=0.001, emptyDropsLower = 100, useEmptyDropsCellRanger = FALSE, nExpectedCells = 8000, seed = GetSeed()){
	print(paste0('Performing ', ifelse(useEmptyDropsCellRanger, yes = 'emptyDropsCellRanger', no = 'emptyDrops'), ' with ', emptyDropNIters, ' iterations'))

	if (!is.null(seed)) {
		set.seed(seed)
	}

	if (useEmptyDropsCellRanger) {
		e.out <- DropletUtils::emptyDropsCellRanger(seuratRawData, niters = emptyDropNIters, n.expected.cells = nExpectedCells)
	} else {
		e.out <- DropletUtils::emptyDrops(seuratRawData, niters = emptyDropNIters, lower = emptyDropsLower)
	}

	print(paste0('Input cells: ', nrow(e.out)))
	badRows <- !is.finite(e.out$FDR)
	if (sum(badRows) > 0) {
		print(paste0('Cells with non-finite FDR: ', sum(badRows)))
		e.out <- e.out[!badRows,]
	}

	e.out$is.cell <- e.out$FDR <= fdrThreshold
	print(paste0('Cells passing FDR (', fdrThreshold, '): ', sum(e.out$is.cell, na.rm=TRUE)))
	print(paste0('Cells failing FDR: ', sum(!e.out$is.cell, na.rm=TRUE)))

	totalLimited <- 0
	if (!useEmptyDropsCellRanger) {
		#If there are any entries with FDR above the desired threshold and Limited==TRUE, it indicates that npts should be increased in the emptyDrops call.
		print(table(Limited=e.out$Limited, Significant=e.out$is.cell))
		totalLimited <- sum(e.out$Limited == T & e.out$Significant == F)
	}

	if (totalLimited == 0){
		return(e.out)
	} else {
		print('Repeating emptyDrops with more iterations')
		return(PerformEmptyDrops(seuratRawData, emptyDropNIters = emptyDropNIters * 2, fdrThreshold = fdrThreshold, emptyDropsLower = emptyDropsLower, seed = seed, useEmptyDropsCellRanger = useEmptyDropsCellRanger, nExpectedCells = nExpectedCells))
	}
}


.DoMergeSimple <- function(seuratObjs, projectName, merge.data = FALSE, expectedDefaultAssay = NULL, doGC = FALSE){
	seuratObj <- NULL

	for (datasetId in names(seuratObjs)) {
		if (is.null(seuratObj)) {
			seuratObj <- .MergeSplitLayersIfNeeded(seuratObjs[[datasetId]])
			if (!is.null(expectedDefaultAssay)) {
				DefaultAssay(seuratObj) <- expectedDefaultAssay
			}
		} else {
			assayName <- DefaultAssay(seuratObj)
			DefaultAssay(seuratObjs[[datasetId]]) <- assayName

			assayObj <- GetAssay(seuratObjs[[datasetId]], assay = assayName)
			hasGeneId <- 'GeneId' %in% names(slot(assayObj, GetAssayMetadataSlotName(assayObj)))

			if (any(rownames(seuratObj[[assayName]]) != rownames(seuratObjs[[datasetId]][[assayName]]))) {
				missing <- rownames(seuratObj[[assayName]])[!(rownames(seuratObj[[assayName]]) %in% rownames(seuratObjs[[datasetId]][[assayName]]))]
				missing <- c(missing, rownames(seuratObjs[[datasetId]][[assayName]])[!(rownames(seuratObjs[[datasetId]][[assayName]]) %in% rownames(seuratObj[[assayName]]))])
				if (length(missing) > 0) {
					stop(paste0('Gene names are not equal! Missing: ', paste0(missing, collapse = ',')))
				} else {
					sel <- rownames(seuratObj[[assayName]]) != rownames(seuratObjs[[datasetId]][[assayName]])
					genes1 <- rownames(seuratObj[[assayName]])[sel]
					genes2 <- rownames(seuratObjs[[datasetId]][[assayName]])[sel]

					message(paste0('Gene names not identical between objects using assay: ', assayName, '. Problem genes:'))
					message(paste0(genes1, sep =', ', collapse = ','))
					message(paste0(genes2, sep =', ', collapse = ','))
				}
			}

			if (hasGeneId) {
				assayObj <- Seurat::GetAssay(seuratObj, assay = assayName)

				# This can occur if the first object lacks GeneId but the second has it.
				if (!'GeneId' %in% names(slot(assayObj, GetAssayMetadataSlotName(assayObj)))) {
					message('GeneId was present in the second merged object, but not the first. Adding NAs')
					slot(assayObj, GetAssayMetadataSlotName(assayObj))$GeneId <- NA
				}

				geneIds1 <- slot(assayObj, GetAssayMetadataSlotName(assayObj))$GeneId
				geneIds2 <- slot(assayObj, GetAssayMetadataSlotName(assayObj))$GeneId
				names(geneIds1) <- rownames(seuratObj[[assayName]])
				names(geneIds2) <- rownames(seuratObjs[[datasetId]][[assayName]])
			}

			# NOTE: if collapse = TRUE is ever supported, we should use this.
			seuratObj <- merge(x = seuratObj, y = .MergeSplitLayersIfNeeded(seuratObjs[[datasetId]]), project = projectName, merge.data = merge.data)
			seuratObj <- .MergeSplitLayersIfNeeded(seuratObj)

			seuratObjs[[datasetId]] <- NULL
			if (doGC) {
				gc()
			}

			if (hasGeneId) {
				if (any(is.na(geneIds1)) & !any(is.na(geneIds2))) {
					if (length(geneIds2) != nrow(seuratObj[[assayName]])) {
						stop(paste0('Adding geneIds2 of length that differs from assay: ', length(geneIds2), ' vs. ', nrow(seuratObj[[assayName]])))
					}
					names(geneIds2) <- rownames(seuratObj[[assayName]])
					seuratObj[[assayName]] <- AddMetaData(seuratObj[[assayName]], metadata = geneIds2, col.name = 'GeneId')
				} else if (!any(is.na(geneIds1)) & any(is.na(geneIds2))) {
					if (length(geneIds1) != nrow(seuratObj[[assayName]])) {
						stop(paste0('Adding geneIds1 of length that differs from assay: ', length(geneIds1), ' vs. ', nrow(seuratObj[[assayName]])))
					}
					names(geneIds1) <- rownames(seuratObj[[assayName]])

					seuratObj[[assayName]] <- AddMetaData(seuratObj[[assayName]], metadata = geneIds1, col.name = 'GeneId')
				} else if (!any(is.na(geneIds1)) & !any(is.na(geneIds2))) {
					if (any(geneIds1 != geneIds2)) {
						stop('Gene IDs did not match between seurat objects!')
					}

					names(geneIds1) <- rownames(seuratObj[[assayName]])
					seuratObj[[assayName]] <- AddMetaData(seuratObj[[assayName]], metadata = geneIds1, col.name = 'GeneId')
				} else {
					names(geneIds1) <- rownames(seuratObj[[assayName]])
					seuratObj[[assayName]] <- AddMetaData(seuratObj[[assayName]], metadata = geneIds1, col.name = 'GeneId')
				}
			}
		}
	}

	seuratObj <- .MergeSplitLayersIfNeeded(seuratObj)

	print(paste0('Merge complete, layers:'))
	for (assayName in names(seuratObj@assays)) {
		print(paste0(assayName, ': ', paste0(SeuratObject::Layers(seuratObj, assay = assayName), collapse = ', ')))
	}

	return(seuratObj)
}

.MergeSplitLayersIfNeeded <- function(seuratObj) {
	if (HasSplitLayers(seuratObj)) {
		return(MergeSplitLayers(seuratObj))
	}

	return(seuratObj)
}

#' @title HasSplitLayers
#'
#' @param seuratObj The seurat object
#' @param assaysToTest An option list of assays to test
#' @return A boolean indicating whether the object has split layers
#' @export
HasSplitLayers <- function(seuratObj, assaysToTest = NULL) {
	if (all(is.null(assaysToTest))) {
		assaysToTest <- Seurat::Assays(seuratObj)
	}

	for (assayName in assaysToTest) {
		if (length(suppressWarnings(SeuratObject::Layers(seuratObj, assay = assayName, search = 'counts'))) > 1) {
			return(TRUE)
		}

		if (length(suppressWarnings(SeuratObject::Layers(seuratObj, assay = assayName, search = 'data'))) > 1) {
			return(TRUE)
		}
	}

	return(FALSE)
}

#' @title MergeSplitLayers
#'
#' @param seuratObj The seurat object
#' @return The updated seurat object
#' @export
MergeSplitLayers <- function(seuratObj) {
	# NOTE: in Seurat 5.x, the default is to rename layers (i.e. counts.1 and counts.2). Collapse=TRUE avoids this, but this is not supported yet
	for (assayName in Seurat::Assays(seuratObj)) {
		print(paste0('Inspecting assay layers: ', assayName))
		print(paste0('Layers: ', paste0(SeuratObject::Layers(seuratObj[[assayName]]), collapse = ',')))
		print(paste0('Class: ', class(seuratObj[[assayName]])[1]))

		if (inherits(seuratObj[[assayName]], 'Assay5')) {
			toJoin <- .FindLayersToJoin(seuratObj, assayName)
			print(paste0('Joining layers: ', assayName, ', ', paste0(toJoin, collapse = ';')))
			seuratObj[[assayName]] <- SeuratObject::JoinLayers(seuratObj[[assayName]], layers = toJoin)
			print(paste0('After join: ', paste0(SeuratObject::Layers(seuratObj[[assayName]]), collapse = ',')))
		} else {
			print(paste0('Not an assay5 object, not joining layers: ', assayName))
			print(seuratObj)
		}

		if (HasSplitLayers(seuratObj, assaysToTest = assayName)) {
			stop(paste0('Layers were not joined: ', assayName, ', ', paste0(SeuratObject::Layers(seuratObj[[assayName]]), collapse = ',')))
		}
	}

	if (HasSplitLayers(seuratObj)) {
		stop(paste0('This seurat object has split layers'))
	}

	return(seuratObj)
}

.RunSoupX <- function(outsDir) {
	sc <- SoupX::load10X(outsDir)
	sc <- SoupX::autoEstCont(sc, doPlot = T)
	seuratRawData <- SoupX::adjustCounts(sc)
	print(paste0('Passing cells: ', ncol(seuratRawData)))
	print(SoupX::plotMarkerMap(sc, geneSet = sc$fit$markersUsed$gene))
	print(SoupX::plotChangeMap(sc, cleanedMatrix = seuratRawData, geneSet = sc$fit$markersUsed$gene))

	# Drop suffixes:
	colnames(seuratRawData) <- sapply(colnames(seuratRawData), function(x){
		return(unlist(strsplit(x, split = '-'))[1])
	})

	return(seuratRawData)
}

.InferMatrixDir <- function(dataDir) {
	matrixFile <- paste0(dataDir, 'matrix.mtx')
	matrixFileGz <- paste0(dataDir, 'matrix.mtx.gz')
	if (file.exists(matrixFile) || file.exists(matrixFileGz)) {
		return(dataDir)
	}

	dirWithFeatureMatrix <- paste0(dataDir, 'raw_feature_bc_matrix')
	matrixFile <- paste0(dirWithFeatureMatrix, '/matrix.mtx')
	matrixFileGz <- paste0(dirWithFeatureMatrix, '/matrix.mtx.gz')
	if (file.exists(matrixFile) || file.exists(matrixFileGz)) {
		return(dirWithFeatureMatrix)
	}

	stop(paste0('Unable to find matrix file in: ', dataDir, ' or ', dirWithFeatureMatrix))
}

.FindLayersToJoin <- function(seuratObj, assayName) {
	layerNames <- SeuratObject::Layers(seuratObj, assay = assayName)
	layerBasenames <- sapply(layerNames, function(x){
		x <- unlist(strsplit(x, split = '\\.'))
		if (length(x) == 1) {
			return(NA)
		}

		# Drop the final suffix, on the assumption the initial piece is the layer basename, and the final element is projectName:
		return(paste0(x[-length(x)], collapse = '.'))
	})

	# Any duplicated basenames inducate a split layer:
	layerBasenames <- layerBasenames[!is.na(layerBasenames)]
	layerBasenames <- unique(layerBasenames[duplicated(layerBasenames)])

	return(unlist(layerBasenames))
}
bimberlabinternal/CellMembrane documentation built on June 9, 2025, 6:59 p.m.