#' Load Reference Atlas
#'
#' Load or download the reference map for dataset projection.
#' By the default it downloads a reference atlas of tumour-infiltrating
#' lymphocytes (TILs) from mouse.
#'
#' @param ref Reference atlas as a Seurat object (by default downloads a mouse reference TIL atlas).
#' To use a custom reference atlas, provide a .rds object or a URL to a .rds object, storing a Seurat object
#' prepared using \link{make.reference}
#'
#' @examples
#' # consider increasing downloading timeout, if downloading Default reference atlas or large reference
#' options(timeout = 1000)
#'
#' # Download and load default reference map
#' ref <- load.reference.map()
#'
#' # download reference map
#' ref.web <- load.reference.map(ref = url)
#'
#' # Load any reference map
#' ref <- load.reference.map(ref = "path/to/ref")
#'
#' @export load.reference.map
load.reference.map <- function(ref="referenceTIL") {
hash <- NULL
if(identical(ref,"referenceTIL")){
print("Loading Default Reference Atlas...")
# file path of the reference map
refFileName <- paste0(getwd(),"/ref_TILAtlas_mouse_v1.rds")
# url and hash of TIL atlas (default)
refUrl <- "https://ndownloader.figshare.com/files/41398167"
hash <- "679c7fe3cb1737e43cc2f84350331253"
if (file.exists(refFileName)){
print(refFileName)
ref <- load.helper(refFileName)
} else {
print(paste0(refFileName," not found; downloading reference TIL map from the server..."))
try.download(url = refUrl,
destfile = refFileName,
hash = hash,
warn = F)
ref <- load.helper(refFileName)
}
} else {
if (grepl("^(ftp|http)", ref, perl = T)) {
refUrl <- ref
refFileName <- paste0(getwd(),"/custom_reference.rds")
print(sprintf("Trying to download custom reference from %s...", refUrl))
try.download(url = refUrl,
destfile = refFileName,
hash = hash,
warn = F)
} else if (file.exists(ref)) {
refFileName <- ref
} else {
stop("Provide ref is not a valid reference or a valid URL.")
}
print("Loading Custom Reference Atlas...")
ref <- load.helper(refFileName)
}
return(ref)
}
#' Read to memory a query expression matrix
#'
#' Load a query expression matrix to be projected onto the reference atlas. Several formats (10x, hdf5, raw and log counts)
#' are supported - see \code{type} parameter for details
#'
#' @param filename Path to expression matrix file or folder
#' @param type Expression matrix format (10x, hdf5, raw, raw.log2)
#' @param project.name Title for the project
#' @param min.cells Only keep genes represented in at least min.cells number of cells
#' @param min.features Only keep cells expressing at least min.features genes
#' @param gene.column.10x For 10x format - which column of genes.tsv or features.tsv to use for gene names
#' @param raw.rownames For raw matrix format - A vector of row names, or a single number giving the column of the table which contains the row names
#' @param raw.sep For raw matrix format - Separator for raw expression matrix
#' @param raw.header For raw matrix format - Use headers in expression matrix
#' @param use.readmtx Use ReadMtx function to read in 10x files with custom names
#' @return A Seurat object populated with raw counts and normalized counts for single-cell expression
#' @examples
#' fname <- "./sample_data"
#' querydata <- read.sc.query(fname, type="10x")
#' @importFrom Matrix readMM
#' @export read.sc.query
read.sc.query <- function(filename,
type=c("10x","hdf5","raw","raw.log2"),
project.name="Query",
min.cells = 3,
min.features = 50,
gene.column.10x=2,
raw.rownames=1,
raw.sep=c("auto"," ","\t",","),
raw.header=TRUE,
use.readmtx=TRUE) {
if (is.null(filename)) {stop("Please provide a query dataset in one of the supported formats")}
type = tolower(type[1])
if (type == "10x") {
fl <- list.files(filename)
matrix.file <- grep("matrix.mtx", fl, value=TRUE)[1]
feature.file <- grep("features.tsv|genes.tsv", fl, value=TRUE)[1]
barcode.file <- grep("barcodes.tsv", fl, value=TRUE)[1]
if (is.na(matrix.file)) stop("Cannot find matrix file")
if (is.na(feature.file)) stop("Cannot find genes file")
if (is.na(barcode.file)) stop("Cannot find barcode file")
matrix.file <- sprintf("%s/%s", filename, matrix.file)
feature.file <- sprintf("%s/%s", filename, feature.file)
barcode.file <- sprintf("%s/%s", filename, barcode.file)
if (use.readmtx) {
query.exp <- ReadMtx.fix(mtx=matrix.file,
cells=barcode.file,
features=feature.file,
feature.column=gene.column.10x)
} else {
query.exp <- Read10X(filename, gene.column = gene.column.10x)
}
} else if (type == "hdf5") {
query.exp <- Read10X_h5(filename)
} else if (type == "raw" | type == "raw.log2") {
raw.sep <- raw.sep[1]
if (raw.sep == "auto") {
raw.sep <- guess_raw_separator(f=filename)
if (is.null(raw.sep)) {
stop("Could not guess separator for raw matrix format. Try specifying manually with raw.sep parameter")
}
}
p <- regexpr("\\.([[:alnum:]]+)$", filename)
extension <- ifelse(p > -1L, substring(filename, p + 1L), "")
if (extension == "gz") {
query.exp <- read.table(gzfile(filename),row.names=raw.rownames, sep=raw.sep, header=raw.header)
} else {
query.exp <- read.table(filename,row.names=raw.rownames, sep=raw.sep, header=raw.header)
}
query.exp[is.na(query.exp)] <- 0
if (type == "raw.log2") {
query.exp <- 2^(query.exp)-1
}
#Also try to determine whether genes are on rows or columns
data(Hs2Mm.convert.table)
gnames <- c(Hs2Mm.convert.table$Gene.MM, Hs2Mm.convert.table$Gene.stable.ID.HS, Hs2Mm.convert.table$Gene.HS)
gr <- length(intersect(rownames(query.exp), gnames))
gc <- length(intersect(colnames(query.exp), gnames))
gmax <- max(gr, gc)
if (gmax==0) {
stop("Could not find gene names in matrix. Check matrix format")
}
if (gc>gr) { #flip rows and columns
query.exp <- t(query.exp)
}
} else {
stop("Please provide a query dataset in one of the supported formats")
}
query.seurat <- CreateSeuratObject(counts=query.exp, project=project.name, min.cells=min.cells, min.features=min.features)
query.seurat<- NormalizeData(query.seurat)
return(query.seurat)
}
#' Project a query scRNA-seq dataset onto a reference atlas
#'
#' This function allows projecting ("query") single-cell RNA-seq datasets onto a reference map
#' (i.e. a curated and annotated scRNA-seq dataset).
#' To project multiple datasets, submit a list of Seurat objects with the query parameter.
#' The projection consists of 3 steps:
#' \itemize{
#' \item{pre-processing: optional steps which might include pre-filtering of cells by markers using `scGate`,
#' data normalization, and ortholog conversion.}
#' \item{batch-effect correction: uses built-in STACAS algorithm to detect and correct for batch effects
#' (this step assumes that at least a fraction of the cells in the query are in the same state than cells in
#' the reference)}
#' \item{embedding of corrected query data in the reduced-dimensionality spaces (PCA and UMAP) of the reference map.}
#' }
#'
#' See \link{load.reference.map} to load or download a reference atlas. See
#' also \link{ProjecTILs.classifier} to use ProjecTILs as a cell type classifier.
#'
#' @param query Query data, either as single Seurat object or as a list of Seurat object
#' @param ref Reference Atlas - if NULL, downloads the default TIL reference atlas
#' @param query.assay Which assay slot to use for the query (defaults to DefaultAssay(query))
#' @param direct.projection If true, apply PCA transformation directly without alignment
#' @param STACAS.anchor.coverage Focus on few robust anchors (low STACAS.anchor.coverage) or on a large amount
#' of anchors (high STACAS.anchor.coverage). Must be number between 0 and 1.
#' @param STACAS.correction.scale Slope of sigmoid function used to determine strength of batch effect correction.
#' @param STACAS.k.anchor Integer. For alignment, how many neighbors (k) to use when picking anchors.
#' @param STACAS.k.weight Number of neighbors to consider when weighting anchors.
#' Default is "max", which disables local anchor weighting.
#' @param skip.normalize By default, log-normalize the count data.
#' If you have already normalized your data, you can skip normalization.
#' @param filter.cells Pre-filter cells using `scGate`. Only set to FALSE if the dataset has
#' been previously subset to cell types represented in the reference.
#' @param scGate_model scGate model used to filter target cell type from query data
#' (if NULL use the model stored in \code{ref@@misc$scGate})
#' @param ortholog_table Dataframe for conversion between ortholog genes
#' (by default package object \code{Hs2Mm.convert.table})
#' @param fast.umap.predict Fast approximation for UMAP projection. Uses coordinates of nearest neighbors in
#' PCA space to assign UMAP coordinates (credits to Changsheng Li for the implementation)
#' @param ncores Number of cores for parallel execution (requires \link{BiocParallel})
#' @param progressbar Whether to show a progress bar for projection process or not (requires \link{BiocParallel})
#' @return An augmented Seurat object with projected UMAP coordinates on the reference map
#' @examples
#' data(query_example_seurat)
#' ref <- load.reference.map()
#' make.projection(query_example_seurat, ref=ref)
#' @import Seurat
#' @import SeuratObject
#' @importFrom STACAS FindAnchors.STACAS IntegrateData.STACAS
#' @importFrom scGate scGate
#' @importFrom BiocParallel MulticoreParam bplapply
#' @importFrom stats aggregate quantile sd
#' @export make.projection
make.projection <- function(query, ref=NULL,
filter.cells=TRUE,
query.assay=NULL,
direct.projection=FALSE,
STACAS.anchor.coverage=0.7,
STACAS.correction.scale=100,
STACAS.k.anchor=5,
STACAS.k.weight="max",
skip.normalize=FALSE,
fast.umap.predict=FALSE,
ortholog_table=NULL,
scGate_model=NULL,
ncores=1,
progressbar = TRUE) {
if (is.null(query)) {
return(NULL)
}
if(is.null(ref)){
ref <- load.reference.map()
}
#ref assays should be in Seurat v4 format
assays <- Assays(ref)
for (a in assays) {
ref <- convert_to_v3(ref, assay = a, layer="data")
}
projected.list <- list()
if (is.null(ortholog_table)) {
data(Hs2Mm.convert.table)
ortholog_table <- Hs2Mm.convert.table
}
if(!is.list(query)) {
query.list <- list(query=query)
} else {
query.list <- query
if (is.null(names(query.list))) {
names(query.list) <- paste0("query",c(1:length(query.list)))
}
}
rm(query)
#Parallelize (ncores>1)
ncores <- min(ncores, length(query.list))
param <- set_parall(ncores, progressbar=progressbar)
#Projection over list of datasets
projected.list <- BiocParallel::bplapply(
X = 1:length(query.list),
BPPARAM = param,
FUN = function(i) {
projection.helper(query=query.list[[i]], ref=ref,
filter.cells=filter.cells,
query.assay=query.assay,
direct.projection=direct.projection,
fast.umap.predict=fast.umap.predict,
STACAS.k.anchor=STACAS.k.anchor,
STACAS.k.weight=STACAS.k.weight,
STACAS.anchor.coverage=STACAS.anchor.coverage,
STACAS.correction.scale=STACAS.correction.scale,
remove.thr=0,
alpha=0.5,
ncores=ncores,
ortholog_table=ortholog_table,
skip.normalize=skip.normalize,
id=names(query.list)[i],
scGate_model=scGate_model)
}
)
names(projected.list) <- names(query.list)
#De-list if single object was submitted
if(length(projected.list)==1)
projected.list <- projected.list[[1]]
return(projected.list)
}
#' Predict cell states of a projected dataset
#'
#' This function uses a nearest-neighbor algorithm to predict a feature (e.g. the cell state) of the query cells. Distances between
#' cells in the reference map and cells in the query are calculated in a reduced space (PCA or UMAP) and the feature is assigned to
#' query cells based on a consensus of its nearest neighbors in the reference object.
#'
#' @param ref Reference Atlas
#' @param query Seurat object with query data
#' @param reduction The dimensionality reduction used to calculate pairwise distances. One of "pca" or "umap"
#' @param ndim How many dimensions in the reduced space to be used for distance calculations
#' @param k Number of neighbors to assign the cell type
#' @param nn.decay Weight decay for internal nearest neighbors (between 0 and 1)
#' @param min.confidence Minimum confidence score to return cell type labels (otherwise NA)
#' @param labels.col The metadata field of the reference to annotate the clusters (default: functional.cluster)
#' @return The query object submitted as parameter, with two additional metadata slots for predicted state and its confidence score
#' @examples
#' data(query_example_seurat)
#' ref <- load.reference.map()
#' q <- make.projection(query_example_seurat, ref=ref)
#' q <- cellstate.predict(ref, query=q)
#' table(q$functional.cluster)
#' @import Seurat
#' @importFrom BiocNeighbors queryKNN AnnoyParam
#' @export cellstate.predict
cellstate.predict <- function(ref, query,
reduction="pca",
ndim=NULL,
k=5,
min.confidence=0.2,
nn.decay=0.1,
labels.col="functional.cluster") {
if (is.null(query)) {
return(NULL)
}
if (is.null(ndim)) {
if (!is.null(ref@misc$umap_object$data)) {
ndim <- ncol(ref@misc$umap_object$data)
} else {
stop("Please specify ndim parameter.")
}
}
ref.red <- Reductions(ref, slot=reduction)
query.red <- Reductions(query, slot=reduction)
tdim <- ncol(ref.red@cell.embeddings)
if (ndim > tdim) {
warning(sprintf("Number of dimensions ndim=%i is larger than the dimensions in reduction %s - Using only first %i dimensions",ndim,reduction,tdim))
ndim = tdim
}
labels <- ref[[labels.col]][,1]
levs <- levels(labels)
ref.space <- ref.red@cell.embeddings[,1:ndim]
query.space <- query.red@cell.embeddings[,1:ndim]
querysize <- nrow(query.space)
pred.type <- rep("Unknown", dim(query.space)[1])
pred.conf <- numeric(dim(query.space)[1])
#Use NN-search implemented in BiocNeighbors
nn.toref <- queryKNN(ref.space, query.space, k=k, BNPARAM=AnnoyParam())
nn.self <- queryKNN(query.space, query.space, k=min(k,querysize), BNPARAM=AnnoyParam())
nn.w <- (1-nn.decay)^(seq(0, min(k,querysize)-1))
#External kNN (to reference)
ext.nn <- apply(nn.toref$index, 1, function(x) {
top.k.lab <- factor(labels[x], levels=levs)
table(top.k.lab)/k
})
#Sum of confidence weights of internal neighbors
comb.nn <- apply(nn.self$index, 1, function(x) {
wgts <- vapply(x, FUN.VALUE=vector("numeric", length=length(levs)),
FUN = function(y) {
ext.nn[,y]
})
(wgts %*% nn.w) / sum(nn.w)
})
rownames(comb.nn) <- row.names(ext.nn)
pred.type <- rownames(comb.nn)[apply(comb.nn, 2, which.max)]
pred.conf <- apply(comb.nn, 2, max)
pred.type[pred.conf < min.confidence] <- NA
pred.conf[pred.conf < min.confidence] <- NA
pred <- as.data.frame(cbind(row.names(query.space), pred.type, pred.conf), stringsAsFactors = FALSE)
row.names(pred) <- row.names(query.space)
colnames(pred) <- c("id","pred.state","confidence")
pred <- transform(pred, confidence=as.numeric(confidence))
query <- AddMetaData(query, metadata=pred$pred.state, col.name = labels.col)
query <- AddMetaData(query, metadata=pred$confidence, col.name = paste0(labels.col,".conf"))
return(query)
}
# Function plot.projection
#' Show UMAP projection of query on reference map
#'
#' Plots the UMAP representation of the reference map, together with the projected coordinates of a query dataset.
#'
#' @param ref Reference object
#' @param query Seurat object with query data
#' @param labels.col The metadata field to annotate the clusters (default: functional.cluster)
#' @param cols Custom color palette for clusters
#' @param linesize Contour line thickness for projected query
#' @param pointsize Point size for cells in projected query
#' @param ref.alpha Transparency parameter for reference cells
#' @param ref.size Adjust point size for reference cells
#' @param ... Additional parameters for \code{DimPlot}, e.g. raster=T to
#' limit image size
#' @return UMAP plot of reference map with projected query set in the same space
#' @examples
#' data(query_example_seurat)
#' ref <- load.reference.map()
#' q <- Run.ProjecTILs(query_example_seurat, ref=ref, fast.umap.predict=TRUE)
#' plot.projection(ref=ref, query=q)
#' @import Seurat
#' @import ggplot2
#' @importFrom scales alpha
#' @importFrom grDevices rainbow
#' @export plot.projection
#'
plot.projection <- function(ref, query=NULL, labels.col="functional.cluster",
cols=NULL, linesize=1, pointsize=1,
ref.alpha=0.3, ref.size=NULL, ...) {
labels <- ref[[labels.col]][,1]
states_all <- levels(factor(labels))
nstates <- length(states_all)
if (!is.null(cols)) { #custom palette
if (nstates>length(cols)) {
warning("Not enough colors provided. Making an automatic palette")
palette <- rainbow(n=nstates)
} else {
palette <- cols
}
} else { #default palette
if (!is.null(ref@misc$atlas.palette)) { #read directly from atlas, if stored
palette <- ref@misc$atlas.palette
} else {
palette <- c("#edbe2a","#A58AFF","#53B400","#F8766D","#00B6EB","#d1cfcc","#FF0000","#87f6a5","#e812dd")
if (nstates > length(palette)) {
palette <- rainbow(n=nstates)
}
}
}
#apply transparency to ref cells
cols_use <- scales::alpha(palette, ref.alpha)
if (is.null(query)) {
p <- DimPlot(ref, reduction="umap", label = FALSE, group.by = labels.col,
repel = TRUE, pt.size=ref.size, cols=cols_use, ...) +
ggtitle ("Reference map") + theme(aspect.ratio=1)
} else {
query.red <- Reductions(query, slot="umap")
p <- DimPlot(ref, reduction="umap", label = FALSE, group.by = labels.col,
repel = TRUE, pt.size=ref.size, cols=cols_use, ...) +
geom_point(data.frame(query.red@cell.embeddings),
mapping=aes(x=UMAP_1,y=UMAP_2),alpha=0.6, size=pointsize,shape=17, color="gray10") +
geom_density_2d(data=data.frame(query.red@cell.embeddings),
mapping=aes(x=UMAP_1,y=UMAP_2),color="black",n=200,h=2,linewidth=linesize) +
ggtitle ("Projection of query on reference map") + theme(aspect.ratio=1)
}
return(p)
}
# Function plot.statepred.composition
#' Summarize the predicted cell states of an object
#'
#' Makes a barplot of the frequency of cell states in a query object.
#'
#' @param ref Reference object
#' @param query Seurat object with query data
#' @param labels.col The metadata field used to annotate the clusters (default: functional.cluster)
#' @param metric One of `Count` or `Percent`. `Count` plots the absolute number of cells, `Percent` the fraction on the total number of cells.
#' @param cols Custom color palette for clusters
#' @return Barplot of predicted state composition
#' @examples
#' plot.statepred.composition(query_example.seurat)
#' @importFrom reshape2 melt
#' @import ggplot2
#' @export plot.statepred.composition
#'
plot.statepred.composition <- function(ref, query,
labels.col="functional.cluster",cols=NULL,
metric=c("Count","Percent")) {
metric <- tolower(metric[1])
labels <- ref[[labels.col]][,1]
states_all <- levels(factor(labels))
nstates <- length(states_all)
if (!is.null(cols)) { #custom palette
if (nstates<=length(cols)) {
palette <- cols[1:nstates]
} else {
warning("Not enough colors provided. Making an automatic palette")
palette <- rainbow(n=nstates)
}
} else { #default palette
if (!is.null(ref@misc$atlas.palette)) { #read directly from atlas, if stored
palette <- ref@misc$atlas.palette
} else {
palette <- c("#edbe2a","#A58AFF","#53B400","#F8766D","#00B6EB","#d1cfcc","#FF0000","#87f6a5","#e812dd")
if (nstates<=length(palette)) {
palette <- palette[1:nstates]
} else { #make a new palette
palette <- rainbow(n=nstates)
}
}
}
names(palette) <- states_all
cols_use <- palette[states_all]
tb <- table(factor(query[[labels.col]][,1], levels=states_all))
if (metric=="percent") { #normalize
tb <- tb*100/sum(tb)
tb.m <- reshape2::melt(tb)
colnames(tb.m) <- c("Cell_state","Perc_cells")
p <- ggplot(tb.m, aes(x=Cell_state, y=Perc_cells, fill=Cell_state)) + geom_bar(stat="identity") +
theme_bw() + scale_fill_manual(values=cols_use) +
theme(axis.text.x=element_blank(), legend.position="left")
} else if (metric=="count") {
tb.m <- reshape2::melt(tb)
colnames(tb.m) <- c("Cell_state","Ncells")
p <- ggplot(tb.m, aes(x=Cell_state, y=Ncells, fill=Cell_state)) + geom_bar(stat="identity") +
theme_bw() + scale_fill_manual(values=cols_use) +
theme(axis.text.x=element_blank(), legend.position="left")
} else {
stop("Unknown metric specified (Must be either 'count' or 'percent')")
}
return(p)
}
# Function plot.states.radar
#' Show expression level of key genes
#'
#' Makes a radar plot of the expression level of a set of genes. It can be useful to compare
#' the gene expression profile of different cell states in the reference atlas vs. a projected set.
#'
#' @param ref Reference object
#' @param query Query data, either as a Seurat object or as a list of Seurat objects
#' @param labels.col The metadata field used to annotate the clusters
#' @param genes4radar Which genes to use for plotting
#' @param meta4radar Which metadata columns (numeric) to use for plotting. If not NULL, \code{genes4radar} are ignored
#' @param min.cells Only display cell states with a minimum number of cells
#' @param norm.factor Normalization factor for rescaling expression or metadata values
#' @param cols Custom color palette for samples in radar plot
#' @param ref.assay The assay to pull the reference expression data
#' @param query.assay The assay to pull the query expression data
#' @param return Return the combined plots instead of printing them to the default device (deprecated)
#' @param return.as.list Return plots in a list, instead of combining them in a single plot
#' @return Radar plot of gene expression of key genes by cell subtype
#' @usage plot.states.radar(ref)
#' @examples
#' ref <- load.reference.map()
#' plot.states.radar(ref)
#' @import ggplot2
#' @importFrom scales hue_pal
#' @importFrom patchwork wrap_plots plot_annotation
#' @export plot.states.radar
#'
plot.states.radar <- function(ref, query=NULL,
labels.col="functional.cluster",
ref.assay='RNA', query.assay='RNA',
genes4radar=c("Foxp3","Cd4","Cd8a","Tcf7","Ccr7","Gzmb",
"Gzmk","Pdcd1","Havcr2","Tox","Mki67"),
meta4radar=NULL,
norm.factor=1,
min.cells=20, cols=NULL,
return=FALSE, return.as.list=FALSE) {
#Make sure query is a list
if(!is.null(query) & !is.list(query)) {
query <- list(Query=query)
}
#Check assays exist
if (!ref.assay %in% Assays(ref)) {
stop(sprintf("Assay %s not found in reference object. Please check ref.assay parameter", ref.assay))
}
#Whether to use gene expression or metadata
if (!is.null(meta4radar)) {
refmat <- t(ref[[]])
feat.use <- intersect(meta4radar, row.names(refmat))
if (length(feat.use)==0) {
stop("None of the provided meta columns were found - check option 'meta4radar'")
}
feat.missing <- setdiff(meta4radar, feat.use)
if (length(feat.missing)>0) {
to.print <- paste(feat.missing, sep=",", collapse = ",")
warning(sprintf("Some metadata columns were not found:\n%s", to.print))
}
} else {
refmat <- GetAssayData(ref, assay=ref.assay, layer="data")
#Check gene names/feature names
feat.use <- intersect(genes4radar, row.names(refmat))
#If overlap is zero, first check whether wrong species was used (upper case to human)
if (length(feat.use)==0) {
genes4radar <- toupper(genes4radar)
feat.use <- intersect(genes4radar, row.names(refmat))
if (length(feat.use)==0) {
stop("None of the provided genes were found - check option 'genes4radar'")
}
}
feat.missing <- setdiff(genes4radar, feat.use)
if (length(feat.missing)>0) {
to.print <- paste(feat.missing, sep=",", collapse = ",")
warning(sprintf("Some gene symbols were not found:\n%s", to.print))
}
}
order <- match(feat.use, row.names(refmat))
rr <- as.matrix(refmat[order,])
rr <- matrix(as.numeric(rr), ncol=ncol(rr))
rownames(rr) <- feat.use
colnames(rr) <- colnames(refmat)
#Set colors
ncolors <- 1+length(query)
if (ncolors==1) {
radar.colors <- "black"
} else {
if (is.null(cols)) {
radar.colors <- c("black", hue_pal()(ncolors-1))
} else {
cols <- c("black", cols)
if (ncolors <= length(cols)) {
radar.colors <- cols[1:ncolors]
} else {
warning("Not enough colors provided. Making an automatic palette")
radar.colors <- c("black", hue_pal()(ncolors-1))
}
}
}
names(radar.colors) <- c("Reference", names(query))
#Convert NAs to unknown, if any
use.unknown <- FALSE
labels <- ref@meta.data[,labels.col]
is.na <- is.na(labels)
states_all <- levels(factor(labels))
nstates <- length(states_all)
if (sum(is.na)>0) {
ref@meta.data[,labels.col] <- factor(labels, levels=c(levels(labels), "Unknown"))
ref@meta.data[is.na, labels.col] <- "Unknown"
labels <- ref@meta.data[,labels.col]
use.unknown <- TRUE
}
if (!is.null(query)) {
if (is.null(names(query))) {
for (i in 1:length(query)) {
names(query)[[i]] <- paste0("Query",i)
}
}
labels.q <- list()
qq <- list()
for (i in 1:length(query)) {
this <- query[[i]]
if (!labels.col %in% colnames(this@meta.data)) {
message1 <- sprintf("Could not find %s column in query object metadata.",labels.col)
message2 <- "Did you run cellstate.predict() on this object to predict cell states?"
stop(paste(message1, message2, sep="\n"))
}
this.lab <- this@meta.data[,labels.col]
is.na <- is.na(this.lab)
if (sum(is.na)>0) {
this@meta.data[,labels.col] <- factor(this.lab, levels=c(levels(labels), "Unknown"))
this@meta.data[is.na, labels.col] <- "Unknown"
this.lab <- this@meta.data[,labels.col]
use.unknown <- TRUE
}
labels.q[[i]] <- this.lab
if (!is.null(meta4radar)) {
qmat <- t(this[[]])
} else {
if (!query.assay %in% Assays(this)) {
stop(sprintf("Assay %s not found in query object. Please check ref.assay parameter", query.assay))
}
qmat <- GetAssayData(this, assay=query.assay, layer="data")
}
order <- match(feat.use, row.names(qmat))
qq[[i]] <- as.matrix(qmat[order,])
qq[[i]] <- matrix(as.numeric(qq[[i]]), ncol=ncol(qq[[i]]))
}
}
if (use.unknown) {
states_all <- c(states_all, "Unknown")
nstates <- nstates+1
}
#Get raw expression means, to normalize by gene
m <- matrix(, nrow = length(states_all), ncol = length(feat.use))
rownames(m) <- states_all
colnames(m) <- feat.use
for (i in 1:length(states_all)) {
s <- states_all[i]
m[i,] <- apply(rr[, labels == s], MARGIN=1, function(x){mean(x, na.rm=T)})
}
normfacs <- apply(m, MARGIN=2, function(x) {max(c(norm.factor,x), na.rm=T)})
pll <- list()
for (j in 1:length(states_all)) {
s <- states_all[j]
this.df <- data.frame(Gene=character(0),Expression=numeric(0),Dataset=character(0))
if (sum(labels==s) > min.cells) {
this.mean <- apply(rr[, labels == s], MARGIN=1, function(x){mean(x, na.rm=T)})
this.mean <- this.mean/normfacs
ref.df <- data.frame(t(rbind(names(this.mean), this.mean, "Reference")))
colnames(ref.df) <- c("Gene","Expression","Dataset")
ref.df$Expression <- as.numeric(as.character(ref.df$Expression))
this.df <- rbind(this.df, ref.df)
}
i <- 1
while (i <= length(query)) {
ll <- labels.q[[i]]
m <- as.matrix(qq[[i]][, !is.na(ll) & ll == s])
if (ncol(m) >= min.cells) {
q.mean <- apply(m, MARGIN=1, function(x){mean(x, na.rm=T)})
q.mean <- q.mean/normfacs
q.df <- data.frame(t(rbind(names(q.mean), q.mean, names(query)[[i]])))
colnames(q.df) <- c("Gene","Expression","Dataset")
q.df$Expression <- as.numeric(as.character(q.df$Expression))
this.df <- rbind(this.df, q.df)
}
i=i+1
}
ymin <- min(c(-0.1, min(this.df$Expression)))
ymax <- max(c(1, max(this.df$Expression)))
levs <- unique(this.df$Dataset)
this.df$Dataset <- factor(this.df$Dataset, levels=levs)
this.df$Gene <- factor(this.df$Gene, levels=feat.use)
pll[[j]] <- ggplot(data=this.df, aes(x=Gene, y=Expression, group= Dataset, colour=Dataset, fill=Dataset)) +
geom_bar(stat="identity", alpha=0.7, color=NA, width=1, position = position_dodge(width = 0.5)) +
coord_radial() +
ylim(ymin, ymax) + ggtitle(s) +
scale_x_discrete() +
scale_fill_manual(values= radar.colors) +
theme_light() +
theme(axis.text.x=element_blank()) +
annotate(geom="text", x=seq(1,length(feat.use)), y=ymax-0.05*ymax, label=feat.use, size=3)
}
#Return plots
if (return.as.list) {
return(pll)
} else {
g <- wrap_plots(pll) + plot_annotation(paste0("Radar plots for ", labels.col))
return(g)
}
}
# Function find.discriminant.dimensions
#' Find discriminant dimensions
#'
#' Searches PCA or ICA dimensions where the query set deviates the most from a control set or from the reference map. It can
#' be useful to suggest novel cell states that escape from the main axes of diversity of the UMAP
#'
#' @param ref Seurat object with reference atlas
#' @param query Seurat object with query data
#' @param query.control Optionally, you can compare your query with a control sample, instead of the reference
#' @param query.assay The data slot to be used for enrichment analysis
#' @param state Perform discriminant analysis on this cell state. Can be either:
#' \itemize{
#' \item{"largest" - Performs analysis on the cell state most represented in the query set(s)}
#' \item{"all" - Performs analysis on the complete dataset, using all cells}
#' \item{A specific cell state, one of the states in metadata field labels.col}
#' }
#' @param labels.col The metadata field used to annotate the clusters (default: functional.cluster)
#' @param reduction Which dimensionality reduction to use (either ICA or PCA)
#' @param test Which test to perform between the dataset distributions in each ICA/PCA dimension. One of `ks` (Kolmogorov-Smirnov) or `t.test` (T-test)
#' @param ndim How many dimensions to consider in the reduced ICA/PCA space
#' @param print.n The number of top dimensions to return to STDOUT
#' @param verbose Print results to STDOUT
#' @return A dataframe, where rows are ICA/PCA dimensions. ICA/PCAs are ranked by statistical significance when comparing their distribution between query and control (or query vs. reference map)
#' @examples
#' find.discriminant.dimensions(ref, query=query.set)
#' find.discriminant.dimensions(ref, query=query.set, query.control=control.set)
#' @importFrom stats t.test ks.test
#' @export find.discriminant.dimensions
find.discriminant.dimensions <- function(ref, query, query.control=NULL, query.assay="RNA",
state="largest", labels.col="functional.cluster",
reduction="ICA", test=c("ks","t.test"), ndim=50, print.n=3, verbose=T) {
reduction=tolower(reduction)
test=test[1]
if (is.null(ref)) {stop("Please provide the reference object (ref")}
if (is.null(query)) {stop("Please provide a query object (query)")}
#Determine cell state for analysis
if (is.null(state) | state=="largest") {
if (!is.null(query.control)) {
ss <- table(rbind(query[[labels.col]], query.control[[labels.col]]))
} else {
ss <- table(query[[labels.col]])
}
state <- names(sort(ss, decreasing = T))[1]
message(paste0("Performing discriminant analysis using the most abundant cell state in the query - ", state))
} else if (state=="all") {
message("Performing discriminant analysis using all cells")
} else {
if (!state %in% query[[labels.col]][,1]) {
stop(sprintf("State %s not found in query metadata colum %s", state, labels.col))
}
if (!is.null(query.control) & !state %in% query.control[[labels.col]][,1]) {
stop(sprintf("State %s not found in query.control metadata colum %s", state, labels.col))
}
message(paste0("Performing discriminant analysis with user-specified state - ", state))
}
#Subset query data on specific state
if (state=="all") {
ref.cells=seq(1, dim(ref)[2])
query.cells=seq(1, dim(query)[2])
if (!is.null(query.control)) {
query.c.cells=seq(1, dim(query.control)[2])
}
} else {
ref.cells=which(ref[[labels.col]]==state)
query.cells=which(query[[labels.col]]==state)
if (!is.null(query.control)) {
query.c.cells=which(query.control[[labels.col]]==state)
}
}
if (reduction=="ica") {
if (!"ica" %in% Reductions(ref)) {
message("Reduction ICA not found. Calculating ICA for reference object")
ref <- run.ica(ref, ndim=ndim)
}
if (!requireNamespace("fastICA", quietly = TRUE)) {
stop("Please install package 'fastICA' to run this function.", call. = FALSE)
}
ref_dimRed <- ref@misc$ica
perturb_dimRed <- apply.ica.obj(query=query, query.assay=query.assay, ica.obj=ref_dimRed)
if (!is.null(query.control)) {
control_dimRed <- apply.ica.obj(query=query.control, query.assay=query.assay, ica.obj=ref_dimRed)
}
} else if (reduction=="pca") {
ref_dimRed <- ref@misc$pca_obj
perturb_dimRed <- apply.pca.obj.2(query=query, query.assay=query.assay, pca.obj=ref_dimRed)
if (!is.null(query.control)) {
control_dimRed <- apply.pca.obj.2(query=query.control, query.assay=query.assay, pca.obj=ref_dimRed)
}
} else {
stop(paste0("Unrecognized reduction slot: ", reduction))
}
ndim <- min(ndim, length(colnames(perturb_dimRed)))
df <- data.frame(matrix(ncol = 3, nrow = ndim))
colnames(df) <- c("stat","stat_abs","p_val")
rownames(df) <- colnames(perturb_dimRed)[1:ndim]
if (!is.null(query.control)) {
message("Query and control datasets was provided. Determining discriminant components of Query vs. Control...")
} else {
message("Single query dataset was provided. Determining discriminant components of Query vs. Reference...")
}
refemb <- Reductions(ref, slot=reduction)
for (pc in 1:ndim) {
d1 <- perturb_dimRed[query.cells,pc]
if (!is.null(query.control)) {
d2 <- control_dimRed[query.c.cells,pc]
} else {
d2 <- refemb@cell.embeddings[ref.cells, pc]
}
ttest <- t.test(d1, d2)
if (test=="ks") {
this.test <- ks.test(d1, d2, alternative="two.sided")
this.test.signed <- this.test$statistic * sign(ttest$statistic) #KS test statistic has no sign
} else {
this.test <- ttest
this.test.signed <- this.test$statistic
}
df[pc, "stat"] <- this.test.signed
df[pc, "stat_abs"] <- abs(this.test.signed)
df[pc, "p_val"] <- this.test$p.value * ndim #multiple testing
}
df <- df[with(df, order(stat_abs, decreasing=T)), ]
buffer <- ""
for (i in 1:print.n) {
topPC <- rownames(df)[i]
pc.index <- match(topPC, colnames(perturb_dimRed))
feats <- refemb@feature.loadings[,pc.index]
topgenes <- names(head(sort(abs(feats), decreasing = T), 10))
topgenes.sign <- feats[topgenes]
if (df[i, "stat"]>0) {
topgenes.p <- topgenes.sign[topgenes.sign>=0]
topgenes.n <- topgenes.sign[topgenes.sign<0]
} else {
topgenes.p <- topgenes.sign[topgenes.sign<0]
topgenes.n <- topgenes.sign[topgenes.sign>=0]
}
pval2print <- ifelse(df[i, "p_val"]<0.001, sprintf("%.1e",df[i, "p_val"]), sprintf("%.4f",df[i, "p_val"]))
buffer <- paste0(buffer, sprintf("-----------------------\nTop %s component %i: %s Stat %.3f p.val %s\n",
reduction, i, topPC, df[i, "stat"], pval2print))
buffer <- paste0(buffer, "Driver genes for this component:\n")
buffer <- paste0(buffer, paste(c("Higher in query +++ ", names(topgenes.p)), collapse = " "), "\n")
buffer <- paste0(buffer, paste(c("Lower in query --- ", names(topgenes.n)), collapse = " "), "\n")
}
if (verbose) {
message(buffer)
}
return(df)
}
# Function plot.discriminant.3d
#' 3D plot of reference map with extra discriminant dimension
#'
#' Add an extra dimension to the reference map (it can be suggested by `find.discriminant.dimensions`), to explore additional axes of variability
#' in a query dataset compared to the reference map.
#'
#' @param ref Seurat object with reference object
#' @param query Seurat object with query data
#' @param query.control Optionally, you can compare your query with a control sample, instead of the reference
#' @param query.assay The data slot to be used for enrichment analysis
#' @param labels.col The metadata field used to annotate the clusters
#' @param query.state Only plot the query cells from this specific state
#' @param extra.dim The additional dimension to be added on the z-axis of the plot. Can be either:
#' \itemize{
#' \item{An ICA or PCA dimension (e.g. ICA_10). See `find.discriminant.dimensions`}
#' \item{Any numeric metadata field associated to the cells (e.g. 'cycling.score')}
#' }
#' @return A three dimensional plot with UMAP_1 and UMAP_2 on the x and y axis respectively, and the specified `extra.dim` on the z-axis.
#' @examples
#' plot.discriminant.3d(ref, query=query, extra.dim="ICA_19")
#' plot.discriminant.3d(ref, query=treated.set, query.control=control.set, extra.dim="ICA_2")
#'
#' @export plot.discriminant.3d
#'
plot.discriminant.3d <- function(ref, query, query.control=NULL, query.assay="RNA",
labels.col="functional.cluster", extra.dim="ICA_1", query.state=NULL) {
require(plotly)
reduction=NULL
message(paste0("Generating UMAP with 3rd dimension on ",extra.dim))
if (grepl("^ica_\\d+", tolower(extra.dim), perl=T)) {
reduction="ica"
} else if (grepl("^pca_\\d+", tolower(extra.dim), perl=T)) {
reduction="pca"
}
ref.3d <- ref
ref.3d$queryGroup <- "Reference"
#Only show cells of a specific state for the query
if(!is.null(query.state)) {
query.cells=colnames(query)[which(query[[labels.col]]==query.state)]
query <- subset(query, cells=query.cells)
if (!is.null(query.control)) {
query.c.cells=colnames(query.control)[which(query.control[[labels.col]]==query.state)]
query.control <- subset(query.control, cells=query.c.cells)
}
}
#Prepare embeddings
if (!is.null(query.control)) {
q.labs <- c(rep("Control", dim(query.control)[2]), rep("Query", dim(query)[2]))
q.umaps <- rbind(query.control@reductions$umap@cell.embeddings, query@reductions$umap@cell.embeddings)
query <- merge(query.control, query)
query$queryGroup <- q.labs
query[["umap"]] <- CreateDimReducObject(embeddings = q.umaps, key = "UMAP_", assay = query.assay)
} else {
query$queryGroup <- "Query"
}
query@meta.data[,labels.col] <- "Query"
metacols <- intersect(names(ref.3d@meta.data),names(query@meta.data))
ref.3d@meta.data <- rbind(ref.3d@meta.data[,metacols], query@meta.data[,metacols])
ref.3d@reductions$umap@cell.embeddings <- rbind(ref.3d@reductions$umap@cell.embeddings,
query@reductions$umap@cell.embeddings)
ref.3d@meta.data <- cbind(ref.3d@meta.data, ref.3d@reductions$umap@cell.embeddings)
#Calculate ICA or PCA embeddings for query
if (!is.null(reduction)) {
if (reduction=="ica"){
projected <- apply.ica.obj(query=query, query.assay=query.assay, ica.obj=ref.3d@misc$ica)
ref.3d@reductions[[reduction]]@cell.embeddings <- rbind(ref.3d@reductions[[reduction]]@cell.embeddings, projected)
} else if (reduction=="pca") {
projected <- apply.pca.obj.2(query=query, query.assay=query.assay, pca.obj=ref.3d@misc$pca_object)
ref.3d@reductions[[reduction]]@cell.embeddings <- rbind(ref.3d@reductions[[reduction]]@cell.embeddings, projected)
}
}
# Add metadata column
if (is.null(reduction)) {
if (extra.dim %in% colnames(ref.3d@meta.data)){
ref.3d@meta.data <- cbind(ref.3d@meta.data, Discriminant = ref.3d@meta.data[,extra.dim])
} else {
stop(sprintf("extra.dim %s not present in meta.data", extra.dim))
}
} else {
ref.3d@meta.data <- cbind(ref.3d@meta.data,Discriminant=ref.3d@reductions[[reduction]]@cell.embeddings[,extra.dim])
}
if (is.null(query.control)) {
cols <- c(Reference="gray50",Query="red")
} else {
cols <- c(Reference="gray50",Control="green",Query="red")
}
plotting.data <- ref.3d@meta.data[,c("UMAP_1", "UMAP_2","Discriminant", labels.col,"queryGroup")]
plotting.data$size <- ifelse(plotting.data$queryGroup == "Reference",0.3,6)
g <- plotly::plot_ly(data = plotting.data,
x = ~UMAP_1, y = ~UMAP_2, z = ~Discriminant,
color = ~queryGroup,
type = "scatter3d",
mode = "markers",
text=~queryGroup,
hoverinfo="text",
alpha=0.6,
alpha_stroke=0.6,
size=~size,
colors=cols ) |> plotly::layout(
title = sprintf("Projection of query on reference map + dimension %s", extra.dim),
scene = list(
xaxis = list(title = "UMAP_1"),
yaxis = list(title = "UMAP_2"),
zaxis = list(title = extra.dim)
))
print(g)
return(g)
}
# Function find.discriminant.genes
#' Find discriminant genes
#'
#' Based on `FindMarkers`. It performs differential expression analysis between a projected query and a control (either the reference map or a control sample), for
#' a given cell type. Useful to detect whether specific cell states over/under-express genes between conditions or with respect to the reference.
#'
#' @param ref Seurat object with reference atlas
#' @param query Seurat object with query data
#' @param query.control Optionally, you can compare your query with a control sample, instead of the reference
#' @param ref.assay The referece assay to be used for DE analysis
#' @param query.assay The query assay to be used for DEG analyis, if comparing to the reference
#' @param state Perform discriminant analysis on this cell state. Can be either:
#' \itemize{
#' \item{"largest" - Performs analysis on the cell state most represented in the query set(s)}
#' \item{"all" - Performs analysis on the complete dataset, using all cells}
#' \item{A specific cell state, one of the states in metadata field labels.col}
#' }
#' @param labels.col The metadata field used to annotate the clusters (default: functional.cluster)
#' @param test Type of test for DE analysis. See help for `FindMarkers` for implemented tests.
#' @param min.cells Minimum number of cells in the cell type to proceed with analysis.
#' @param genes.use What subset of genes to consider for DE analysis:
#' \itemize{
#' \item{"variable" - Only consider variable genes of the reference}
#' \item{"all" - Use intersection of all genes in query and control}
#' \item{A custom list of genes}
#' }
#' @param ... Adding parameters for `FindMarkers`
#' @return A dataframe with a ranked list of genes as rows, and statistics as columns (e.g. log fold-change, p-values). See help for `FindMarkers` for more details.
#' @examples
#' # Discriminant genes between query and reference in cell type "Tex"
#' markers <- find.discriminant.genes(ref, query=query.set, state="Tex")
#'
#' # Discriminant genes between query and control sample in most represented cell type
#' markers <- find.discriminant.genes(ref, query=query.set, query.control=control.set)
#'
#' # Pass results to EnhancedVolcano for visual results
#' library(EnhancedVolcano)
#' EnhancedVolcano(markers, lab = rownames(markers), x = 'avg_logFC', y = 'p_val')
#'
#' @import Seurat
#' @export find.discriminant.genes
#'
find.discriminant.genes <- function(ref, query, query.control=NULL, ref.assay="RNA", query.assay="RNA",
state="largest", labels.col="functional.cluster",
test="wilcox", min.cells=10, genes.use=c("variable","all"), ...)
{
if (is.null(ref)) {stop("Please provide the reference object (ref")}
if (is.null(query)) {stop("Please provide a query object (query)")}
#Determine cell state for analysis
if (is.null(state) | state=="largest") {
if (!is.null(query.control)) {
ss <- table(rbind(query[[labels.col]], query.control[[labels.col]]))
} else {
ss <- table(query[[labels.col]])
}
state <- names(sort(ss, decreasing = T))[1]
message(paste0("Performing differential expression analysis using the most abundant cell state in the query - ", state))
} else if (state=="all") {
message("Performing differential expression analysis using all cells")
} else {
if (!state %in% query[[labels.col]][,1]) {
stop(sprintf("State %s not found in query metadata colum %s", state, labels.col))
}
if (!is.null(query.control) & !state %in% query.control[[labels.col]][,1]) {
stop(sprintf("State %s not found in query.control metadata colum %s", state, labels.col))
}
message(paste0("Performing differential expression analysis with user-specified state - ", state))
}
#Subset query data on specific state
if (state=="all") {
s1.cells <- colnames(query)
if (!is.null(query.control)) {
s2.cells <- colnames(query.control)
} else {
s2.cells <- colnames(ref)
}
} else {
s1.cells <- colnames(query)[which(query[[labels.col]]==state)]
if (!is.null(query.control)) {
s2.cells <- colnames(query.control)[which(query.control[[labels.col]]==state)]
} else {
s2.cells <- colnames(ref)[which(ref[[labels.col]]==state)]
}
}
#check we have enough cells
if (length(s1.cells)<min.cells) {
stop(sprintf("Too few cells for state %s in query. Exit.", state))
}
if (!is.null(query.control) & length(s2.cells)<min.cells ) {
stop(sprintf("Too few cells for state %s in query control. Exit.", state))
}
#Subset on subtype
DefaultAssay(query) <- query.assay
s1 <- subset(query, cells=s1.cells)
s1$Group <- "Query"
if (!is.null(query.control)) {
DefaultAssay(query.control) <- query.assay
s2 <- subset(query.control, cells=s2.cells)
} else {
DefaultAssay(ref) <- ref.assay
s2 <- subset(ref, cells=s2.cells)
if (ref.assay != query.assay) {
s2[[query.assay]] <- s2[[ref.assay]]
}
}
s2$Group <- "Control"
s.m <- merge(s1, s2)
s.m <- DietSeurat(s.m, assays = query.assay)
if (inherits(s.m[[query.assay]],"Assay5")) {
s.m <- JoinLayers(s.m)
}
Idents(s.m) <- "Group"
#use all genes or only variable genes from the reference
if (genes.use[1] == "all") {
which.genes <- NULL
} else if (genes.use[1] == "variable") {
which.genes <- intersect(VariableFeatures(ref, assay="integrated"), rownames(s.m))
} else {
which.genes <- intersect(genes.use, rownames(s.m))
}
markers <- FindMarkers(s.m, slot="data", ident.1="Query", ident.2="Control", only.pos = F, test.use=test, assay=query.assay,
features = which.genes, ...)
return(markers)
}
#' Make a ProjecTILs reference
#'
#' Converts a Seurat object to a ProjecTILs reference atlas. You can preserve your low-dimensionality embeddings
#' (e.g. UMAP) in the reference atlas by setting `recalculate.umap=FALSE`, or recalculate the UMAP using one of
#' the two methods (\link[umap]{umap::umap} or \link[uwot]{uwot::umap}). Recalculation allows exploting the
#' 'predict' functionalities of these methods for embedding of new points; skipping recalculation will
#' make the projection use an approximation for UMAP embedding of the query.
#'
#' @param ref Seurat object with reference atlas
#' @param assay The assay storing the reference expression data (e.g. "integrated")
#' @param assay.raw The assay storing raw expression data (e.g. "RNA")
#' @param atlas.name An optional name for your reference
#' @param annotation.column The metadata column with the cluster annotations for this atlas
#' @param recalculate.umap If TRUE, run the `umap` or `uwot` algorithm to generate embeddings.
#' Otherwise use the embeddings stored in the `dimred` slot.
#' @param umap.method Which method to use for calculating the umap reduction
#' @param metric Distance metric to use to find nearest neighbors for UMAP
#' @param min_dist Effective minimum distance between UMAP embedded points
#' @param n_neighbors Size of local neighborhood for UMAP
#' @param ndim Number of PCA dimensions
#' @param dimred Use the pre-calculated embeddings stored at `Embeddings(ref, dimred)`
#' @param nfeatures Number of variable features (only calculated if not already present)
#' @param color.palette A (named) vector of colors for the reference plotting functions.
#' One color for each cell type in 'functional.cluster'
#' @param scGate.model.human A human \link[scGate]{scGate} model to purify the cell types represented in the
#' map. For example, if the map contains CD4 T cell subtype, specify an scGate model for CD4 T cells.
#' @param scGate.model.human A mouse \link[scGate]{scGate} model to purify the cell types represented in the
#' map.
#' @param store.markers Whether to store the top differentially expressed genes in `ref@@misc$gene.panel`
#' @param n.markers Store the top `n.markers` for each subtype given by differential
#' expression analysis
#' @param seed Random seed
#' @param layer1_link Broad cell type contained in this reference atlas (i.e. CD4T, CL:0000624...) to link with broad cell type annotation (layer1).
#' @return A reference atlas compatible with ProjecTILs
#' @examples
#' custom_reference <- ProjecTILs::make.reference(myref, recalculate.umap=T)
#' @importFrom stats prcomp
#' @importFrom uwot umap
#' @importFrom dplyr group_by top_n
#' @export make.reference
#'
make.reference <- function(ref,
assay=NULL,
assay.raw="RNA",
atlas.name="custom_reference",
annotation.column="functional.cluster",
recalculate.umap=FALSE,
umap.method=c("umap","uwot"),
metric="cosine",
min_dist=0.3,
n_neighbors = 30,
ndim=20,
dimred="umap",
nfeatures=1000,
color.palette=NULL,
scGate.model.human=NULL,
scGate.model.mouse=NULL,
store.markers=FALSE,
n.markers=10,
seed=123,
layer1_link = NULL) {
if (is.null(assay)) {
assay=DefaultAssay(ref)
}
if (!assay %in% Assays(ref)) {
stop(sprintf("Assay %s not found in reference object. Select a different assay", assay))
}
# Add layer1 link
if (is.null(layer1_link)){
warning("layer1_link not specificied. If using this reference for HiTME annotation this reference will not be used.\n")
}
ref@misc$layer1_link <- layer1_link
#make a copy of default assay
if (assay != "integrated") {
ref[["integrated"]] <- ref[[assay]]
}
DefaultAssay(ref) <- "integrated"
varfeat <- VariableFeatures(ref, assay=assay)
if (is.null(varfeat) | length(varfeat)==0) {
ref <- FindVariableFeatures(ref, assay = assay, nfeatures = nfeatures, verbose=FALSE)
varfeat <- VariableFeatures(ref, assay=assay)
}
#Remove cells with no annotation
notna <- colnames(ref)[!is.na(ref@meta.data[,annotation.column])]
# warning for removing cells
if(length(notna) != ncol(ref)){
na.rm <- 1-(length(notna)/ncol(ref))
warning("Removing ", na.rm, "% cells with NA on annotation.column: ", annotation.column)
ref <- subset(ref, cells=notna)
}
#Recompute PCA embeddings using prcomp
ref <- prcomp_seurat(ref, ndim=ndim, assay=assay)
#Recalculate UMAP, or use an already-present dimensionality reduction
if (!recalculate.umap) {
if (dimred %in% Reductions(ref)) {
ref.pca <- ref@misc$pca_object
cell.order = rownames(ref.pca$x)
low.emb <- Embeddings(ref, reduction=dimred)[cell.order,]
colnames(low.emb) <- c("UMAP_1","UMAP_2")
#Save these embeddings
ref@misc$umap_object <- list()
ref@misc$umap_object$data <- ref.pca$x
ref@misc$umap_object$layout <- low.emb
} else {
stop(sprintf("Dimred %s not found in reference object. Select a different dimensionality reduction, or set recalculate.umap=TRUE to compute UMAP coordinates", dimred))
}
} else {
umap.method = umap.method[1]
ref.pca <- ref@misc$pca_object
if (umap.method == "umap") {
#generate UMAP embeddings
ref.umap <- run.umap.2(ref.pca, ndim=ndim, seed=seed,
n.neighbors=n_neighbors, min.dist=min_dist,
metric=metric)
#Save UMAP object
ref@misc$umap_object <- ref.umap
ref[["umap"]] <- CreateDimReducObject(embeddings = ref.umap$layout,
key = "UMAP_", assay = assay)
} else if (umap.method == "uwot") {
warning("There are known issues with saving a loading uwot models. If you plan to save your reference as an .rds file, please use umap.method='umap'")
#generate UMAP embeddings
ref.umap <- run.umap.uwot(ref.pca, ndim=ndim, seed=seed,
n.neighbors=n_neighbors, min.dist=min_dist,
metric=metric)
ref.umap$data <- ref.pca$x
#Save UMAP object
ref[["umap"]] <- CreateDimReducObject(embeddings = ref.umap$embedding,
key = "UMAP_", assay = assay)
} else {
stop("Unsupported UMAP method.")
}
}
if (!annotation.column == "functional.cluster") {
ref$functional.cluster <- ref@meta.data[,annotation.column]
}
ref$functional.cluster <- factor(ref$functional.cluster)
levs <- levels(ref$functional.cluster)
#Reference name
ref@misc$projecTILs=atlas.name
#Add color palette
if (!is.null(color.palette)) {
if (length(color.palette) != length(levs)) {
stop("Length of color palette must match number of cell types")
}
if (!is.null(names(color.palette))) {
d <- setdiff(names(color.palette), levels(ref$functional.cluster))
if (length(d)>0) {
stop("Names of color palette do not match annotation levels")
}
} else {
names(color.palette) <- levs
}
} else {
color.palette <- rainbow(n=length(levs))
names(color.palette) <- levs
}
ref@misc$atlas.palette <- color.palette
#Add scGate models
ref@misc$scGate <- list()
if (!is.null(scGate.model.human)) {
ref@misc$scGate$human <- scGate.model.human
}
if (!is.null(scGate.model.mouse)) {
ref@misc$scGate$mouse <- scGate.model.mouse
}
#Add DEGs
Idents(ref) <- "functional.cluster"
if (store.markers) {
DefaultAssay(ref) <- assay.raw
cluster.markers <- FindAllMarkers(ref, only.pos = T, min.pct = 0.1, min.diff.pct=0.1,
logfc.threshold = 0.25, max.cells.per.ident = 500,
test.use="wilcox",base=exp(1), verbose=F)
all <- cluster.markers |> dplyr::group_by(cluster) |>
dplyr::top_n(n = n.markers, wt = abs(avg_logFC))
markers <- list()
for (i in levels(ref@active.ident)) {
markers[[i]] <- subset(all, cluster==i)$gene
}
ref@misc$gene.panel <- markers
}
DefaultAssay(ref) <- "integrated"
return(ref)
}
#' Merge Seurat objects, including reductions (e.g. PCA, UMAP, ICA)
#'
#' Given two Seurat objects, merge counts and data as well as dim reductions (PCA, UMAP, ICA, etc.)
#'
#' @param x First object to merge
#' @param y Second object to merge
#' @param merge.dr How to handle merging dimensional reductions (see merge.Seurat)
#' @param ... More parameters to \link{merge} function
#' @return A merged Seurat object
#' @examples
#' seurat.merged <- merge.Seurat.embeddings(obj.1, obj.2)
#' #To merge multiple object stored in a list
#' seurat.merged <- Reduce(f=merge.Seurat.embeddings, x=obj.list)
#' @import Seurat
#' @export merge.Seurat.embeddings
merge.Seurat.embeddings <- function(x=NULL, y=NULL, merge.dr=TRUE, ...)
{
#Use Seurat merge function, inheriting parameters
m <- merge(x, y, merge.dr=merge.dr, ...)
return(m)
}
#' Recalculate low dimensional embeddings after projection
#'
#' Given a reference object and a (list of) projected objects, recalculate low-dim
#' embeddings accounting for the projected cells
#'
#' @param ref Reference map
#' @param projected A projected object (or list of projected objects) generated using \link{make.projection}
#' @param ref.assay Assay for reference object
#' @param proj.assay Assay for projected object(s)
#' @param ndim Number of dimensions for recalculating dimensionality reductions
#' @param n.neighbors Number of neighbors for UMAP algorithm
#' @param min.dist Tightness parameter for UMAP embedding
#' @param metric Distance metric to use to find nearest neighbors for UMAP
#' @param recalc.pca Whether to recalculate the PCA embeddings with the combined reference and projected data
#' @param umap.method Which method should be used to calculate UMAP embeddings
#' @param resol Resolution for unsupervised clustering
#' @param k.param Number of nearest neighbors for clustering
#' @param seed Random seed for reproducibility
#' @return A combined reference object of reference and projected object(s), with new low dimensional embeddings
#' @examples
#' combined <- recalculate.embeddings(ref, projected, ndim=10)
#' @export recalculate.embeddings
recalculate.embeddings <- function(ref, projected, ref.assay="integrated", proj.assay="integrated",
ndim=NULL, n.neighbors=20, min.dist=0.3, recalc.pca=FALSE,
resol=0.4, k.param=15, metric="cosine",
umap.method=c('umap','uwot'), seed=123){
if (is.null(ref) | is.null(projected)) {
stop("Please provide a reference and a projected object (or list of projected objects)")
}
if (is.list(projected) | is.vector(projected)) {
projected <- Reduce(f=merge.Seurat.embeddings, x=projected)
}
projected$ref_or_query <- "query"
ref$ref_or_query <- "ref"
umap.method <- umap.method[1]
projected <- RenameCells(object = projected, add.cell.id = "Q")
merged <- merge.Seurat.embeddings(ref, projected)
DefaultAssay(merged) <- proj.assay
DefaultAssay(ref) <- ref.assay
if (is.null(ndim)) {
ndim <- ncol(ref@misc$umap_object$data)
if (is.null(ndim)) {
stop("Please provide number of dimensions for dimensionality reduction (ndim)")
}
}
VariableFeatures(merged) <- VariableFeatures(ref)
merged@misc <- ref@misc
merged@misc$pca_object$x <- merged@reductions$pca@cell.embeddings
if (recalc.pca) {
message("Recalculating PCA embeddings...")
merged <- prcomp_seurat(merged, assay=proj.assay, ndim=ndim)
}
ref.pca <- merged@misc$pca_object
if (umap.method=="uwot") {
message("Recalculating UMAP embeddings using uwot...")
warning("There are known issues with saving a loading uwot models. If you plan to save your reference as an .rds file, please use umap.method='umap'")
ref.umap <- run.umap.uwot(ref.pca, ndim=ndim,
n.neighbors = n.neighbors,
min.dist=min.dist,
seed=seed, metric=metric)
ref.umap$data <- ref.pca$x
#Save UMAP object
merged@misc$umap_object <- ref.umap
merged@reductions$umap@cell.embeddings <- ref.umap$embedding
} else if (umap.method=="umap") {
message("Recalculating UMAP embeddings using 'umap' package...")
ref.umap <- run.umap.2(ref.pca, ndim=ndim,
n.neighbors = n.neighbors,
min.dist=min.dist,
seed=seed, metric=metric)
#Save UMAP object
merged@misc$umap_object <- ref.umap
merged@reductions$umap@cell.embeddings <- ref.umap$layout
} else {
stop("UMAP method not supported.")
}
#Did any new clusters arise after adding projected data?
DefaultAssay(merged) <- ref.assay
merged <- FindNeighbors(merged, reduction = "pca", dims = 1:ndim,
k.param = k.param, verbose=FALSE)
merged <- FindClusters(merged, resolution = resol, verbose=FALSE)
tab <- table(merged$seurat_clusters, merged$ref_or_query)
glob.freq <- table(merged$ref_or_query)["query"]/ncol(merged)
freq <- apply(tab, 1, function(x){x/sum(x)})["query",] - glob.freq
freq[freq<0] <- 0
cl <- as.numeric(as.character(merged$seurat_clusters))
names(cl) <- names(merged$seurat_clusters)
fcl <- freq[as.character(cl)]
names(fcl) <- names(cl)
merged$newclusters <- fcl
Idents(merged) <- "functional.cluster"
return(merged)
}
#' Calculate Silhouette coefficient
#'
#' Given a projected object and its reference, calculate silhouette coefficient for query cells with respect
#' to reference cells with the same cell labels.
#'
#' @param ref Reference object
#' @param query Query object. If not specified, the silhouette coefficient of only the reference will be calculated
#' @param reduction Which dimensionality reduction to use for euclidian distance calculation
#' @param ndim Number of dimensions in the dimred to use for distance calculation. If NULL, use all dimensions.
#' @param label_col Metadata column with cell type annotations. Must be present both in reference and query
#' @param normalize.scores Whether to normalize silhouette scores by the average cell type silhouettes of the reference
#' @param min.cells Only report silhouette scores for cell type with at least this number of cells
#' @return A dataframe with average silhouette coefficient for each cell type
#' @examples
#' data(query_example_seurat)
#' ref <- load.reference.map()
#' q <- Run.ProjecTILs(query_example_seurat, ref=ref, fast.umap.predict=TRUE)
#' combined <- compute_silhouette(ref, query=q)
#' @importFrom pracma distmat
#' @export compute_silhouette
compute_silhouette <- function(ref, query=NULL,
reduction="pca",
ndim=NULL,
label_col="functional.cluster",
normalize.scores=FALSE,
min.cells=20) {
y <- Reductions(ref, slot=reduction)
if (is.null(ndim)) {
ndim <- ncol(y@cell.embeddings)
}
y <- y@cell.embeddings[,1:ndim]
levs <- levels(as.factor(ref@meta.data[,label_col]))
labs.y <- ref@meta.data[,label_col]
if (is.null(query)) {
x <- y
labs.x <- labs.y
} else {
subtypes <- table(query@meta.data[,label_col])
subtypes <- names(subtypes[subtypes > min.cells])
cid <- Cells(query)[query@meta.data[,label_col] %in% subtypes]
query <- subset(query, cells = cid)
x <- Reductions(query, slot=reduction)
x <- x@cell.embeddings[,1:ndim]
labs.x <- factor(query@meta.data[,label_col], levels=levs)
}
dists <- pracma::distmat(x,y)
sil <- silhouette_2sets(dists, labs.x, labs.y)
#summarize by cluster
means <- aggregate(sil$sil_width, list(sil$cluster), FUN=mean)
colnames(means) <- c("Cluster","Silhouette")
if (normalize.scores) { #normalize by silhouette of the reference
dist.ref <- pracma::distmat(y,y)
sil.ref <- silhouette_2sets(dist.ref, labs.y, labs.y)
means.ref <- aggregate(sil.ref$sil_width, list(sil.ref$cluster), FUN=mean)
colnames(means.ref) <- c("Cluster","Silhouette")
for (i in 1:nrow(means)) {
subset <- means[i,"Cluster"]
j <- which(means.ref$Cluster == subset)
means[i,"Silhouette.norm"] <- means[i,"Silhouette"]/means.ref[j,"Silhouette"]
if (means[i,"Silhouette.norm"] > 1) {means[i,"Silhouette.norm"]=1}
if (means[i,"Silhouette.norm"] < 0) {means[i,"Silhouette.norm"]=0}
}
}
return(means)
}
#' Project a query scRNA-seq dataset onto a reference atlas
#'
#' This function allows projecting ("query") single-cell RNA-seq datasets onto a reference map
#' (i.e. a curated and annotated scRNA-seq dataset).
#' To project multiple datasets, submit a list of Seurat objects with the query parameter.
#' The projection consists of 3 steps:
#' \itemize{
#' \item{pre-processing: optional steps which might include pre-filtering of cells by markers using `scGate`,
#' data normalization, and ortholog conversion.}
#' \item{batch-effect correction: uses built-in STACAS algorithm to detect and correct for batch effects
#' (this step assumes that at least a fraction of the cells in the query are in the same state than cells in
#' the reference)}
#' \item{embedding of corrected query data in the reduced-dimensionality spaces (PCA and UMAP) of the reference map.}
#' }
#' This function acts as a wrapper for \link{make.projection} and \link{cellstate.predict}
#'
#' See \link{load.reference.map} to load or download a reference atlas. See
#' also \link{ProjecTILs.classifier} to use ProjecTILs as a cell type classifier.
#'
#' @param query Query data, either as single Seurat object or as a list of Seurat object
#' @param ref Reference Atlas - if NULL, downloads the default TIL reference atlas
#' @param filter.cells Pre-filter cells using `scGate`. Only set to FALSE if the dataset has
#' been previously subset to cell types represented in the reference.
#' @param split.by Grouping variable to split the query object (e.g. if the object contains multiple samples)
#' @param reduction The dimensionality reduction used to assign cell type labels, based on
#' majority voting of nearest neighbors between reference and query.
#' @param ndim The number of dimensions used for cell type classification
#' @param k Number of neighbors for cell type classification
#' @param labels.col The metadata field of the reference to annotate the clusters
#' @param nn.decay Weight decay for internal nearest neighbors (between 0 and 1)
#' @param min.confidence Minimum confidence score to return cell type labels (otherwise NA)
#' @param ... Additional parameters to \link[ProjecTILs]{make.projection}
#' @return An augmented Seurat object with projected UMAP coordinates on the reference map and cell classifications
#' @examples
#' data(query_example_seurat)
#' ref <- load.reference.map()
#' q <- Run.ProjecTILs(query_example_seurat, ref=ref, fast.umap.predict=TRUE)
#' plot.projection(ref=ref, query=q)
#' @export Run.ProjecTILs
#'
Run.ProjecTILs <- function(query, ref=NULL,
filter.cells = TRUE,
split.by = NULL,
reduction="pca",
ndim=NULL, k=5,
nn.decay=0.1,
min.confidence=0.2,
labels.col="functional.cluster", ...) {
if (!is.null(split.by)) {
if (!split.by %in% colnames(query[[]])) {
stop(sprintf("No grouping variable %s available in metadata", split.by))
}
query <- SplitObject(query, split.by = split.by)
}
#Run projection
query <- make.projection(query=query, ref=ref, filter.cells=filter.cells, ...)
if(!is.list(query)) {
query <- list(query=query)
}
#Cell type classification
query <- lapply(query, function(x) {
cellstate.predict(ref=ref, query=x,
reduction=reduction,
ndim=ndim, k=k,
nn.decay=nn.decay,
min.confidence = min.confidence,
labels.col = labels.col)
})
pass <- lapply(query, function(x) {
!is.null(x) && ncol(x)>1
})
query <- query[unlist(pass)]
#Merge embeddings
if (!is.null(split.by)) {
query <- suppressMessages(Reduce(merge.Seurat.embeddings, query))
}
if (is.list(query) & length(query)==1) {
query <- query[[1]]
}
query
}
#' Annotate query dataset using a reference object
#'
#' Apply label transfer to annotate a query dataset with the cell types of a reference object.
#' Compared to \link{Run.ProjecTILs}, only cell labels are returned. The low-dim embeddings of
#' the query object (PCA, UMAP) are not modified.
#'
#' See \link{load.reference.map} to load or download a reference atlas.
#' See \link{Run.ProjecTILs} to embed the query in the same space of the reference
#'
#' @param query Query data, either as single Seurat object or as a list of Seurat object
#' @param ref Reference Atlas - if NULL, downloads the default TIL reference atlas
#' @param filter.cells Pre-filter cells using `scGate`. Only set to FALSE if the dataset has
#' been previously subset to cell types represented in the reference.
#' @param split.by Grouping variable to split the query object (e.g. if the object contains multiple samples)
#' @param reduction The dimensionality reduction used to assign cell type labels
#' @param ndim The number of dimensions used for cell type classification
#' @param k Number of neighbors for cell type classification
#' @param labels.col The metadata field with label annotations of the reference, which will
#' be transferred to the query dataset
#' @param nn.decay Weight decay for internal nearest neighbors (between 0 and 1)
#' @param min.confidence Minimum confidence score to return cell type labels (otherwise NA)
#' @param overwrite Replace any existing labels in \code{labels.col} with new labels.
#' This may be useful for predicting cell types using multiple reference maps; run
#' this function with \code{overwrite=FALSE} to combine existing labels
#' with new labels from a second reference map.
#' @param ncores Number of cores for parallel processing
#' @param ... Additional parameters to \link[ProjecTILs]{make.projection}
#' @return The query object with a additional metadata columns containing predicted cell labels
#' and confidence scores for the predicted cell labels
#' If cells were filtered prior to projection, they will be labeled as 'NA'
#' @examples
#' data(query_example_seurat)
#' ref <- load.reference.map()
#' q <- ProjecTILs.classifier(query_example_seurat, ref=ref)
#' table(q$functional.cluster, useNA="ifany")
#' @importFrom BiocParallel MulticoreParam SerialParam SnowParam bplapply
#' @export ProjecTILs.classifier
ProjecTILs.classifier <- function(query, ref=NULL,
filter.cells = TRUE,
split.by = NULL,
reduction="pca",
ndim=NULL, k=5,
nn.decay=0.1,
min.confidence=0.2,
labels.col="functional.cluster",
overwrite=TRUE,
ncores=1,
...) {
if (is.list(query)) {
query.list <- query
input.is.list <- TRUE
if (!is.null(split.by)) {
split.by <- NULL
message("List of query objects provided. Ignoring 'split.by' argument")
}
} else {
input.is.list <- FALSE
}
if (!is.null(split.by)) {
if (!split.by %in% colnames(query[[]])) {
stop(sprintf("No grouping variable %s available in metadata", split.by))
}
query.list <- SplitObject(query, split.by = split.by)
} else if (!is.list(query)) {
query.list <- list(query=query)
}
ncores <- min(ncores, length(query.list))
param <- set_parall(ncores)
pred.labels <- BiocParallel::bplapply(
X = 1:length(query.list),
BPPARAM = param,
FUN = function(i) {
classifier.singleobject(query=query.list[[i]], ref=ref,
filter.cells = filter.cells,
reduction=reduction,
ndim=ndim, k=k,
nn.decay=nn.decay,
min.confidence=min.confidence,
labels.col=labels.col,
overwrite=overwrite,
ncores=1, ...)
}
)
names(pred.labels) <- names(query.list)
#Add new labels to original 'query' object
labels.col.conf <- paste0(labels.col, ".conf")
if (input.is.list) {
query <- lapply(seq_along(query), function(i){
new.labs <- pred.labels[[i]]
q <- query[[i]]
q@meta.data[,labels.col] <- NA
q@meta.data[,labels.col.conf] <- NA
q@meta.data[rownames(new.labs),labels.col] <- new.labs[[labels.col]]
q@meta.data[rownames(new.labs),labels.col.conf] <- new.labs[[labels.col.conf]]
q
})
names(query) <- names(pred.labels)
} else {
pred.labels.bind <- Reduce(rbind, pred.labels)
query@meta.data[,labels.col] <- NA
query@meta.data[,labels.col.conf] <- NA
query@meta.data[rownames(pred.labels.bind),labels.col] <- pred.labels.bind[[labels.col]]
query@meta.data[rownames(pred.labels.bind),labels.col.conf] <- pred.labels.bind[[labels.col.conf]]
}
return(query)
}
#' Plot a averaged expression heatmap from a Seurat object
#'
#' This function allows to calculate and plot pseudo-bulk gene expression by cell type and
#' custom grouping variables. Data can be split in principle by any metadata present in the
#' starting Seurat object (e.g. patient, tissue, study, etc.). This can be useful to evaluate
#' consistency of expression profiles for different cell types across samples, studies or
#' other grouping variables.
#'
#' @param data A Seurat object to be used for the heatmap
#' @param assay A string indicating the assay type, default is "RNA"
#' @param genes A vector of genes to be used in the heatmap
#' @param ref A ProjecTILs reference Seurat object to define the order of functional.cluster
#' @param scale A string indicating the scale of the heatmap, default is "row"
#' @param method A string or vector of strings indicating the clustering method to be used, default is "ward.D2"
#' @param brewer.palette A string indicating the color palette to be used, default is "RdBu"
#' @param palette_reverse A boolean indicating if color palette should be reversed, default is FALSE
#' @param cluster.col The metadata column name containing the cell type labels
#' @param group.by The metadata column names used as grouping variables
#' @param flip A boolean indicating if the heatmap should be flipped, default is FALSE
#' @param show_samplenames A boolean indicating whether the heatmap should display the sample names or not, default is FALSE
#' @param cluster_genes A boolean indicating if genes should be clustered, default is FALSE
#' @param cluster_samples A boolean indicating if samples should be clustered, default is FALSE
#' @param min.cells A value defining the minimum number of cells a sample should have to be kept, default is 10
#' @param remove.NA.meta A boolean indicating if missing samples with missing metadata should be plotted, default is TRUE
#' @param return.matrix If true, return the pseudo-bulk data matrix instead of graphical output
#' @param palette A named list containing colors vectors compatible with pheatmap. The list is named by the metadata names, default is taking these palettes to plot metadata: "Paired","Set2","Accent","Dark2","Set1","Set3".
#' @return A pheatmap plot, displaying averaged expression values across genes for each selected genes and samples.
#' @import pheatmap
#' @importFrom tidyr drop_na
#' @import RColorBrewer
#' @examples
#' library(Seurat)
#' ref <- load.reference.map(ref = "https://figshare.com/ndownloader/files/38921366")
#' celltype.heatmap(ref, assay = "RNA", genes = c("LEF1","SELL","GZMK","FGFBP2"),
#' ref = ref, cluster.col = "functional.cluster", metadata = c("orig.ident", "Tissue"))
#' @export celltype.heatmap
celltype.heatmap <- function(data, assay="RNA", slot="data", genes, ref = NULL, scale="row",
method=c("ward.D2","ward.D", "average"), brewer.palette="RdBu",
palette_reverse=F, palette = NULL,
cluster.col = "functional.cluster", group.by = NULL,
flip=FALSE, cluster_genes = FALSE, cluster_samples=FALSE, min.cells = 10,
show_samplenames = FALSE, remove.NA.meta = TRUE,
breaks = seq(-2, 2, by = 0.1), return.matrix=FALSE, ...) {
set.seed(123)
# Select clustering method to be used
method = method[1]
# Select desired metadata
if (is.null(group.by)) {
stop("Must at least provide one metadata column")
} else {
meta.cols <- c(cluster.col, group.by)
if (!all(meta.cols %in% colnames(data[[]]))) {
stop("Metadata columns not found in object. Please check cluster.col and group.by")
}
meta.sub <- data@meta.data[,meta.cols]
}
# Transform "NA" into true NAs
meta.sub[meta.sub=="NA"] = NA
# Remove NAs from metadata
if(remove.NA.meta == TRUE){
meta.sub <- meta.sub |> drop_na()
}
# Filters the data set to only include samples that have at least "min.cells" in the "metaSubset" variable.
data$metaSubset <- factor(apply(meta.sub,1,paste,collapse="!"))
t <- table(data$metaSubset)
accept <- names(t)[t>min.cells]
data <- subset(data, subset=metaSubset %in% accept)
# Calculate mean expression by cluster
genes.use <- intersect(genes, rownames(GetAssay(data, assay=assay)))
genes.removed <- setdiff(genes, genes.use)
if(length(genes.removed)>0){
message("These genes were not found in the assay: ", paste0(genes.removed, collapse = ","))
}
this.mat <- GetAssayData(data, assay=assay, layer=slot)[genes.use,]
m <- lapply(unique(genes.use), function(g) {
tapply(this.mat[g,], data$metaSubset, mean)
})
names(m) <- genes.use
m <- as.data.frame(m)
m <- m[accept,]
# Compute metadata for the annotation colors
m.subset <- factor(unlist(lapply(strsplit(rownames(m),"!",perl = T),function(x) x[[1]])))
m.meta <- list()
for (i in 1:length(group.by)){
m.meta[[i]] <- factor(unlist(lapply(strsplit(rownames(m),"!",perl = T),function(x) x[[i+1]])))
}
names(m.meta) <- group.by
m.meta <- as.data.frame(m.meta)
# Reorder dataframe if "ref" is defined
m <- cbind(m, m.subset, m.meta)
if (!is.null(ref)) {
m$m.subset <- factor(m$m.subset, levels = levels(ref$functional.cluster))
m <- m[order(m$m.subset), ]
# Reappend good annotation order
m.subset <-
factor(unlist(lapply(strsplit(rownames(m), "!", perl = T), function(x)
x[[1]])))
}
m <- m[,1:length(genes.use)]
if(return.matrix) {
return(m)
}
# Setup color palette list
if(palette_reverse){
color = colorRampPalette(brewer.pal(n = 7, name = brewer.palette))(length(breaks))
} else{
color = colorRampPalette(rev(brewer.pal(n = 7, name = brewer.palette)))(length(breaks))
}
palettes.default <- c("Paired","Set2","Accent","Dark2","Set1","Set3")
if (is.null(palette)) {
palette <- list()
palette[["Subtype"]] <- colorRampPalette(brewer.pal(n=8, name="Set1"))(length(unique(unlist(m.subset))))
names(palette[["Subtype"]]) <- c(unique(m.subset))
for (i in 1:length(group.by)){
meta <- group.by[i]
palette[[meta]] <- colorRampPalette(brewer.pal(n=6, name=palettes.default[i]))(length(unique(m.meta[[meta]])))
names(palette[[meta]]) <- levels(m.meta[[meta]])
}
}
# Define the colors of functional.cluster if ref is given
if (!is.null(ref)) {
palette[["Subtype"]] <- ref@misc$atlas.palette
}
# Compute annotation dataframe
annotation_col = data.frame(
Subtype = m.subset
)
annotation_col <- cbind(annotation_col, m.meta)
rownames(annotation_col) = rownames(m)
# Plot heatmap
if (flip) {
h <- pheatmap::pheatmap(m, cluster_rows = cluster_samples,
cluster_cols = cluster_genes,scale = scale,
breaks = breaks, color=color,
annotation_row = annotation_col,
show_rownames = show_samplenames,
border_color = NA,
annotation_colors = palette,
fontsize_row=6,fontsize = 7,
clustering_method=method, ...)
} else {
h <- pheatmap::pheatmap(t(m),cluster_rows = cluster_genes,
cluster_cols = cluster_samples,scale = scale,
breaks = breaks, color=color,
annotation_col = annotation_col,
show_colnames = show_samplenames,
border_color = NA,
annotation_colors = palette,
fontsize_row=6,fontsize = 7,
clustering_method=method, ... )
}
return(h)
}
#' Gene expression markers shared by multiple groups of cells
#'
#' This function expands \link[Seurat]{FindAllMarkers} to find markers that are differentially expressed across multiple
#' datasets or samples. Given a Seurat object with identity classes (for example annotated clusters) and a grouping
#' variable (for example a Sample ID), it calculate differentially expressed genes (DEGs) individually for each sample.
#' Then it determines the fraction of samples for which the gene was found to be differentially expressed.
#'
#' This function can be useful to find marker genes that are specific for individual cell types, and that are found
#' to be so consistently across multiple samples.
#'
#' @param object A Seurat object
#' @param split.by A metadata column name - the data will be split by this column to calculate \link[Seurat]{FindAllMarkers}
#' separately for each data split
#' @param only.pos Only return positive markers (TRUE by default)
#' @param features Genes to test. Default is to use all genes
#' @param min.cells.group Minimum number of cells in the group - if lower the group is skipped
#' @param min.freq Only return markers which are differentially expressed in at least this fraction of datasets.
#' @param ... Additional paramters to \link[Seurat]{FindAllMarkers}
#' @return A list of marker genes for each identity class (typically clusters), with two associated numerical values:
#' i) the fraction of datasets for which the marker was found to be differentially expressed; ii) the
#' average log-fold change for the genes across datasets
#' @importFrom dplyr filter select
#' @examples
#' ref <- load.reference.map(ref = "https://figshare.com/ndownloader/files/38921366")
#' Idents(ref) <- "functional.cluster"
#' FindAllMarkers.bygroup(ref, split.by = "Sample", min.cells.group=30, min.freq=0.8)
#' @export FindAllMarkers.bygroup
FindAllMarkers.bygroup <- function(object,
split.by = NULL,
only.pos = TRUE,
features = NULL,
min.cells.group = 10,
min.freq = 0.5,
...) {
if (is.null(split.by)) {
stop("Please provide a grouping variable with 'split.by' parameter")
}
if (!is.null(features)) {
features <- intersect(features, rownames(object))
}
notNA <- colnames(object)[!is.na(Idents(object))]
object <- subset(object, cells=notNA)
ids <- names(table(Idents(object)))
meta <- object[[]]
obj.list <- SplitObject(object, split.by = split.by)
deg <- lapply(obj.list, function(x){
d <- suppressWarnings(FindAllMarkers(x, only.pos = only.pos,
min.cells.group = min.cells.group, ...))
})
degsize <- unlist(lapply(deg, nrow))
deg <- deg[degsize>0]
genes <- lapply(ids, function(i) {
#count groups with at least min cells
cells <- Idents(object) == i
t <- table(meta[cells, split.by])
max.c <- sum(t>=min.cells.group)
if (max.c == 0) {
return(NULL)
}
#count occurrences
fq <- lapply(deg, function(x) {
x[x$cluster==i, "gene"]
})
sums <- table(unlist(fq))
freqs <- sums / max.c
#minimum frequency
sums <- sums[freqs >= min.freq]
freqs <- freqs[freqs >= min.freq]
if (length(freqs) == 0) {
return(NULL)
}
fc <- lapply(deg, function(x) {
x[x$cluster==i & x$gene %in% names(freqs), c("gene","avg_log2FC")]
})
avg.fc <- rep(0, length(freqs))
names(avg.fc) <- names(freqs)
for (ds in names(fc)) {
avg.fc[fc[[ds]]$gene] <- avg.fc[fc[[ds]]$gene] + fc[[ds]]$avg_log2FC
}
avg.fc <- avg.fc/sums
df <- data.frame(Gene = names(freqs), Freq=as.numeric(freqs))
df$avg.FC <- avg.fc
df$Gene <- as.character(df$Gene)
df <- df[order(df$avg.FC, decreasing = T),]
df
})
names(genes) <- ids
genes
}
#' Retrieve and load reference atlas
#'
#' Download and load reference atlases.
#'
#' @param collection Collection to download and load. See available collection using \link{list.reference.maps}. If NULL, all are downloaded and loaded (default)
#' @param reference References to download and load. See available collection using \link{list.reference.maps}. If NULL, all are downloaded and loaded (default)
#' @param update Boolean whether to delete current reference maps and download them again
#' @param directory Directory where to download and load from reference maps. By default a directory named "ProjecTILs_references" is created in working directory.
#' @param as.list Boolean whether to simplify list (\code{FALSE}) or, by default, keep a list of lists for each collection (\code{TRUE}).
#' @param verbose Inform of the status of processes
#'
#' @examples
#' # explore available reference maps
#' list.reference.maps()
#'
#' # consider increasing downloading timeout
#' options(timeout = 1000)
#'
#' # get all available reference maps
#' ref.maps <- get.reference.maps()
#'
#' # get certain collections or reference maps
#' # all human references maps
#' ref.maps.human <- get.reference.maps(collection = "human")
#'
#' # only some references
#' ref.maps <- get.reference.maps(reference = "DC")
#' ref.maps.CD4 <- get.reference.maps(reference = c("CD4", "Virus_CD4T"))
#'
#' # update previously downloaded maps
#' ref.maps <- get.reference.maps(update = TRUE)
#'
#' @importFrom digest digest
#' @importFrom jsonlite fromJSON
#' @importFrom dplyr filter pull left_join %>%
#' @export get.reference.maps
get.reference.maps <- function(collection = NULL,
reference = NULL,
update = FALSE,
directory = "./ProjecTILs_references",
as.list = TRUE,
verbose = TRUE){
# Normalize destination/directory path
directory <- suppressWarnings(
{normalizePath(directory,
winslash = "/")
})
if(!dir.exists(directory) && !update){
if(verbose){
message("Provided directory doesn't exists. Creating directory and downloading reference maps\n")
}
update <- TRUE
}
# get list of download links
links <- list.reference.maps()
# check collections
if(is.null(collection)){
if(verbose){message("Retrieving all ProjecTILs reference atlases\n")}
} else{
if(!any(tolower(collection) %in% links$collection.CSI)){
stop("Please provide one of the available collections to retrieve reference maps:\n\n",
paste(capture.output(print(links)), collapse = "\n"))
}
links <- links %>%
dplyr::filter(collection.CSI %in% collection)
if(verbose){message("Retrieving ", paste(collection, collapse = ", "),
" ProjecTILs reference atlases\n")}
}
# check references
if(is.null(reference)){
if(verbose){message("Retrieving all ProjecTILs reference atlases\n")}
} else{
if(!any(reference %in% links$reference.atlas)){
stop("Please provide one of the available references.atlas to retrieve:\n\n",
paste(capture.output(print(links)), collapse = "\n"))
}
links <- links %>%
dplyr::filter(reference.atlas %in% reference)
if(verbose){message("Retrieving ", paste(reference, collapse = ", "),
" ProjecTILs reference atlases\n")}
}
# filter links only for wanted to collections
# create directories for each species
collecs <- unique(links$collection.CSI)
refs <- lapply(collecs,
function(d){
# get figshare id
article_id <- links %>%
dplyr::filter(collection.CSI == d) %>%
dplyr::pull(figshare_id) %>%
unique()
# create directory and subdirectory
suppressWarnings({
dir.create(file.path(directory, d),
recursive = T)
})
ref.cols <- list()
for (aid in article_id){
# get metadata of downloading url and hash
figshare_md <- get_figshare_metadata(aid)
# left_join with customed list of desired maps to download
md <- dplyr::left_join(links %>%
dplyr::filter(figshare_id == aid),
figshare_md,
by = "name")
# download and check hashing
for(i in 1:nrow(md)){
filename <- md[i,"name"]
destfile <- file.path(directory, d, filename)
url <- md[i,"download_url"]
hash <- md[i, "computed_md5"]
obj_name <- md[i, "reference.atlas"]
if(update || !file.exists(destfile)){
try.download(url = url,
destfile = destfile,
hash = hash,
verbose = verbose,
warn = T)
}
# load files
tryCatch({
ref.cols[[obj_name]] <- load.helper(destfile)
cat("-------\n")
}, error = function(e){
cat("Loading failed for", destfile,
"\nPlease rerun get.reference.maps(update=T) to download again references. Also, Consider increasing downloading timeout running:\n `options(timeout = 1000)`\n")
}
)
}
}
return(ref.cols)
})
# return list
if(as.list){
names(refs) <- collecs
} else {
refs <- unlist(refs)
}
return(refs)
}
#' Available reference atlas for ProjecTILs
#'
#' Obtain the list of available reference atlas for ProjecTILs to then download and load them using \link{get.reference.maps}.
#'
#' @examples
#' # explore available reference maps
#' list.reference.maps()
#' @export list.reference.maps
list.reference.maps <- function(){
# get list of download links
links_path <- system.file("extdata",
"reference_links.csv",
package = "ProjecTILs")
# return the available links to reference maps
links <- read.csv(links_path,
stringsAsFactors = FALSE)
return(links)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.