#' Scale data in each group
#'
#' @param data expression matrix of feature by cell, usually normalized (in log scale) but not feature scaled.
#' @param group_by a vector including cell annotations
#' @param by_log calculate in log scale or in normalized scale; If TRUE log_base will be ignored.
#' @param log_base base::exp(1) or 2 or other else, depends on your normalization method.
#' @param center a logical value used in base::scale
#' @param scale a logical value used in base::scale
#'
#' @return scaled data
#' @export
#'
#' @examples
#'
pp_centerData <- function(
data,
group_by = NULL,
by_log = F,
log_base = base::exp(1),
center = T,
scale = T){
if(!by_log) data <- log_base^data
data <- as.matrix(data)
if(IsNULLorNA(group_by)) group_by <- rep("A", ncol(data))
if(is.factor(group_by)){
group_by <- droplevels(group_by)
}else{
group_by <- factor(group_by)
}
for(i in levels(group_by)){
data[,group_by == i] <- t(base::scale(t(data[,group_by == i]), center = center, scale = scale))
}
return(data)
}
#' A Wrapper of harmony::RunHarmony
#'
#' RunHarmony did not support dims.use actually, so I wrote this wrapper function to support that.
#'
#' @param object Seurat object
#' @param dims.use Which PCA dimensions to use for Harmony. By default, use all
#' @param group.by.vars Which variable(s) to remove (character vector).
#' @param ... other parameters to be used in harmony::HarmonyMatrix
#'
#' @return Seurat (version 3) object.
#' @export
#'
#' @examples
#'
pp_runHarmony <- function(object, dims.use, group.by.vars,...){
embedding <- Seurat::Embeddings(object)[,dims.use]
harmonyEmbed <- harmony::HarmonyMatrix(embedding, object@meta.data, group.by.vars,
FALSE, 0, ...)
rownames(harmonyEmbed) <- row.names(embedding)
colnames(harmonyEmbed) <- paste0("harmony_", seq_len(ncol(harmonyEmbed)))
suppressWarnings({
harmonydata <- Seurat::CreateDimReducObject(embeddings = harmonyEmbed,
stdev = as.numeric(apply(harmonyEmbed, 2, stats::sd)), key = "harmony")
})
object[["harmony"]] <- harmonydata
if (FALSE) {
object <- Seurat::ProjectDim(object, reduction = "harmony",
overwrite = TRUE, verbose = FALSE)
}
return(object)
}
#' Find variable features across batches
#'
#' Identifies Highly variable features (HVFs) for each batch, then combine them into a single vector of HVFs by count.
#'
#' @param object Seurat object
#' @param split_by a colname of meta.data
#' @param groups only some groups of split_by are considered.
#' @param nfeatures Number of features to select as top variable features; only used when selection.method is set to 'dispersion' or 'vst'
#' @param ... Arguments passed to Seurat::FindVariableFeatures
#'
#' @return A vector of variable features
#' @export
#'
#' @examples
#'
pp_findVariableFeatures <- function(object, split_by = "ident", groups = NULL, nfeatures = 2000, assay = "RNA", log_var = F, ...){
object_list <- Seurat::SplitObject(object, split_by)
if(!is.null(groups)) object_list <- object_list[groups]
hvgs <- data.frame(row.names = rownames(object))
for (i in seq_along(object_list)) {
object_list[[i]] <- Seurat::FindVariableFeatures(object_list[[i]], nfeatures = nfeatures, assay = assay,...)
var_data <- object_list[[i]]@assays[[assay]]@meta.features
var_ind <- grep("variance", colnames(var_data), value = T, ignore.case = T)
hvgs <- cbind(hvgs, var_data[,var_ind[length(var_ind)]])
}
if(log_var){
names(sort(rowMeans(log(hvgs)), decreasing = T)[1:nfeatures])
}else{
names(sort(rowMeans(hvgs), decreasing = T)[1:nfeatures])
}
}
#' render rmarkdown file for de-multiplex hash-tag data of h5file generated by cellranger
#'
#' @param h5file .h5 file generated by cellranger
#' @param seuratfile save seurat obejct into this file, eg. 'hashtag.seurat.Rdata'
#' @param hashtag hash-tag names used in the feature-ref file of your cellranger pipeline
#' @param sample real sample names represented by hash-tag, must be corresponding to hashtag each by each.
#' @inheritParams rmarkdown::render
#'
#' @details
#' Given that this package is designed for human, mouse and macaca,
#' when Mitochondria genes did not start with "mt-" (case was ignored. This situation should only occur in Macaca),
#' Ensembl gene id (already stored in this package) of Macaca mulatta will be used.
#'
#' @seealso \code{\link{rmarkdown::render}}
#'
#' @return derived html file
#' @export
#'
#' @examples
#'
pp_hashtagDemultiplex <- function(
h5file,
seuratfile,
sample,
hashtag = stringr::str_glue("B025{1:6}_anti-human_Hashtag_{1:6}"),
output_format = "html_document",
output_file = NULL,
output_dir = "./") {
rmdfile <- system.file("rmd", "hashtag_demux.Rmd", package = "convgene")
rmarkdown::render(rmdfile, output_format = output_format, output_file = output_file, output_dir = output_dir,
knit_root_dir = getwd(),
params = list(
h5file = h5file,
seuratfile = seuratfile,
sample = sample,
hashtag = hashtag
))
}
#' Annotate seurat data derived by pp_hashtagDemultiplex function
#'
#' @param files seurat Rdata files produced by pp_hashtagDemultiplex
#' @param metadata annotation information for each file/seurat_object.
#' e.g. list(c(Batch = "b1"), c(Batch = "b2"), NULL, c(Batch = "b4"))
#'
#' @return a list of seurat objects
#' @export
#'
#' @examples
#'
pp_annotateHashtagSeurat <- function(files, metadata = NULL){
message("Be carefull the correspondence between files and metadata!")
if(!is.null(metadata) && length(files) != length(metadata)){
stop("metadata should be NULL, or with equal length of files!")
}
lapply(seq_along(files), function(i){
seurat_obj <- load(files[i])
seurat_obj <- get(seurat_obj)
if(!is.null(metadata[[i]])){
for(x in metadata[i]){
seurat_obj[[names(x)]] <- x
}
}
return(seurat_obj)
})
}
#' A wrapper function to quickly run standard processing pipeline
#'
#' Run NormalizeData, CellCycleScoring, FindVariableFeatures, ScaleData, RunUMAP/TSNE and FindClusters.
#'
#' @param object a Seurat object
#' @param scale.factor Sets the scale factor for cell-level normalization. (NormalizeData)
#' @param s.features A vector of features associated with S phase (CellCycleScoring)
#' @param g2m.features A vector of features associated with G2M phase (CellCycleScoring)
#' @param do.regress.cc regress out cell cycle scores (ScaleData)
#' @param pp do SCTransform or standard RNA process
#' @param nfeatures Number of features to return (SCTransform/FindVariableFeatures)
#' @param ndim Number of dimensions to use for dimensionality reduction and clustering (RunUMAP/TSNE/FindClusters)
#' @param skip_tsne you may set TRUE to skip RunTSNE step for large dataset.
#' @param resolution Value of the resolution parameter (FindClusters)
#'
#' @return a Seurat object
#' @export
#'
#' @examples
#'
pp_preprocess <- function(object,
scale.factor = 1e4,
s.features = cc.genes$s.genes,
g2m.features = cc.genes$g2m.gene,
do.regress.cc = T,
pp = c("SCT", "RNA"),
nfeatures = 2000,
ndim = 20,
skip_tsne = F,
resolution = 0.5){
DefaultAssay(object) <- "RNA"
object %<>% NormalizeData(scale.factor = scale.factor) %>%
CellCycleScoring(s.features = s.features, g2m.features = g2m.features, set.ident = T)
vars.to.regress <- if(do.regress.cc) c("S.Score", "G2M.Score") else NULL
if(pp == "SCT"){
object %<>% SCTransform(variable.features.n = nfeatures, vars.to.regress = vars.to.regress)
}else{
object %<>% FindVariableFeatures(nfeatures = nfeatures) %>%
ScaleData(vars.to.regress = vars.to.regress)
}
object %<>% RunPCA()
object %<>% RunUMAP(dims = 1:ndim)
if(!skip_tsne) object %<>% RunTSNE(dims = 1:ndim)
object %<>% FindNeighbors(dims = 1:ndim) %>% FindClusters(resolution = resolution)
return(object)
}
#' A wrapper function to quickly run fastMNN using SeuratWrappers::RunFastMNN
#'
#' Run NormalizeData, CellCycleScoring, SelectIntegrationFeatures, RunFastMNN, RunUMAP/TSNE, FindClusters and FindMarkers.
#' And save object_mnn and object_degs to xls and Rdata files.
#'
#' @param object_list a list of Seurat objects
#' @param genes_used Set NULL to use the intersection of all Seurat objects.
#' @param batch_by Attribute for splitting. Default is "Batch". (SplitObject)
#' @param scale.factor Sets the scale factor for cell-level normalization. Calculated in RNA assay. (NormalizeData)
#' @param s.features A vector of features associated with S phase. Calculated in RNA assay. (CellCycleScoring)
#' @param g2m.features A vector of features associated with G2M phase. Calculated in RNA assay. (CellCycleScoring)
#' @param assay which assay to be used for integration. default RNA.
#' @param nfeatures Number of features to return (SelectIntegrationFeatures)
#' @param integration_features if NULL use nfeatures, else use these feature for fastMNN
#' @param remove_ccgenes_by_cor whether to exclude cell cycle-related genes using correlation method.
#' @param cc_cor_th feature genes with correlation coefficient larger than this threshold will be removed.
#' @param mnn_d Number of dimensions to use for dimensionality reduction in multiBatchPCA. (batchelor::fastMNN)
#' @param seed random seed for fastMNN
#' @param skip_tsne you may set TRUE to skip RunTSNE step for large dataset.
#' @param resolution Value of the resolution parameter (FindClusters)
#' @param save_object_prefix Set NULL to skip save files and FindAllMarker steps. Or a character setting the object prefix, e.g. AAA will get AAA_mnn and AAA_degs being returned.
#' @param save_degs_dir {save_object_prefix}_degs.seurat_clusters.{sys.time}.xls will be written to here.
#' @param logfc.threshold Limit testing to genes which show at least X-fold difference (log-scale) between the two groups of cells. (FindAllMarkers)
#' @param min.pct only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. (FindAllMarkers)
#' @param save_rdata_dir {save_object_prefix}_mnn.seurat.{sys.time}.Rdata will be written to here.
#' @param save_add.sys.time whether to add sys.time to file names.
#' @param ... Extra parameters passed to batchelor::fastMNN.
#'
#' @return seurat object if save_object_prefix is NULL, else saved files.
#' @export
#'
#' @examples
#'
pp_fastMNN <- function(object_list, genes_used = NULL,
batch_by = "Batch",
scale.factor = 1e5,
s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.gene,
assay = c("RNA", "SCT"),
nfeatures = 2000,
integration_features = NULL,
remove_ccgenes_by_cor = T, cc_cor_th = 0.3,
mnn_d = 20, seed = 666, skip_tsne = F,
resolution = 0.5,
save_object_prefix = NULL,
save_degs_dir = "tables/",
logfc.threshold = log(2), min.pct = 0.25,
save_rdata_dir = "seuratData/",
save_add.sys.time = F,
...){
if(is.null(genes_used)){
genes_used <- Reduce(intersect, lapply(object_list, rownames))
}
for(i in 1:length(object_list)){
object_list[[i]]$Dataset <- paste0("Dataset_", i)
}
object_merge <- merge(object_list[[1]], object_list[-1])[genes_used, ]
DefaultAssay(object_merge) <- "RNA"
object_merge %<>% NormalizeData(scale.factor = scale.factor) %>%
CellCycleScoring(s.features = s.features, g2m.features = g2m.features, set.ident = T)
assay <- match.arg(assay)
if(assay == "SCT"){
object_merge %<>% SCTransform(variable.features.n = nfeatures)
}
# the following function should use default assay.
object_merge_list <- SplitObject(object_merge, split.by = batch_by)
object_merge_features <- SelectIntegrationFeatures(
object_merge_list,
nfeatures = nfeatures,
fvf.nfeatures = nfeatures#, assay = rep("RNA",length(object_merge_list))
)
if(!is.null(integration_features)) object_merge_features <- integration_features
## CC genes
if(remove_ccgenes_by_cor){
object_merge_cc_cor <- apply(GetAssayData(object_merge)[object_merge_features,], 1,
function(x){
max(abs(cor(x, object_merge$S.Score)),
abs(cor(x, object_merge$G2M.Score)))
}
)
object_merge_features <- setdiff(object_merge_features, names(which(object_merge_cc_cor > cc_cor_th)))
}
object_merge_mnn_d <- mnn_d
set.seed(seed) # seems like the following function produce random result
object_merge_mnn <- SeuratWrappers::RunFastMNN(
object_merge_list,
#assay = "RNA",
features = object_merge_features,
d = object_merge_mnn_d,
...
)
object_merge_mnn %<>% RunUMAP(reduction = "mnn", dims = 1:object_merge_mnn_d)
if(!skip_tsne) object_merge_mnn %<>% RunTSNE(reduction = "mnn", dims = 1:object_merge_mnn_d)
object_merge_mnn %<>% FindNeighbors(dims = 1:object_merge_mnn_d, reduction = "mnn")
object_merge_mnn %<>% FindClusters(resolution = resolution)
if(!is.null(save_object_prefix)){
time_suffix <- ifelse(save_add.sys.time, base::Sys.time(), "")
degs_file <- file.path(save_degs_dir, paste0(save_object_prefix, ".degs.seurat_clusters.", time_suffix,".xls"))
rdata_file <- file.path(save_rdata_dir, paste0(save_object_prefix, "_mnn.seurat.", time_suffix,".Rdata"))
object_merge_degs <- FindAllMarkers(object_merge_mnn, assay = "RNA",
logfc.threshold = logfc.threshold, min.pct = min.pct, only.pos = T)
WriteXLS::WriteXLS(object_merge_degs, ExcelFileName = degs_file)
assign(paste0(save_object_prefix, "_mnn"), object_merge_mnn)
assign(paste0(save_object_prefix, "_degs"), object_merge_degs)
save(list = paste0(save_object_prefix, c("_mnn", "_degs")), file = rdata_file)
}else{
return(object_merge_mnn)
}
}
#' A wrapper function to quickly run fastMNN using SeuratWrappers::RunFastMNN
#'
#' Run NormalizeData, CellCycleScoring, SelectIntegrationFeatures, RunFastMNN, RunUMAP/TSNE, FindClusters and FindMarkers.
#' And save object_mnn and object_degs to xls and Rdata files.
#'
#' @param object_list a list of Seurat objects
#' @param genes_used Set NULL to use the intersection of all Seurat objects.
#' @param batch_by Attribute for splitting. Default is "Batch". (SplitObject)
#' @param scale.factor Sets the scale factor for cell-level normalization. (NormalizeData)
#' @param s.features A vector of features associated with S phase (CellCycleScoring)
#' @param g2m.features A vector of features associated with G2M phase (CellCycleScoring)
#' @param nfeatures Number of features to return (SelectIntegrationFeatures)
#' @param do.regress.cc regress out cell cycle scores (ScaleData)
#' @param ndim Number of dimensions to use for dimensionality reduction in multiBatchPCA. (batchelor::fastMNN)
#' @param skip_tsne you may set TRUE to skip RunTSNE step for large dataset.
#' @param resolution Value of the resolution parameter (FindClusters)
#' @param save_object_prefix Set NULL to skip save files and FindAllMarker steps. Or a character setting the object prefix, e.g. AAA will get AAA_mnn and AAA_degs being returned.
#' @param save_degs_dir {save_object_prefix}_degs.seurat_clusters.{sys.time}.xls will be written to here.
#' @param logfc.threshold Limit testing to genes which show at least X-fold difference (log-scale) between the two groups of cells. (FindAllMarkers)
#' @param min.pct only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. (FindAllMarkers)
#' @param save_rdata_dir {save_object_prefix}_mnn.seurat.{sys.time}.Rdata will be written to here.
#' @param save_add.sys.time whether to add sys.time to file names.
#' @param ... Extra parameters passed to IntegrateData
#'
#' @return seurat object if save_object_prefix is NULL, else saved files.
#' @export
#'
#' @examples
#'
pp_seuratV3Integration <- function(object_list, genes_used = NULL,
batch_by = "Batch",
scale.factor = 1e5,
s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.gene,
nfeatures = 2000, do.regress.cc = T,
ndim = 20, skip_tsne = F,
resolution = 0.5,
save_object_prefix = NULL,
save_degs_dir = "tables/",
logfc.threshold = log(2), min.pct = 0.25,
save_rdata_dir = "seuratData/",
save_add.sys.time = F,
...){
if(is.null(genes_used)){
genes_used <- Reduce(intersect, lapply(object_list, rownames))
}
for(i in 1:length(object_list)){
object_list[[i]]$Dataset <- paste0("Dataset_", i)
}
object_merge <- merge(object_list[[1]], object_list[-1])[genes_used, ]
DefaultAssay(object_merge) <- "RNA"
object_merge %<>% NormalizeData(scale.factor = scale.factor) %>%
CellCycleScoring(s.features = s.features, g2m.features = g2m.features, set.ident = T)
object_merge_list <- SplitObject(object_merge, split.by = batch_by)
for (i in 1:length(object_merge_list)) {
object_merge_list[[i]] <- FindVariableFeatures(
object_merge_list[[i]],
nfeatures = nfeatures, verbose = FALSE)
}
object_merge_anchors <- FindIntegrationAnchors(object.list = object_merge_list)
object_merge_integrated <- IntegrateData(anchorset = object_merge_anchors, ...)
vars.to.regress <- if(do.regress.cc) c("S.Score", "G2M.Score") else NULL
object_merge_integrated <- ScaleData(object_merge_integrated, verbose = FALSE,
vars.to.regress = vars.to.regress)
object_merge_integrated %<>% RunPCA()
object_merge_integrated %<>% RunUMAP(reduction = "pca", dims = 1:ndim)
if(!skip_tsne) object_merge_integrated %<>% RunTSNE(reduction = "pca", dims = 1:ndim)
object_merge_integrated %<>% FindNeighbors(dims = 1:ndim, reduction = "pca")
object_merge_integrated %<>% FindClusters(resolution = resolution)
if(!is.null(save_object_prefix)){
time_suffix <- ifelse(save_add.sys.time, base::Sys.time(), "")
degs_file <- file.path(save_degs_dir, paste0(save_object_prefix, ".degs.seurat_clusters.", time_suffix,".xls"))
rdata_file <- file.path(save_rdata_dir, paste0(save_object_prefix, "_integrated.seurat.", time_suffix,".Rdata"))
object_merge_degs <- FindAllMarkers(object_merge_integrated, assay = "RNA",
logfc.threshold = logfc.threshold, min.pct = min.pct, only.pos = T)
WriteXLS::WriteXLS(object_merge_degs, ExcelFileName = degs_file)
assign(paste0(save_object_prefix, "_integrated"), object_merge_integrated)
assign(paste0(save_object_prefix, "_degs"), object_merge_degs)
save(list = paste0(save_object_prefix, c("_integrated", "_degs")), file = rdata_file)
}else{
return(object_merge_integrated)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.