Nothing
#' Loads neural network models from GitHub
#'
#' \code{GetModels_HPCA} returns a list of neural network models that were trained with the HPCA training data.
#' The HPCA training data has 18 classification tasks with bootstrapped training data.
#' The training data are split into two distinct cellular phenotypes (i.e., immune and nonimmune).
#' \code{GetModels_HPCA} downloads pre-computed neural networks trained to classify cells for each task (n = 100 models trained for each tasks).
#'
#' @return list of 1,800 neural network models stacked to solve 18 classification problems.
#' @export
#' @seealso \code{\link{SignacFast}}, \code{\link{ModelGenerator}}
#' @examples
#' \dontrun{
#' P = GetModels()
#' }
GetModels_HPCA <- function(){
return(readRDS(url("https://github.com/mathewchamberlain/SignacX/blob/master/assets/Models_HPCA.rds?raw=TRUE","rb")))
}
#' Loads bootstrapped HPCA training data from GitHub
#'
#' \code{GetTrainingData_HPCA} returns a list of bootstrapped HPCA training data.
#' The HPCA training data has 18 classification tasks, each split into two distinct cellular phenotypes (i.e., immune and nonimmune).
#' \code{GetTrainingData_HPCA} downloads bootstrapped training data to use for model-building.
#'
#' @return A list with two elements; 'Reference' are the training data, and 'genes' -- the union of all features present in the training data.
#' @export
#' @examples
#' \dontrun{
#' P = GetTrainingData_HPCA()
#' }
GetTrainingData_HPCA <- function(){
return(readRDS(url("https://github.com/mathewchamberlain/SignacX/blob/master/assets/training_HPCA.rds?raw=TRUE","rb")))
}
#' Generates cellular phenotype labels
#'
#' \code{GenerateLabels} returns a list of cell type and cell state labels, as well as novel cellular phenotypes and unclassified cells.
#'
#' @param cr list returned by \code{\link{Signac}} or by \code{\link{SignacFast}}.
#' @param E a sparse gene (rows) by cell (column) matrix, or a Seurat object. Rows are HUGO symbols.
#' @param spring.dir If using SPRING, directory to categorical_coloring_data.json. Default is NULL.
#' @param smooth if TRUE, smooths the cell type classifications. Default is TRUE.
#' @param new_populations Character vector specifying any new cell types that were learned by Signac. Default is NULL.
#' @param new_categories If new_populations are set to a cell type, new_category is a corresponding character vector indicating the population that the new population belongs to. Default is NULL.
#' @param min.cells If desired, any cell population with equal to or less than N cells is set to "Unclassified." Default is 10 cells.
#' @param graph.used If using Seurat object by default, Signac uses the nearest neighbor graph in the graphs field of the Seurat object. Other options are "wnn" to use weighted nearest neighbors, as well as "snn" to use shared nearest neighbors.
#' @return A list of cell type labels for cell types, cell states and novel populations.
#' @export
#' @examples
#' \dontrun{
#' # download single cell data for classification
#' file.dir = "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/"
#' file = "pbmc_1k_v3_filtered_feature_bc_matrix.h5"
#' download.file(paste0(file.dir, file), "Ex.h5")
#'
#' # load data, process with Seurat
#' library(Seurat)
#' E = Read10X_h5(filename = "Ex.h5")
#' pbmc <- CreateSeuratObject(counts = E, project = "pbmc")
#'
#' # run Seurat pipeline
#' pbmc <- SCTransform(pbmc, verbose = FALSE)
#' pbmc <- RunPCA(pbmc, verbose = FALSE)
#' pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = FALSE)
#'
#' # classify cells
#' labels = SignacFast(E = pbmc)
#' celltypes = GenerateLabels(labels, E = pbmc)
#' }
GenerateLabels = function(cr, E = NULL, smooth = TRUE, new_populations = NULL, new_categories = NULL, min.cells = 10, spring.dir = NULL, graph.used = "nn")
{
# if using SPRING, load data
if (!is.null(spring.dir)){
edges = CID.LoadEdges(data.dir = spring.dir)
edges = CID.GetDistMat(edges)
}
# check for Seurat object
flag = class(E) == "Seurat"
if (flag) {
default.assay <- Seurat::DefaultAssay(E)
edges = E@graphs[[which(grepl(paste0("_", graph.used), names(E@graphs)))]]
if (ncol(edges) > 100000) {
edges = list(edges)
} else {
edges = CID.GetDistMat(edges)
}
}
# remove louvain clusters
res = list("")
louvain = cr$louvain
cr = cr[-which(names(cr) == "louvain")]
# remove all fields from input
if ("probability" %in% names(cr[[1]]))
cr = lapply(cr, function(x) {x$celltypes})
# change format to character
cr = lapply(cr, function(x) {as.character(x)})
# label categories according to cell type hierarchy
for (j in 1:length(cr))
{
if (names(cr)[j] == "All"){
res[[j]] = cr[[j]]
} else {
qq = res[[j - 1]]
logik = qq == names(cr)[j]
qq[logik] = cr[[j]][logik]
res[[j]] = qq
}
}
# define cell states and cell types
cellstates = res[[length(res)]]
celltypes = cellstates
celltypes[celltypes %in% c("B.memory", "B.naive")] = "B"
celltypes[celltypes %in% c("DC", "Mon.Classical", "Mon.NonClassical", "Neutrophils", "Monocytes", "Macrophages")] = "MPh"
celltypes[celltypes %in% c("NK", "T.CD4.naive", "T.CD4.memory", "T.regs", "T.CD8.naive", "T.CD8.memory", "T.CD8.cm","T.CD8.em")] = "TNK"
celltypes[celltypes %in% c("Endothelial", "Fibroblasts", "Epithelial")] = "NonImmune"
immune = res[[1]]
# allow for new cell populations
if (!is.null(new_populations))
{
for (j in 1:length(new_populations))
celltypes[celltypes %in% new_populations[j]] = new_categories[j]
}
# assign Unclassifieds
if (!is.null(spring.dir) | flag){
celltypes = CID.entropy(celltypes, edges)
immune = CID.entropy(immune, edges)
# smooth
if (smooth) {
celltypes= CID.smooth(celltypes, edges[[1]])
immune = CID.smooth(immune, edges[[1]])
}
}
# set consistent unclassified cells
logik = immune == "Unclassified" | celltypes == "Unclassified" | cellstates == "Unclassified"
cellstates[logik] = "Unclassified"
celltypes[logik] = "Unclassified"
immune[logik] = "Unclassified"
# set consistent cell type annotations
celltypes[cellstates %in% c("B.memory", "B.naive")] = "B"
celltypes[cellstates %in% c("DC", "Mon.Classical", "Mon.NonClassical", "Neutrophils", "Monocytes", "Macrophages")] = "MPh"
celltypes[cellstates %in% c("NK", "T.CD4.naive", "T.CD4.memory", "T.regs", "T.CD8.naive", "T.CD8.memory", "T.CD8.cm","T.CD8.em")] = "TNK"
celltypes[cellstates %in% c("Endothelial", "Fibroblasts", "Epithelial")] = "NonImmune"
immune[cellstates %in% c("Endothelial", "Fibroblasts", "Epithelial")] = "NonImmune"
immune[! cellstates %in% c("NonImmune", "Unclassified")] = "Immune"
res$Immune = immune
if (!is.null(spring.dir) | flag)
{
do = data.frame(table(louvain[cellstates == "Unclassified"]))
df = data.frame(table(louvain[louvain %in% do$Var1]))
logik = (1 - phyper(do$Freq, df$Freq , length(cellstates) - do$Freq, sum(cellstates == "Unclassified"))) < 0.01;
if (any(logik)){
do = do[logik,]
logik = do$Freq > 3; # require at least 3 cell communities
if (any(logik)){
celltypes_novel = celltypes;
cellstates_novel = cellstates;
cat(" Signac found", sum(logik), "novel celltypes!\n");
lbls = rep("All", ncol(E))
logik = louvain %in% do$Var1[logik] & cellstates == "Unclassified";
lbls[logik] = louvain[logik]
cellstates_novel[lbls != "All"] = paste0("+ Novel cluster", lbls[lbls != "All"])
celltypes_novel[lbls != "All"] = paste0("+ Novel cluster", lbls[lbls != "All"])
res$CellTypes_novel = celltypes_novel
res$CellStates_novel = cellstates_novel
}
}
}
df = data.frame(table(celltypes))
logik = df$Freq < (min.cells + 1)
if (any(logik))
celltypes[celltypes %in% as.character(df[,1][logik])] <- "Unclassified"
df = data.frame(table(cellstates))
logik = df$Freq < (min.cells + 1)
if (any(logik))
cellstates[cellstates %in% as.character(df[,1][logik])] <- "Unclassified"
res$CellTypes = celltypes
res$CellStates = cellstates
names(res)[names(res) == ""] = paste0("L", 1:sum(names(res) == ""))
res$clusters = louvain
# factor encode for Seurat visualization
if (flag){
res = lapply(res, function(x){
xx = factor(x)
levels(xx) <- sort(unique(xx))
return(xx)
})
}
return(res)
}
#' Generates bootstrapped single cell data
#'
#' \code{SignacBoot} uses a Seurat object or an expression matrix and performs feature selection, normalization and bootstrapping
#' to generate a training data set to be used for cell type or cluster classification.
#'
#' @param E a gene (rows) by cell (column) matrix, sparse matrix or a Seurat object. Rows are HUGO symbols.
#' @param L cell type categories for learning.
#' @param labels cell type labels corresponding to the columns of E.
#' @param size Number of bootstrapped samples for machine learning. Default is 1,000.
#' @param logfc.threshold Cutoff for feature selection. Default is 0.25.
#' @param p.val.adj Cutoff for feature selection. Default is 0.05.
#' @param impute if TRUE, performs imputation prior to bootstrapping (see \code{\link{KSoftImpute}}). Default is TRUE.
#' @param verbose if TRUE, code speaks. Default is TRUE.
#' @param spring.dir if using SPRING, directory to categorical_coloring_data.json. Default is NULL.
#' @return Training data set (data.frame) to be used for building new models=.
#' @seealso \code{\link{ModelGenerator}}
#' @export
#' @examples
#' \dontrun{
#' # load Seurat object from SignacFast example
#' P <- readRDS("pbmcs.rds")
#'
#' # run feature selection + bootstrapping to generate 2,000 bootstrapped cells
#' x = P@meta.data$celltypes
#' R_learned = SignacBoot(P, L = c("B.naive", "B.memory"), labels = x)
#' }
SignacBoot <- function (E, L, labels, size = 1000, impute = TRUE, spring.dir = NULL, logfc.threshold = 0.25, p.val.adj = 0.05, verbose = TRUE)
{
# check for Seurat object
flag = class(E) == "Seurat"
if (flag) {
default.assay <- Seurat::DefaultAssay(E)
edges = E@graphs[[which(grepl(paste0(default.assay, "_nn"), names(E@graphs)))]]
dM = CID.GetDistMat(edges)
}
if (verbose)
{
cat(" .......... Entry in SignacBoot \n");
ta = proc.time()[3];
# main function
cat(ifelse(!flag, c(" .......... Running SignacBoot on input data matrix :\n"), c(" .......... Running SignacBoot on input Seurat object :\n")))
cat(" nrow = ", nrow(E), "\n", sep = "");
cat(" ncol = ", ncol(E), "\n", sep = "");
}
# set up imputation matrices
if (impute & !flag){
edges = CID.LoadEdges(data.dir = spring.dir)
dM = CID.GetDistMat(edges)
louvain = CID.Louvain(edges = edges)
} else {
louvain = CID.Louvain(edges = edges)
}
# keep only unique row names
logik = CID.IsUnique(rownames(E))
if (!flag) {
V = E[logik,]
colnames(V) <- 1:ncol(V)
} else {
V = E@assays[[default.assay]]@counts[logik,]
}
# run feature selection
ctrl <- Seurat::CreateSeuratObject(counts = V[,labels %in% L], project = "Signac", min.cells = 0)
ctrl <- Seurat::NormalizeData(ctrl)
ctrl <- Seurat::AddMetaData(ctrl, metadata=labels[labels %in% L], col.name = "celltypes")
ctrl <- Seurat::SetIdent(ctrl, value='celltypes')
mrks = Seurat::FindMarkers(ctrl, ident.1 = L[1], ident.2 = L[2], only.pos = FALSE, logfc.threshold = logfc.threshold)
mrks = mrks[mrks$p_val_adj < p.val.adj,]
# bootstrap data
dat = V[rownames(V) %in% rownames(mrks),]
mrks$cluster = L[1]
mrks$cluster[mrks$avg_logFC < 0] = L[2]
mrks$gene = rownames(mrks)
xx = labels
# run imputation (if desired)
if (impute)
Z = KSoftImpute(E = dat, dM = dM, verbose = FALSE)
cts = split.data.frame(mrks, f = mrks$cluster)
N = lapply(cts, function(x){
# first sample from cells in cluster 1, size cells
logik = rownames(dat) %in% x$gene
dummy = dat[logik,]
logik = as.character(x$cluster[1]) == labels
dummy = dummy[,logik]
dd = t(apply(dummy, 1, function(z) {
sample(z, size = size, replace = TRUE)}))
dd = t(apply(dd, 1, function(z){
stats::rnorm(n = length(z), mean = mean(z), sd = sd(z))
}))
# now sample from cells in cluster 2, size cells with True Negative Expr.
logik = !rownames(dat) %in% x$gene
dummy = dat[logik,]
logik = as.character(x$cluster[1]) == labels
dummy = dummy[,logik]
dd2 = t(apply(dummy, 1, function(z) {
sample(z, size = size, replace = TRUE)}))
dd2 = t(apply(dd2, 1, function(z){
stats::rnorm(n = length(z), mean = mean(z), sd = sd(z))
}))
rbind(dd, dd2)
})
N2 = merge(N[[1]],N[[2]],by="row.names")
rownames(N2) <- N2$Row.names
N2 = t(N2[,-1])
# normalize
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
N2 = apply(N2, 2, function(x){
normalize(x)
})
boot = data.frame(N2, celltypes = c(rep(names(cts)[1], size), rep(names(cts)[2], size)))
colnames(boot) <- gsub(pattern = "\\.", replacement = "-", x = colnames(boot))
boot = list(genes = colnames(boot)[-ncol(boot)],
Reference = list(boot))
if (verbose) {
tb = proc.time()[3] - ta;
cat("\n .......... Exit SignacBoot.\n");
cat(" Execution time = ", tb, " s.\n", sep = "");
}
return(boot)
}
#' Load edges from edge list for single cell network
#'
#' \code{CID.LoadEdges} loads edges, typically after running the SPRING pipeline.
#'
#' @param data.dir A directory where "edges.csv" file is located
#' @return The edge list in data frame format
#' @export
#' @examples
#' \dontrun{
#' # Loads edges
#' file.dir = "https://kleintools.hms.harvard.edu/tools/client_datasets/"
#' file = "CITESEQ_EXPLORATORY_CITESEQ_5K_PBMCS/FullDataset_v1_protein/edges.csv"
#' download.file(paste0(file.dir, file, "?raw=true"), destfile = "edges.csv")
#'
#' # data.dir is your path to the "edges.csv" file
#' edges = CID.LoadEdges(data.dir = ".")
#' }
CID.LoadEdges <- function(data.dir)
{
edges = paste(data.dir, "edges.csv", sep = "/")
file.exists(edges)
flag = file.exists(edges);
if (!flag) {
cat("ERROR: from CID.LoadEdges:\n");
cat("edges = ", edges, " does not exist.\n", sep = "");
stop()
}
# read in knn graph edges
cat(" .......... Reading in graph edges from edgelist: \n")
cat(" ", edges, "\n")
edges <- read.delim(edges,sep = ";" , stringsAsFactors = FALSE, header = FALSE)
if(min(edges) == 0)
edges = edges + 1
colnames(edges) <- c("V1", "V2")
edges
}
#' Library size normalize
#'
#' \code{CID.Normalize} normalizes the expression matrix to the mean library size.
#'
#' @param E Expression matrix
#' @return Normalized expression matrix where each cell sums to the mean total counts
#' @export
#' @examples
#' \dontrun{
#' # download single cell data for classification
#' file.dir = "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/"
#' file = "pbmc_1k_v3_filtered_feature_bc_matrix.h5"
#' download.file(paste0(file.dir, file), "Ex.h5")
#'
#' # load data, process with Seurat
#' library(Seurat)
#' E = Read10X_h5(filename = "Ex.h5")
#'
#' # observe average total counts
#' mean(Matrix::colSums(E))
#'
#' # run normalization
#' E_norm = CID.Normalize(E)
#'
#' # check normalization
#' kmu = mean(Matrix::colSums(E_norm))
#' head(kmu)
#' }
CID.Normalize <- function(E)
{
xx = NULL
if (!is.null(colnames(E)))
xx <- colnames(E)
if (class(E) == "matrix")
{
m = Matrix::Matrix(0, ncol(E), ncol(E))
tots_use = Matrix::colSums(E)
target_mean = mean(tots_use)
diag(m) <- target_mean / tots_use
E = as.matrix(E %*% m)
if (!is.null(xx))
colnames(E) <- xx
} else {
m = Matrix::Matrix(0, ncol(E), ncol(E))
tots_use = Matrix::colSums(E)
target_mean = mean(tots_use)
diag(m) <- target_mean / tots_use
E = Matrix::Matrix(E %*% m, sparse = TRUE)
if (!is.null(xx))
colnames(E) <- xx
}
return(E)
}
#' Smoothing function
#'
#' \code{CID.smooth} uses k-nearest neighbors to identify cells which
#' correspond to a different label than the majority of their first-degree neighbors. If so,
#' those annotations are "smoothed."
#'
#' @param ac list containing a character vector where each element is a cell type or cell state assignment.
#' @param dM distance matrix (see ?CID.GetDistMat).
#' @return A character vector with smoothed labels
#' @export
#' @examples
#' \dontrun{
#' # load data classified previously (see SignacFast)
#' P <- readRDS("celltypes.rds")
#' S <- readRDS("pbmcs.rds")
#'
#' # get edges from default assay from Seurat object
#' default.assay <- Seurat::DefaultAssay(S)
#' edges = S@graphs[[which(grepl(paste0(default.assay, "_nn"), names(S@graphs)))]]
#'
#' # get distance matrix
#' D = CID.GetDistMat(edges)
#'
#' # smooth labels
#' smoothed = CID.smooth(ac = P$CellTypes, dM = D[[1]])
#' }
CID.smooth <- function(ac, dM)
{
Y = ac;
# remove any unconnected cells from consideration
logik = Matrix::rowSums(dM) != 0
if(any(!logik)){
dM = dM[logik,logik]
Y = Y[logik]
}
if (sum(dM) == 0)
return(ac)
m = Matrix::Matrix(0, nrow = length(Y), ncol = length(unique(Y)), sparse = TRUE)
m[cbind(1:nrow(m), as.numeric(factor(Y)))] <- 1
res = dM %*% m
res = res / Matrix::rowSums(res)
mx.idx = apply(res, 1, function(x) which.max(x))
mx = apply(res, 1, max)
Y[mx > 0.5] = levels(factor(Y))[mx.idx[mx > 0.5]]
ac[logik] = Y
return(ac)
}
#' Normalized Shannon entropy-based "unclassified" assignment
#'
#' \code{CID.entropy} calculates the normalized Shannon entropy of labels for each cell
#' among k-nearest neighbors less than four-degrees apart, and then sets cells with statistically
#' significant large Shannon entropy to be "Unclassified."
#'
#' @param ac a character vector of cell type labels
#' @param distM the distance matrix, see ?CID.GetDistMat
#' @return A character vector like 'ac' but with cells type labels set to "Unclassified" if there was high normalized Shannon entropy.
#' @export
#' @examples
#' \dontrun{
#' # load data classified previously (see \code{SignacFast})
#' P <- readRDS("celltypes.rds")
#' S <- readRDS("pbmcs.rds")
#'
#' # get edges from default assay from Seurat object
#' default.assay <- Seurat::DefaultAssay(S)
#' edges = S@graphs[[which(grepl(paste0(default.assay, "_nn"), names(S@graphs)))]]
#'
#' # get distance matrix
#' D = CID.GetDistMat(edges)
#'
#' # entropy-based unclassified labels labels
#' entropy = CID.entropy(ac = P$L2, distM = D)
#' }
CID.entropy <- function(ac,distM)
{
# "Y" labels will be ac but with some "Unclassified" cells
Y = ac
# Calculate normalized shannon entropy for each cell j for all connections with shortest path < N
shannon = rep(0, length(Y))
if (class(distM) == "list")
{
dM = Reduce('+', distM) > 0;
# create identity matrix to subtract off self
I <- methods::new("ngTMatrix",
i = as.integer(1:nrow(dM) - 1L),
j = as.integer(1:nrow(dM) - 1L),
Dim = as.integer(c(nrow(dM), nrow(dM))))
dM = dM - I
} else {
dM = distM
}
# create matrix for cell labels
m = Matrix::Matrix(0, nrow = length(ac), ncol = length(unique(ac)), sparse = TRUE)
m[cbind(1:nrow(m), as.numeric(factor(ac)))] <- 1
res = dM %*% m
res = res / Matrix::rowSums(res)
N_unique = length(unique(Y))
shannon = apply(res, 1, function(freqs) {-sum(freqs[freqs != 0] * log(freqs[freqs != 0])) / log(2) / log2(N_unique)})
q = c(shannon, -shannon)
Y[shannon > 2 * sd(q)] = "Unclassified"
return(Y)
}
#' Writes JSON file for SPRING integration
#'
#' \code{CID.writeJSON} is a SPRING-integrated function for writing a JSON file.
#'
#' @param cr output from \code{\link{GenerateLabels}}
#' @param json_new Filename where annotations will be saved for new SPRING color tracks. Default is "categorical_coloring_data_new.json".
#' @param spring.dir Directory where file 'categorical_coloring_data.json' is located. If set, will add Signac annotations to the file.
#' @param new_populations Character vector specifying any new cell types that Signac has learned. Default is NULL.
#' @param new_colors Character vector specifying the HEX color codes for new cell types. Default is NULL.
#' @return A categorical_coloring_data.json file with Signac annotations and Louvain clusters added.
#' @export
#' See \url{https://github.com/AllonKleinLab/SPRING_dev} for more details.
CID.writeJSON <- function(cr, json_new = "categorical_coloring_data.json", spring.dir, new_populations = NULL, new_colors = NULL)
{
if (!is.null(spring.dir))
{
spring.dir = gsub("\\/$", "", spring.dir, perl = TRUE);
if (file.exists(paste(spring.dir, 'categorical_coloring_data.json', sep = "/")))
{
json_file = 'categorical_coloring_data.json'
gJ <- paste(spring.dir,json_file,sep = "/")
json_data <- RJSONIO::fromJSON(gJ)
json_out = jsonlite::toJSON(json_data, auto_unbox = TRUE)
write(json_out,paste(spring.dir,'categorical_coloring_data_legacy.json',sep="/"))
} else {
json_data = list("")
}
}
if ("clusters" %in% names(cr))
{
Q = as.character(cr$clusters)
json_data$Clusters_Louvain$label_list = Q
Ntypes = length(unique(Q))
qual_col_pals = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(RColorBrewer::brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
col_palette <- as.list(col_vector[1:Ntypes]);
names(col_palette) <- unique(Q)
json_data$Clusters_Louvain$label_colors = col_palette
}
if ("CellTypes" %in% names(cr))
{
Q = cr$CellTypes
json_data$CellTypes$label_list = Q
C = get_colors(Q)
json_data$CellTypes$label_colors = as.list(C[[1]])
}
if ("CellStates" %in% names(cr))
{
Q = cr$CellStates
json_data$CellStates$label_list = Q
C = get_colors(Q)
json_data$CellStates$label_colors = as.list(C[[1]])
}
if ("CellTypes_novel" %in% names(cr))
{
Q = cr$CellTypes_novel
json_data$CellTypes_novel$label_list = Q
C = get_colors(Q)
Ntypes = sum(C[[1]] == "")
qual_col_pals = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(RColorBrewer::brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
C[[1]][C[[1]] == ""] <- col_vector[1:Ntypes];
json_data$CellTypes_novel$label_colors = as.list(C[[1]])
}
if ("CellStates_novel" %in% names(cr))
{
Q = cr$CellStates_novel
json_data$CellStates_novel$label_list = Q
C = get_colors(Q)
Ntypes = sum(C[[1]] == "")
qual_col_pals = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(RColorBrewer::brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
C[[1]][C[[1]] == ""] <- col_vector[1:Ntypes]; # or sample if you wish
json_data$CellStates_novel$label_colors = as.list(C[[1]])
}
if ("Immune" %in% names(cr))
{
Q = cr$Immune
json_data$Immune$label_list = Q
C = get_colors(Q)
json_data$Immune$label_colors = as.list(C[[1]])
}
json_data = json_data[order(names(json_data))]
json_data_backup = json_data;
if (!is.null(new_populations)) {
json_data = lapply(json_data, function(x) {
for (j in 1:length(new_populations))
x$label_colors[names(x$label_colors) == new_populations[j]] = new_colors[j]
x
})
}
fn = json_new
json_out = jsonlite::toJSON(json_data, auto_unbox = TRUE)
write(json_out,paste(spring.dir,fn,sep="/"))
cat(paste(spring.dir,fn,sep="/"), "has been written to directory! \n")
return(json_data)
}
#' Detects community substructure by Louvain community detection
#'
#' \code{CID.Louvain} determines Louvain clusters from an edge list.
#'
#' @param edges A data frame or matrix with edges
#' @return A character vector with the community substructures of the graph corresponding to Louvain clusters.
#' @export
#' @examples
#' \dontrun{
#' # Loads edges
#' file.dir = "https://kleintools.hms.harvard.edu/tools/client_datasets/"
#' file = "CITESEQ_EXPLORATORY_CITESEQ_5K_PBMCS/FullDataset_v1_protein/edges.csv"
#' download.file(paste0(file.dir, file, "?raw=true"), destfile = "edges.csv")
#'
#' # data.dir is your path to the "edges.csv" file
#' edges = CID.LoadEdges(data.dir = ".")
#'
#' # get louvain clusters from edge list
#' clusters = CID.Louvain(edges)
#' }
CID.Louvain <- function(edges)
{
# convert edgelist to adjacency matrix
if (nrow(edges) != ncol(edges))
{
adjmatrix <- methods::new("ngTMatrix",
i = c(as.integer(edges$V1)-1L, as.integer(edges$V2)-1L),
j = c(as.integer(edges$V2)-1L, as.integer(edges$V1)-1L),
Dim = as.integer(c(max(edges), max(edges))))
} else {
adjmatrix = edges; # just load straight adjacency matrix
}
g = igraph::graph_from_adjacency_matrix(adjmatrix * 1, mode = c("undirected"), weighted = NULL, diag = TRUE, add.colnames = NULL, add.rownames = NA)
as.character(igraph::cluster_louvain(g)$membership)
}
#' Computes distance matrix from edge list
#'
#' \code{CID.GetDistMat} returns the distance matrix (i.e., adjacency matrix, second-degree adjacency matrix, ..., etc.)
#' from an edge list. Here, edges is the edge list, and n is the order of the connetions.
#'
#' @param edges a data frame with two columns; V1 and V2, specifying the cell-cell edge list for the network
#' @param n maximum network distance to subtend (n neighbors)
#' @return adjacency matrices for distances < n
#' @export
#' @examples
#' \dontrun{
#' # Loads edges
#' file.dir = "https://kleintools.hms.harvard.edu/tools/client_datasets/"
#' file = "CITESEQ_EXPLORATORY_CITESEQ_5K_PBMCS/FullDataset_v1_protein/edges.csv"
#' download.file(paste0(file.dir, file, "?raw=true"), destfile = "edges.csv")
#'
#' # data.dir is your path to the "edges.csv" file
#' edges = CID.LoadEdges(data.dir = ".")
#'
#' # get distance matrix (adjacency matrices with up to fourth-order connections) from edge list
#' distance_matrices = CID.GetDistMat(edges)
#' }
CID.GetDistMat <- function(edges, n = 4)
{
"%^%" <- function(A, n) {if(n == 1) A else A %*% (A %^% (n-1)) }
# create adjacency matrix
if (nrow(edges) != ncol(edges))
{
m <- methods::new("ngTMatrix",
i = c(as.integer(edges$V1)-1L, as.integer(edges$V2)-1L),
j = c(as.integer(edges$V2)-1L, as.integer(edges$V1)-1L),
Dim = as.integer(c(max(edges), max(edges))))
} else {
m = edges
}
dm = list("") # initialize distance matrix
for (j in 1:n)
if(j == 1) dm[[j]] = m else dm[[j]] = m %^% j
return(dm)
}
#' Extracts unique elements
#'
#' \code{CID.IsUnique} returns a Boolean for the unique elements of a vector.
#'
#' @param x A character vector
#' @return boolean, unique elements are TRUE
#' @export
#' @examples
#' # generate a dummy variable
#' dummy = c("A", "A", "B", "C")
#'
#' # get only unique elements
#' logik = CID.IsUnique(dummy)
#' dummy[logik]
CID.IsUnique <- function (x)
{
rv = rep(TRUE, length(x))
if (length(x) >= 2) {
ord = order(x)
ox = x[ord]
neq = (ox[-length(ox)] != ox[-1])
rv[ord] = c(neq, TRUE) & c(TRUE, neq)
}
return(rv)
}
#' Save count_matrix.h5 files for SPRING integration
#'
#' To integrate with SPRING, \code{SaveCountsToH5} saves expression matrices
#' in a sparse ".h5" format to be read with SPRING notebooks in Jupyter.
#'
#' @param D A list of count matrices
#' @param data.dir directory (will be created if it does not exist) where results are saved
#' @param genome default is GRCh38.
#' @importFrom methods as
#' @importClassesFrom Matrix dgCMatrix
#' @return matrix.h5 file, where each is background corrected
#' @export
#' @importFrom rhdf5 h5createFile h5createGroup h5write
#' @examples
#' \dontrun{
#' # download single cell data for classification
#' file.dir = "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/"
#' file = "pbmc_1k_v3_filtered_feature_bc_matrix.h5"
#' download.file(paste0(file.dir, file), "Ex.h5")
#'
#' # load data
#' library(Seurat)
#' E = Read10X_h5(filename = "Ex.h5")
#'
#' # save counts to h5 files
#' SaveCountsToH5(E, data.dir = "counts_h5")
#'
#' }
SaveCountsToH5 <- function(D, data.dir, genome = "GRCh38")
{
data.dir = gsub("\\/$", "", data.dir, perl = TRUE);
if (!dir.exists(data.dir))
dir.create(data.dir)
if (!"list" %in% class(D))
D = list(D)
cat(" .......... Entry in SaveCountsToH5: \n");
ta = proc.time()[3];
cat(" Number of cells:", sum(sapply(D, ncol)), "\n");
cat(" Number of genes:", nrow(D[[1]]), "\n");
cat(" Number of samples:", length(D), "\n");
if (is.null(names(D)))
names(D) <- paste0("Sample", seq_along(1:length(D)))
flag = rownames(D[[1]])[1] != gsub( "_.*$", "", rownames(D[[1]])[1])
if (flag)
{
genes = strsplit(x = rownames(D[[1]]), split = "_._")
gene_names = sapply(genes, `[[`, 1)
genes = sapply(genes, `[[`, 2)
}
D = lapply(D, function(x)
{
x@Dimnames[[1]] = rownames(D[[1]])
x
})
data.dirs = paste(data.dir, names(D), sep = "/")
q = lapply(data.dirs, function(x){
if (!dir.exists(x))
dir.create(x)})
d = suppressWarnings(
mapply(function(x,y){
fn = "count_matrix.h5"
fn = paste(y, fn, sep = "/")
rhdf5::h5createFile(fn)
rhdf5::h5createGroup(fn, genome)
rhdf5::h5write(x@Dimnames[[2]], file = fn, name = paste0(genome, "/barcodes"))
if (flag)
{
rhdf5::h5write(genes, file = fn, name=paste0(genome, "/genes"))
rhdf5::h5write(gene_names, file = fn, name=paste0(genome, "/gene_names"))
} else {
rhdf5::h5write(x@Dimnames[[1]], file = fn, name=paste0(genome, "/genes"))
rhdf5::h5write(x@Dimnames[[1]], file = fn, name=paste0(genome, "/gene_names"))
}
rhdf5::h5write(x@x, file = fn, name=paste0(genome, "/data"))
rhdf5::h5write(dim(x), file = fn, name=paste0(genome, "/shape"))
rhdf5::h5write(x@i, file = fn, name=paste0(genome, "/indices")) # already zero-indexed.
rhdf5::h5write(x@p, file = fn, name=paste0(genome, "/indptr"))
}, x = D, y = data.dirs)
)
rhdf5::h5closeAll()
tb = proc.time()[3] - ta;
cat(" .......... Exit SaveCountsToH5 \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
}
#' Get HEX colors
#'
#' @param P A character vector of cell type labels
#' @return A vector of HEX colors for each category of cell type labels
#' @keywords internal
get_colors <- function(P)
{
P <- sort(unique(as.character(P)));
colorcount = length(unique(P))
cs = character(colorcount)
# main cell type categories will be consistently labeled:
main_types = c("B" , "Epithelial", "Fibroblasts" ,
"MPh" , "Plasma.cells" , "Endothelial",
"TNK" , "Unclassified" , "NonImmune" ,
"Immune")
main_colors = c("#aa3596" , "#387f50" , "#a2ba37" ,
"#f1ff51" , "#d778de" , "#73de97" ,
"#90c5f4" , "#c0c0c0" , "#7fc97f" ,
"#bb7fc7" )
# sub cell types will be consistently labeled:
sub_types = c("B.memory", "B.naive", "DC" , "Macrophages", "Mon.Classical", "Mon.NonClassical", "Monocytes", "Neutrophils", "NK", "T.CD4.memory", "T.CD4.naive", "T.CD8.cm", "T.CD8.em", "T.CD8.naive", "T.gd", "T.regs" )
sub_colors = c("#aa3596", "#edc5e6", "#f9a702" , "#f97501", "#d6b171", "#9e4c05", "#f1ff51", "#f7f2b2", "#ac9bf2", "#bb7fc7", "#8a5e83", "#c9c9ff", "#8c9fe1", "#90c5f4", "#64058a", "#2038b0" )
for (i in 1:length(sub_types))
{
cs[P == sub_types[i]] = sub_colors[i]
cs[P == main_types[i]] = main_colors[i]
}
cs[P == "None"] = "#808080"
outs = list(cs)
names(outs[[1]]) <- P
return(outs)
}
#' Load data file from directory
#'
#' Loads 'matrix.mtx' and 'genes.txt' files from a directory.
#'
#' @param data.dir Directory containing matrix.mtx and genes.txt.
#' @param mfn file name; default is 'matrix.mtx'
#' @return A sparse matrix with rownames equivalent to the names in genes.txt
#' @export
#' @examples
#' \dontrun{
#' # Loads data from SPRING
#'
#' # dir is your path to the "categorical_coloring_data.json" file
#' dir = "./FullDataset_v1"
#'
#' # load expression data
#' E = CID.LoadData(data.dir = dir)
#' }
CID.LoadData <- function(data.dir, mfn = "matrix.mtx")
{
data.dir = gsub("\\/$", "", data.dir, perl = TRUE);
if (! (file.exists(paste(data.dir, mfn, sep = "/")) & file.exists(paste(data.dir, "genes.txt", sep = "/"))))
data.dir = dirname(data.dir)
gE <- paste(data.dir,mfn,sep="/")
flag = file.exists(gE);
if (!flag) {
cat("ERROR: from CID.LoadData:\n");
cat("file = ", gE, " does not exist.\n", sep = "");
stop()
}
E <- Matrix::readMM(gE)
# read genes
fn <-"genes.txt"
gG <- paste(data.dir,fn, sep = "/")
flag = file.exists(gG);
if (!flag) {
cat("ERROR: from CID.LoadData:\n");
cat("file = ", gG, " does not exist.\n", sep = "");
stop()
}
genes <- read.delim(gG, stringsAsFactors = FALSE, header = FALSE)$V1
if (genes[1] != gsub( "_.*$", "", genes[1] ))
{
genes[!grepl("Total", genes)] = gsub( "_.*$", "", genes [!grepl("Total", genes)])
genes[grepl("Total", genes)] = gsub( "_", "-", genes [grepl("Total", genes)])
}
if (any(grepl(pattern = "-ENSG00", x = genes)))
{
genes <-gsub(pattern = "-ENSG00", replacement = "_ENSG00",x = genes, fixed = TRUE)
genes[!grepl("Total", genes)] = gsub( "_.*$", "", genes [!grepl("Total", genes)])
genes[grepl("Total", genes)] = gsub( "_", "-", genes [grepl("Total", genes)])
}
if (grepl("^1", genes[1]))
genes = do.call(rbind, strsplit(genes, " "))[,2]
flag = length(genes) %in% c(nrow(E), ncol(E));
if (!flag) {
cat("ERROR: from CID.LoadData:\n");
cat("length of genes in genes.txt = ", length(genes), " is not equal to nrow(E) = ", nrow(E), ", or ncol(E) = ", ncol(E), "\n", sep = "");
stop()
}
if (nrow(E) != length(genes))
E = Matrix::t(E)
rownames(E) <- genes
E
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.