#' Generate data files required for shiny app
#'
#' Generate data files required for shiny app. Five files will be generated,
#' namely (i) the shinycell config \code{prefix_conf.rds}, (ii) the gene
#' mapping object config \code{prefix_gene.rds}, (iii) the single-cell gene
#' expression \code{prefix_gexpr.h5}, (iv) the single-cell metadata
#' \code{prefix_meta.rds} and (v) the defaults for the Shiny app
#' \code{prefix_def.rds}. A prefix is specified in each file to allow for the
#' use of multiple single-cell datasets in one Shiny app. Note that both
#' \code{makeShinyFiles} and \code{makeShinyCodes} functions are ran when
#' running the wrapper function \code{makeShinyApp}.
#'
#' @param obj input single-cell object for Seurat (v3+) / SingleCellExperiment
#' data or input file path for h5ad / loom files
#' @param scConf shinycell config data.table
#' @param gex.assay assay in single-cell data object to use for plotting
#' gene expression, which must match one of the following:
#' \itemize{
#' \item{Seurat objects}: "RNA" or "integrated" assay,
#' default is "RNA"
#' \item{SCE objects}: "logcounts" or "normcounts" or "counts",
#' default is "logcounts"
#' \item{h5ad files}: "X" or any assay in "layers",
#' default is "X"
#' \item{loom files}: "matrix" or any assay in "layers",
#' default is "matrix"
#' }
#' @param gex.slot slot in single-cell assay to plot. This is only used
#' for Seurat objects (v3+). Default is to use the "data" slot
#' @param gene.mapping specifies whether to convert human / mouse Ensembl gene
#' IDs (e.g. ENSG000xxx / ENSMUSG000xxx) into "user-friendly" gene symbols.
#' Set this to \code{TRUE} if you are using Ensembl gene IDs. Default is
#' \code{FALSE} which is not to perform any conversion. Alternatively, users
#' can supply a named vector where \code{names(gene.mapping)} correspond
#' to the actual gene identifiers in the gene expression matrix and
#' \code{gene.mapping} correspond to new identifiers to map to
#' @param shiny.prefix specify file prefix
#' @param shiny.dir specify directory to create the shiny app in
#' @param default.gene1 specify primary default gene to show
#' @param default.gene2 specify secondary default gene to show
#' @param default.multigene character vector specifying default genes to
#' show in bubbleplot / heatmap
#' @param default.dimred character vector specifying the two default dimension
#' reductions. Default is to use UMAP if not TSNE embeddings
#' @param chunkSize number of genes written to h5file at any one time. Lower
#' this number to reduce memory consumption. Should not be less than 10
#'
#' @return data files required for shiny app
#'
#' @author John F. Ouyang
#'
#' @import data.table hdf5r reticulate hdf5r
#'
#' @examples
#' makeShinyFiles(seu, scConf, gex.assay = "RNA", gex.slot = "data",
#' shiny.prefix = "sc1", shiny.dir = "shinyApp/",
#' default.gene1 = "GATA3", default.gene2 = "DNMT3L",
#' default.multigene = c("ANPEP","NANOG","ZIC2","NLGN4X","DNMT3L",
#' "DPPA5","SLC7A2","GATA3","KRT19"),
#' default.dimred = c("UMAP_1", "UMAP_2"))
#'
#' @export
makeShinyFiles <- function(
obj, scConf, gex.assay = NA, gex.slot = c("data", "scale.data", "counts"),
gene.mapping = FALSE, shiny.prefix = "sc1", shiny.dir = "shinyApp/",
default.gene1 = NA, default.gene2 = NA, default.multigene = NA,
default.dimred = NA, chunkSize = 500){
### Preprocessing and checks
# Generate defaults for gex.assay / gex.slot
if(class(obj)[1] == "Seurat"){
# Seurat Object
if(is.na(gex.assay[1])){gex.assay = "RNA"}
if(class(obj@assays[[gex.assay[1]]]) == "Assay5"){
# Seurat v5
# check if layers are joined
if(!(gex.slot[1] %in% names(obj@assays[[gex.assay[1]]]@layers))){
stop(paste0("gex.slot not found in gex.assay. ",
"Are layers joined? run obj <- JoinLayers(obj)"))
}
gex.matdim = dim(obj@assays[[gex.assay[1]]]@layers[[gex.slot[1]]])
gex.rownm = rownames(obj)
gex.colnm = colnames(obj)
}else{
gex.matdim = dim(slot(obj@assays[[gex.assay[1]]], gex.slot[1]))
gex.rownm = rownames(slot(obj@assays[[gex.assay[1]]], gex.slot[1]))
gex.colnm = colnames(slot(obj@assays[[gex.assay[1]]], gex.slot[1]))
}
# defGenes = obj@assays[[gex.assay[1]]]@var.features[1:10]
defGenes = Seurat::VariableFeatures(obj)[1:10]
if(is.na(defGenes[1])){
warning(paste0("Variable genes for seurat object not found! Have you ",
"ran `FindVariableFeatures` or `SCTransform`?"))
defGenes = gex.rownm[1:10]
}
sc1meta = data.table(sampleID = rownames(obj@meta.data), obj@meta.data)
} else if (class(obj)[1] == "SingleCellExperiment"){
# SCE Object
if(is.null(colnames(obj)[1])){
colnames(obj) = paste0("cell_", seq(ncol(obj)))
} # Populate cell IDs if they are not present
if(is.na(gex.assay[1])){gex.assay = "logcounts"}
gex.matdim = dim(SummarizedExperiment::assay(obj, gex.assay[1]))
gex.rownm = rownames(SummarizedExperiment::assay(obj, gex.assay[1]))
gex.colnm = colnames(SummarizedExperiment::assay(obj, gex.assay[1]))
defGenes = gex.rownm[1:10]
sc1meta = SingleCellExperiment::colData(obj)
sc1meta = data.table(sampleID = rownames(sc1meta),
as.data.frame(sc1meta))
} else if (tolower(tools::file_ext(obj)) == "h5ad"){
# h5ad file
if(is.na(gex.assay[1])){gex.assay = "X"}
# Can just check X since inpH5$layers should have same dimensions
ad <- import("anndata", convert = FALSE)
sp <- import('scipy.sparse', convert = FALSE)
inpH5 = ad$read_h5ad(obj)
gex.matdim = rev(unlist(py_to_r(inpH5$X$shape)))
gex.rownm = py_to_r(inpH5$var_names$values)
gex.colnm = py_to_r(inpH5$obs_names$values)
defGenes = gex.rownm[1:10]
sc1meta = data.table(sampleID = gex.colnm)
sc1meta = cbind(sc1meta, data.table(py_to_r(inpH5$obs$values)))
colnames(sc1meta) = c("sampleID", py_to_r(inpH5$obs$columns$values))
for(i in colnames(sc1meta)[-1]){
sc1meta[[i]] = unlist(sc1meta[[i]]) # unlist and refactor
if(as.character(inpH5$obs[i]$dtype) == "category"){
sc1meta[[i]] = factor(sc1meta[[i]], levels =
py_to_r(inpH5$obs[i]$cat$categories$values))
}
}
} else if (tolower(tools::file_ext(obj)) == "loom"){
# loom file
if(is.na(gex.assay[1])){gex.assay = "matrix"}
# Can just check matrix since inpLM[["layers"]] should have same dimensions
inpLM = H5File$new(obj, mode = "r+")
gex.matdim = rev(inpLM[["matrix"]]$dims)
gex.rownm = inpLM[["row_attrs"]][["Gene"]]$read()
for(i in unique(gex.rownm[duplicated(gex.rownm)])){
gex.rownm[gex.rownm == i] = paste0(i, "-", seq(sum(gex.rownm == i)))
} # make unique gene names
gex.colnm = inpLM[["col_attrs"]][["CellID"]]$read()
defGenes = gex.rownm[1:10]
cellIdx = which(inpLM[["col_attrs"]]$names == "CellID")
sc1meta = data.table(sampleID = gex.colnm)
for(i in inpLM[["col_attrs"]]$names[-cellIdx]){
tmp = inpLM[["col_attrs"]][[i]]$read()
if(length(tmp) == nrow(sc1meta)){sc1meta[[i]] = tmp}
}
} else {
stop("Only Seurat/SCE objects or h5ad/loom file paths are accepted!")
}
# Perform gene.mapping if specified (also map defGenes)
if(gene.mapping[1] == TRUE){
if(sum(grepl("^ENSG000", gex.rownm)) >= sum(grepl("^ENMUSG000", gex.rownm))){
tmp1 = fread(system.file("extdata", "geneMapHS.txt.gz",
package = "ShinyCell"))
} else {
tmp1 = fread(system.file("extdata", "geneMapMM.txt.gz",
package = "ShinyCell"))
}
gene.mapping = tmp1$geneName
names(gene.mapping) = tmp1$geneID
}
# Check if gene.mapping is partial or not
if(gene.mapping[1] == FALSE){
gene.mapping = gex.rownm
names(gene.mapping) = gex.rownm # Basically no mapping
} else {
if(!all(gex.rownm %in% names(gene.mapping))){
# warning("Mapping for some gene identifiers are not provided!")
tmp1 = gex.rownm[gex.rownm %in% names(gene.mapping)]
tmp1 = gene.mapping[tmp1]
tmp2 = gex.rownm[!gex.rownm %in% names(gene.mapping)]
names(tmp2) = tmp2
gene.mapping = c(tmp1, tmp2)
}
gene.mapping = gene.mapping[gex.rownm]
}
defGenes = gene.mapping[defGenes]
# Check default.gene1 / default.gene2 / default.multigene
default.gene1 = default.gene1[1]
default.gene2 = default.gene2[1]
if(is.na(default.gene1)){default.gene1 = defGenes[1]}
if(is.na(default.gene2)){default.gene2 = defGenes[2]}
if(is.na(default.multigene[1])){default.multigene = defGenes}
if(default.gene1 %in% gene.mapping){
default.gene1 = default.gene1
} else if(default.gene1 %in% names(gene.mapping)){
default.gene1 = gene.mapping[default.gene1]
} else {
warning("default.gene1 doesn't exist in gene expression, using defaults...")
default.gene1 = defGenes[1]
}
if(default.gene2 %in% gene.mapping){
default.gene2 = default.gene2
} else if(default.gene2 %in% names(gene.mapping)){
default.gene2 = gene.mapping[default.gene2]
} else {
warning("default.gene2 doesn't exist in gene expression, using defaults...")
default.gene2 = defGenes[2]
}
if(all(default.multigene %in% gene.mapping)){
default.multigene = default.multigene
} else if(all(default.multigene %in% names(gene.mapping))){
default.multigene = gene.mapping[default.multigene]
} else {
warning(paste0("default.multigene doesn't exist in gene expression, ",
"using defaults..."))
default.multigene = defGenes
}
### Actual object generation
# Make XXXmeta.rds and XXXconf.rds (updated with dimred info)
sc1conf = scConf
sc1conf$dimred = FALSE
sc1meta = sc1meta[, c("sampleID", as.character(sc1conf$ID)), with = FALSE]
# Factor metadata again
for(i in as.character(sc1conf[!is.na(fID)]$ID)){
sc1meta[[i]] = factor(sc1meta[[i]],
levels = strsplit(sc1conf[ID == i]$fID, "\\|")[[1]])
levels(sc1meta[[i]]) = strsplit(sc1conf[ID == i]$fUI, "\\|")[[1]]
sc1conf[ID == i]$fID = sc1conf[ID == i]$fUI
}
# Extract dimred and append to both XXXmeta.rds and XXXconf.rds...
if(class(obj)[1] == "Seurat"){
# Seurat Object
for(iDR in names(obj@reductions)){
drMat = obj@reductions[[iDR]]@cell.embeddings
if(ncol(drMat) > 5){drMat = drMat[, 1:5]} # Take first 5 components only
drMat = drMat[sc1meta$sampleID, ] # Ensure ordering
drMat = as.data.table(drMat)
sc1meta = cbind(sc1meta, drMat)
# Update sc1conf accordingly
tmp = data.table(ID = colnames(drMat), UI = colnames(drMat),
fID = NA, fUI = NA, fCL = NA, fRow = NA,
default = 0, grp = FALSE, dimred = TRUE)
tmp$UI = gsub("_", "", tmp$UI)
sc1conf = rbindlist(list(sc1conf, tmp))
}
} else if (class(obj)[1] == "SingleCellExperiment"){
# SCE Object
for(iDR in SingleCellExperiment::reducedDimNames(obj)){
drMat = SingleCellExperiment::reducedDim(obj, iDR)
if(ncol(drMat) > 5){drMat = drMat[, 1:5]} # Take first 5 components only
if(is.null(colnames(drMat))){
colnames(drMat) = paste0(iDR, seq(ncol(drMat)))
}
drMat = drMat[sc1meta$sampleID, ] # Ensure ordering
drMat = as.data.table(drMat)
sc1meta = cbind(sc1meta, drMat)
# Update sc1conf accordingly
tmp = data.table(ID = colnames(drMat), UI = colnames(drMat),
fID = NA, fUI = NA, fCL = NA, fRow = NA,
default = 0, grp = FALSE, dimred = TRUE)
tmp$UI = gsub("_", "", tmp$UI)
sc1conf = rbindlist(list(sc1conf, tmp))
}
} else if (tolower(tools::file_ext(obj)) == "h5ad"){
# h5ad file
for(iDR in py_to_r(inpH5$obsm_keys())){
drMat = py_to_r(inpH5$obsm[iDR])
tmpName = gsub("pca", "pc", gsub("X_", "", iDR))
tmpName = paste0(tmpName, "_", 1:ncol(drMat))
colnames(drMat) = tmpName
if(ncol(drMat) > 5){drMat = drMat[, 1:5]} # Take first 5 components only
drMat = as.data.table(drMat)
sc1meta = cbind(sc1meta, drMat)
# Update sc1conf accordingly
tmp = data.table(ID = colnames(drMat), UI = colnames(drMat),
fID = NA, fUI = NA, fCL = NA, fRow = NA,
default = 0, grp = FALSE, dimred = TRUE)
tmp$UI = gsub("_", "", tmp$UI)
sc1conf = rbindlist(list(sc1conf, tmp))
}
} else if (tolower(tools::file_ext(obj)) == "loom"){
# loom file
nDR = inpLM[["col_attrs"]]$names[
grep("pca|tsne|umap", inpLM[["col_attrs"]]$names, ignore.case = TRUE)]
for(iDR in nDR){
drMat = t(inpLM[["col_attrs"]][[iDR]]$read())
colnames(drMat) = paste0(iDR, "_", 1:ncol(drMat))
if(ncol(drMat) > 5){drMat = drMat[, 1:5]} # Take first 5 components only
drMat = as.data.table(drMat)
sc1meta = cbind(sc1meta, drMat)
# Update sc1conf accordingly
tmp = data.table(ID = colnames(drMat), UI = colnames(drMat),
fID = NA, fUI = NA, fCL = NA, fRow = NA,
default = 0, grp = FALSE, dimred = TRUE)
tmp$UI = gsub("_", "", tmp$UI)
sc1conf = rbindlist(list(sc1conf, tmp))
}
}
sc1conf$ID = as.character(sc1conf$ID) # Remove levels
# Make XXXgexpr.h5
if(!dir.exists(shiny.dir)){dir.create(shiny.dir)}
filename = paste0(shiny.dir, "/", shiny.prefix, "gexpr.h5")
sc1gexpr <- H5File$new(filename, mode = "w")
sc1gexpr.grp <- sc1gexpr$create_group("grp")
sc1gexpr.grp.data <- sc1gexpr.grp$create_dataset(
"data", dtype = h5types$H5T_NATIVE_FLOAT,
space = H5S$new("simple", dims = gex.matdim, maxdims = gex.matdim),
chunk_dims = c(1,gex.matdim[2]))
chk = chunkSize
while(chk > (gex.matdim[1]-8)){
chk = floor(chk / 2) # Account for cases where nGene < chunkSize
}
if(class(obj)[1] == "Seurat"){
# Seurat Object
if(class(obj@assays[[gex.assay[1]]]) == "Assay5"){
# Seurat v5
for(i in 1:floor((gex.matdim[1]-8)/chk)){
sc1gexpr.grp.data[((i-1)*chk+1):(i*chk), ] <- as.matrix(
obj@assays[[gex.assay[1]]]@layers[[gex.slot[1]]][((i-1)*chk+1):(i*chk),])
}
sc1gexpr.grp.data[(i*chk+1):gex.matdim[1], ] <- as.matrix(
obj@assays[[gex.assay[1]]]@layers[[gex.slot[1]]][(i*chk+1):gex.matdim[1],])
}else{
for(i in 1:floor((gex.matdim[1]-8)/chk)){
sc1gexpr.grp.data[((i-1)*chk+1):(i*chk), ] <- as.matrix(
slot(obj@assays[[gex.assay[1]]], gex.slot[1])[((i-1)*chk+1):(i*chk),])
}
sc1gexpr.grp.data[(i*chk+1):gex.matdim[1], ] <- as.matrix(
slot(obj@assays[[gex.assay[1]]], gex.slot[1])[(i*chk+1):gex.matdim[1],])
}
} else if (class(obj)[1] == "SingleCellExperiment"){
# SCE Object
for(i in 1:floor((gex.matdim[1]-8)/chk)){
sc1gexpr.grp.data[((i-1)*chk+1):(i*chk), ] <- as.matrix(
SummarizedExperiment::assay(obj, gex.assay[1])[((i-1)*chk+1):(i*chk),])
}
sc1gexpr.grp.data[(i*chk+1):gex.matdim[1], ] <- as.matrix(
SummarizedExperiment::assay(obj, gex.assay[1])[(i*chk+1):gex.matdim[1],])
} else if (tolower(tools::file_ext(obj)) == "h5ad"){
# h5ad file
if(gex.assay == "X"){
scGEX = Matrix::t(py_to_r(sp$csc_matrix(inpH5$X)))
} else {
scGEX = Matrix::t(py_to_r(sp$csc_matrix(inpH5$layers[[gex.assay]])))
}
for(i in 1:floor((gex.matdim[1]-8)/chk)){
sc1gexpr.grp.data[((i-1)*chk+1):(i*chk), ] <- as.matrix(
scGEX[((i-1)*chk+1):(i*chk),])
}
sc1gexpr.grp.data[(i*chk+1):gex.matdim[1], ] <- as.matrix(
scGEX[(i*chk+1):gex.matdim[1],])
} else if (tolower(tools::file_ext(obj)) == "loom"){
# loom file
if(gex.assay == "matrix"){
for(i in 1:floor((gex.matdim[1]-8)/chk)){
sc1gexpr.grp.data[((i-1)*chk+1):(i*chk), ] <- t(
inpLM[["matrix"]][, ((i-1)*chk+1):(i*chk)])
}
sc1gexpr.grp.data[(i*chk+1):gex.matdim[1], ] <- t(
inpLM[["matrix"]][, (i*chk+1):gex.matdim[1]])
} else {
for(i in 1:floor((gex.matdim[1]-8)/chk)){
sc1gexpr.grp.data[((i-1)*chk+1):(i*chk), ] <- t(
inpLM[["layers"]][[gex.assay]][, ((i-1)*chk+1):(i*chk)])
}
sc1gexpr.grp.data[(i*chk+1):gex.matdim[1], ] <- t(
inpLM[["layers"]][[gex.assay]][, (i*chk+1):gex.matdim[1]])
}
}
# sc1gexpr.grp.data[, ] <- as.matrix(gex.matrix[,])
sc1gexpr$close_all()
if(!isTRUE(all.equal(sc1meta$sampleID, gex.colnm))){
sc1meta$sampleID = factor(sc1meta$sampleID, levels = gex.colnm)
sc1meta = sc1meta[order(sampleID)]
sc1meta$sampleID = as.character(sc1meta$sampleID)
}
# Make XXXgenes.rds
sc1gene = seq(gex.matdim[1])
names(gene.mapping) = NULL
names(sc1gene) = gene.mapping
sc1gene = sc1gene[order(names(sc1gene))]
sc1gene = sc1gene[order(nchar(names(sc1gene)))]
# Make XXXdef.rds (list of defaults)
if(all(default.dimred %in% sc1conf[dimred == TRUE]$ID)){
default.dimred[1] = sc1conf[ID == default.dimred[1]]$UI
default.dimred[2] = sc1conf[ID == default.dimred[2]]$UI
} else if(all(default.dimred %in% sc1conf[dimred == TRUE]$UI)) {
default.dimred = default.dimred # Nothing happens
} else {
warn = TRUE
if(is.na(default.dimred[1])){
default.dimred = "umap"
warn = FALSE
}
# Try to guess... and give a warning
guess = gsub("[0-9]", "", default.dimred[1])
if(length(grep(guess, sc1conf[dimred == TRUE]$UI, ignore.case=TRUE)) >= 2){
default.dimred = sc1conf[dimred == TRUE]$UI[
grep(guess, sc1conf[dimred == TRUE]$UI, ignore.case = TRUE)[1:2]]
} else {
nDR = length(sc1conf[dimred == TRUE]$UI)
default.dimred = sc1conf[dimred == TRUE]$UI[(nDR-1):nDR]
}
if(warn){
warning(paste0("default.dimred not found, switching to ",
default.dimred[1], " and ", default.dimred[1]))
} # Warn if user-supplied default.dimred is not found
}
# Note that we stored the display name here
sc1def = list()
sc1def$meta1 = sc1conf[default == 1]$UI # Use display name
sc1def$meta2 = sc1conf[default == 2]$UI # Use display name
sc1def$gene1 = default.gene1 # Actual == Display name
sc1def$gene2 = default.gene2 # Actual == Display name
sc1def$genes = default.multigene # Actual == Display name
sc1def$dimred = default.dimred # Use display name
tmp = nrow(sc1conf[default != 0 & grp == TRUE])
if(tmp == 2){
sc1def$grp1 = sc1def$meta1
sc1def$grp2 = sc1def$meta2
} else if(tmp == 1){
sc1def$grp1 = sc1conf[default != 0 & grp == TRUE]$UI
if(nrow(sc1conf[default == 0 & grp == TRUE]) == 0){
sc1def$grp2 = sc1def$grp1
} else {
sc1def$grp2 = sc1conf[default == 0 & grp == TRUE]$UI[1]
}
} else {
sc1def$grp1 = sc1conf[default == 0 & grp == TRUE]$UI[1]
if(nrow(sc1conf[default == 0 & grp == TRUE]) < 2){
sc1def$grp2 = sc1def$grp1
} else {
sc1def$grp2 = sc1conf[default == 0 & grp == TRUE]$UI[2]
}
}
sc1conf = sc1conf[, -c("fUI", "default"), with = FALSE]
### Saving objects
saveRDS(sc1conf, file = paste0(shiny.dir, "/", shiny.prefix, "conf.rds"))
saveRDS(sc1meta, file = paste0(shiny.dir, "/", shiny.prefix, "meta.rds"))
saveRDS(sc1gene, file = paste0(shiny.dir, "/", shiny.prefix, "gene.rds"))
saveRDS(sc1def, file = paste0(shiny.dir, "/", shiny.prefix, "def.rds"))
return(sc1conf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.