# loom.R
#
#*******************# #
# Constants ----
#*******************# #
# Column attributes
CA_CELLID <- "CellID"
CA_DFLT_CLUSTERS_NAME <- "ClusterName"
CA_DFLT_CLUSTERS_ID <- "ClusterID"
CA_EMBEDDING_NAME <- "Embedding"
CA_EMBEDDING_DFLT_CNAMES <- c("_X","_Y")
CA_EXTRA_EMBEDDINGS_X_NAME <- "Embeddings_X"
CA_EXTRA_EMBEDDINGS_Y_NAME <- "Embeddings_Y"
CA_EXTRA_EMBEDDINGS_NAMES <- c(CA_EXTRA_EMBEDDINGS_X_NAME, CA_EXTRA_EMBEDDINGS_Y_NAME)
CA_CLUSTERINGS_NAME <- "Clusterings"
# CA_REGULONS_AUC <- "RegulonsAUC" # Deprecated?
# Row attributes
RA_CLUSTERING_MARKERS_NAME <- "ClusterMarkers"
RA_GENE_NAME <- "Gene"
# RA_REGULONS <- "Regulons"# Deprecated?
# Global attributes
GA_METADATA_NAME <- "MetaData"
GA_METADATA_ANNOTATIONS_NAME <- "annotations"
GA_METADATA_METRICS_NAME <- "metrics"
GA_METADATA_CLUSTERINGS_NAME <- "clusterings"
GA_METADATA_EMBEDDINGS_NAME <- "embeddings"
GA_METADATA_CLUSTERINGS_CLUSTER_MARKER_METRICS_NAME <- "clusterMarkerMetrics"
GA_TITLE_NAME <- "title"
GA_TITLE_GENOME <- "Genome"
GA_CREATION_DATE_NAME <- "CreationDate"
GA_R_VERSION_NAME <- "RVersion"
GA_LOOM_SPEC_VERSION <- "LOOM_SPEC_VERSION"
# Miscellaneous
BASE64_PATTERN <- "^([A-Za-z0-9+/]{4})*([A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)?$"
#*******************#
#*******************#
# DATA INSERTION #
#*******************#
#*******************#
#*******************#
# Global attribute data functions ----
#*******************#
#'@title create_hierarchy
#'@description Create hierarchy as named list of tree levels (SCopeTreeL1, SCopeTreeL2, SCopeTreeL3)
#'@param level.1.name Name of top level of the hierarchy.
#'@param level.2.name Name of the second level of the hierarchy.
#'@param level.3.name Name of the third level of the hierarchy.
#'@export
create_hierarchy <- function(
level.1.name = NULL,
level.2.name = NULL,
level.3.name = NULL
) {
if(is.null(x = level.1.name))
stop("You need to define at least the first level when creating a hierarchy.")
if(is.null(x = level.2.name) && !is.null(x = level.3.name))
stop("You cannot define the third level while the second level is not defined.")
if(is.null(x = level.1.name))
level.1.name<-""
if(is.null(x = level.2.name))
level.2.name<-""
if(is.null(x = level.3.name))
level.3.name<-""
return (
list(
"SCopeTreeL1"=level.1.name,
"SCopeTreeL2"=level.2.name,
"SCopeTreeL3"=level.3.name
)
)
}
#'@title add_hierarchy
#'@description Add a 3-level hierarchy as global attributes to the given loom.
#'@param loom The loom file handler.
#'@param hierarchy A named list of the 3-levels names. Use create_hierarchy() to build it.
#'@export
add_hierarchy <- function(
loom,
hierarchy,
overwrite = FALSE
) {
if(loom$mode=="r") stop("File open as read-only.")
he <- hierarchy_exists(loom = loom)
if(he && !overwrite) {
stop("Hierarchy already exists for the given loom. You can overwrite the hierarchy in the given loom by the given hierarchy by setting overwrite option to TRUE.")
}
for(idx in seq_along(along.with = hierarchy)) {
if(!he | !overwrite) {
add_global_attr(
loom = loom,
key = names(x = hierarchy)[idx],
value = hierarchy[[idx]]
)
} else {
update_global_attr(
loom = loom,
key = names(x = hierarchy)[idx],
value = hierarchy[[idx]]
)
}
}
}
#'@title hierarchy_exists
#'@description Check if hierarchy exists for the given loom.
#'@param loom The loom file handler.
#'@export
hierarchy_exists <- function(
loom
) {
if(is_loom_spec_version_3_or_greater(loom = loom))
return(
loom$link_exists(name = paste0("attrs/", "SCopeTreeL1"))
| loom$link_exists(name = paste0("attrs/", "SCopeTreeL2"))
| loom$link_exists(name = paste0("attrs/", "SCopeTreeL3"))
)
return (
loom$attr_exists(attr_name = "SCopeTreeL1")
| loom$attr_exists(attr_name = "SCopeTreeL2")
| loom$attr_exists(attr_name = "SCopeTreeL3")
)
}
#*******************#
# Global Meta data functions ----
#*******************#
is_loom_spec_version_3_or_greater <- function(loom) {
return (get_global_loom_spec_version(loom = loom) >= 3)
}
get_global_loom_spec_version <- function(loom) {
if(loom$link_exists(name = "attrs")) {
loom.spec.version <- loom[["attrs"]][[GA_LOOM_SPEC_VERSION]][]
if(loom.spec.version < 3)
stop(paste0("Corrupted loom file: expecting LOOM_SPEC_VERSION 3 but it's different (",loom.spec.version,") !"))
return (loom.spec.version)
} else {
# If not exists, add attribute
if(GA_LOOM_SPEC_VERSION %in% list.attributes(object = loom)) {
loom.spec.version <- h5attr(x=loom, which=GA_LOOM_SPEC_VERSION)
} else {
tryCatch(
{
warning("LOOM_SPEC_VERSION attribute not detected. This loom file has probably been generated SCopeLoomR version < 0.6.0.
Adding this attribute to the loom file to follow Loompy standards...")
h5attr(x=loom, which=GA_LOOM_SPEC_VERSION) <- "2.0.0"
# add_global_loom_spec_version(loom=loom, loom.spec.version=2) # why not this?
},
error = function(e) {
e$message <- paste0("It was not possible to update the attribute. Maybe the file is open in mode 'read-only'? (mode='r')\n",
e$message)
stop(e)
}
)
loom.spec.version <- h5attr(x=loom, which=GA_LOOM_SPEC_VERSION)
}
if(loom.spec.version >= 3)
stop(paste0("Corrupted loom file: expecting LOOM_SPEC_VERSION 2 but it's different (",loom.spec.version,") !"))
return (loom.spec.version)
}
}
add_global_loom_spec_version <- function(
loom,
loom.spec.version
) {
dtype <- guess_dtype(x = "foo")
# Encode character vector to UTF-8
dtype <- hdf5_utf8_encode(
value = "foo",
dtype = dtype
)
if(loom.spec.version == 3) {
loom$create_dataset(
name = paste0("attrs/", GA_LOOM_SPEC_VERSION),
robj = "3.0.0",
dtype = dtype,
space = get_dspace(x = "scalar"),
chunk_dims = NULL
)
} else if(loom.spec.version == 2) {
loom$create_attr(
attr_name = GA_LOOM_SPEC_VERSION,
robj = "2.0.0",
dtype = dtype,
space = get_dspace(x = "scalar")
)
} else {
stop("Invalid loom.spec.version!")
}
}
#'@title add_global_md_metric
#'@description Add the metric with the given name to the global MetaData attribute.
#'@param loom The loom file handler.
#'@param name The name of the annotation.
#'@param values The values of the annotation to be added.
add_global_md_metric <- function(
loom,
name
) {
if(loom$mode=="r") stop("File open as read-only.")
gmd <- get_global_meta_data(loom = loom)
a <- gmd[[GA_METADATA_METRICS_NAME]]
a[[length(a)+1]] <- list(name = name)
gmd[[GA_METADATA_METRICS_NAME]] <- NULL
gmd[[GA_METADATA_METRICS_NAME]] <- a
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
}
#'@title add_global_md_annotation
#'@description Add the annotation with the given name to the global MetaData attribute.
#'@param loom The loom file handler.
#'@param name The name of the annotation.
#'@param values The values of the annotation to be added.
add_global_md_annotation <- function(
loom,
name,
values
) {
if(loom$mode=="r") stop("File open as read-only.")
gmd <- get_global_meta_data(loom = loom)
a <- gmd[[GA_METADATA_ANNOTATIONS_NAME]]
a[[length(a)+1]] <- list(
name = name,
values = as.list(
x = as.character(
x = unique(
x = values
)
)
)
)
gmd[[GA_METADATA_ANNOTATIONS_NAME]] <- NULL
gmd[[GA_METADATA_ANNOTATIONS_NAME]] <- a
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
}
#'@title init_global_md_embeddings
#'@description Remove all embeddings from the given loom.
#'@param loom The loom file handler.
#'@export
init_global_md_embeddings <- function(loom) {
gmd <- get_global_meta_data(loom = loom)
gmd[[GA_METADATA_EMBEDDINGS_NAME]] <- NULL
gmd[[GA_METADATA_EMBEDDINGS_NAME]] <- list()
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
}
#'@title add_global_md_embedding
#'@description Add the embedding with the given name to the global MetaData attribute.
#'@param loom The loom file handler.
#'@param name The name of the embedding to add.
#'@param is.default Is the given embedding the default embedding to use in the .loom file.
#'@param trajectory Adata.frame storing the X (first column) and Y (second column) coordinates of the trajectory.
add_global_md_embedding <- function(
loom,
id,
name,
is.default = F,
trajectory = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
gmd <- get_global_meta_data(loom = loom)
e <- gmd[[GA_METADATA_EMBEDDINGS_NAME]]
idx <- length(x = e)+1
e[[idx]] <- list(
id = as.character(x = id),
name = name
)
if(!is.null(x = trajectory)) {
e[[idx]][["trajectory"]] <- trajectory
}
gmd[[GA_METADATA_EMBEDDINGS_NAME]] <- NULL
gmd[[GA_METADATA_EMBEDDINGS_NAME]] <- e
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
}
get_global_md_clusterings <- function(
loom
) {
return (get_global_meta_data(loom = loom)[[GA_METADATA_CLUSTERINGS_NAME]])
}
#'@title get_cluster_info_by_id
#'@description Get the value of the given field from the cluster with the given clustering_idx and cluster_idx
#'@param loom The loom file handler.
#'@param clustering_id The ID of the clustering you want to get the field value from.
#'@param cluster_id `The ID of the cluster you want to get the field value from.
#'@param field The name of the field you want to get the value from.
#'@export
get_cluster_info_by_id <- function(
loom,
clustering.id,
cluster.idx,
field = NULL
) {
if(is.null(x = field)) {
stop("Missing field argument in get_cluster_info_by_idx.")
}
gmd_clusterings <- get_global_md_clusterings(loom = loom)
gmd_clustering <- get_global_md_clustering_by_id(
loom = loom,
clustering.id = clustering.id
)
gmd_cluster <- gmd_clustering$clusters[[cluster.idx]]
if(!(field %in% names(x = gmd_cluster))) {
stop(paste0("The given field", field, " is not a valid field for the given cluster ", cluster.idx, "within the clustering wiith the given ID ", clustering.id, "."))
}
return (gmd_cluster[[field]])
}
#'@title get_cluster_names
#'@description Get the cluster names of the given clustering object
#'@param loom The loom file handler.
#'@param clustering The clustering object from the global MetaData attribute
#'@export
get_cluster_names <- function(
clustering
) {
return (
sapply(X = clustering$clusters, FUN = function(cluster) return (cluster$description))
)
}
#'@title get_global_md_clustering_by_id
#'@description Get the clustering object from the global MetaData attribute with the given clustering.id
#'@param loom The loom file handler.
#'@param clustering.id The ID of the clustering you want to get the field value from.
#'@export
get_global_md_clustering_by_id <- function(
loom,
clustering.id
) {
gmd_clusterings <- get_global_md_clusterings(loom = loom)
gmd_clustering <- list.find(
.data = gmd_clusterings,
id == clustering.id
)
if(length(x = gmd_clustering) != 1) {
stop(paste0("The clustering with the given ID ", clustering.id, " does not exist in this loom file."))
}
return (gmd_clustering[[1]])
}
#'@title get_global_md_clustering_by_name
#'@description Get the clustering object from the global MetaData attribute with the given clustering.name
#'@param loom The loom file handler.
#'@param clustering.name The ID of the clustering you want to get the field value from.
#'@export
get_global_md_clustering_by_name <- function(
loom,
clustering.name
) {
gmd_clusterings <- get_global_md_clusterings(loom = loom)
gmd_clustering <- list.find(
.data = gmd_clusterings,
name == clustering.name
)
if(length(x = gmd_clustering) != 1) {
stop(paste0("The given clustering ", clustering.name, " does not exist in this loom file."))
}
return (gmd_clustering[[1]])
}
#'@title get_global_md_clustering_info_by_name
#'@description Get the clustering object from the global MetaData attribute with the given clustering.name
#'@param loom The loom file handler.
#'@param clustering.name The name of the clustering you want to get the field value from.
#'@param field The field name to get the value from.
#'@export
get_global_md_clustering_info_by_name <- function(
loom,
clustering.name,
field = NULL
) {
gmd_clustering <- get_global_md_clustering_by_name(
loom = loom,
clustering.name = clustering.name
)
if(!(field %in% names(x = gmd_clustering))) {
stop(paste0("The given field", field, " is not a valid field for the given clustering ", clustering.name, "."))
}
return (gmd_clustering[[field]])
}
#'@title add_global_md_clustering_kv
#'@description Add given value with the given key to the clustering with the given clustering.id of the global MetaData attribute
#'@param loom The loom file handler.
#'@param clustering.id The clustering ID of the clustering which the key/value is added to.
#'@param key The name of the attribute to add.
#'@param value The value of the attribute to add.
add_global_md_clustering_kv <- function(
loom,
clustering.id,
key,
value
) {
if(loom$mode=="r") stop("File open as read-only.")
gmd <- get_global_meta_data(loom = loom)
c <- gmd[[GA_METADATA_CLUSTERINGS_NAME]]
# Filter the clusterings based on the given clustering.id
mask <- lapply(
X = c,
FUN = function(x) {
return (x[["id"]])
}
) == clustering.id
if(sum(mask) == 0) {
stop("Cannot add key/value to clustering that does not exist.")
}
idx <- which(mask == T)
tmp.clustering <- c[mask][[1]]
tmp.clustering[[key]] <- value
gmd[[GA_METADATA_CLUSTERINGS_NAME]][[idx]] <- tmp.clustering
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
flush(loom = loom)
}
#'@title add_global_md_clustering
#'@description Add the clustering annotation to the global MetaData attribute.
#'@param loom The loom file handler.
#'@param group The name of the group of clusterings.
#'@param name The name given to the given clustering.
#'@param clusters A list of the the cluster id for each cell present in the same order as in the columns of gene expression matrix.
add_global_md_clustering <- function(
loom,
id,
group,
name,
clusters,
annotation = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
if(sum(is.na(x = names(x = clusters))) > 0) {
stop("The names of the given clusters contains NAs.")
}
gmd <- get_global_meta_data(loom = loom)
c <- gmd[[GA_METADATA_CLUSTERINGS_NAME]]
unique.clusters <- get_unique_clusters(clusters = clusters)
clusters <- lapply(
X = seq_along(
along.with = unique.clusters
),
FUN = function(cluster.idx) {
cluster.id <- unique.clusters[[cluster.idx]]
if(is.numeric(x = cluster.id)) {
description <- paste("NDA - Cluster", cluster.id)
} else if(is.character(x = cluster.id)) {
cluster.id <- cluster.idx - 1
description <- cluster.id
} else {
stop("Cluster labels are required to be of class character or numeric.")
}
if(!is.null(x = annotation)) {
# Force to have the same order
annotation <- annotation[names(x = clusters)]
# If annotation for the current cluster not empty then add
# d<-annotation[annotation[[annotation.cluster.id.cl]] == cluster.id, annotation.cluster.description.cl]
# Convert from factor to character vector to be used with nchar
d <- as.character(x = unique(x = annotation[clusters == cluster.id]))
if(length(x = d) > 1) {
stop("Annotation is not unique: multiple annotation correspond to a cluster ID.")
}
if(nchar(x = d)>0) {
description <- paste0(d, " (",cluster.id,")")
}
}
# IDs should start from 0
return (
list(
id = cluster.id,
description = description
)
)
}
)
clusterings <- get_clusterings(loom = loom)
clustering <- list(
id = id,
group = group,
name = name,
clusters = clusters
)
c[[length(c)+1]] <- clustering
gmd[[GA_METADATA_CLUSTERINGS_NAME]] <- NULL
gmd[[GA_METADATA_CLUSTERINGS_NAME]] <- c
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
flush(loom = loom)
}
#'@title update_cluster_descriptions_by_cluster_annotation_mapping_df
#'@description Update the cluster descriptions of the clusters of the given clustering_name
#'@param loom The loom file handler.
#'@param clustering.name The name of the clustering.
#'@param annotation.df The data.frame containing the cluster annotatiion mapping
#'@param annotation.df.cluster.id.column.name The column name of the given annotation.df containing the cluster ID values
#'@param annotation.df.annotation.column.name The column name of the given annotation.df containing the annotation values.
#'@param in.place If true update the given loom, otherwise return the update global meta data.
#'@export
update_cluster_descriptions_by_cluster_annotation_mapping_df <- function(
loom,
clustering.name,
annotation.df = NULL,
annotation.df.cluster.id.column.name = NULL,
annotation.df.annotation.column.name = NULL,
in.place = TRUE
) {
if(loom$mode=="r") stop("File open as read-only.")
## make sure annotation.df cluster id column is numeric!
annotation.df[, annotation.df.cluster.id.column.name] <- as.numeric(annotation.df[, annotation.df.cluster.id.column.name])
gmd <- get_global_meta_data(loom = loom)
gmd_clusterings <- gmd[[GA_METADATA_CLUSTERINGS_NAME]]
c <- gmd[[GA_METADATA_CLUSTERINGS_NAME]]
gmd_clustering <- rlist::list.filter(
.data = gmd_clusterings,
name == clustering.name
)[[1]]
gmd_clustering_idx <- list.findi(
.data = gmd_clusterings,
cond = name == clustering.name
)[[1]]
if(is.null(x = annotation.df)
| is.null(x = annotation.df.cluster.id.column.name)
| is.null(x = annotation.df.annotation.column.name)) {
cluster_ids <- do.call(
what = "c",
args = lapply(
X = gmd_clustering$clusters,
FUN = function(cluster) {
cluster$id
}
)
)
reset <- TRUE
} else {
if(!(annotation.df.cluster.id.column.name %in% colnames(x = annotation.df))) {
stop(paste0("The given '",annotation.df.cluster.id.column.name,"' does not exist in the given annotation.df."))
}
if(!(annotation.df.annotation.column.name %in% colnames(x = annotation.df))) {
stop(paste0("The given '",annotation.df.annotation.column.name,"' does not exist in the given annotation.df."))
}
annotation_names <- annotation.df[[annotation.df.annotation.column.name]]
cluster_ids <- annotation.df[[annotation.df.cluster.id.column.name]]
annotation_as_named_list <- setNames(
object = annotation_names,
nm = cluster_ids
)
reset <- FALSE
}
clusters <- lapply(
X = cluster_ids,
FUN = function(cluster.id) {
cluster <- rlist::list.filter(gmd_clustering$clusters, id == cluster.id)[[1]]
description <- paste("NDA - Cluster", cluster$id)
if(!reset) {
# If annotation for the current cluster not empty then add
# Convert from factor to character vector to be used with nchar
d <- as.character(x = annotation_as_named_list[as.character(cluster.id)])
if(length(d) > 1) {
stop("Annotation is not unique: multiple annotation correspond to a cluster ID.")
}
if(nchar(d)>0) {
description <- paste0(d, " (",cluster.id,")")
}
}
return (
list(
id = cluster.id,
description = description
)
)
}
)
gmd_clustering$clusters <- clusters
c[[gmd_clustering_idx]] <- gmd_clustering
gmd[[GA_METADATA_CLUSTERINGS_NAME]] <- NULL
gmd[[GA_METADATA_CLUSTERINGS_NAME]] <- c
if(in.place) {
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
flush(loom = loom)
} else {
return (gmd)
}
}
#'@title add_global_md_regulon_thresholds
#'@description Add the regulon thresholds annotation to the global MetaData attribute.
#'@param loom The loom file handler.
#'@param regulon.threshold.assignments The automated regulon cell assignments object generated by SCENIC.
#'@param regulon.enrichment.table The regulon enrichment table generated by SCENIC.
add_global_md_regulon_thresholds <-function(
loom,
regulon.threshold.assignments,
regulon.enrichment.table = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
gmd <- get_global_meta_data(loom = loom)
rT <- gmd[["regulonThresholds"]]
if(!is.null(x = regulon.enrichment.table)) {
# Store the motif name
if(!("NES"%in%colnames(x = regulon.enrichment.table))) {
stop("Error: NES column is not provided in the regulon enrichment table!")
}
if(!("motif"%in%colnames(x = regulon.enrichment.table))) {
stop("Error: motif column is not provided in the regulon enrichment table!")
}
regulon_enrichment_table <- regulon.enrichment.table[with(data = regulon.enrichment.table, order(NES, decreasing = T)), ]
regulon_enrichment_table <- regulon_enrichment_table[which(x = regulon_enrichment_table$TFinDB!=""),]
}
for(regulon in names(x = regulon.threshold.assignments)) {
regulon_name <- strsplit(x = regulon, split = " ")[[1]][1]
# Store the threshold assignments
rta <- regulon.threshold.assignments[[regulon]]
AUC_thresholds <- rta$aucThr$thresholds
all_thresholds <- do.call(
what = "c",
args = lapply(
X = row.names(x = AUC_thresholds),
FUN = function(threshold.name) {
l <- list()
l[[threshold.name]] <- AUC_thresholds[threshold.name,"threshold"]
return (l)
}
)
)
AUC_selected_threshold <- rta$aucThr$selected
# Add regulon thresholds
regulon_tresholds <- list(
regulon = gsub(pattern = " ", replacement = "_", x = regulon),
defaultThresholdValue = as.numeric(x = AUC_selected_threshold),
defaultThresholdName = names(x = AUC_selected_threshold),
allThresholds = all_thresholds
)
# Add the motif data
if(!is.null(x = regulon.enrichment.table)) {
regulon_tf <- gsub(
pattern = "_extended",
replacement = "",
x = regulon_name
)
met_subset <- regulon_enrichment_table[regulon_enrichment_table$highlightedTFs == regulon_tf,]
motif_by_type <- split(
x = met_subset$motif,
f = met_subset$TFinDB
)
motif_name = NULL
if(grepl("_extended",regulon_name)) {
motif_name = motif_by_type[["*"]][1] ## Extended
} else{
motif_name = motif_by_type[["**"]][1] ## High confidence annotation
}
motif_data <- paste0(motif_name,".png")
regulon_tresholds <- append(
x = regulon_tresholds,
values = list("motifData"=motif_data)
)
} else {
warning("Argument 'regulon.enrichment.table' is not provided. Motif data will not be stored in the given .loom.")
}
rT[[length(x = rT)+1]] <- regulon_tresholds
}
gmd[["regulonThresholds"]] <- NULL
gmd[["regulonThresholds"]] <- rT
update_global_meta_data(
loom = loom,
meta.data.json = rjson::toJSON(x = gmd)
)
flush(loom = loom)
}
#'@title check_global_meta_data
#'@description Perform some sanity checks on the MetaData global attribute
#'@param loom The loom file handler.
check_global_meta_data <- function(
loom
) {
if(!global_meta_data_exists(loom = loom)) {
stop("The MetaData global attribute does not exist in the given .loom. To be able to run this function please first initialize the MetaData global attribute by running init_global_meta_data(loom = loom).")
}
}
#'@title global_meta_data_exists
#'@description Check if global MetaData attribute exists for the given loom.
#'@param loom The loom file handler.
global_meta_data_exists <- function(
loom
) {
return (
loom$attr_exists(
attr_name = GA_METADATA_NAME
) | loom$link_exists(name = paste0("attrs/", GA_METADATA_NAME))
)
}
load_global_meta_data <- function(
meta.data
) {
if(!is_base64_encoded(value = meta.data)) {
if(!is_json(value = meta.data)) {
stop("The global MetaData attribute in the given loom is corrupted.")
}
meta_data <- meta.data
} else {
meta_data <- decompress_gzb64(gzb64c = meta.data)
}
return (
rjson::fromJSON(
json_str = meta_data
)
)
}
#'@title get_global_meta_data
#'@description Get the global MetaData attribute as a R object.
#'@param loom The loom file handler.
#'@export
get_global_meta_data <- function(
loom
) {
# Check if the given loom is v3
if(loom$link_exists(name = "attrs")) {
md <- loom[["attrs"]][["MetaData"]][]
} else {
md <- h5attr(x = loom, which = GA_METADATA_NAME)
}
return (
load_global_meta_data(
meta.data = md
)
)
}
#'@title update_global_meta_data
#'@description Update the global MetaData attribute with the given meta.data.json.
#'@param loom The loom file handler.
#'@param meta.data.json The meta data stored as a json string.
update_global_meta_data <- function(
loom,
meta.data.json
) {
if(is_loom_spec_version_3_or_greater(loom = loom)) {
meta_data_json <- as.character(x = meta.data.json)
} else {
meta_data_json <- compress_gzb64(c = as.character(x = meta.data.json))
}
update_global_attr(
loom = loom,
key = GA_METADATA_NAME,
value = meta_data_json
)
}
#'@title init_global_meta_data
#'@description Initialize the global MetaData attribute.
#'@param loom The loom file handler.
#'@param loom.spec.version The version of the loom specification. Choose on of: 2 or 3
#'@export
init_global_meta_data <- function(
loom,
loom.spec.version
) {
if(!(loom.spec.version %in% c(2,3)))
stop("Invalid loom.spec.version!")
meta_data <- list(
annotations = list(),
metrics = list(),
embeddings = list(),
clusterings = list(),
regulonThresholds = list()
)
meta_data_json <- rjson::toJSON(x = meta_data)
if(GA_METADATA_NAME %in% list.attributes(object = loom))
stop("The MetaData global attribute already exists.")
if(loom.spec.version < 3) {
md <- compress_gzb64(
c = as.character(x = meta_data_json)
)
} else {
md <- meta_data_json
}
add_global_attr(
loom = loom,
key = GA_METADATA_NAME,
value = as.character(x = md)
)
}
#'@title clear_global_meta_data
#'@description Empty the global MetaData attribute.
#'@param loom The loom file handler.
clear_global_meta_data <- function(
loom
) {
meta_data <- list(
annotations = list(),
metrics = list(),
embeddings = list(),
clusterings = list(),
regulonThresholds = list()
)
meta_data_json <- rjson::toJSON(x = meta_data)
if(!(GA_METADATA_NAME %in% list.attributes(object = loom))) {
compressed.meta.data <- compress_gzb64(
c = as.character(x = meta_data_json)
)
add_global_attr(
loom = loom,
key = GA_METADATA_NAME,
value = as.character(x = compressed.meta.data)
)
} else {
update_global_meta_data(
loom = loom,
meta.data.json = as.character(x = meta_data_json)
)
}
}
#*******************#
# Embedding functions ----
#*******************#
#'@title add_embedding_dyno
#'@description Add the given embedding as a column attribute and meta data related to the given embeddding to the given .loom file handler.
#'@param loom The loom file handler.
#'@param dyno A dyno object generated by dyno::infer_trajectory.
#'@param name A name to give to the embedding/trajectory inferred by the trajectory method.
#'@export
add_embedding_from_dyno <- function(loom, dyno, name = NULL) {
if(is.null(x = name)) {
stop("Please specify a name for the given analysis run with dyno.")
}
add_embedding(
loom = loom,
embedding = dyno$dimred,
name = name,
trajectory = create_trajectory_from_dyno(dyno = dyno),
is.default = !col_attrs_exists_by_key(loom = loom, key = "Embedding")
)
}
#'@title add_default_embedding
#'@description Add the main (default) embedding to the given loom file handler.
#'@param loom The loom file handler.
#'@param embedding The embedding to add.
add_default_embedding <- function(
loom,
embedding
) {
if(loom$mode=="r") stop("File open as read-only.")
embedding <- as.data.frame(x = embedding)
colnames(embedding) <- CA_EMBEDDING_DFLT_CNAMES
add_col_attr(
loom = loom,
key = CA_EMBEDDING_NAME,
value = embedding
)
}
#'@title get_next_embedding_id
#'@description Get the next embedding ID
#'@param loom The loom file handler.
#'@export
get_next_embedding_id <- function(loom) {
ca_extra_embeddings_dims <- do.call(what = "c", args = lapply(CA_EXTRA_EMBEDDINGS_NAMES, function(ca_key) {
ca.embeddings <- get_col_attr_by_key(
loom = loom,
key = ca_key
)
return (max(as.integer(x = colnames(x = ca.embeddings))))
}))
if(ca_extra_embeddings_dims[1] != ca_extra_embeddings_dims[2]) {
stop("Embeddings_X and Embeddings_Y have different dimensions. Make sure they the same dimensions.")
}
return (ca_extra_embeddings_dims[1]+1)
}
#'@title add_embedding
#'@description Add the given embedding as a column attribute and meta data related to the given embeddding to the given .loom file handler.
#'@param loom The loom file handler.
#'@param embedding A M-by-2 data.frame of the embeddings with M cells.
#' If the given trajectory is not NULL, it should correspond to the projection of the cells onto the given trajectory.
#'@param name The name of the given embedding.
#'@param is.default Default embedding to use in the .loom file.
#'@param trajectory A named list with the node names, the edges, and the x and y coordinates of the nodes of trajectory. Use the function create_trajectory().
#'@export
add_embedding <- function(
loom,
embedding,
name,
is.default = F,
trajectory = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
# Check if any NA values
if(any(is.na(x = embedding))) {
stop("Please make sure that the embeddings do not contain any NA values.")
}
# Check MetaData global attribute
check_global_meta_data(loom = loom)
if(is.null(x = row.names(x = embedding))) {
stop("Please, make sure row.names are correctly filled in with cell IDs.")
}
# Check if some cells are missing betweem cells in embedding and the cells in the expression matrix
check_missing_cells <- check_missings(
x = get_cell_ids(loom = loom),
y = row.names(x = embedding)
)
if(!check_missing_cells$result) {
if(length(x = check_missing_cells$x.missing.in.y) > 0) {
stop(paste0("Some cells (",length(x = check_missing_cells$x.missing.in.y),") from expression matrix are missing in the embeddings: ", paste0(check_missing_cells$x.missing.in.y, collapse = ","), "."))
}
if(length(check_missing_cells$y.missing.in.x) > 0) {
stop(paste0("Some cells (",length(x = check_missing_cells$y.missing.in.x),") from embeddings are missing in the expression matrix: ", paste0(check_missing_cells$y.missing.in.x, collapse = ","), "."))
}
}
# Check if cells in embedding are in the same order as in the expression matrix
if(!is_order_conserved(
x = get_cell_ids(loom = loom),
y = row.names(x = embedding))
) {
stop("Please, make sure that row.names of the given embedding are in the same order as the column names of the expression matrix in the given loom.")
}
# Add the default embedding also to Embeddings_X and Embeddings_Y
if(is.default) {
add_default_embedding(
loom = loom,
embedding = embedding
)
} else {
print(paste0("Adding embedding ", name, "..."))
}
ca <- loom[["col_attrs"]]
# Add a main embedding
if(sum(CA_EXTRA_EMBEDDINGS_NAMES%in%names(ca)) != 2) {
for(i in seq_along(along.with = CA_EXTRA_EMBEDDINGS_NAMES)) {
e <- as.data.frame(x = embedding[,i])
id <- "-1"
colnames(x = e) <- id
add_col_attr(
loom = loom,
key = CA_EXTRA_EMBEDDINGS_NAMES[i],
value = e
)
}
} else {
id <- as.character(x = get_next_embedding_id(loom = loom))
for(i in seq_along(along.with = CA_EXTRA_EMBEDDINGS_NAMES)) {
ca.embeddings <- get_col_attr_by_key(
loom = loom,
key = CA_EXTRA_EMBEDDINGS_NAMES[i]
)
e <- as.data.frame(x = embedding[,i])
id <- as.character(x = ncol(x = ca.embeddings))
colnames(x = e) <- id
ca.embeddings <- cbind(ca.embeddings, e)
# Update the current coordinates Embeddings
update_col_attr(
loom = loom,
key = CA_EXTRA_EMBEDDINGS_NAMES[i],
value = as.data.frame(x = ca.embeddings)
)
}
}
flush(loom = loom)
# Add the given embedding to the global attribute MetaData
add_global_md_embedding(
loom = loom,
id = id,
name = name,
is.default = is.default,
trajectory = trajectory
)
flush(loom = loom)
}
#*******************#
# Trajectories functions ----
#*******************#
#'@title create_trajectory_from_dyno
#'@description Create a trajectory object from dyno TI output.
#'@param dyno A dyno object generated by dyno::infer_trajectory.
#'@export
create_trajectory_from_dyno <- function(
dyno
) {
# get cell positions
cell_positions <- dyno$dimred %>%
as.data.frame() %>%
rownames_to_column("cell_id")
# assign cells to closest milestone
cell_positions <- left_join(
cell_positions,
dyno$milestone_percentages %>%
group_by(cell_id) %>%
arrange(desc(percentage)) %>%
filter(dplyr::row_number() == 1) %>%
select(cell_id, milestone_id),
"cell_id"
)
print("Projecting the waypoints... (this may take some time)")
waypoint_projection <- project_waypoints(
traj = dyno,
cell_positions = cell_positions
)
edges <- as.data.frame(x = waypoint_projection$edges[, c("from","to")]) # "arrow"
coordinates <- data.frame(
"x"=waypoint_projection$positions[, "comp_1"],
"y"=waypoint_projection$positions[, "comp_2"]
)
# Set the row.names
row.names(x = coordinates) <- waypoint_projection$positions$waypoint_id
return (
create_trajectory(
edges = edges,
coordinates = coordinates
)
)
}
#'@title create_trajectory_from_monocle
#'@description Create a trajectory object from CellDataSet monocle output.
#'@param cds CellDataSet object from monocle.
#'@export
create_trajectory_from_monocle <- function(
cds
) {
edges <- as.data.frame(
x = igraph::get.edgelist(
graph = cds@minSpanningTree
)
)
coordinates <- data.frame(
"x"=t(x = reducedDimK(cds = cds))[,1],
"y"=t(x = reducedDimK(cds = cds))[,2]
)
return (
create_trajectory(
edges = edges,
coordinates = coordinates
)
)
}
#'@title create_trajectory
#'@description Create a trajectory object from the given edges and coordinates of the nodes.
#'@param cds CellDataSet object from monocle.
#'@export
create_trajectory <- function(
edges,
coordinates
) {
if(!is.data.frame(x = edges))
stop("The given edges should be a data.frame.")
if(!is.data.frame(x = coordinates))
stop("The given coordinates should be a data.frame.")
if(all(
as.character(x = seq(
from = 1,
to = nrow(x = coordinates))
) == row.names(
x = coordinates
)) | is.null(x = row.names(coordinates))
) {
stop("The row.names of the given coordinates should have the node names.")
}
colnames(x = edges)[c(1,2)] <- c("source","target")
row.names(x = edges) <- NULL
edges.list <- apply(
X = edges,
MARGIN = 1,
FUN = as.list
)
nodes <- row.names(x = coordinates)
colnames(x = coordinates) <- c("x","y")
row.names(x = coordinates) <- NULL
coord.list <- apply(
X = coordinates,
MARGIN = 1,
FUN = as.list
)
return (
list(
"nodes"=nodes,
"edges"=edges.list,
"coordinates"=coord.list
)
)
}
#*******************#
# Clusterings functions ----
#*******************#
get_unique_clusters <- function(
clusters
) {
if(is.factor(x = clusters)) {
return (
sort(
x = as.numeric(levels(x = clusters)),
decreasing = F)
)
} else {
return (
sort(
x = as.numeric(unique(x = clusters)),
decreasing = F
)
)
}
}
#'@title get_seurat_clustering_resolutions
#'@description Get list of all computed clusterings resolutions in the given seurat object.
#'@param seurat A Seurat object.
#'@param prefix Prefix of the clustering to extract the resolutions from (only for Seurat v3 objects).
#'@export
get_seurat_clustering_resolutions <- function(
seurat,
prefix = NULL
) {
if(seurat@version >= 3) {
clustering.ids <- grep(
pattern = "res\\.",
x = colnames(x = seurat@meta.data)
)
clustering.prefix <- paste0(unique(x = stringr::str_split_fixed(string = colnames(x = seurat@meta.data)[clustering.ids], pattern = "res.", n = 2)[,1]), "res.")
# Check if there are multiple prefixes for clustering information
# Seurat v3 uses integrated_snn_ prefix for storing clustering information from its integration method
if(length(x = clustering.prefix) > 1 && is.null(x = prefix)) {
stop(paste0("Detected multiple prefixes for clustering information: ", paste(clustering.prefix, collapse = ", "), " . Please select one of them using the prefix argument."))
} else if(length(x = clustering.prefix) == 1) {
prefix <- clustering.prefix
} else if(length(x = clustering.prefix) == 0) {
stop("It seems that clustering data has not been computed. ")
}
if(length(x = prefix) == 1 && !(prefix %in% clustering.prefix)) {
stop("The selected prefix does not exist.")
}
return (
as.numeric(
x = stringr::str_split_fixed(
string = colnames(x = seurat@meta.data)[grep(pattern = paste0("^", prefix), x = colnames(x = seurat@meta.data))],
pattern = prefix,
n = 2
)[,2]
)
)
} else if (seurat@version >= 2 && seurat@version < 3){
return (
as.numeric(x = stringr::str_split_fixed(
string = names(x = seurat@calc.params)[grep(pattern = "FindClusters", x = names(x = seurat@calc.params))],
pattern = "FindClusters.res.",
n = 2
)[,2])
)
} else {
stop(paste0("This version of Seurat ", seurat@version," is not supported."))
}
}
#'@title add_seurat_clustering
#'@description Add all the Seurat clusterings in the given seurat object to the given .loom file handler.
#'@param loom The loom file handler.
#'@param seurat The Seurat object
#'@param seurat.assay The assay to access the data from (only if you're using Seurat version 3). Option used only for sanity checks.
#'@param seurat.markers.file.path.list The named list of file paths to the markers saved in RDS format. The names should be the resolution id of the corresponding clustering (e.g.: SCT_snn_res.0.5). Default is NULL.
#'@param default.clustering.resolution The clustering resolution (i.e.: '0.5' or 'SCT_snn_res.0.5') of the clustering that should be set as the default which an annotation can be set for.
#'@param annotation A data.frame with annotation for the clusters of the default clustering. Default is NULL.
#'@param annotation.cluster.id.cn The column name to use for the IDs of the clusters found by the given clustering group. Default is NULL.
#'@param annotation.cluster.description.cn The column name to use for the description of the clusters found by the given clustering group. Default is NULL.
#'@export
add_seurat_clustering <- function(
loom,
seurat,
seurat.assay = "RNA",
seurat.clustering.prefix = NULL,
seurat.markers.file.path.list = NULL,
seurat.marker.metric.accessors = NULL,
seurat.marker.metric.names = NULL,
seurat.marker.metric.description = NULL,
default.clustering.resolution = NULL,
default.clustering.overwrite = NULL,
annotation = NULL,
annotation.cluster.id.cn = NULL,
annotation.cluster.description.cn = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
# Check MetaData global attribute
check_global_meta_data(loom = loom)
if(!is.null(x = seurat.markers.file.path.list)) {
if(is.null(x = names(x = seurat.markers.file.path.list))) {
stop("Argument 'seurat.markers.file.path.list' is not a named list. The names should correspond to the clustering ID in Seurat object (e.g.: res.2).")
}
}
# Access data
if(seurat@version >= 3) {
counts <- seurat@assays[[seurat.assay]]@counts
data <- seurat@assays[[seurat.assay]]@data
} else if (seurat@version >= 2 && seurat@version < 3){
counts <- seurat@raw.data
data <- seurat@data
} else {
stop(paste0("This version of Seurat ", seurat@version," is not supported."))
}
# Check that all cells present in loom digital expression matrix are present in the given seurat@data and seurat@scale.data objects
cells <- get_cell_ids(loom = loom)
n_cells <- length(x = cells)
if(sum(cells %in% colnames(x = counts)) != n_cells | sum(cells %in% colnames(x = data)) != n_cells) {
stop(paste0("Some cells are missing. Please check that all cells from the digital expression matrix (dgem) in the given loom are present in the data slots i.e.: seurat@raw.data, seurat@data or seurat@assays[[", seurat.assay, "]]@counts, seurat@assays[[", seurat.assay, "]]@data"))
}
clustering.resolutions <- get_seurat_clustering_resolutions(
seurat = seurat,
prefix = seurat.clustering.prefix
)
if(length(x = clustering.resolutions) > 0) {
for(res in clustering.resolutions) {
# Get cluster IDs for the current clustering
resolution_id <- paste0(seurat.clustering.prefix,res)
print(paste0("Seurat, ", seurat.clustering.prefix, res))
if(seurat@version >= 3) {
if(is.null(x = seurat.clustering.prefix)) {
stop("The given seurat.clustering.prefix is required. E.g.: RNA_snn_res.")
}
cluster_ids <- seurat@meta.data[, paste0(seurat.clustering.prefix, res)]
names(x = cluster_ids) <- rownames(x = seurat@meta.data)
} else if (seurat@version >= 2 && seurat@version < 3){
seurat <- Seurat::SetAllIdent(
object = seurat,
id=resolution_id
)
cluster_ids <- seurat@ident
} else {
stop(paste0("This version of Seurat ", seurat@version," is not supported."))
}
is_default_clustering <- F
# Add the Seurat clusters
print("Adding Seurat clusters...")
a <- NULL
ac_id_cn <- NULL
ac_description_cn <- NULL
if(!is.null(x = default.clustering.resolution)) {
if(res == default.clustering.resolution | resolution_id == default.clustering.resolution) {
print("Adding default Seurat clusters...")
if(!is.null(x = annotation) && !is.null(x = annotation.cluster.id.cn) && !is.null(x = annotation.cluster.description.cn)) {
a <- annotation
ac_id_cn <- annotation.cluster.id.cn
ac_description_cn <- annotation.cluster.description.cn
}
is_default_clustering <- T
}
} else {
# If only one clustering resolution computed then set this one as default
if(length(x = clustering.resolutions) == 1) {
is_default_clustering <- T
}
}
flush(loom = loom)
cluster_annotation <- create_cluster_annotation(
clusters = cluster_ids,
cluster.meta.data.df = a,
cluster.id.cn = ac_id_cn,
cluster.description.cn = ac_description_cn
)
clid <- add_annotated_clustering(
loom = loom,
group = "Seurat",
name = paste("Seurat,", paste0(seurat.clustering.prefix, res)),
clusters = cluster_ids,
annotation = cluster_annotation,
is.default = is_default_clustering,
overwrite.default = default.clustering.overwrite
)
print(paste0("Clustering ID: ", clid))
flush(loom = loom)
# Add the Seurat markers if not empty
if(!is.null(x = seurat.markers.file.path.list)) {
print("Adding Seurat markers...")
if(resolution_id %in% names(x = seurat.markers.file.path.list)) {
if(file.exists(seurat.markers.file.path.list[[resolution_id]])) {
seurat_markers <- readRDS(file = seurat.markers.file.path.list[[resolution_id]])
seurat_markers_by_cluster <- split(
x = seurat_markers,
f = seurat_markers$cluster
)
add_clustering_markers(
loom = loom,
clustering.id = clid,
clustering.markers = seurat_markers_by_cluster,
marker.metric.accessors = seurat.marker.metric.accessors,
marker.metric.names = seurat.marker.metric.names,
marker.metric.descriptions = seurat.marker.metric.description
)
flush(loom = loom)
} else {
warning(paste0("Missing Seurat markers file for clustering resolution ", res, "."))
}
} else {
warning(paste0("Seurat markers for clustering resolution ", res, " have not been computed."))
}
} else {
print("No Seurat markers added.")
}
}
} else {
warning("It seems that you are using an old version of Seurat.")
print("Adding Seurat clusters...")
cluster_annotation <- create_cluster_annotation(
clusters = seurat@ident,
cluster.meta.data.df = annotation,
cluster.id.cn =annotation.cluster.id.cn,
cluster.description.cn = annotation.cluster.description.cn
)
clid <- add_annotated_clustering(
loom = loom,
group = "Seurat",
name = "Seurat, resolution undefined",
clusters = seurat@ident,
annotation = cluster_annotation,
is.default = T,
overwrite.default = default.clustering.overwrite
)
}
}
#'@title append_clustering_update_ca
#'@description Append the given clustering with the given clustering.id.
#'@param loom The loom file handler.
#'@param clustering.id The id of the embedding group.
#'@param clustering The embedding to add.
append_clustering_update_ca <- function(
loom,
clustering.id,
clustering
) {
if(loom$mode=="r") stop("File open as read-only.")
ca_clusterings <- get_clusterings(loom = loom)
colnames(clustering) <- as.character(clustering.id)
# Append this clustering
ca_clusterings <- cbind(
ca_clusterings,
clustering
)
update_col_attr(
loom = loom,
key = CA_CLUSTERINGS_NAME,
value = as.data.frame(x = ca_clusterings)
)
}
#'@title create_cluster_annotation
#'@description Create a named list where names are the cell IDs and the values are the annotation stored in the given cluster.description.cn column of the given cluster.meta.data.df.
#' If cluster.meta.data.df is set to NULL, annotation of cells are set to "NDA - Cluster X" where X is the cluster ID.
#'@param cluster.meta.data.df A data.frame object with at least 2 columns named cluster.id.cn and cluster.description.cn.
#'@param clusters A named list of the cell id and the assigned cluster id generated by a clustering method (e.g.: Seurat).
#'@param cluster.id.cn The column name to use for the IDs of the clusters.
#'@param cluster.description.cn The column name to use for the description of the clusters.
#'@export
create_cluster_annotation <- function(
clusters,
cluster.meta.data.df = NULL,
cluster.id.cn = NULL,
cluster.description.cn = NULL
) {
unique_clusters <- get_unique_clusters(clusters = clusters)
annotation <- setNames(
object = rep(NA, length(x = clusters)),
nm = names(clusters)
)
for(cluster.idx in seq_along(along.with = unique_clusters)) {
cluster_id <- unique_clusters[[cluster.idx]]
if(is.numeric(x = cluster_id)) {
description <- paste("NDA - Cluster", cluster_id)
} else if(is.character(x = cluster_id)) {
description <- cluster_id
} else {
stop("Cluster labels are required to be of class character or numeric.")
}
if(!is.null(x = cluster.meta.data.df)) {
if(!(cluster.description.cn %in% colnames(x = cluster.meta.data.df))) {
stop(paste0("The given column ",cluster.description.cn, " does not exist in the annotation provided."))
}
description <- as.vector(x = unlist(x = cluster.meta.data.df[cluster.meta.data.df[[cluster.id.cn]] == cluster_id, cluster.description.cn]))
}
annotation[as.vector(x = unlist(x = clusters)) == cluster_id] <- description
}
annotation <- factor(x = annotation)
return (annotation)
}
#'@title add_clustering
#'@description Add the given clusters in the given group column attribute and meta data related to the given clustering to the given .loom file handler.
#'@param loom The loom file handler.
#'@param group The for the given clustering group to which the given clusters have to be added.
#'@param name The name given to this clustering.
#'@param clusters A named list of the cell id and assigned the cluster id.
#'@param is.default Set this clustering be set as default one.
#'@param overwrite.default Overwrite the default clustering in the given loom if set to TRUE. Otherwise, it's not updated.
#'@export
add_clustering <- function(
loom,
group,
name,
clusters,
is.default = F,
overwrite.default = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
annotation <- create_cluster_annotation(clusters = clusters)
add_annotated_clustering(
loom = loom,
group = group,
name = name,
clusters = clusters,
annotation = annotation,
is.default = is.default,
overwrite.default = overwrite.default
)
}
clusters_as_factor_ids <- function(clusters) {
# Convert character clusters to to numeric clusters
unique_clusters <- get_unique_clusters(clusters = clusters)
if(any(is.character(x = unique_clusters)))
return (
# IDs should start from 0
setNames(
object = factor(x = plyr::mapvalues(
x = clusters,
from = unique_clusters,
to = seq_along(along.with = unique_clusters)-1
)),
nm = names(x = clusters)
)
)
return (clusters)
}
#'@title add_annotated_clustering
#'@description Add the given clusters in the given group column attribute and meta data related to the given clustering to the given .loom file handler.
#'@param loom The loom file handler.
#'@param group The for the given clustering group to which the given clusters have to be added.
#'@param name The name given to this clustering
#'@param clusters A named list of the cell id and assigned the cluster id.
#'@param annotation A named list of the cell id and the corresponding annotation.
#'@param is.default Set this clustering be set as default one.
#'@param overwrite.default Overwrite the default clustering in the given loom if set to TRUE. Otherwise, it's not updated.
#'@export
add_annotated_clustering <- function(
loom,
group,
name,
clusters,
annotation,
is.default = F,
overwrite.default = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
id <- 0
if(length(x = unique(x = clusters)) == length(x = unique(x = annotation))) {
# Make sure the order are the same
annotation <- annotation[names(x = clusters)]
} else {
# Does not seem that cluster IDs and cluster annotation correspond
# Remap to have the same number of unique IDs as the number of unique annotation
clusters <- factor(x = plyr::mapvalues(
x = annotation,
from = unique(x = annotation),
to = seq_along(along.with = unique(x = annotation))-1
))
names(x = clusters) <- names(x = annotation)
}
# Convert clusters to numeric values if needed clusters are of class character
# This is required since the column attribute Clusterings expects numeric values
clusters_as_factor_ids <- clusters_as_factor_ids(clusters = clusters)
# Check if all the cells are present in the given clusters
cell_ids <- get_cell_ids(loom = loom)
n_mismatches <- sum(!(names(x = clusters) %in% cell_ids))
if(n_mismatches > 0) {
stop(paste0("Mismatches detected between the cell IDs (",n_mismatches,") in the given clusters object and in CellID column attribute of the .loom. Please do not use special characters for cell IDs except '_'."))
}
# Order the clusters in the order defined CellID column attribute
clusters <- clusters[cell_ids]
clusters_as_factor_ids <- clusters_as_factor_ids[cell_ids]
# Convert clusters_as_factor_ids to integer (not numeric since this will be interpreted as float) otherwise H5T_ENUM will be used leading to
# Datatype: H5T_COMPOUND {
# H5T_ENUM {
# H5T_STD_U8LE;
# "0" 1;
# "1" 2;
# } "0" : 0;
# }
clusters_as_numeric_ids <- as.integer(x = as.character(x = clusters_as_factor_ids))
# If the clustering is the default one
# Add it as the generic column attributes ClusterID and ClusterName
if(is.default) {
if(col_attrs_exists_by_key(
loom = loom, key = CA_DFLT_CLUSTERS_ID
) | col_attrs_exists_by_key(
loom = loom,
key = CA_DFLT_CLUSTERS_NAME)
) {
if(is.null(x = overwrite.default)) {
stop("A default clustering has already been set. It can be overwritten by setting the corresponding *overwrite.default to TRUE or skipped by setting it to FALSE.")
} else {
if(overwrite.default) {
warning("A default clustering has already been set. The current default clustering will be overwritten.")
# Check for each if it exists. If it exists update otherwise create new.
if(col_attrs_exists_by_key(loom = loom, key = CA_DFLT_CLUSTERS_ID)) {
update_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_ID,
value = clusters_as_numeric_ids
)
} else {
add_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_ID,
value = clusters_as_numeric_ids
)
}
# Check if annotation is not null
if(!is.null(x = annotation)) {
if(col_attrs_exists_by_key(loom = loom, key = CA_DFLT_CLUSTERS_NAME))
update_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_NAME,
value = as.character(x = annotation)
)
else {
add_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_NAME,
value = as.character(x = annotation)
)
}
} else {
# As we overwrite ClusterID and annotation is null, if ClusterName is already present, to be consistent we should delete it.
if(col_attrs_exists_by_key(loom = loom, key = CA_DFLT_CLUSTERS_NAME)) {
remove_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_NAME
)
}
}
} else {
warning("A default clustering has already been set. It won't be overwritten.")
}
}
} else {
add_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_ID,
value = clusters_as_numeric_ids
)
add_col_attr(
loom = loom,
key = CA_DFLT_CLUSTERS_NAME, value = as.character(x = annotation)
)
}
}
# Adding the clustering data
if(col_attrs_exists_by_key(loom = loom, key = CA_CLUSTERINGS_NAME)) {
print(paste0("col_attr '", CA_CLUSTERINGS_NAME, "' already exists..."))
ca_clusterings <- get_clusterings(loom = loom)
# Set the clustering id
id <- ncol(x = ca_clusterings) # n clusterings (start at 0)
clustering <- data.frame(
"x" = clusters_as_numeric_ids,
stringsAsFactors = F
)
append_clustering_update_ca(
loom = loom,
clustering.id = id,
clustering = clustering
)
} else {
print(paste(CA_CLUSTERINGS_NAME, "created..."))
clustering <- data.frame(
"x" = clusters_as_numeric_ids,
stringsAsFactors = F
)
colnames(x = clustering) <- as.character(x = id)
add_col_attr(
loom = loom,
key = CA_CLUSTERINGS_NAME,
value = as.data.frame(x = clustering)
)
}
flush(loom = loom)
# Adding the clustering meta data
add_global_md_clustering(
loom = loom,
id = id,
group = group,
name = name,
clusters = clusters_as_factor_ids,
annotation = annotation
)
flush(loom = loom)
return(id)
}
#*******************#
# SCENIC functions ----
#*******************#
#'@title add_regulons
#'@description Add the regulons with their target genes generated by SCENIC as a row attribute to the given .loom file handler.
#'@param loom The loom file handler.
#'@param regulons A list of list of the regulons and their target genes generated by SCENIC.
#'#'@param column.attr.name Type of regulon to retrieve; Usually 'MotifRegulons' for motif-based regulons, or 'TrackRegulons' for track-based (e.g. ChIP-seq) regulons.
#' (Might change according to the SCENIC/pySCENIC pipeline version and settings used).
#'@param regulon.threshold.assignments The automated regulon cell assignments object generated by SCENIC.
#'@param regulon.enrichment.table The regulon enrichment table generated by SCENIC.
#'@export
add_regulons <- function(
loom,
regulons,
column.attr.name="MotifRegulons",
regulon.threshold.assignments = NULL,
regulon.enrichment.table = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
if(!is.null(x = regulon.threshold.assignments)) {
cell_ids <- get_cell_ids(loom = loom)
cells_not_found <- lapply(
X = seq_along(along.with = regulon.threshold.assignments),
FUN = function(i) {
tf <- names(regulon.threshold.assignments)[i]
x <- regulon.threshold.assignments[[i]]
# Get all cells from regulon cell assignments not found in dgem
cells_not_found <- x$assignment[!(x$assignment%in%cell_ids)]
if(length(x = cells_not_found) > 0) {
return (
list(
name=tf,
cells_not_found=cells_not_found
)
)
}
}
)
# Remove all empty entries
cells_not_found <- Filter(
f = Negate(
f = is.null
),
x = cells_not_found
)
if(length(x = cells_not_found) > 0) {
msg <- sapply(
X = cells_not_found,
FUN = function(x) {
return (paste0("Cells not found in expression matrix for ", x$name ," regulon: ", paste0(x$cells_not_found, collapse = ", ")))
}
)
cat(paste(msg, collapse="\n\n"))
}
}
# Get cells IDs from expression matrix
genes <- get_genes(loom = loom)
# Add regulons
regulons_mask <- do.call(
what = "cbind",
args = lapply(
X = seq_along(along.with = regulons),
FUN = function(regulon.idx) {
reg.name <- names(x = regulons)[regulon.idx]
reg.genes <- regulons[[regulon.idx]]
reg.mask <- data.frame("x"=genes%in%reg.genes, stringsAsFactors = F)
colnames(x = reg.mask) <- reg.name
return (reg.mask)
}
)
)
row.names(x = regulons_mask) <- genes
colnames(x = regulons_mask) <- gsub(
pattern = " ",
replacement = "_",
x = colnames(x = regulons_mask)
)
add_row_attr(
loom = loom,
key = column.attr.name,
value = as.data.frame(x = regulons_mask)
)
flush(loom = loom)
# Add regulon thresholds
if(!is.null(x = regulon.threshold.assignments)) {
add_global_md_regulon_thresholds(
loom = loom,
regulon.threshold.assignments = regulon.threshold.assignments,
regulon.enrichment.table = regulon.enrichment.table
)
}
flush(loom = loom)
}
#'@export
add_scenic_regulons <- add_regulons
#'@title add_regulons_AUC
#'@description Add the regulons AUC matrix generated by SCENIC as a column attribute to the given .loom file handler.
#'@param loom The loom file handler.
#'@param regulons.AUC A matrix of the regulons AUC values with M regulons as rows and N cells as columns.
#' @param column.attr.name Type of regulon to retrieve; Usually 'MotifRegulonsAUC' for motif-based regulons, or 'TrackRegulonsAUC' for track-based (e.g. ChIP-seq) regulons.
#' (Might change according to the SCENIC/pySCENIC pipeline version and settings used).
#'@export
add_regulons_AUC <- function(
loom,
regulons.AUC,
column.attr.name="MotifRegulonsAUC"
) {
if(loom$mode=="r") stop("File open as read-only.")
# Check if some cells are missing betweem cells in embedding and the cells in the expression matrix
check_missing_cells <- check_missings(
x = get_cell_ids(loom = loom),
y = colnames(x = regulons.AUC)
)
if(!check_missing_cells$result) {
if(length(x = check_missing_cells$x.missing.in.y) > 0) {
stop(paste0("Some cells (",length(x = check_missing_cells$x.missing.in.y),") from expression matrix are missing in the regulon AUC matrix: ", paste0(check_missing_cells$x.missing.in.y, collapse = ","), "."))
}
if(length(x = check_missing_cells$y.missing.in.x) > 0) {
stop(paste0("Some cells (",length(x = check_missing_cells$y.missing.in.x),") from regulon AUC matrix are missing in the expression matrix: ", paste0(check_missing_cells$y.missing.in.x, collapse = ","), "."))
}
}
# Check if cells in embedding are in the same order as in the expression matrix
if(!is_order_conserved(x = get_cell_ids(loom = loom), y = colnames(x = regulons.AUC))) {
stop("Please, make sure that row.names of the given embedding are in the same order as the column names of the expression matrix in the given loom.")
}
add_col_attr(
loom = loom,
key = column.attr.name,
value = as.data.frame(x = t(x = regulons.AUC))
)
flush(loom = loom)
}
#'@export
add_scenic_regulons_auc_matrix <- add_regulons_AUC
#*******************#
# Row data functions ----
#*******************#
#'@title add_clustering_markers
#'@description Add the clustering markers as a row attribute to the given .loom file handler.
#'@param loom The loom file handler.
#'@param dgem A matrix of the gene expression with M genes as rows and N cells as columns.
#'@param clustering.id The clustering id that the given clustering.markers are specific for.
#'@param clustering.markers A list of markers of each cluster found for the given clustering.id.
#'@param marker.metric.accessors A list of the column names of the metrics to extract from the cluster marker data.frame in the given clustering.markers.
#'@param marker.metric.names A list of names to attribute to the given marker.metric.accessor.
#'@param marker.metric.descriptions A list of description to attribute to the given marker.metric.accessor.
#'@export
add_clustering_markers <- function(
loom,
clustering.id,
clustering.markers,
marker.metric.accessors = NULL,
marker.metric.names = NULL,
marker.metric.descriptions = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
# Check if a gene column is present
tmp <- colnames(x = clustering.markers[[1]])
if(!("gene" %in% tmp)) {
stop("Cannot find a 'gene' column. Please use 'gene' as column name")
}
# Add the clustering markers as a mask
genes <- get_genes(loom = loom, is.flybase.gn = F)
clustering_markers_mask <- do.call(
what = "cbind",
args = lapply(
X = seq_along(along.with = clustering.markers),
FUN = function(cluster.idx) {
cluster_name <- names(x = clustering.markers)[cluster.idx]
cluster_markers <- clustering.markers[[cluster.idx]][["gene"]]
cm_mask <- data.frame("x" = genes %in% cluster_markers, stringsAsFactors = F)
colnames(x = cm_mask) <- cluster_name
return (cm_mask)
}
)
)
row.names(x = clustering_markers_mask) <- genes
print(paste0("Adding markers for clustering ", clustering.id, "..."))
add_row_attr(
loom = loom,
key = paste0(RA_CLUSTERING_MARKERS_NAME, "_",clustering.id),
value = as.data.frame(x = clustering_markers_mask)
)
flush(loom = loom)
# Add the marker metrics
# Check all marker metric vectors have the same length
if(!is.null(x = marker.metric.accessors) && !is.null(x = marker.metric.names) && !is.null(x = marker.metric.descriptions)) {
if(!all(lapply(X = list(marker.metric.accessors,marker.metric.names,marker.metric.descriptions), FUN = length) == length(marker.metric.accessors))) {
stop("The given marker.metric.accessors, marker.metric.names and marker.metric.descriptions should have the same length.")
}
print(paste0("Adding metrics for clustering ", clustering.id, "..."))
metrics_av <- list()
for(metric.idx in seq_along(along.with = marker.metric.names)) {
metric_accessor <- marker.metric.accessors[metric.idx]
metric_name <- marker.metric.names[metric.idx]
metric_description <- marker.metric.descriptions[metric.idx]
clustering_marker_metric <- do.call(
what = "cbind",
args = lapply(
X = seq_along(along.with = clustering.markers),
FUN = function(cluster.idx) {
cluster_name <- names(clustering.markers)[cluster.idx]
# Get the current metric_name in the cluster marker table of the current cluster_name
if(!("gene" %in% colnames(x = clustering.markers[[cluster.idx]]))) {
stop(paste0("'gene' column does not exist in the given clustering table(s)."))
}
if(!(metric_accessor %in% colnames(x = clustering.markers[[cluster.idx]]))) {
stop(paste0("'", metric_accessor, "' column does not exist in the given clustering table(s)."))
}
cluster_markers <- clustering.markers[[cluster.idx]][, c("gene", metric_accessor)]
genes_df <- data.frame("gene" = genes, stringsAsFactors = F)
metric_df <- merge(
x = genes_df,
y = cluster_markers,
by = "gene",
all = T
)
metric_df[is.na(x = metric_df)] <- 0
row.names(x = metric_df) <- metric_df$gene
# Order by order of genes stored in .loom
metric_df <- metric_df[match(x = genes, table = metric_df$gene),]
metric_df[, "gene"] <- NULL
colnames(x = metric_df) <- cluster_name
return (metric_df)
}
)
)
add_row_attr(
loom = loom,
key = paste0(RA_CLUSTERING_MARKERS_NAME, "_",clustering.id,"_",metric_accessor),
value = as.data.frame(x = clustering_marker_metric)
)
flush(loom = loom)
metrics_av[[length(metrics_av)+1]] <- list(
"accessor"=metric_accessor,
"name"=metric_name,
"description"=metric_description
)
}
add_global_md_clustering_kv(
loom = loom,
clustering.id = clustering.id,
key = GA_METADATA_CLUSTERINGS_CLUSTER_MARKER_METRICS_NAME,
value = metrics_av
)
}
}
#'@title add_fbgn
#'@description Add the Flybase gene as a row attribute to the given .loom file handler.
#'@param loom The loom file handler.
#'@param dgem A matrix of the gene expression with M genes as rows and N cells as columns.
#'@param fbgn.gn.mapping.file.path A N-by-2 data.frame containing the mapping between the Flybase gene and the gene symbol.
#'@export
add_fbgn <- function(
loom,
dgem,
fbgn.gn.mapping.file.path
) {
if(loom$mode=="r") stop("File open as read-only.")
fbgn.gn.mapping <- utils::read.table(
file = fbgn.gn.mapping.file.path,
header = F,
sep = "\t",
quote = '',
stringsAsFactors = F
)
colnames(x = fbgn.gn.mapping) <- c("FBgn", RA_GENE_NAME)
tmp <- data.frame(row.names(x = dgem))
colnames(x = tmp) <- RA_GENE_NAME
genes <- merge(
x = tmp,
y = fbgn.gn.mapping,
by = RA_GENE_NAME
)
add_row_attr(
loom = loom,
key = "FBgn",
value = genes$FBgn
)
}
#*******************#
# Col data functions ----
#*******************#
#'@title add_annotation_by_cluster_annotation_mapping_df
#'@description Add annotation as column attribute using cluster annotation mapping data.frame
#'@param loom The loom file handler.
#'@param key The name of the annotation
#'@param clustering.name The name of the clustering.
#'@param annotation.df The data.frame containing the cluster annotatiion mapping
#'@param annotation.df.cluster.id.column.name The column name of the given annotation.df containing the cluster ID values
#'@param annotation.df.annotation.column.name The column name of the given annotation.df containing the annotation values.
#'@export
add_annotation_by_cluster_annotation_mapping_df <- function(
loom,
key,
clustering.name,
annotation.df = NULL,
annotation.df.cluster.id.column.name = NULL,
annotation.df.annotation.column.name = NULL
) {
md <- get_global_meta_data(loom = loom)
gmd_clusterings <- md[["clusterings"]]
gmd_clustering <- rlist::list.filter(
.data = gmd_clusterings,
name == clustering.name
)[[1]]
ca_clusterings <- get_col_attr_by_key(
loom = loom,
key = "Clusterings"
)
cluster_annotation <- plyr::mapvalues(
ca_clusterings[, as.character(x = gmd_clustering$id)],
from = as.character(x = annotation[[annotation.df.cluster.id.column.name]]),
to = annotation[[annotation.df.annotation.column.name]]
)
add_col_attr(
loom = loom,
key = key,
value = cluster_annotation,
as.annotation = TRUE
)
}
#*******************#
# Generic functions ----
#*******************#
#'@title lookup_all_global_attr
#'@description List all global attributes of the given .loom file handler.
#'@param loom The loom file handler.
#'@export
lookup_all_global_attr <- function(
loom
) {
if(is_loom_spec_version_3_or_greater(loom = loom))
return (loom[["attrs"]]$names)
return (list.attributes(object = loom))
}
#'@title update_global_attr
#'@description Update the global attribute with the given key and the given value.
#'@param loom The loom file handler.
#'@param key The key of the global attribute to update.
#'@param value The new value
update_global_attr <- function(
loom,
key,
value
) {
if(loom$mode=="r") stop("Cannot update the given global attribute: file open as read-only.")
remove_global_attr(
loom = loom,
key = key
)
add_global_attr(
loom = loom,
key = key,
value = value
)
}
#'@title remove_global_attr
#'@description Remove the global attribute with the given key.
#'@param loom The loom file handler.
#'@param key The key of the global attribute to remove.
remove_global_attr <- function(
loom,
key
) {
if(loom$mode=="r") stop("Cannot renove the given global attribute: file open as read-only.")
if(is_loom_spec_version_3_or_greater(loom = loom)) {
loom$link_delete(name = paste0("attrs/",key))
} else {
loom$attr_delete(attr_name = key)
}
flush(loom = loom)
}
#'@title get_global_attr
#'@description Get the global attribute with the given key.
#'@param loom The loom file handler.
#'@param key The key of the global attribute to get value from.
#'@export
get_global_attr <- function(
loom,
key
) {
if(is_loom_spec_version_3_or_greater(loom = loom))
return (loom[["attrs"]][[key]][])
return (h5attr(x = loom, which = key))
}
#'@title add_global_attr
#'@description Add a new global attribute to the given .loom object accessible by the given key and containing the given value.
#'@param loom The loom file handler.
#'@param key The name of the new added attribute.
#'@param value The value of the new added attribute.
#'@export
add_global_attr <- function(
loom,
key,
value,
dtype = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
if(is.null(dtype)) {
dtype <- guess_dtype(x = value)
}
# Encode character vector to UTF-8
dtype <- hdf5_utf8_encode(
value = value,
dtype = dtype
)
if(is_loom_spec_version_3_or_greater(loom = loom)) {
loom$create_dataset(
name = paste0("attrs/", key),
robj = value,
dtype = dtype,
space = get_dspace(x = "scalar"),
chunk_dims = NULL
)
} else {
loom$create_attr(
attr_name = key,
robj = value,
dtype = dtype,
space = get_dspace(x = "scalar")
)
}
flush(loom = loom)
}
#'@title get_row_attr_by_key
#'@description Get the row attribute with the given key.
#'@param loom The loom file handler.
#'@param key The name of the row attribute.
#'@export
get_row_attr_by_key <- function(
loom,
key
) {
ra <- loom[["row_attrs"]]
if(key == "Gene") {
genes <- ra[[key]][]
if(any(as.numeric(x = names(x = table(table(genes))))>1)) {
stop("Some gene names are duplicated")
}
}
return (ra[[key]][])
}
#'@title remove_row_attr
#'@description Remove the row attribute with the given key.
#'@param key The name of the row attribute.
#'@export
remove_row_attr <- function(
loom,
key
) {
if(loom$mode=="r") stop("File open as read-only.")
loom$link_delete(name = paste0("row_attrs/", key))
flush(loom = loom)
}
#'@title update_row_attr
#'@description Update the row attribute with the given key and the given value.
#'@param key The name of the row attribute to remove.
#'@param value The new value to update.
update_row_attr <- function(
loom,
key,
value
) {
if(loom$mode=="r") stop("File open as read-only.")
loom$link_delete(name = paste0("row_attrs/",key))
add_col_attr(
loom = loom,
key = key,
value = value
)
}
#'@title add_row_attr
#'@description Add a new row attribute to the given .loom object accessible by the given key and containing the given value.
#'@param loom The loom file handler.
#'@param key The name of the new added attribute.
#'@param value The value of the new added attribute.
#'@export
add_row_attr <- function(
loom,
key,
value,
dtype = NULL
) {
if(loom$mode=="r") stop("File open as read-only.")
if(is.null(x = dtype)) {
dtype <- guess_dtype(x = value)
}
# Encode character vector to UTF-8
dtype <- hdf5_utf8_encode(
value = value,
dtype = dtype
)
loom$create_dataset(
name = paste0("row_attrs/",key),
robj = value,
dtype = dtype
)
flush(loom = loom)
}
#'@title col_attrs_exists_by_key
#'@description Check if the column attribute with the given key exists.
#'@param loom The loom file handler.
#'@param key The name of the new added attribute.
col_attrs_exists_by_key <- function(
loom,
key
) {
ca <- loom[["col_attrs"]]
return (key %in% names(ca))
}
#'@title get_col_attr_by_key
#'@description Get the column attribute with the given key.
#'@param loom The loom file handler.
#'@param key The name of the column attribute.
#'@export
get_col_attr_by_key <- function(
loom,
key
) {
ca <- loom[["col_attrs"]]
return (ca[[key]][])
}
#'@title remove_col_attr
#'@description Remove the column attribute with the given key.
#'@param loom The loom file handler.
#'@param key The name of column attribute to remove.
#'@export
remove_col_attr <- function(
loom,
key
) {
if(loom$mode=="r") stop("File open as read-only.")
loom$link_delete(name = paste0("col_attrs/", key))
flush(loom = loom)
}
#'@title update_col_attr
#'@description Update the column attribute with the given key and with the given value.
#'@param loom The loom file handler.
#'@param key The name of column attribute to update.
#'@param value The new value to update.
update_col_attr <- function(
loom,
key,
value
) {
if(loom$mode=="r") stop("File open as read-only.")
loom$link_delete(name = paste0("col_attrs/",key))
add_col_attr(
loom = loom,
key = key,
value = value
)
}
#'@title add_col_attr
#'@description Add a new column attribute to the given .loom object accessible by the given key and containing the given value.
#'@param loom The loom file handler.
#'@param key The name of the new added attribute.
#'@param value The value of the new added attribute.
#'@param as.annotation Define this attribute as a discrete attribute. This attribute will be visible in the compare tab.
#'@param as.metric Define this attribute as a continues
#'@export
add_col_attr <- function(
loom,
key,
value,
dtype = NULL,
as.annotation = F,
as.metric = F
) {
if(loom$mode=="r") stop("File open as read-only.")
if(as.annotation && as.metric) {
stop("Cannot add column attribute that is both of type metric and annotation.")
}
if(as.annotation && length(unique(value)) > 245) {
stop("Cannot add column attribute as an annotation with more than 245 unique values.")
}
if(is.null(x = dtype)) {
dtype <- guess_dtype(x = value)
}
# Encode character vector to UTF-8
dtype <- hdf5_utf8_encode(
value = value,
dtype = dtype
)
if(as.annotation) {
value <- as.character(x = value)
add_global_md_annotation(
loom = loom,
name = key,
values = as.character(x = value)
)
flush(loom = loom)
}
if(as.metric) {
value <- as.numeric(x = value)
add_global_md_metric(loom = loom, name = key)
flush(loom = loom)
}
loom$create_dataset(
name = paste0("col_attrs/",key),
robj = value,
dtype = dtype
)
flush(loom = loom)
}
#'@title add_matrix
#'@description Add the given gene expression matrix dgem to the given .loom object.
#'@param loom The loom file handler.
#'@param dgem A matrix of the gene expression with M genes as rows and N cells as columns.
add_matrix <- function(
loom,
dgem,
chunk.size,
display.progress
) {
if(loom$mode=="r") stop("File open as read-only.")
# Check if any NA values
if(any(is.na(x = dgem))) {
stop("Please make sure that the expression matrix (dgem) does not contain any NA values.")
}
dtype <- get_dtype(x = dgem[1, 1])
# Remove row names and column names
row.names(x = dgem) <- NULL
colnames(x = dgem) <- NULL
loom$create_dataset(
name = 'matrix',
dtype = dtype,
dims = rev(x = dim(x = dgem))
)
chunk_points <- chunk_points(
data.size = dim(x = dgem)[2],
chunk.size = chunk.size
)
if (display.progress) {
pb <- utils::txtProgressBar(char = '=', style = 3)
}
for (col in 1:ncol(x = chunk_points)) {
row_start <- chunk_points[1, col]
row_end <- chunk_points[2, col]
loom[['matrix']][row_start:row_end, ] <- t(x = as.matrix(x = dgem[, row_start:row_end]))
if(display.progress) {
utils::setTxtProgressBar(pb = pb, value = col / ncol(x = chunk_points))
}
}
flush(loom = loom)
}
#'@title flush
#'@description flush
#'@param loom The loom file handler.
#'@export
flush <- function(
loom
) {
if(loom$mode=="r") stop("File open as read-only.")
gc()
loom$flush()
}
#'@title finalize
#'@description Save and close the given .loom file handler.
#'@param loom The loom file handler.
#'@export
finalize <- function(
loom
) {
if(loom$mode=="r") {
warning("File open as read-only. Only closing file (not flushed)...")
flush(loom = loom)
}
loom$close_all()
}
#'@title close_loom
#'@description Close the given .loom file handler. Equivalent to finalize but without flushing.
#'@param loom The loom file handler.
#'@export
close_loom <- function(
loom
) {
tryCatch({
loom$close_all()
}, error=function(e) {
invisible(e)
})
}
#'@title build_loom
#'
#'@description Write the data given as arguments to the given file.name .loom file.
#'@param file.name A string naming the .loom file to be generated.
#'@param title A short description of content of loom.
#'@param genome The genome used for the mapping.
#'@param dgem A matrix of the gene expression with M genes as rows and N cells as columns.
#'@param default.embedding A M-by-2 data.frame of the embedding (X and Y coordinates) of the cells.
#'@param default.embedding.name A description name for the given default.embedding
#'@param hierarchy A named list of the 3 hierarchy levels that can be used to group looms in SCope. Use create_hierarchy().
#'@param fbgn.gn.mapping.file.path A N-by-2 data.frame containing the mapping between the Flybase gene and the gene symbol.
#'@param chunk.size The size of chunk of the gene expression matrix.
#'@param display.progress Display progress when adding the gene expression matrix.
#'@export
build_loom <- function(
file.name,
dgem,
title = NULL,
genome = NULL,
default.embedding = NULL,
default.embedding.name = NULL,
hierarchy = NULL,
fbgn.gn.mapping.file.path = NULL,
chunk.size = 1000,
display.progress = T,
loom.spec.version = 3
) {
is_dgem_sparse <- F
loom <- H5File$new(
filename = file.name,
mode = "w"
)
tryCatch({
# global attrs
print("Adding global attributes...")
if(loom.spec.version > 2) {
loom$create_group("attrs")
}
# Add LOOM_SPEC_VERSION
add_global_loom_spec_version(
loom = loom,
loom.spec.version = loom.spec.version
)
# title
if(!is.null(x = title)) {
add_global_attr(
loom = loom,
key = GA_TITLE_NAME,
value = as.character(x = title)
)
}
# Genome
if(!is.null(x = genome)) {
add_global_attr(
loom = loom,
key = GA_TITLE_GENOME,
value = as.character(x = genome)
)
}
# Creation data
add_global_attr(
loom = loom,
key = GA_CREATION_DATE_NAME,
value = as.character(x = Sys.time())
)
# R version
add_global_attr(
loom = loom,
key = GA_R_VERSION_NAME,
value = as.character(x = R.version.string)
)
cn <- colnames(x = dgem)
rn <- row.names(x = dgem)
# global MetaData attribute
init_global_meta_data(
loom = loom,
loom.spec.version = loom.spec.version
)
# Add hierarchy levels
if(!is.null(x = hierarchy)) {
add_hierarchy(
loom = loom,
hierarchy = hierarchy
)
}
# matrix
# Check the type of the sparse matrix
# convert to dgCMatrix if necessary to speedup populating the matrix slot
if("dgTMatrix" %in% class(x = dgem)) {
print("Converting to dgCMatrix...")
dgem <- methods::as(
object = dgem,
Class = "dgCMatrix"
)
}
# Check if sparse matrix
if(sum(class(x = dgem) %in% c("dgCMatrix","dgTMatrix")) > 0) {
is_dgem_sparse <- T
}
print("Adding matrix...")
add_matrix(
loom = loom,
dgem = dgem,
chunk.size = chunk.size,
display.progress = display.progress
)
# col_attrs
print("Adding column attributes...")
loom$create_group("col_attrs")
add_col_attr(
loom = loom,
key = CA_CELLID,
value = as.character(x = cn)
)
# Check if matrix is raw counts
if(sum(dgem%%1!=0) == 0) {
print("Adding default metrics nUMI...")
if(is_dgem_sparse) {
nUMI <- Matrix::colSums(x = dgem)
} else {
nUMI <- colSums(x = dgem)
}
add_col_attr(
loom = loom,
key = "nUMI",
value = nUMI,
as.metric = T
)
} else {
warning("Default metric nUMI was not added because the input matrix does not seem to be the raw counts.")
}
print("Adding default metrics nGene...")
if(is_dgem_sparse) {
nGene <- Matrix::colSums(x = dgem > 0)
} else {
nGene <- colSums(x = dgem > 0)
}
add_col_attr(
loom = loom,
key = "nGene",
value = nGene,
as.metric = T
)
if(!is.null(x = default.embedding)) {
print("Adding default embedding...")
# Add the default embedding
if(is.null(x = default.embedding.name)) {
default.embedding.name <- "Embedding"
}
add_embedding(
loom = loom,
embedding = as.data.frame(x = default.embedding),
name = default.embedding.name,
is.default = T
)
} else {
warning("No default embedding set for the loom. You'll not be able to visualize it in SCope.")
}
# row_attrs
print("Adding row attributes...")
loom$create_group("row_attrs")
add_row_attr(
loom = loom,
key = RA_GENE_NAME,
value = as.character(x = rn)
)
# Check if Flybase gene mapping is not empty
if(!is.null(x = fbgn.gn.mapping.file.path)) {
add_fbgn(
loom = loom,
dgem = dgem,
fbgn.gn.mapping.file.path = fbgn.gn.mapping.file.path
)
}
# col_graphs
print("Adding columns graphs...")
col.graphs <- loom$create_group("col_graphs")
# row_edges
print("Adding row graphs...")
row.graphs <- loom$create_group("row_graphs")
# layers
print("Adding layers...")
layers <- loom$create_group("layers")
}, error = function(e) {
flush(loom = loom)
loom$close()
stop(e)
}, finally = {
flush(loom = loom)
})
flush(loom = loom)
}
#'@title lookup_loom
#'@description List the content of the given .loom file handler.
#'@param loom The loom file handler.
#'@export
lookup_loom <- function(
loom
) {
loom$ls(recursive=TRUE)
}
#'@title open_loom
#'@description Open loom file and return a .loom file handler.
#'@param file.path The file path to the .loom file.
#'@param mode "r" for read-only (default). "r+" for read/write
#'@return A loom file handler
#'@export
open_loom <- function(
file.path,
mode="r"
) {
if(mode != "r" && mode != 'r+') {
stop("'r' (read) and 'r+' (read/write) modes are only allowed.")
}
loom <- tryCatch(
{
H5File$new(
filename = file.path,
mode = mode
)
},
error = function(e) {
msg <- paste0("It is not possible to open the file. ")
if(tolower(mode)=="w") msg <- paste0(msg, "Do you have permissions to open it in WRITE mode? (mode='w')")
msg <- paste0(msg, "\n", e$message)
e$message <- msg
stop(e)
}
)
if(is_loom_spec_version_3_or_greater(loom = loom)) {
# warning("Loom specification version 3 or greater detected!")
} else {
warning("Loom specification version 2 or smaller detected!")
}
return (loom)
}
#'@title convert_to_loom_v3_spec
#'@description Convert a .loom file to version 3 specification of the Loom format
#'@param loom The loom file handler.
#'@return A loom file handler
#'@export
convert_to_loom_v3_spec <- function(
loom
) {
if(is_loom_spec_version_3_or_greater(loom = loom)) {
stop("The given loom is already in Loom version 3 specification.")
}
gmd <- get_global_meta_data(loom = loom)
# Update loom spec version to 3
print("Updating to Loom v3 specification...")
if(loom$attr_exists(attr_name = GA_LOOM_SPEC_VERSION)) {
remove_global_attr(loom = loom, GA_LOOM_SPEC_VERSION)
}
# Create the global attrs group
loom$create_group("attrs")
add_global_loom_spec_version(
loom = loom,
loom.spec.version = 3
)
for(global_attr_key in list.attributes(object = loom)) {
print(paste0("Converting ", global_attr_key, " global attribute to Loom v3 specification..."))
if(global_attr_key == "MetaData") {
value <- rjson::toJSON(x = gmd)
} else {
value <- h5attr(
x = loom,
which = global_attr_key
)
}
loom$attr_delete(attr_name = global_attr_key)
add_global_attr(
loom = loom,
key = global_attr_key,
value = value
)
}
flush(loom = loom)
return (loom)
print("Done")
}
#*******************#
# Utils ----
#*******************#
#'@title check_missings
#'@description Check if some elements are missing between given vectors x an y
#'@param x A vector of elements
#'@param y Another vector of elements
#'@return A list with "result", a logical whether the two vectors are conserved
#' "x.missing.in.y", a vector of elements in x that are missing in y
#' "y.missing.in.x", a vector of elements in y that are missing in x
check_missings <- function(
x,
y,
check.both = TRUE
) {
x_diff_y <- setdiff(x = x, y = y)
if(check.both) {
y_diff_x <- setdiff(x = y, y = x)
} else {
y_diff_x <- NULL
}
return (
list(
"result"=length(x = x_diff_y)==0 && length(x = y_diff_x)==0,
"x_missing_in_y"=x_diff_y,
"y_missing_in_x"=y_diff_x)
)
}
#'@title is_order_conserved
#'@description Check if order between elements of the given vectors x an y are the same
#'@param x A vector of elements
#'@param y Another vector of elements
is_order_conserved <- function(x,y) {
return (
isTRUE(
x = all.equal(
current = x,
target = y
)
)
)
}
#'@title compress_gzb64
#'@description Open loom file and return a .loom file handler.
#'@param file.path The file path to the .loom file.
compress_gzb64 <- function(c) {
return (
base64enc::base64encode(
what = memCompress(
from = c,
type = "gzip"
)
)
)
}
#'@title decompress_gzb64
#'@description Open loom file and return a .loom file handler.
#'@param file.path The file path to the .loom file.
decompress_gzb64 <- function(gzb64c) {
return (
rawToChar(
x = memDecompress(
from = base64enc::base64decode(what = gzb64c),
type = "gzip",
asChar = F
),
multiple = F
)
)
}
#'@title hdf5_utf8_encode
#'@description Open loom file and return a .loom file handler.
#'@param value The value from which the dtype should be UTF8 encoded.
#'@param dtype The dtype of the given value.
hdf5_utf8_encode <- function(
value,
dtype
) {
if(class(x = dtype)[1] == "H5T_STRING") {
# Set to UTF-8 encoding otherwise read as byte array
dtype <- dtype$set_cset('UTF-8')
} else if(class(dtype)[1] == "H5T_COMPOUND") {
dtypes <- unique(x = sapply(X = value, FUN = class))
if(length(x = dtypes) > 1) {
stop("Adding a data.frame as column requires the values to be the same type.")
}
if(dtypes == "character") {
# Set all the columns to UTF-8 encoding otherwise read as byte array
dtype <- H5T_COMPOUND$new(
labels = colnames(x = value),
dtypes = lapply(
X = seq_along(along.with = colnames(x = value)),
FUN = function(x) {
return (
H5T_STRING$new(size = Inf)$set_cset(cset = "UTF-8"))
}
)
)
}
}
return (dtype)
}
#*******************#
# Utils ----
#*******************#
#' @title chunk_points
#' @description Generate chunk points.
#' @author mojaveazure
#' @param data.size How big is the data being chunked.
#' @param chunk.size How big should each chunk be.
#' @return A matrix where each column is a chunk, row 1 is start points, row 2 is end points.
#' @export
chunk_points <- function(
data.size,
chunk.size
) {
return(vapply(
X = 1L:ceiling(x = data.size / chunk.size),
FUN = function(i) {
return(c(
start = (chunk.size * (i - 1L)) + 1L,
end = min(chunk.size * i, data.size)
))
},
FUN.VALUE = numeric(length = 2L)
))
}
#'@title get_dtype
#'@description Get HDF5 data types.
#'@author mojaveazure
#'@param x An R object or string describing HDF5 datatype.
#'@return The corresponding HDF5 data type.
#'@import hdf5r
#'@seealso \link{hdf5r::h5types}
#'@export
get_dtype <- function(x) {
return(switch(
EXPR = class(x = x),
'numeric' = h5types$double,
'integer' = h5types$int,
'character' = H5T_STRING$new(size = Inf),
'logical' = H5T_LOGICAL$new(),
stop(paste("Unknown data type:", class(x = x)))
))
}
#'@title get_dspace
#'@description Get the HDF5 dataspace interface object given x.
#'@param x The dataspace interface name. Either scalar or simple.
#'@return The corresponding HDF5 dataspace interface object.
get_dspace <- function(x) {
dspaces <- c("scalar", "simple")
if(!(x %in% dspaces)) {
stop(paste("Wrong dspace. Choose either scalar or simple."))
}
return (
H5S$new(
type = x,
dims = NULL,
maxdims = NULL
)
)
}
#'@title is_base64_encoded
#'@description Checks whether the given value is base64 encoded.
#'@param value The value to check.
#'@return T is base64 encoded otherwise F.
is_base64_encoded <- function(value) {
return (
grepl(
pattern = BASE64_PATTERN,
x = value,
perl=TRUE
)
)
}
#'@title is_gzipped
#'@description Checks whether the given value is gzip compressed.
#'@param value The value to check.
#'@return T if gzip compressed otherwise F.
is_gzipped <- function(value) {
tryCatch({
memDecompress(
from = value,
type = "gzip",
asChar = F
)
return (T)
}, error = function(e) {
return (F)
}, finally = {
})
}
#'@title is_json
#'@description Checks whether the given value is a json character
#'@param value The value to check.
#'@return T if json string character otherwise F.
is_json <- function(value) {
tryCatch({
rjson::fromJSON(json_str = value)
return (T)
}, error = function(e) {
return (F)
}, finally = {
})
}
#*******************#
#*******************#
# DATA EXTRACTION ----
#*******************#
#*******************#
#'@title get_cell_ids
#'@description Get the cell names
#'@param loom The loom file handler.
#'@export
get_cell_ids <- function(
loom,
is.flybase.gn = F
) {
ra <- loom[["col_attrs"]]
return(ra[[CA_CELLID]][])
}
#'@title get_genes
#'@description Get the gene names either symbols or Flybase gene identifiers.
#'@param loom The loom file handler.
#'@param is.flybase.gn Whether to retrieve the Flybase gene identifiers or not.
#'@export
get_genes <- function(
loom,
is.flybase.gn = F
) {
ra <- loom[["row_attrs"]]
if(is.flybase.gn) {
return (ra[["FBgn"]])
}
return (ra[[RA_GENE_NAME]][])
}
#**********#
# Matrix ----
#**********#
#'@title get_dgem
#'@description Get the expression matrix for the given .loom.
#'@param loom The loom file handler.
#'@return The gene expression matrix (genes as rows, cells as columns).
#'@export
get_dgem <- function(
loom
) {
dgem <- t(x = loom[["matrix"]][,])
cell_ids <- get_cell_ids(loom = loom)
# Set the Cell IDs
colnames(x = dgem) <- cell_ids
genes <- get_genes(loom = loom)
row.names(x = dgem) <- genes
return (dgem)
}
#*******************#
# Embeddings ----
#*******************#
#'@title get_embedding_by_name
#'@description Get the embedding with the given embedding.name in the given .loom.
#'@param loom The loom file handler.
#'@param embedding.name The name of the given embedding to retrieve
#'@return The embedding with the given embedding.name.
#'@export
get_embedding_by_name <- function(
loom,
embedding.name
) {
return (get_embeddings(loom = loom)[[embedding.name]])
}
#'@title get_default_embedding
#'@description Get the default embedding for the given .loom.
#'@param loom The loom file handler.
#'@return The default embedding.
#'@export
get_default_embedding <- function(
loom
) {
return (loom[["col_attrs"]][[CA_EMBEDDING_NAME]][])
}
#' @title get_embeddings
#' @description Get embeddings (e.g. t-SNE, UMAP) from the given loom file
#' @param loom The loom file handler.
#' @return List with the embeddings (each of them as data.frame)
#' @examples
#' loomPath <- file.path(system.file('extdata', package='SCopeLoomR'), "example.loom")
#' loom <- open_loom(loomPath)
#'
#' embeddings <- get_embeddings(loom)
#' names(embeddings)
#' head(embeddings[[1]])
#' @export
get_embeddings <- function(
loom
){
embeddings <- list()
for(embedding in get_global_meta_data(loom = loom)$embeddings)
{
if(embedding$id == -1){
tmp <- as.matrix(x = get_default_embedding(loom = loom)[])
rownames(x = tmp) <- get_cell_ids(loom = loom)
embeddings[[embedding$name]] <- tmp
} else {
tmp <- cbind(
loom[["col_attrs"]][["Embeddings_X"]][][,as.character(embedding$id)],
loom[["col_attrs"]][["Embeddings_Y"]][][,as.character(embedding$id)]
)
rownames(x = tmp) <- get_cell_ids(loom = loom)
embeddings[[embedding$name]] <- tmp
}
}
return(embeddings)
}
#*******************#
# Clusterings ----
#*******************#
#'@title get_clustering_annotations
#'@description Get crowd annotations from each clustering of the clustering with the given clustering.name
#'@param loom The loom file handler.
#'@param clustering.name The name of the clustering.
#'@param curator_orcid (Optional) If provided, only annotations with this ORCID will be selected.
#'@param out_file_path (Optional) If provided, the output results will be saved to this file path.
#'@return A cell-based data.frame containing information about the annotations associated to each cluster of the clustering.
get_clustering_annotations <- function(
loom,
clustering_name = "Annotation",
curator_orcid = NULL,
curation_timestamp = NULL,
skip_without_timestamp = FALSE,
out_file_path = NULL
) {
md <- get_global_meta_data(loom = loom)
md_clusterings <- md$clusterings
md_clustering <- rlist::list.filter(.data = md_clusterings, name == clustering_name)
if(length(x = md_clustering) != 1) {
stop(paste0("Clustering ", clustering_name,"does not exist. list_clusterings_names() function can be used to list the existing clusterings."))
}
md_clustering <- md_clustering[[1]]
# Get the cell-cluster association for the given clustering
clusterings <- get_clusterings_with_name(loom = loom)
clustering <- clusterings[[as.character(x = md_clustering$id)]]
clustering_annotations <- NULL
is_extracting <- FALSE
for(cluster in md_clustering$clusters) {
if(length(x = cluster$cell_type_annotation) == 0) {
next
}
if(length(x = cluster$cell_type_annotation) > 1) {
stop(paste0("Extracting annotations with cluster having multiple annotations has not been implemented. Cluster ", cluster$id," (", cluster$description,") has multiple annotations."))
}
cta <- cluster$cell_type_annotation[[1]]
if(is.null(x = cta$data$annotation_label)) {
warning(paste0("Skipping cluster with ID ", cluster$id," and description '", cluster$description,"'. No annotation detected."))
next
}
curators <- rlist::list.apply(.data = cta$votes$votes_for$voters, function(x) { x$voter_id })[[1]]
# If ORCID provided, only annotations with the given ORCID will be selected
if(!is.null(x = curator_orcid) && !(curator_orcid %in% curators)) {
next
}
# If curation timestamp provided, only annotations after the given timestamp will be selected
if(!is.numeric(x = curation_timestamp)) {
stop("Expects curation_timestamp to be an integer and in milleseconds.")
}
if(cta$data$timestamp < 0 && skip_without_timestamp) {
next
}
if(!is.null(x = curator_orcid) && cta$data$timestamp > 0 && cta$data$timestamp < curation_timestamp) {
print(cta$data$timestamp)
next
}
is_extracting <- TRUE
# Make annotation data.frame for the current "cluster"
cell_ids <- get_cell_ids(loom = loom)[clustering == cluster$description]
annotations_df <- data.frame("CellID"=cell_ids)
annotations_df$AnnotationLabel <- cta$data$annotation_label
annotations_df$AnnotationFBbt <- cta$data$obo_id
annotations_df$CuratorMarkers <- paste(cta$data$markers, collapse = ", ")
annotations_df$Publication <- cta$data$publication
annotations_df$Curators <- paste0(curators, collapse = ", ")
annotations_df$Comment <- cta$data$comment
annotations_df$ClusteringID <- md_clustering$id
annotations_df$ClusteringName <- md_clustering$name
annotations_df$ClusterID <- cluster$id
# Concatenate the results
if(is.null(x = clustering_annotations)) {
clustering_annotations <- annotations_df
} else {
clustering_annotations <- rbind(clustering_annotations, annotations_df)
}
}
if(is_extracting) {
print(paste0("Extracting annotations from clustering '", clustering_name, "'"))
}
if(!is.null(x = out_file_path)) {
print(paste0("Writing output to CSV file: ", out_file_path))
write.csv(x = clustering_annotations, file = out_file_path, row.names = FALSE)
}
return(clustering_annotations)
}
#'@title get_all_clustering_annotations
#'@description Get crowd annotations from each clustering of the clustering with the given clustering.name
#'@param loom The loom file handler.
#'@param clustering.name The name of the clustering.
#'@param curator_orcid Only annotations with this ORCID will be selected.
#'@param from_date Only annotations that were added after this data (YYYY-mm-dd) will be selected. By default, it will use the current date.
#'@param skip_without_timestamp Don't keep annotations with negative timestamp (these are usually final/custom generated annotations).
#'@return A cell-based data.frame containing information about the annotations associated to each cluster of all the clusterings.
get_all_clustering_annotations <- function(
loom,
curator_orcid = NULL,
from_date = format(Sys.time(), "%Y-%m-%d"),
skip_without_timestamp = TRUE
) {
if(is.null(x = curator_orcid)) {
stop("The curator_orcid parameter is required.")
}
d <- try(as.Date(x = from_date, format="%Y-%m-%d"))
if("try-error" %in% class(x = d) || is.na(x = d)) {
print("The given data is not valid. Use YYYY-mm-dd format.")
}
date <- as.POSIXct(paste0(from_date, "00:00:00"))
# Convert date to milliseconds
curation_timestamp <- as.integer(x = date)*1000
all_clustering_annotations <- NULL
for(clustering_name in list_clusterings_names(loom = loom)) {
annotations_df <- get_clustering_annotations(
loom = loom,
clustering_name = clustering_name,
curator_orcid = curator_orcid,
curation_timestamp = curation_timestamp,
skip_without_timestamp = skip_without_timestamp
)
# Concatenate the results
if(is.null(x = all_clustering_annotations)) {
all_clustering_annotations <- annotations_df
} else {
all_clustering_annotations <- rbind(
all_clustering_annotations,
annotations_df
)
}
}
return (all_clustering_annotations)
}
#'@title get_clustering_idx_by_cluster_name
#'@description Get index of the clutering related to the given cluster.name.
#'@param loom The loom file handler.
#'@param cluster.name The name of the cluster.
#'@return The index of the clustering in the clusterings metadata global attribute corresponding to the clustering where the given cluster.name is found.
get_clustering_idx_by_cluster_name <- function(
loom,
cluster.name
) {
library(rlist)
# Get global meta data
md <- get_global_meta_data(loom = loom)
# Unlist the nested meta data tree
tmp <- list.flatten(x = md$clusterings)
# Look for the given cluster.name
idx = match(
x = cluster.name,
table = tmp
)
if(is.na(x = idx)) {
return (idx)
}
# Reverse search from idx till the first key id is found
for(i in idx:1) {
if(names(x = tmp)[i] == "id") id <- tmp[[i]]; break
}
return (
list.findi(
.data = md$clusterings,
id == id
)
)
}
#'@title get_cluster_info_by_cluster_name
#'@description Get cluster information (Clustering ID, Clustering Name, Clustering Group, Cluster ID, Cluster Name) of the given cluster.name.
#'@param loom The loom file handler.
#'@param cluster.name The name of the cluster.
#'@return The index of the clustering in the clusterings metadata global attribute corresponding to the clustering where the given cluster.name is found.
#'@export
get_cluster_info_by_cluster_name <- function(
loom,
cluster.name
) {
library(rlist)
# Get global meta data
md <- get_global_meta_data(loom = loom)
# Get the index of the clustering in the meta data clusterings
clustering.idx <- get_clustering_idx_by_cluster_name(
loom = loom,
cluster.name = cluster.name
)
if(is.na(x = clustering.idx)) {
stop(paste0("The given cluster ", cluster.name, " does not exist in this .loom."))
}
clustering.id <- md$clusterings[[clustering.idx]]$id
cluster.idx <- list.findi(
.data = md$clusterings[[clustering.idx]]$clusters,
cond = description == cluster.name
)
return (
list(
clustering.id=clustering.id,
clustering.name=md$clusterings[[clustering.idx]]$name,
clustering.group=md$clusterings[[clustering.idx]]$group,
cluster.id=md$clusterings[[clustering.idx]]$clusters[[cluster.idx]]$id,
cluster.name=md$clusterings[[clustering.idx]]$clusters[[cluster.idx]]$description
)
)
}
#'@title get_clusterings
#'@description Get clustering of the given loom.
#'@param loom The loom file handler.
#'@param cluster.name The name of the cluster.
#'@return A N-by-M data.frame containing the clusterings of the given loom. N represents the cells and M the clusterings.
#'@export
get_clusterings <- function(
loom
) {
clusterings <- get_col_attr_by_key(
loom = loom,
key = CA_CLUSTERINGS_NAME
)
row.names(x = clusterings) <- get_cell_ids(loom = loom)
return (clusterings)
}
#'@title get_global_md_clustering_by_id
#'@description Get clustering with the given clustering.id of the given loom.
#'@param loom The loom file handler.
#'@param clustering.id The ID of the clustering.
#'@return A N-by-1 vector containing the cell assignments to each of the clusters of the clustering.
#'@export
get_clustering_by_id <- function(
loom,
clustering.id
) {
ca.clusterings <- get_clusterings(loom = loom)
return (ca.clusterings[, colnames(x = ca.clusterings) %in% clustering.id])
}
#' @title get_clusterings_with_name
#' @description Get cell clusterings stored in the given loom file
#' @param loom .loom file name
#' @return data.frame containing the clusterings in the given loom (cells as rows, clustering alternatives as columns).
#' @examples
#' # clusters <- get_clusterings_with_name(loom)
#' # head(clusters)
#' @export
get_clusterings_with_name <- function(
loom
){
md_clusterings <- get_global_meta_data(loom = loom)$clusterings
clustering_ids <- sapply(X = md_clusterings, FUN = function(x) { return (x$id)})
clusterings <- get_clusterings(loom = loom)
# Filter out clusterings not present in global metadata attribute
clusterings <- clusterings[, as.character(x = clustering_ids), drop = FALSE]
for(clustering_id in clustering_ids){
md_clustering <- rlist::list.filter(.data = md_clusterings, id == clustering_id)[[1]]
cluster_levels <- sapply(
X = md_clustering$clusters,
FUN = function(cluster) {
return (
setNames(
object = cluster$description,
nm = cluster$id
)
)
}
)
clusterings[, as.character(x = clustering_id)] <- factor(
x = clusterings[, as.character(x = clustering_id)],
levels = names(x = cluster_levels),
labels = cluster_levels
)
}
return (clusterings)
}
#'@title get_cell_mask_by_cluster_name
#'@description Get a cell mask for the given cluster.name of the given loom.
#'@param loom The loom file handler.
#'@param cluster.name The name of the cluster.
#'@return A N-by-1 boolean vector specifying which cells belong to the given cluster.name in the given loom.
get_cell_mask_by_cluster_name <- function(
loom,
cluster.name
) {
# Get the cluster info given the cluster.name
cluster_info <- get_cluster_info_by_cluster_name(
loom = loom,
cluster.name = cluster.name
)
# Get the clustering related to the given cluster.name
ca_clustering <- get_global_md_clustering_by_id(
loom = loom,
clustering.id = cluster_info$clustering.id
)
# Create the mask
return (ca_clustering %in% cluster_info$cluster.id)
}
#'@title get_cluster_markers
#'@description Get a list of data.frame each containing the statistical markers for the clusters from the clustering with the given clustering.name and filtered with the given log.fc.threshold and given adj.p.value.
#'@param loom The loom file handler.
#'@param clustering.name The name/description of the clustering.
#'@param log.fc.threshold The log fold change threshold
#'@param adj.p.value.threshold The adjusted p-value threshold
#'@return A list of L data.frame of N-by-M containing the statistical markers for all the clusters of from the clusteirng with the given clustering.name. L represents the number of clusters. N represents the number of statistical markers for a given cluster.
#'@export
get_cluster_markers <- function(
loom,
clustering.id = NULL,
clustering.name = NULL,
log.fc.threshold = 1.5,
adj.p.value.threshold = 0.05,
n.signif.digits = NULL
) {
clustering <- NULL
if(!is.null(x = clustering.name)) {
# Get clustering ID
clustering <- get_global_md_clustering_by_name(
loom = loom,
clustering.name = clustering.name
)
} else {
if(is.null(x = clustering.id)) {
stop("The clustering.id or clustering.name argument is required.")
}
clustering <- get_global_md_clustering_by_id(
loom = loom,
clustering.id = clustering.id
)
}
genes <- get_row_attr_by_key(loom = loom, key = "Gene")
clustering_marker_mask <- get_row_attr_by_key(
loom = loom,
key = paste0("ClusterMarkers_", clustering$id)
)
rownames(x = clustering_marker_mask) <- genes
colnames(x = clustering_marker_mask) <- rep('is_marker', ncol(x = clustering_marker_mask))
clustering_avg_log_fc <- get_row_attr_by_key(
loom = loom,
key = paste0(RA_CLUSTERING_MARKERS_NAME, "_", clustering$id, '_avg_logFC')
)
rownames(x = clustering_avg_log_fc) <- genes
colnames(x = clustering_avg_log_fc) <- rep('avg_logFC', ncol(x = clustering_avg_log_fc))
clustering_pval <- get_row_attr_by_key(
loom = loom,
key = paste0(RA_CLUSTERING_MARKERS_NAME, "_", clustering$id, '_pval')
)
rownames(x = clustering_pval) <- genes
colnames(x = clustering_pval) <- rep('adj_pval', ncol(x = clustering_pval))
# Get cluster names
cluster_names <- get_cluster_names(clustering = clustering)
# Create list
marker_list <- lapply(
X = 1:ncol(x = clustering_avg_log_fc),
FUN = function (i) {
cbind(
clustering_marker_mask[,i, drop=FALSE],
clustering_avg_log_fc[,i, drop=FALSE],
clustering_pval[,i, drop=FALSE]
)
}
)
# Rounding
if(!is.null(x = n.signif.digits)) {
marker_list <- lapply(
X = marker_list,
FUN = function(x) {
return (
signif(
x = x,
digits = n.signif.digits
)
)
}
)
}
# Filter
marker_list <- lapply(
X = marker_list,
FUN = function(x) x <- x[which(x[,1] == TRUE),]
)
marker_list <- lapply(
X = marker_list,
FUN = function(x) x <- x[which(x[,2] > log.fc.threshold),]
)
marker_list <- lapply(
X = marker_list,
FUN = function(x) x <- x[which(x[,3] < adj.p.value.threshold),]
)
marker_list <- lapply(
X = marker_list,
FUN = function(x) x <- x[with(x, order(-x[,'avg_logFC'], x[,'adj_pval'])),]
)
names(x = marker_list) <- cluster_names
return(marker_list)
}
#'@title get_all_cluster_markers
#'@description Get cluster markers from all clusterings fromthe given loom file with filter applied by the given log.fc.threshold and adj.p.value.threshold.
#'@param loom The loom file handler.
#'@param log.fc.threshold The log fold change threshold
#'@param adj.p.value.threshold The adjusted p-value threshold
#'@param n.signif.digits Number of significant digits in the output.
#'@return A list of data.frames containing all the clusterings with their cluster markers (adj. p-value and log fold changes). The list names refer to the different clustering names.
#'@export
get_all_cluster_markers <- function(
loom,
log.fc.threshold = 1.5,
adj.p.value = 0.05,
n.signif.digits = NULL
) {
all_markers <- list()
clustering_ids <- sapply(
X = get_global_md_clusterings(loom = loom),
FUN = function(clustering) return (clustering$id)
)
for(clustering_id in clustering_ids) {
clustering_name <- get_global_md_clustering_by_id(
loom = loom,
clustering.id = clustering_id
)$name
clustering_markers <- get_cluster_markers(
loom = loom,
clustering.id = clustering_id,
log.fc.threshold = log.fc.threshold,
adj.p.value = adj.p.value,
n.signif.digits = n.signif.digits
)
all_markers[[clustering_name]] <- clustering_markers
}
return (all_markers)
}
#'@title list_clusterings_names
#'@description Get a list of all clustering names in the given loom
#'@param loom The loom file handler.
#'@return A list of all clustering names
#'@export
list_clusterings_names <- function(
loom
) {
md_clusterings <- get_global_md_clusterings(loom = loom)
return (sapply(
X = 1:length(x = md_clusterings),
FUN = function (i) md_clusterings[[i]]$name)
)
}
#'@title get_cluster_dgem_by_name
#'@description Get a subset of the digital gene expression matrix containing only the cells in the cluster annotated by the given cluster.name.
#'@param loom The loom file handler.
#'@param cluster.name The name/description of the cluster.
#'@return A N-by-M matrix containing the gene expression levels of the cells in the cluster annotated by the given cluster.name. N represents the genes and M the cells.
#'@export
get_cluster_dgem_by_name <- function(
loom,
cluster.name
) {
# Get the cell mask for the given cluster.name
mask <- get_cell_mask_by_cluster_name(
loom = loom,
cluster.name = cluster.name
)
dgem <- get_dgem(loom = loom)
return (dgem[, mask])
}
#*******************#
# Annotations ----
#*******************#
#' @title get_cell_annotation
#' @description Get cell annotations from the given loom file
#' @param loom .loom file name
#' @return data.frame containing the cell annotations in the given loom (cells as rows, annotation variables as columns).
#' @examples
#' loomPath <- file.path(system.file('extdata', package='SCopeLoomR'), "example.loom")
#' loom <- open_loom(loomPath)
#'
#' cell_annotations <- get_cell_annotation(loom)
#' head(cellAnnots)
#' @export
get_cell_annotation <- function(
loom,
annotations.columns = NULL
) {
exclude_columns <- c("CellID", "ClusterID", "Clusterings", "Embedding", "Embeddings_X", "Embeddings_Y", "RegulonsAUC")
if(is.null(x = annotations.columns)) {
annotations.columns <- names(x = loom[["col_attrs"]])[which(x = !names(x = loom[["col_attrs"]]) %in% exclude_columns)]
}
cell_annotations <- list()
for (annoation_column in annotations.columns){
cell_annotations[[annoation_column]] <- get_col_attr_by_key(
loom = loom,
key = annoation_column
)
}
cell_annotations <- data.frame(cell_annotations)
rownames(x = cell_annotations) <- get_cell_ids(loom = loom)
return (cell_annotations)
}
#*******************#
# Regulons ----
#*******************#
#' @title get_regulons_AUC
#' @description Get AUCell matrix from the given loom file
#' @param loom .loom file name
#' @param column.attr.name Type of regulon to retrieve; Usually 'MotifRegulonsAUC' for motif-based regulons, or 'TrackRegulonsAUC' for track-based (e.g. ChIP-seq) regulons.
#' (Might change according to the SCENIC/pySCENIC pipeline version and settings used).
#' @param rows Type of data stored as rows (only for informative purposes) Default: "regulons"
#' @param columns Type of data stored as columns (only for informative purposes) Default: "cells"
#' @return AUC matrix in the slot \code{loom[["col_attrs"]][["RegulonsAUC"]][]}.
#' If AUCell is installed, it will be stored as aucellResults object.
#' @examples
#' loom_file_pathath <- file.path(system.file('extdata', package='SCopeLoomR'), "example.loom")
#' loom <- open_loom(loomPath)
#'
#' regulons_AUC <- get_regulons_AUC(loom)
#' @export
get_regulons_AUC <- function(
loom,
column.attr.name="MotifRegulonsAUC",
rows="regulons",
columns="cells"
) {
if(!column.attr.name %in% names(loom[["col_attrs"]]))
{
msg <- paste("The attribute '", column.attr.name, "' is not available in this loom file.", sep='')
possible_values <- grep("egulon", names(x = loom[["col_attrs"]]), value=T)
if(length(x = possible_values)>0) {
msg <- c(
msg,
" Possible values include: ",
paste(possible_values, collapse=", "),
paste(". Try setting the 'column.attr.name' argument to one of these values (i.e., get_regulons_AUC(loom, column.attr.name='", possible_values[1], "')).",sep="")
)
}
if(length(x = possible_values) == 0) {
msg <- c(
msg,
" The loom doesn't contain regulon information."
)
}
stop(msg)
}
mtx <- loom[["col_attrs"]][[column.attr.name]][]
rownames(x = mtx) <- get_cell_ids(loom = loom)
mtx <- t(x = mtx)
names(x = dimnames(x = mtx)) <- c(rows, columns)
if("AUCell" %in% rownames(x = installed.packages())) {
require(AUCell)
mtx <- new(
Class = "aucellResults",
SummarizedExperiment::SummarizedExperiment(assays=list(AUC=mtx))
)
}
return(mtx)
}
#' @title get_regulons
#' @description Get regulons from the given loom file
#' @param loom .loom file name
#' @param column.attr.name Type of regulon to retrieve; Usually 'MotifRegulons' for motif-based regulons, or 'TrackRegulons' for track-based (e.g. ChIP-seq) regulons.
#' (Might change according to the SCENIC/pySCENIC pipeline version and settings used).
#' @param tfAsName Whether to return only the TF name (calls \code{SCENIC::getTF()}), or the regulon name as stored in the loom file.
#' @param tfSep Character used as separator for the TF name and extra values stored in the regulon name. To be passed to SCENIC::getTF()
#' @return The regulons as incidence matrix or list.
#' @examples
#' loomPath <- file.path(system.file('extdata', package='SCopeLoomR'), "example.loom")
#' loom <- open_loom(loomPath)
#'
#' regulons <- get_regulons(loom)
#' head(regulons)
#' @export
get_regulons <- function(
loom,
column.attr.name="MotifRegulons",
tf.as.name=TRUE,
tf.sep="_"
){
if(!column.attr.name %in% names(loom[["row_attrs"]]))
{
msg <- paste("The attribute '", column.attr.name, "' is not available in this loom file.", sep='')
possible_values <- grep("egulon", names(loom[["row_attrs"]]), value=T)
if(length(x = possible_values) > 0) {
msg <- c(
msg,
" Possible values include: ",
paste(possible_values, collapse=", "),
paste(". Try setting the 'column.attr.name' argument to one of these values (i.e., get_regulons(loom, column.attr.name='", possible_values[1], "')).", sep="")
)
}
if(length(x = possible_values) == 0) {
msg <- c(msg, " The loom doesn't contain regulon information.")
}
stop(msg)
}
incidence_matrix <- loom[["row_attrs"]][[column.attr.name]][] # incid mat
rownames(x = incidence_matrix) <- get_genes(loom = loom)
incidence_matrix <- t(x = incidence_matrix)
if(tf.as.name) {
rownames(x = incidence_matrix) <- SCENIC::getTF(
regulonName = rownames(x = incidence_matrix),
sep = tf.sep
)
}
return(incidence_matrix)
}
#' @title get_regulon_thresholds
#' @description Get the AUC thresholds for the regulons in the given loom file
#' @param loom .loom file name
#' @param onlySelected Whether to return only the selected threshold for each regulon (as vector),
#' or also the alternative thresholds calculated by AUCell (as list).
#' @examples
#' loomPath <- file.path(system.file('extdata', package='SCopeLoomR'), "example.loom")
#' loom <- open_loom(loomPath)
#'
#' thresholds <- get_regulon_thresholds(loom)
#' head(thresholds)
#' @export
get_regulon_thresholds <- function(
loom,
only.selected=TRUE
) {
if(only.selected) {
thresholds <- sapply(
X = get_global_meta_data(loom = loom)$regulonThresholds,
function(x) {
setNames(
object = x$regulon,
nm = x$defaultThresholdValue
)
}
)
} else{
thresholds <- sapply(
X = get_global_meta_data(loom)$regulonThresholds,
function(x) {
list(
aucThr=list(
selected=x$defaultThresholdValue,
thresholds=data.frame(x$allThresholds) # if pySCENIC: transpose?
)
)
}
)
}
return(thresholds)
}
#'@title get_cell_mask_by_cluster_name
#'@description Get a cell mask for the given cluster.name of the given loom.
#'@param loom The loom file handler.
#'@param cluster.name The name of the cluster.
#'@return A N-by-1 boolean vector specifying which cells belong to the given cluster.name in the given loom.
get_cell_mask_by_cluster_name <- function(
loom,
cluster.name
) {
# Get the cluster info given the cluster.name
cluster_info <- get_cluster_info_by_cluster_name(
loom = loom,
cluster.name = cluster.name
)
# Get the clustering related to the given cluster.name
ca.clustering <- get_clustering_by_id(
loom = loom,
clustering.id = cluster_info$clustering.id
)
# Create the mask
return (ca.clustering %in% cluster_info$cluster.id)
}
#'@title get_cluster_dgem_by_name
#'@description Get a subset of the digital gene expression matrix containing only the cells in the cluster annotated by the given cluster.name.
#'@param loom The loom file handler.
#'@param cluster.name The name/description of the cluster.
#'@return A N-by-M matrix containing the gene expression levels of the cells in the cluster annotated by the given cluster.name. N represents the genes and M the cells.
#'@export
get_cluster_dgem_by_name <- function(
loom,
cluster.name
) {
# Get the cell mask for the given cluster.name
mask <- get_cell_mask_by_cluster_name(
loom = loom,
cluster.name = cluster.name
)
dgem <- get_dgem(loom = loom)
return (dgem[, mask])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.