# ____________________________________________________________________
# Seurat.Utils.Metadata ----
# ____________________________________________________________________
# file.edit("~/GitHub/Packages/Seurat.utils/R/Seurat.Utils.Metadata.R")
# file.edit("~/GitHub/Packages/Seurat.utils/R/Seurat.utils.less.used.R")
# devtools::load_all(path = '~/GitHub/Packages/Seurat.utils');
# devtools::document("~/GitHub/Packages/Seurat.utils"); devtools::load_all("~/GitHub/Packages/Seurat.utils")
# source("~/GitHub/Packages/Seurat.utils/R/Seurat.Utils.R")
# source('~/.pack.R')
# _________________________________________________________________________________________________
# Extract and check metadata columns ______________________________ ----
# _________________________________________________________________________________________________
#' @title Add Translated Metadata to a Seurat Object
#'
#' @description
#' This function translates a specified metadata vector in a Seurat object using a named vector of old
#' and new values and adds it to the Seurat object with a specified suffix. The function also generates
#' UMAP plots for the new metadata.
#'
#' @param obj A Seurat object to be updated. Default: combined.obj.
#' @param orig.ident A character string specifying the original metadata to be translated. Default: "RNA_snn_res.0.4".
#' @param translation_as_named_vec A named vector where names are old values and values are new translations. Default: None.
#' @param new_col_name A character string specifying the name of the new metadata column. Default: "translation_as_named_vec".
# #' @param NA.as.character A logical indicating whether to convert NAs to character. Default: TRUE.
#' @param suffix A character string specifying the suffix for the new metadata column name. Default: ".".
#' @param plot A logical indicating whether to plot the UMAP for the new metadata. Default: FALSE.
#'
#' @return An updated Seurat object.
#'
#' @importFrom CodeAndRoll2 translate
#' @export
addTranslatedMetadata <- function(obj = combined.obj,
orig.ident = "RNA_snn_res.0.4",
translation_as_named_vec,
new_col_name = substitute_deparse(translation_as_named_vec),
# NA.as.character = T,
suffix = NULL,
plot = F,
...) {
# Input assertions
stopifnot(is(obj, "Seurat"),
is.character(orig.ident) && length(orig.ident) == 1,
is.character(suffix) | is.null(suffix),
"Not a named vec was provided!" = !is.null(names(translation_as_named_vec))
)
message("new_col_name: ", new_col_name)
# Translate metadata
obj@meta.data[[new_col_name]] <- new <- CodeAndRoll2::translate(
vec = as.character(obj@meta.data[[orig.ident]]),
old = names(translation_as_named_vec),
new = translation_as_named_vec
)
print(table(new, useNA = "ifany"))
if(plot) clUMAP(ident = new_col_name, obj = obj, caption = "New metadata column", ...)
return(obj)
}
# _________________________________________________________________________________________________
#' @title Get Metadata Column Names Matching Pattern
#'
#' @description Retrieves column names from an object's metadata that match a specified pattern.
#'
#' @param pattern A character string containing a regular expression to match against the
#' column names. Default: "RNA".
#' @param obj An object containing a `meta.data` slot, typically from combined datasets.
#' Default: `combined.obj`.
#'
#' @return A character vector of column names matching the pattern.
#'
#' @examples
#' # Assuming `combined.obj` is an object with a meta.data slot
#' getMetaColnames()
#'
#' @export
getMetaColnames <- function(obj = combined.obj,
pattern = "RNA") {
stopifnot(inherits(obj, "Seurat"))
# Retrieve column names matching the pattern
matchedColnames <- grep(pattern = pattern, x = colnames(obj@meta.data), value = TRUE)
# Output assertion
if (is.null(matchedColnames)) {
warning("No matching meta data!", immediate. = TRUE)
} else {
message(length(matchedColnames), " columns matching pattern '", pattern, "'.")
}
dput(matchedColnames)
invisible(matchedColnames)
}
# _________________________________________________________________________________________________
#' @title Check if a Column Exists in the Metadata of an S4 Object
#'
#' @description This function checks whether a given column exists in the meta.data of a Seurat object.
#' @param obj A Seurat object.
#' @param col_name A character string specifying the name of the column.
#'
#' @return A logical value indicating whether the column exists (TRUE) or not (FALSE).
#' @export
metaColnameExists <- function(col_name, obj = combined.obj) {
col_name %in% colnames(obj@meta.data)
}
# _________________________________________________________________________________________________
#' @title getMetadataColumn
#'
#' @description Retrieves a specified metadata column from a Seurat object and returns it as a named vector.
#' @param col A string specifying the name of the metadata column to be retrieved. Default: 'batch'.
#' @param obj A Seurat object from which the metadata column will be retrieved. Default: combined.obj.
#' @param as_numeric A logical flag indicating whether the returned values should be converted to numeric format. Default: `FALSE`. (FALSE).
#' @return A named vector containing the values from the specified metadata column. If 'as_numeric' is TRUE, the values are converted to numeric format.
#' @examples
#' \dontrun{
#' if (interactive()) {
#' # Example usage:
#' batch_metadata <- getMetadataColumn(col = "batch", obj = combined.obj, as_numeric = TRUE)
#' }
#' }
#' @export
getMetadataColumn <- function(col = "batch", obj = combined.obj, as_numeric = FALSE) {
stopifnot(col %in% colnames(obj@meta.data))
x <- df.col.2.named.vector(df = obj@meta.data, col = col)
if (as_numeric) {
as.numeric.wNames(x) + 1
} else {
x
}
}
# mmeta <- getMetadataColumn
# _________________________________________________________________________________________________
#' @title Get Unique Levels of a Seurat Object Ident Slot
#'
#' @description This function extracts the unique levels present in the 'ident' slot of a Seurat object.
#' The function throws an error if the number of levels exceeds 'max_levels'.
#' The function optionally prints the R code to recreate the 'Levels' vector using 'dput'.
#'
#' @param obj A Seurat object.
#' @param ident A character string representing the name of the slot in the Seurat object.
#' @param max_levels An integer that sets the maximum number of levels allowed. Default is 100.
#' @param dput A logical that decides whether to print the R code to recreate the 'Levels' vector. Default is TRUE.
#'
#' @return A vector of unique levels present in the 'ident' slot of the Seurat object.
#'
#' @importFrom tibble deframe
#' @import Seurat
#' @export
get_levels_seu <- function(obj, ident, max_levels = 100, dput = TRUE) {
Levels <- unique(deframe(obj[[ident]]))
stopifnot(length(Levels) < max_levels)
if (dput) {
cat("Levels <- ")
dput(Levels)
}
return(Levels)
}
#' @title Calculate Average Metadata for Seurat Object
#'
#' @description Computes specified metrics (e.g., median, mean) for given metadata features across each category
#' defined by an identity column in a Seurat object's metadata. This function allows for flexible
#' metric calculation on specified features, providing insights into the data distribution.
#'
#' @param obj A Seurat object containing metadata to be analyzed. Defaults to `combined.obj`.
#' @param meta.features A character vector specifying which metadata features to calculate metrics for.
#' Defaults to c("nFeature_RNA", "percent.ribo", "percent.mito").
#' @param ident The name of the identity column used to group the data before calculating metrics.
#' The default is the second entry from `GetNamedClusteringRuns()`.
#' @param metrics A list of named metrics to calculate for the metadata features, where names are
#' the metric names (e.g., 'median', 'mean') and values are the corresponding functions.
#' Defaults to list('median' = median, 'mean' = mean).
#' @param verbose Logical flag indicating whether to print detailed information about the metrics
#' calculation process. Defaults to TRUE.
#' @param max.categ max number of groups in ident.
#'
#' @return A list containing data frames with calculated metrics for each specified metadata feature,
#' grouped by the identity categories. Each data frame corresponds to one of the specified metrics.
#'
#' @examples
#' # Assuming `combined.obj` is a Seurat object with relevant metadata columns:
#' results <- calculateAverageMetaData(
#' obj = combined.obj,
#' meta.features = c("nFeature_RNA", "nCount_RNA"),
#' metrics = list("median" = median, "mean" = mean),
#' verbose = TRUE
#' )
#' # This will return a list with data frames containing the median and mean
#' # of "nFeature_RNA" and "percent.ribo" for each category in "ident_column_name".
#'
#' @export
calculateAverageMetaData <- function(
obj = combined.obj,
meta.features = c("nFeature_RNA", "percent.ribo", "percent.mito"),
ident = GetClusteringRuns()[1],
metrics = list("median" = median, "mean" = mean),
verbose = TRUE, max.categ = 30) {
stopifnot(
is(obj, "Seurat"),
"ident not found in object" = ident %in% colnames(obj@meta.data),
"Not all meta.features found in object" = all(meta.features %in% colnames(obj@meta.data)),
length(unique(obj@meta.data[, ident])) < max.categ
)
# Initialize list to store results
results <- list()
# Calculate metrics for each meta.feature within each ident category
for (m in names(metrics)) {
results[[m]] <- obj@meta.data |>
group_by(!!sym(ident)) |>
summarise(across(all_of(meta.features), metrics[[m]], na.rm = TRUE), .groups = "drop")
}
# Verbose output
if (verbose) {
cat("Calculated metrics:", paste(names(metrics), collapse = ", "), "\n")
cat("For features:", paste(meta.features, collapse = ", "), "\n")
cat("Based on identifier:", ident, "\n")
}
return(results)
}
# _________________________________________________________________________________________________
#' @title Calculate the Percentage of Matches per Category
#'
#' @description This function calculates the percentage of matches for specified metadata features
#' against provided match values within each category of an identifier in a Seurat object.
#'
#' @param obj A Seurat object containing the data to be analyzed. Default: combined.obj.
#' @param ident A string specifying the column in the metadata that identifies the categories.
#' Default: first element of `GetClusteringRuns()`.
#' @param meta.features A vector of strings specifying which metadata features to analyze.
#' Default: c("AAV.detected.min2", "AAV.detected").
#' @param match.values A named vector where names correspond to `meta.features` and values are
#' the strings to match against. Default: c("AAV.detected.min2" = "AAV", "AAV.detected" = "AAV").
#' @param verbose A logical value indicating whether to print detailed output. Default: `TRUE`.
#' @param max.categ The maximum number of categories allowed before stopping. Default: 30.
#'
#' @return A data frame with the category as the first column and the subsequent columns showing
#' the percentage of matches for each metadata feature.
#' @export
#'
#' @examples
#' calculatePercentageMatch(obj = combined.obj, ident = "Simple_Celltypes")
calculatePercentageMatch <- function(
obj,
ident = GetClusteringRuns()[1],
meta.features = c("AAV.detected.min2", "AAV.detected"),
match.values = c("AAV.detected.min2" = "AAV", "AAV.detected" = "AAV"), # Named vector for matches
verbose = TRUE,
max.categ = 100) {
# Check for preconditions
stopifnot(
is(obj, "Seurat"),
"ident not found in object" = ident %in% colnames(obj@meta.data),
"Not all meta.features found in object" = all(meta.features %in% colnames(obj@meta.data)),
"Too many categories" = length(unique(obj@meta.data[, ident])) < max.categ,
length(match.values) == length(meta.features), # Check if match.values has the same length as meta.features
all(names(match.values) == meta.features) # Ensure match.values has names corresponding to meta.features
)
# Initialize a data frame to store results
results <- data.frame(Category = unique(obj@meta.data[[ident]]))
# Calculate the percentage of matches for each meta.feature within each ident category
for (feature in meta.features) {
results[[paste0("pct_match_", feature)]] <- sapply(results$Category, function(cat) {
idx <- obj@meta.data[[ident]] == cat
subset_data <- obj@meta.data[idx, feature, drop = TRUE]
pct_match <- mean(subset_data == match.values[feature], na.rm = TRUE)
return(pct_match)
})
}
# Verbose output
if (verbose) {
cat("Calculated percentage of matches for values:", paste(match.values, collapse = ", "), "\n")
cat("Corresponding to features:", paste(meta.features, collapse = ", "), "\n")
cat("Based on identifier:", ident, "\n")
}
return(results)
}
# _________________________________________________________________________________________________
#' @title getMedianMetric.lsObj
#'
#' @description Get the median values of different columns in meta.data, can iterate over a list of Seurat objects.
#' @param ls.obj List of Seurat objects, Default: ls.Seurat
#' @param n.datasets lenght of list (n objects), Default: length(ls.Seurat)
#' @param mColname Metadata column name to calculate on. Default: 'percent.mito'
#' @examples
#' \dontrun{
#' if (interactive()) {
#' ls.Seurat <- getMedianMetric.lsObj(
#' ls.obj = ls.Seurat, n.datasets = length(ls.Seurat),
#' mColname = "percent.mito"
#' )
#' }
#' }
#' @export
getMedianMetric.lsObj <- function(ls.obj = ls.Seurat, n.datasets = length(ls.Seurat), mColname = "percent.mito") {
medMetric <- vec.fromNames(names(ls.obj))
for (i in 1:n.datasets) {
medMetric[i] <- median(ls.obj[[i]]@meta.data[, mColname])
}
return(medMetric)
}
# _________________________________________________________________________________________________
#' @title getCellIDs.from.meta
#'
#' @description Retrieves cell IDs from a specified metadata column of a Seurat object, where the cell ID matches a provided list of values. The matching operation uses the `%in%` operator.
#' @param ident A string specifying the name of the metadata column from which to retrieve cell IDs. Default: 'res.0.6'.
#' @param ident_values A vector of values to match in the metadata column. Default: `NA`.
#' @param obj The Seurat object from which to retrieve the cell IDs. Default: combined.obj.
#' @param inverse A boolean value indicating whether to inverse the match, i.e., retrieve cell IDs that do not match the provided list of ident_values. Default: `FALSE`.
#' @return A vector of cell IDs that match (or don't match, if `inverse = TRUE`) the provided list of values.
#' @examples
#' \dontrun{
#' if (interactive()) {
#' # Example usage:
#' getCellIDs.from.meta()
#' }
#' }
#' @export
getCellIDs.from.meta <- function(ident = GetClusteringRuns()[1],
ident_values = NA, obj = combined.obj,
inverse = FALSE) {
mdat <- obj@meta.data[, ident]
cells.pass <- mdat %in% ident_values
if (inverse) cells.pass <- !cells.pass
iprint(sum(cells.pass), "cells found.")
return(rownames(obj@meta.data)[which(cells.pass)])
}
# _________________________________________________________________________________________________
# Add new metadata ______________________________ ----
# _________________________________________________________________________________________________
#' @title Add Metadata to a Seurat object, safely with Checks
#'
#' @description Wrapper function for `AddMetaData` that includes additional checks and assertions.
#'
#' @param obj Seurat object to which metadata will be added.
#' @param metadata The metadata to be added.
#' @param col.name The name of the new metadata column.
#' @param overwrite Logical; if TRUE, overwrites the existing column.
#' @param verbose Logical; if TRUE, prints additional information.
#' @param strict ...
#'
#' @return Modified Seurat object with additional metadata.
#' @importFrom Seurat AddMetaData
#' @export
addMetaDataSafe <- function(obj, metadata, col.name, overwrite = FALSE, verbose = FALSE,
strict = TRUE) {
if (verbose) message("Running addMetaDataSafe...")
stopifnot(
is(obj, "Seurat"), is.vector(metadata), is.character(col.name), is.logical(overwrite),
"Column already exists" = ((!col.name %in% colnames(obj@meta.data)) | overwrite)
)
equal_length <- length(metadata) == ncol(obj)
iprint("strict", strict)
if (strict) stopifnot("Metadata or object too short" = equal_length)
if (!is.null(names(metadata))) {
if (verbose) {
iprint("cells in metadata:", head(names(metadata)), "...")
iprint("cells in object:", head(colnames(obj)), "...")
}
if (strict) stopifnot(names(metadata) == colnames(obj))
} else {
message("No CBCs associated with new metadata. Assuming exact match.")
if (!equal_length) stop("Not equal lenght, no CBCs")
}
if (any(is.na(names(metadata))) ) {
warning("Metadata contains NA values.", immediate. = T)
metadata_orig <- metadata
metadata <- vec.fromNames(colnames(obj), fill = NA)
cells_found <- na.omit.strip(names(metadata_orig))
metadata[cells_found] <- metadata_orig[cells_found]
}
# Perform the operation
stopif(any(is.na(names(metadata))))
obj <- Seurat::AddMetaData(object = obj, metadata = metadata, col.name = col.name )
prefix <- paste("New column", col.name)
# Check for NA or NaN values
if (all(is.na(metadata) | is.nan(metadata))) {
warning(paste(prefix, "contains only NA or NaN values."))
} else if (any(is.na(metadata) | is.nan(metadata))) {
message(paste(prefix, "contains NA or NaN values."))
}
return(obj)
}
# _________________________________________________________________________________________________
#' @title Create a Metadata Vector
#'
#' @description This function creates a metadata vector from an input vector and a Seurat object.
#' The resulting vector contains values from 'vec' for the intersecting cell names between 'vec' and 'obj'.
#' It also checks if the intersection between the cell names in 'vec' and 'obj' is more than a
#' minimum intersection size.
#' @param vec A named vector where the names represent cell IDs. This vector should have partial
#' overlap with the cells in a Seurat object.
#' @param obj A Seurat object that contains cell IDs which partially overlap with 'vec'.
#' @param fill The value to fill for non-intersecting cell names in 'obj'. Default is NA.
#' @param min.intersect The minimum number of cells to find in both 'vec' and 'obj'.
#' The function will stop if the intersection is less than this number. Default is 100.
#' @return A named vector of length equal to the number of cells in 'obj', with names from 'obj' and
#' values from 'vec' for intersecting cell names.
#' @examples
#' \dontrun{
#' create.metadata.vector(vec = my_vector, obj = my_seurat_object, min.intersect = 50)
#' }
#' @export
create.metadata.vector <- function(vec, obj = combined.obj, fill = NA,
min.intersect = min(length(vec), ncol(obj), 100)) {
stopifnot(is.vector(vec), is(obj, "Seurat"))
cells_in_obj <- colnames(obj)
cells_in_vec <- names(vec)
stopifnot(is.character(cells_in_obj), is.character(cells_in_vec))
cells_in_both <- length(intersect(cells_in_vec, cells_in_obj))
if (cells_in_both < min.intersect) {
message(
ncol(obj), " cells in obj: ", kppc(cells_in_obj[1:3]), "\n",
length(vec), " cells in vec: ", kppc(cells_in_vec[1:3]), "\n",
cells_in_both, " cells in both.\n"
)
stop("Intersection between vec and obj is less than min.intersect.")
}
cells.vec <- names(vec)
cells.obj <- colnames(obj)
cells.in.both <- intersect(cells.vec, cells.obj)
iprint(
length(cells.in.both), "cells in both;",
length(cells.vec), "cells in vec;",
length(cells.obj), "cells in obj",
"intersect, e.g.:", head(cells.in.both, 5)
)
new_assignment <- CodeAndRoll2::vec.fromNames(cells.obj, fill = fill)
new_assignment[cells.in.both] <- vec[cells.in.both]
return(new_assignment)
}
# _________________________________________________________________________________________________
#' @title addMetaFraction
#'
#' @description Add a new metadata column to a Seurat object, representing the fraction of a gene set in the transcriptome (expressed as a percentage).
#' @param obj Seurat object to which the new metadata column will be added. Default: ls.Seurat[[1]]
#' @param col.name Name of the new metadata column to be added. Default: 'percent.mito'
#' @param gene.symbol.pattern Regular expression pattern to match gene symbols. Default: c("^MT\\.|^MT-", FALSE)[1]
#' @param assay Name of the assay to be used. Default: 'RNA'
#' @param layer Name of the layer to be used. Default: 'data'
#' @param gene.set A set of gene symbols. If specified, it will be used instead of gene.symbol.pattern. Default: `FALSE`.
#' @param verbose Logical indicating whether to display detailed messages (TRUE) or not (FALSE). Default: `TRUE`.
#' @examples
#' \dontrun{
#' if (interactive()) {
#' ls.Seurat[[1]] <- addMetaFraction(col.name = "percent.mito", gene.symbol.pattern = "^MT\\.|^MT-", obj = ls.Seurat[[1]])
#' ls.Seurat[[1]] <- addMetaFraction(col.name = "percent.ribo", gene.symbol.pattern = "^RPL|^RPS", obj = ls.Seurat[[1]])
#' ls.Seurat[[1]] <- addMetaFraction(col.name = "percent.AC.GenBank", gene.symbol.pattern = "^AC[0-9]{6}\\.", obj = ls.Seurat[[1]])
#' ls.Seurat[[1]] <- addMetaFraction(col.name = "percent.AL.EMBL", gene.symbol.pattern = "^AL[0-9]{6}\\.", obj = ls.Seurat[[1]])
#' ls.Seurat[[1]] <- addMetaFraction(col.name = "percent.LINC", gene.symbol.pattern = "^LINC0", obj = ls.Seurat[[1]])
#' ls.Seurat[[1]] <- addMetaFraction(col.name = "percent.MALAT1", gene.symbol.pattern = "^MALAT1", obj = ls.Seurat[[1]])
#' colnames(ls.Seurat[[1]]@meta.data)
#' HGA_MarkerGenes <- c(
#' "ENO1", "IGFBP2", "WSB1", "DDIT4", "PGK1", "BNIP3", "FAM162A", "TPI1",
#' "VEGFA", "PDK1", "PGAM1", "IER2", "FOS", "BTG1", "EPB41L4A-AS1", "NPAS4", "HK2", "BNIP3L",
#' "JUN", "ENO2", "GAPDH", "ANKRD37", "ALDOA", "GADD45G", "TXNIP"
#' )
#' sobj <- addMetaFraction(col.name = "percent.HGA", gene.set = HGA_MarkerGenes, obj = sobj)
#' }
#' }
#' @seealso
#' \code{\link[Matrix]{colSums}}
#' @export
#' @importFrom Matrix colSums
#' @importFrom CodeAndRoll2 grepv
addMetaFraction <- function(
obj,
col.name = "percent.mito",
gene.symbol.pattern = c("^MT\\.|^MT-", FALSE)[1],
assay = "RNA",
layer = "counts",
gene.set = FALSE,
verbose = TRUE) {
#
message("Should rather use the default `Seurat::PercentageFeatureSet`")
message("Assay: ", assay)
message("Layer: ", layer)
stopifnot(
is(obj, "Seurat"),
is.character(col.name), is.character(assay), is.character(layer), is.logical(verbose),
assay %in% Assays(obj), layer %in% Layers(obj)
)
stopif(condition = isFALSE(gene.set) && isFALSE(gene.symbol.pattern), "Either gene.set OR gene.symbol.pattern has to be defined (!= FALSE).")
if (!isFALSE(gene.set) && !isFALSE(gene.symbol.pattern) && verbose) print("Both gene.set AND gene.symbol.pattern are defined. Only using gene.set.")
# if (!isFALSE(gene.set)) geneset <- check.genes(genes = gene.set, obj = obj)
total_expr <- Matrix::colSums(GetAssayData(object = obj, assay = assay, layer = layer))
all.genes <- Features(obj, assay = assay)
genes.matching <- if (!isFALSE(gene.set)) {
intersect(gene.set, all.genes)
} else {
CodeAndRoll2::grepv(pattern = gene.symbol.pattern, x = all.genes)
}
genes.expr <- GetAssayData(object = obj, assay = assay, layer = layer)[genes.matching, ]
target_expr <- if (length(genes.matching) > 1) Matrix::colSums(genes.expr) else genes.expr
iprint(length(genes.matching), "genes found, eg:", head(genes.matching, 10))
obj <- AddMetaData(object = obj, metadata = target_expr / total_expr, col.name = col.name)
colnames(obj@meta.data)
return(obj)
}
# _________________________________________________________________________________________________
#' @title Add Meta Data for Gene-Class Fractions
#'
#' @description
#' This function adds meta data for various gene-class fractions such as percent.mito, percent.ribo,
#' percent.AC.GenBank, percent.AL.EMBL, percent.LINC, percent.MALAT1, and percent.HGA to a Seurat object.
#' If the meta data already exists, a message will be displayed.
#'
#' @param obj A Seurat object to be updated. Default: None.
#' @param gene_fractions A named list containing gene symbol patterns for each meta column name.
#' Default: List of predefined gene fractions.
#' @param add_hga A logical value indicating whether to add percent.HGA meta data. Default: `TRUE`.
#'
#' @return An updated Seurat object.
#' @export
#'
#' @importFrom SeuratObject UpdateSeuratObject
#'
addGeneClassFractions <- function(obj,
gene_fractions = list(
"percent.mito" = "^MT\\.|^MT-",
"percent.ribo" = "^RPL|^RPS",
"percent.AC.GenBank" = "^AC[0-9]{6}\\.",
"percent.AL.EMBL" = "^AL[0-9]{6}\\.",
"percent.LINC" = "^LINC0",
"percent.MALAT1" = "^MALAT1"
),
add_hga = TRUE) {
message("Adding meta data for gene-class fractions, e.g., percent.mito, etc.")
for (col_name in names(gene_fractions)) {
message(col_name, "...")
if (!metaColnameExists(col_name = col_name, obj = obj)) {
gene_data <- gene_fractions[[col_name]]
message("Adding ", col_name, " to @meta.data...", gene_data)
obj <- addMetaFraction(
col.name = col_name, gene.symbol.pattern = gene_data, obj = obj,
assay = "RNA"
)
} else {
message(paste(col_name, "already present."))
}
}
if (add_hga) {
message("Adding percent.HGA to @meta.data...")
HGA_MarkerGenes <- c(
"ENO1", "IGFBP2", "WSB1", "DDIT4", "PGK1", "BNIP3", "FAM162A", "TPI1",
"VEGFA", "PDK1", "PGAM1", "IER2", "FOS", "BTG1", "EPB41L4A-AS1", "NPAS4", "HK2", "BNIP3L",
"JUN", "ENO2", "GAPDH", "ANKRD37", "ALDOA", "GADD45G", "TXNIP"
)
if (!metaColnameExists(col_name = "percent.HGA", obj = obj)) {
obj <- addMetaFraction(col.name = "percent.HGA", gene.set = HGA_MarkerGenes, obj = obj)
} else {
message("percent.HGA already present.")
}
}
return(obj)
}
# _________________________________________________________________________________________________
#' @title add.meta.tags
#'
#' @description Add metadata tags to a Seurat object dataset.
#' @param list.of.tags A list of tags to be added as metadata. Default: tags
#' @param obj A Seurat object to which the metadata tags are to be added. Default: ls.Seurat[[1]]
#' @param n The index specifying the dataset for which the tags should be applied. Default: 1
#' @examples
#' \dontrun{
#' if (interactive()) {
#' ls.Seurat[[1]] <- add.meta.tags(list.of.tags = tags, obj = ls.Seurat[[1]], n = 1)
#' }
#' }
#' @export
add.meta.tags <- function(list.of.tags = tags, obj = ls.Seurat[[1]], n = 1) { # N is the for which dataset
stopifnot(length(names(tags)) == length(tags))
nCells <- nrow(obj@meta.data)
for (i in 1:length(list.of.tags)) {
tagX <- list.of.tags[[i]]
new.meta.tag.per.cell <- rep(tagX[n], nCells)
obj <- AddMetaData(object = obj, metadata = new.meta.tag.per.cell, col.name = names(tags)[i])
}
return(obj)
}
# _________________________________________________________________________________________________
#' @title seu.add.meta.from.table
#'
#' @description Add multiple new metadata columns to a Seurat object from a table. #
#' @param obj Seurat object, Default: seu.ORC
#' @param meta Metadata data frame.
#' @param suffix A suffix added to the filename, Default: '.fromMeta'
#' @examples
#' \dontrun{
#' if (interactive()) {
#' combined.obj <- seu.add.meta.from.table()
#' }
#' }
#' @export
seu.add.meta.from.table <- function(obj = combined.obj, meta, suffix = ".fromMeta") { # Add multiple new metadata columns to a Seurat object from a table.
NotFound <- setdiff(colnames(obj), rownames(meta))
Found <- intersect(colnames(obj), rownames(meta))
if (length(NotFound)) iprint(length(NotFound), "cells were not found in meta, e.g.: ", trail(NotFound, N = 10))
mCols.new <- colnames(meta)
mCols.old <- colnames(obj@meta.data)
overlap <- intersect(mCols.new, mCols.old)
if (length(overlap)) {
iprint(length(overlap), "metadata columns already exist in the seurat object: ", overlap, ". These are tagged as: *", suffix)
colnames(meta)[overlap] <- paste0(overlap, suffix)
}
mCols.add <- colnames(meta)
obj@meta.data[Found, mCols.add] <- meta[Found, ]
return(obj)
}
# _________________________________________________________________________________________________
#' @title seu.map.and.add.new.ident.to.meta
#'
#' @description Adds a new metadata column to a Seurat object based on an identity mapping table.
#' @param obj The Seurat object to which the new metadata column will be added. Default: combined.obj.
#' @param ident.table A data frame or matrix with identity mapping data. This parameter is used to map the old identities to the new ones. Default: clusterIDs.GO.process.
#' @param orig.ident The original identities of the Seurat object. Default: Idents(obj).
#' @param metaD.colname A string specifying the name of the new metadata column. The default value is the name of the provided ident.table.
#' @return A Seurat object with the new metadata column added.
#' @examples
#' \dontrun{
#' if (interactive()) {
#' # Example usage:
#' combined.obj <- seu.map.and.add.new.ident.to.meta(
#' obj = combined.obj,
#' ident.table = clusterIDs.GO.process
#' )
#' }
#' }
#' @export
seu.map.and.add.new.ident.to.meta <- function(
obj = combined.obj, ident.table = clusterIDs.GO.process,
orig.ident = Idents(obj),
metaD.colname = substitute_deparse(ident.table)) {
# identities should match
{
Idents(obj) <- orig.ident
ident.vec <- df.col.2.named.vector(ident.table)
ident.X <- names(ident.vec)
ident.Y <- as.character(ident.vec)
ident.Seu <- gtools::mixedsort(levels(Idents(obj)))
iprint("ident.Seu: ", ident.Seu)
OnlyInIdentVec <- setdiff(ident.X, ident.Seu)
OnlyInSeuratIdents <- setdiff(ident.Seu, ident.X)
msg.IdentVec <- kollapse("Rownames of 'ident.table' have entries not found in 'Idents(obj)':",
OnlyInIdentVec, " not found in ", ident.Seu,
collapseby = " "
)
msg.Seu <- kollapse("Rownames of 'Idents(obj)' have entries not found in 'ident.table':",
OnlyInSeuratIdents, " not found in ", ident.X,
collapseby = " "
)
stopif(length(OnlyInIdentVec), message = msg.IdentVec)
stopif(length(OnlyInSeuratIdents), message = msg.Seu)
}
# identity mapping
{
new.ident <- CodeAndRoll2::translate(vec = as.character(Idents(obj)), old = ident.X, new = ident.Y)
obj@meta.data[[metaD.colname]] <- new.ident
iprint(metaD.colname, "contains the named identitites. Use Idents(combined.obj) = '...'. The names are:")
cat(paste0("\t", ident.Y, "\n"))
}
}
# _________________________________________________________________________________________________
# Replace / overwrite / remove metadata ______________________________ ----
# _________________________________________________________________________________________________
# _________________________________________________________________________________________________
#' @title fix.orig.ident
#'
#' @description Remove the string "filtered_feature_bc_matrix." from "orig.ident". Helper function.
#' @param obj Seurat object, Default: merged.obj
#' @examples
#' \dontrun{
#' if (interactive()) {
#' merged.obj$orig.ident <- fix.orig.ident(obj = merged.obj)
#' table(merged.obj$orig.ident)
#' }
#' }
#' @export
fix.orig.ident <- function(obj = merged.obj) {
fixed <- sub(obj$"orig.ident", pattern = "filtered_feature_bc_matrix.", replacement = "")
return(fixed)
}
# _________________________________________________________________________________________________
#' @title seu.RemoveMetadata
#'
#' @description Remove specified metadata columns from a Seurat object.
#' @param obj A Seurat object from which metadata columns will be removed. Default: combined.obj
#' @param cols_remove A character vector specifying metadata column names to remove. By default, it will remove all columns that do not start with "integr" or "cl.names".
#' @return A Seurat object with specified metadata columns removed.
#' @export
#' @examples
#' \dontrun{
#' combined.obj <- seu.RemoveMetadata(obj = combined.obj, cols_remove = c("column1", "column2"))
#' }
seu.RemoveMetadata <- function(
obj = combined.obj,
cols_remove = grepv(colnames(obj@meta.data), pattern = "^integr|^cl.names", perl = TRUE)) {
CNN <- colnames(obj@meta.data)
iprint("cols_remove:", cols_remove)
print("")
(cols_keep <- setdiff(CNN, cols_remove))
obj@meta.data <- obj@meta.data[, cols_keep]
iprint("meta.data colnames kept:", colnames(obj@meta.data))
return(obj)
}
# _________________________________________________________________________________________________
# Export or Transfer metadata ______________________________ ----
# _________________________________________________________________________________________________
#' @title Save Metadata from a List of Seurat Objects
#'
#' @description This function takes a list of Seurat objects, extracts their metadata, and saves it to a file with a specified suffix.
#'
#' @param ls.obj A list of Seurat objects.
#' @param suffix A character string to append to the filename when saving metadata.
#' @return Invisible list of metadata frames
#' @export
saveLsSeuratMetadata <- function(ls.obj, suffix) {
stopifnot(is.list(ls.obj)) # Check if input is a list
message(length(ls.obj), " objects")
ls.meta <- setNames(lapply(ls.obj, function(x) x@meta.data), names(ls.obj))
ncolz <- unique(sapply(ls.meta, ncol))
message(ncolz, " columns in meta.data")
if (length(ncolz) > 1) warning("Different column counts across meta.data!", immediate. = TRUE)
xsave(ls.meta, suffix = suffix)
invisible(ls.meta)
}
# _________________________________________________________________________________________________
#' @title Transfer Multiple Metadata Columns Between Two Seurat Objects
#'
#' @description Transfers specified metadata columns from one Seurat object to another,
#' with options for verbose output and overwriting existing columns. Checks for cell overlap and
#' reports percentages of matching and unique cells.
#'
#' @param from The source Seurat object from which metadata will be transferred.
#' @param to The destination Seurat object to which metadata will be added.
#' @param colnames_from Vector of names for the columns in the source object's metadata to transfer.
#' @param colnames_to Vector of names for the columns in the destination object's metadata.
#' Defaults to the same names as `colnames_from`. Must be the same length as `colnames_from` unless
#' it is the same as `colnames_from`.
#' @param overwrite Logical, indicating whether to overwrite the column in the destination object
#' if it already exists. Defaults to FALSE.
#' @param plotUMAP Logical, indicating whether to plot UMAPs of the destination object with
#' the new identity.
#' @param strict Logical, indicating whether to fail if the destination object have cells not found in the source object.
#' @param verbose Logical, indicating whether to print details about the transfer, including the
#' number and percentage of matching cells between objects, and unique cells in each object.
#' @param ... Additional arguments to be passed to `transferMetadata`.
#'
#' @return Returns the destination Seurat object (`to`) with the new metadata columns added.
#'
#' @examples
#' # Assuming `object1` and `object2` are Seurat objects, and you want to transfer
#' # metadata columns named 'patientID' and 'treatmentGroup' from `object1` to `object2`:
#' object2 <- transferMetadata(
#' from = object1, to = object2,
#' colnames_from = c("patientID", "treatmentGroup")
#' )
#'
#' @details This function is useful for merging related data from separate Seurat objects,
#' ensuring that relevant metadata is consistent across datasets. The function checks for
#' the existence of the specified columns in the source object and whether the columns
#' can be overwritten in the destination object. It also reports on cell overlap between
#' the two objects, which can be useful for understanding the relationship between datasets.
#'
#' @export
transferMetadata <- function(from, to,
colnames_from,
colnames_to = colnames_from,
overwrite = FALSE,
plotUMAP = TRUE,
strict = TRUE,
verbose = TRUE,
...) {
#
stopifnot(
is(from, "Seurat"), is(to, "Seurat"),
is.character(colnames_from), is.character(colnames_to),
all(colnames_from %in% colnames(from@meta.data)),
"Length of 'colnames_from' and 'colnames_to' must be equal" =
length(colnames_from) == length(colnames_to)
)
# Check cell overlaps
cells_in_both <- intersect(colnames(from), colnames(to))
cells_only_in_from <- setdiff(colnames(from), colnames(to))
cells_only_in_to <- setdiff(colnames(to), colnames(from))
nr.cells.both <- length(cells_in_both)
nr.cells.only.from <- length(cells_only_in_from)
nr.cells.only.to <- length(cells_only_in_to)
# Print cell overlap information _______________________________________________________
if (verbose) {
cat(
"Cells matching between objects:", nr.cells.both,
"(", sprintf("%.2f%%", nr.cells.both / length(colnames(from)) * 100), "of from and",
sprintf("%.2f%%", nr.cells.both / length(colnames(to)) * 100), "of to)\n"
)
cat(
"Cells only in obj1 (from):", length(cells_only_in_from),
"(", sprintf("%.2f%%", nr.cells.only.from / length(colnames(from)) * 100), ")\n"
)
cat(
"Cells only in obj2 (to):", nr.cells.only.to,
"(", sprintf("%.2f%%", nr.cells.only.to / length(colnames(to)) * 100), ")\n"
)
}
if (strict) {
stopifnot("There are cells ONLY present in the destination object. Cannot transfer metadata." = (nr.cells.only.to == 0))
} else {
warning("There are cells ONLY present in the destination object. Filled with NA", immediate. = TRUE)
}
if (nr.cells.only.from > 0 & verbose) warning("There are cells ONLY present in the FROM object. These will be ignored.", immediate. = TRUE)
# Transfer metadata columns _______________________________________________________
for (i in seq_along(colnames_from)) {
# Check if to-column exists in destination object OR you overwrite anyway
if (!(colnames_to[i] %in% colnames(to@meta.data)) || overwrite) {
# Check if column exists in source object
if (colnames_from[i] %in% colnames(from@meta.data)) {
# Transfer the metadata column
# to[[colnames_to[i]]] <- from[[colnames_from[i]]]
metadata_from <- getMetadataColumn(obj = from, col = colnames_from[i])
to <- addMetaDataSafe(
obj = to, col.name = colnames_to[i], metadata = metadata_from[colnames(to)],
strict = strict, verbose = verbose
)
message(sprintf("Transferred '%s' to '%s'.", colnames_from[i], colnames_to[i]))
} else {
warning(sprintf("Column '%s' not found in source object.", colnames_from[i]), immediate. = TRUE)
}
} else {
warning(sprintf(
"Column '%s' already exists in destination object. Set 'overwrite = TRUE' to overwrite.",
colnames_to[i]
), immediate. = TRUE)
}
# Plot umap _______________________________________________________
if (plotUMAP) {
metaX <- getMetadataColumn(col = colnames_to[i], obj = to)
if(is.numeric(metaX) && nr.unique(metaX) > 10) {
x <- qUMAP(obj = to, feature = colnames_to[i], suffix = "transferred.ident", ...)
} else {
x <- clUMAP(obj = to, ident = colnames_to[i], suffix = "transferred.ident", ...)
}
print(x)
}
} # for
return(to)
}
# _________________________________________________________________________________________________
# Subset metadata ______________________________ ----
# _________________________________________________________________________________________________
# _________________________________________________________________________________________________
#' @title Sample N % of a dataframe (obj@metadata), and return rownames (cell IDs).
#'
#' @description This function samples a specified percentage of a dataframe (specifically a subset
#' of the metadata of a Seurat object) and returns the corresponding cell IDs.
#' @param metaDF A dataframe representing a subset of the metadata of a Seurat object. Default:
#' Subset of 'MetaData' for which 'Pass' is TRUE.
#' @param pc The percentage of the dataframe to sample, expressed as a decimal. Default: 0.1.
#' @return A vector of sampled cell IDs.
#' @examples
#' \dontrun{
#' if (interactive()) {
#' # Example usage:
#' # Suppose 'MetaData' is a dataframe and 'Pass' is a boolean vector with the same length.
#' # The following example will sample 10% of the rows of 'MetaData' where 'Pass' is TRUE.
#' sampleNpc(metaDF = MetaData[which(Pass), ], pc = 0.1)
#' }
#' }
#' @export
sampleNpc <- function(metaDF = MetaData[which(Pass), ], pc = 0.1) {
cellIDs <- rownames(metaDF)
nr_cells <- floor(length(cellIDs) * pc)
cellIDs.keep <- sample(cellIDs, size = nr_cells, replace = FALSE)
return(cellIDs.keep)
}
# _________________________________________________________________________________________________
# Combine metadata ______________________________ ----
# _________________________________________________________________________________________________
#' @title Merge Seurat Metadata
#' @description Merges the `@metadata` from a list of Seurat objects, binds them by row, and applies optional inclusion/exclusion of columns.
#' @param ls_obj A list of Seurat objects.
#' @param include_cols A character vector of column names to include (default NULL for all columns).
#' @param exclude_cols A character vector of column names to exclude (default NULL for no exclusions).
#' @return A merged dataframe of metadata from all Seurat objects.
#' @importFrom dplyr bind_rows select
#' @export
merge_seurat_metadata <- function(ls_obj, include_cols = NULL, exclude_cols = NULL) {
# Assert that ls_obj is a list and all elements are Seurat objects
stopifnot(
is.list(ls_obj), length(ls_obj) > 0,
all(sapply(ls_obj, function(x) inherits(x, "Seurat")))
) # Ensure all are Seurat objects
# Extract metadata from each Seurat object
metadata_list <- lapply(ls_obj, function(seurat_obj) seurat_obj@meta.data)
# Assert that the number of columns are the same in all metadata
col_counts <- sapply(metadata_list, ncol)
stopifnot(length(unique(col_counts)) == 1) # Ensure all metadata have the same number of columns
# Assert that rownames (cell names) are unique across all metadata
cell_names <- unlist(lapply(metadata_list, rownames))
stopifnot(length(unique(cell_names)) == length(cell_names)) # Ensure all cell names are unique
# Optionally select columns to include or exclude
if (!is.null(include_cols)) {
metadata_list <- lapply(metadata_list, function(md) dplyr::select(md, all_of(include_cols)))
} else if (!is.null(exclude_cols)) {
metadata_list <- lapply(metadata_list, function(md) dplyr::select(md, -all_of(exclude_cols)))
}
# Bind the metadata by row
merged_metadata <- dplyr::bind_rows(metadata_list)
# Message all column names to console
message("Columns: ", kppc(colnames(merged_metadata)), "\n")
message(length(merged_metadata), " merged columns and ", nrow(merged_metadata), " cells from ", length(ls_obj), " objects.")
return(merged_metadata)
}
# Example usage:
# merged_metadata <- merge_seurat_metadata(ls_obj, include_cols = c("nFeature_RNA", "nCount_RNA"))
#' @title Combine Metadata from a list of Seurat objects and Write to TSV
#'
#' @description
#' Formerly `writeMetadataToTsv`. `writeCombinedMetadataToTsvFromLsObj` takes a list of ls.Obj, extracts their `@meta.data` slots,
#' removes specified columns, checks for column consistency, creates a barplot showing the number
#' of rows per object, and finally merges these into one large data frame.
#'
#' @param ls.Obj A list of objects, each containing a `@meta.data` slot.
#' @param cols.remove A character vector of column names to be removed from each metadata data frame.
#' Default is an empty character vector, meaning no columns will be removed.
#' @param save_as_qs A logical indicating whether to save the merged metadata as a .qs object.
#' @param save_as_tsv A logical indicating whether to save the merged metadata as a .tsv file.
#' @param ... Additional arguments to be passed to `write.table`.
#'
#' @details
#' The function starts by validating the input to ensure it's a list. It then extracts the `@meta.data`
#' from each object, removing the specified columns. It checks if all data frames have the same columns
#' and issues a warning if not. A barplot is created to visualize the number of rows (cells) per object.
#' Finally, it merges all the metadata into one large data frame and prints its dimensions.
#'
#' @return A large data frame that is the row-wise merge of all `@meta.data` data frames.
#'
#' @examples
#' # Assuming a list of Seurat objects with meta.data
#' mergedMetaData <- writeMetadataToTsv(seuratObjectsList, cols.remove = c("column1", "column2"))
#'
#' @note
#' This function is intended for use with S4 objects that have a `@meta.data` slot.
#' The function currently contains a `browser()` call for debugging purposes, which should be removed in production.
#'
#' @export
writeCombinedMetadataToTsvFromLsObj <- function(ls.Obj, cols.remove = character(),
save_as_qs = TRUE, save_as_tsv = TRUE, ...) {
warning("writeMetadataToTsv is EXPERIMENTAL. It writes out subset of columns", immediate. = TRUE)
stopifnot(is.list(ls.Obj)) # Validate that input is a list
# Extract metadata from each object and remove specified columns
metadataList <- lapply(ls.Obj, function(obj) {
stopifnot("meta.data" %in% slotNames(obj)) # Check for meta.data slot
metaData <- obj@meta.data
metaData[, !(names(metaData) %in% cols.remove)]
})
# Find common columns and subset
commonCols <- CodeAndRoll2::intersect.ls(lapply(metadataList, names))
metadataList <- lapply(metadataList, function(df) df[, commonCols, drop = FALSE])
# Check if qbarplot is available and create a barplot showing the number of rows per object
metadata.cells.per.obj <- sapply(metadataList, nrow)
print(metadata.cells.per.obj)
pobj <- ggExpress::qbarplot(metadata.cells.per.obj,
label = metadata.cells.per.obj, ylab = "cells",
save = FALSE
)
print(pobj)
# Merge metadata into one big data frame
mergedMetaData <- do.call(rbind, metadataList)
# Print dimensions of the merged data frame
print(dim(mergedMetaData))
if (save_as_qs) xsave(mergedMetaData)
if (save_as_tsv) ReadWriter::write.simple.tsv(mergedMetaData, ...)
# Return the merged data frame
invisible(mergedMetaData)
}
# _________________________________________________________________________________________________
# Plot metadata ______________________________ ----
# _________________________________________________________________________________________________
#' @title Plot Metadata Correlation Heatmap
#'
#' @description This function plots a heatmap of metadata correlation values. It accepts a Seurat object
#' and a set of metadata columns to correlate. The correlations are calculated using either Pearson
#' or Spearman methods, and the resulting heatmap can include the principal component (PCA) values
#' and be saved with a specific suffix.
#'
#' @param columns A vector of metadata column names to calculate correlations.
#' Default: c("nCount_RNA", "nFeature_RNA", "percent.mito", "percent.ribo").
#' @param obj The main Seurat object used for calculations. No default value.
#' @param cormethod The method to calculate correlations. Can be either "pearson" or "spearman". Default: "pearson".
#' @param main The main title for the plot. Default: "Metadata correlations" followed by the correlation method.
#' @param show_numbers Logical, determines if correlation values should be displayed on the plot. Default: `FALSE`.
#' @param digits The number of decimal places for displayed correlation values. Default: 1.
#' @param suffix A suffix added to the output filename. Default: NULL.
#' @param add_PCA Logical, determines if PCA values should be included in the correlation calculation. Default: `TRUE`.
#' @param n_PCs The number of PCA components to be included if 'add_PCA' is TRUE. Default: 8.
#' @param w The width of the plot in inches. Default: ceiling((length(columns)+n_PCs)/2).
#' @param h The height of the plot in inches. Default: the value of w.
#' @param use_ggcorrplot Logical, determines if the ggcorrplot package should be used for plotting. Default: `FALSE`.
#' @param n_cutree The number of clusters to be used in hierarchical clustering. Default: the number of PCs.
#' @param ... Additional parameters passed to the internally called ggcorrplot function.
#'
#' @seealso
#' \code{\link[ggcorrplot]{ggcorrplot}}
#' @importFrom ggcorrplot ggcorrplot
#' @importFrom pheatmap pheatmap
#' @export
plotMetadataCorHeatmap <- function(
columns = c("nCount_RNA", "nFeature_RNA", "percent.mito", "percent.ribo"),
obj,
cormethod = c("pearson", "spearman")[1],
main = paste("Metadata", cormethod, "correlations"),
show_numbers = FALSE,
digits = 1,
suffix = NULL,
add_PCA = TRUE,
n_PCs = 8,
w = ceiling((length(columns) + n_PCs) / 2), h = w,
use_ggcorrplot = FALSE,
n_cutree = (n_PCs),
...) {
meta.data <- obj@meta.data
columns.found <- intersect(colnames(meta.data), columns)
columns.not.found <- setdiff(columns, colnames(meta.data))
if (length(columns.not.found)) iprint("columns.not.found:", columns.not.found)
meta.data <- meta.data[, columns.found]
if (add_PCA) {
stopif(is.null(obj@reductions$"pca"), "PCA not found in @reductions.")
main <- paste("Metadata and PC", cormethod, "correlations")
suffix <- FixPlotName(suffix, "w.PCA")
PCs <- obj@reductions$pca@cell.embeddings
stopifnot(nrow(meta.data) == nrow(PCs))
meta.data <- cbind(PCs[, 1:n_PCs], meta.data)
}
corX <- cor(meta.data, method = cormethod)
if (use_ggcorrplot) {
pl <- ggcorrplot::ggcorrplot(corX,
title = main,
hc.order = TRUE,
digits = digits,
lab = show_numbers,
type = "full",
...
)
ggExpress::qqSave(pl, fname = FixPlotName(make.names(main), suffix, "pdf"), w = w, h = h)
} else {
pl <- pheatmap::pheatmap(corX,
main = main, treeheight_row = 2, treeheight_col = 2,
cutree_rows = n_cutree, cutree_cols = n_cutree
)
wplot_save_pheatmap(
x = pl, width = w,
plotname = FixPlotName(make.names(main), suffix, "pdf")
)
}
pl
}
# _________________________________________________________________________________________________
#' @title Calculate and plot heatmap of cluster medians
#'
#' @description This function calculates the median of specified variables in a dataframe,
#' grouped by a column ('ident'). The function also provides an option to scale the medians,
#' subset the ident levels, and either return a matrix of median values or plot a heatmap.
#'
#' @param meta A dataframe containing metadata from a Seurat object.
#' @param ident A character string representing the column name to be used for grouping the data.
#' @param subset_ident_levels An optional vector of ident levels to subset. Default is FALSE.
#' @param variables A character vector containing the names of columns for which to calculate the median.
#' @param scale A logical indicating whether to scale the medians. Default is TRUE.
#' @param suffix A character string added to the plot file name if not returning a matrix. Default is NULL.
#' @param return_matrix A logical indicating whether to return a matrix of medians, or plot a heatmap. Default is FALSE.
#' @param plotname A character string representing the main title for the plot. Default is "Median metadata values by cluster".
#' @param n_cut_row The number of row rows to cut the tree into on the `pheatmap`. Default is NA (none).
#' @param n_cut_col The number of column clusters to cut the tree into on the `pheatmap`. Default is NA (none)
#' @param w The width of the plot (if not returning a matrix). Default is half the number of variables.
#' @param ... Additional parameters passed to the pheatmap function.
#'
#' @return If 'return_matrix' is TRUE, a matrix where rows correspond to the unique values of 'ident',
#' and columns correspond to 'variables'. Each element of the matrix represents the median of a specific
#' variable for a specific group. If 'return_matrix' is FALSE, it saves a heatmap plot and returns the plot object.
#'
#' @importFrom dplyr group_by summarize_at
#' @importFrom ReadWriter FirstCol2RowNames
#' @importFrom pheatmap pheatmap
#' @import tidyverse
#' @export
heatmap_calc_clust_median <- function(
meta, ident, subset_ident_levels = FALSE,
variables, scale = TRUE,
suffix = NULL,
return_matrix = FALSE,
plotname = "Median metadata values by cluster",
n_cut_row = NA,
n_cut_col = NA,
w = ceiling(length(variables) / 2),
...) {
# Ensure that 'meta' is a dataframe, 'ident' is a column in 'meta', and 'variables' are columns in 'meta'
stopifnot(
is.data.frame(meta),
is.character(ident),
is.character(variables),
ident %in% colnames(meta),
all(variables %in% colnames(meta))
)
# Group by 'ident' and calculate median for each variable
df_cluster_medians <- meta |>
group_by(meta[[ident]]) |>
summarize_at(vars(variables), median, na.rm = TRUE)
df_cluster_medians <- ReadWriter::column.2.row.names(df_cluster_medians)
if (!isFALSE(subset_ident_levels)) {
stopifnot(all(subset_ident_levels %in% rownames(df_cluster_medians)))
suffix <- FixPlotName(suffix, "subset")
df_cluster_medians <- df_cluster_medians[subset_ident_levels, ]
}
if (scale) {
df_cluster_medians <- scale(df_cluster_medians)
suffix <- sppp(suffix, "scaled")
}
if (return_matrix) {
return(df_cluster_medians)
} else {
plot_name <- FixPlotName(plotname, suffix)
pl <- pheatmap::pheatmap(df_cluster_medians,
main = plot_name,
cutree_rows = n_cut_row,
cutree_cols = n_cut_col,
...
)
wplot_save_pheatmap(
x = pl, width = w,
plotname = FixPlotName(make.names(plot_name), suffix, "pdf")
)
# Now plot correlation heatmap between the identites
corX <- cor(t(df_cluster_medians), method = "spearman")
pl <- pheatmap::pheatmap(corX,
main = paste0("Correlation between ", plot_name),
treeheight_row = 2, treeheight_col = 2,
# cluster_cols = FALSE, cluster_rows = FALSE,
cutree_rows = n_cut_row, cutree_cols = n_cut_col
)
wplot_save_pheatmap(
x = pl, width = w,
plotname = FixPlotName(make.names(plot_name), suffix, "correlation.pdf")
)
}
}
# _________________________________________________________________________________________________
#' @title plotMetadataMedianFractionBarplot
#'
#' @description Generates a barplot of metadata median values.
#' @param columns A vector of column names to consider for the barplot. Default: c("percent.mito", "percent.ribo").
#' @param suffix A suffix added to the output filename. Default: NULL.
#' @param group.by The variable to group by for calculations. Default: Second result of GetClusteringRuns(obj).
#' @param method Method used for calculations, either "median" or "mean". Default: "median".
#' @param min.thr Minimum threshold percentage for a cluster. Default: 2.5.
#' @param return.matrix Logical; if TRUE, returns a matrix. Default: `FALSE`.
#' @param main Main title for the plot. Default: "read fractions per transcript class and cluster" followed by the method and suffix.
#' @param ylab Label for the y-axis. Default: "Fraction of transcriptome (%)".
#' @param percentify Logical. If TRUE, multiplies the fraction by 100. Default: `TRUE`.
#' @param subt Subtitle for the plot. Default: NULL.
#' @param position Position adjustment for geoms. Default: position_stack().
#' @param w The width of the plot. Default: 10.
#' @param h The height of the plot. Default: 6.
#' @param obj The main Seurat object used for calculations. Default: combined.obj.
#' @param ... Additional parameters passed to the internally called functions.
#' @seealso
#' \code{\link[dplyr]{summarise_all}}
#' \code{\link[reshape2]{melt}}
#' @export plotMetadataMedianFractionBarplot
#' @importFrom dplyr summarize_all
#' @importFrom reshape2 melt
plotMetadataMedianFractionBarplot <- function(
columns = c("percent.mito", "percent.ribo"),
suffix = NULL,
group.by = GetClusteringRuns(obj = obj)[2],
method = c("median", "mean")[1],
min.thr = 2.5 # At least this many percent in at least 1 cluster
, return.matrix = FALSE,
main = paste(method, "read fractions per transcript class and cluster", suffix),
ylab = "Fraction of transcriptome (%)",
percentify = TRUE,
subt = NULL,
position = position_stack(),
w = 10, h = 6,
obj = combined.obj,
...) {
meta.data <- obj@meta.data
stopifnot(group.by %in% colnames(meta.data))
columns.found <- intersect(colnames(meta.data), c(group.by, columns))
(mat.cluster.medians1 <- meta.data[, columns.found] |>
group_by_at(group.by) |>
dplyr::summarize_all(median)
)
if (min.thr > 0) {
pass.cols <- colMax(mat.cluster.medians1[, -1]) > (min.thr / 100)
cols.OK <- which_names(pass.cols)
cols.FAIL <- which_names(!pass.cols)
subt <- paste(length(cols.FAIL), "classed do not reach", min.thr, "% :", kpps(cols.FAIL))
iprint(subt)
mat.cluster.medians1 <- mat.cluster.medians1[, c(group.by, cols.OK)]
}
mat.cluster.medians <- mat.cluster.medians1 |>
reshape2::melt(id.vars = c(group.by), value.name = "Fraction")
if (percentify) mat.cluster.medians$"Fraction" <- 100 * mat.cluster.medians$"Fraction"
pl <- ggbarplot(mat.cluster.medians,
x = group.by, y = "Fraction", fill = "variable",
position = position,
title = main, subtitle = subt, ylab = ylab
)
ggExpress::qqSave(pl, fname = ppp(make.names(main), "pdf"), w = w, h = h)
pl
if (return.matrix) mat.cluster.medians1 else pl
}
# _________________________________________________________________________________________________
#' @title Plot Metadata Category Pie Chart
#'
#' @description Generates a pie chart visualizing the distribution of categories within a specified
#' metadata column of a Seurat object.
#'
#' @param metacol The metadata column to visualize.
#' @param plot_name Name of the plot to generate.
#' @param obj Seurat object containing the metadata. Default: `combined.obj`.
#' @param max.categs The maximum number of categories to display in the pie chart.
#' If the number of categories exceeds this value, an error is thrown.
#' @param both_pc_and_value If `TRUE`, labels on the pie chart will show both the percentage
#' and the count of each category. If `FALSE`, only the percentage is shown.
#' @param subtitle Optional subtitle for the pie chart.
#' @param labels Optional labels for the pie chart.
#'
#' @param ... Additional arguments to pass to the pie chart plotting function.
#'
#' @examples
#' \dontrun{
#' plotMetadataCategPie(
#' metacol = "Singlet.status",
#' plot_name = "Singlet Status Distribution",
#' obj = combined.obj,
#' max.categs = 20,
#' both_pc_and_value = TRUE
#' )
#' }
#'
#' @return A pie chart visualizing the distribution of categories within the specified metadata column.
#' @export
plotMetadataCategPie <- function(
metacol = "Singlet.status",
plot_name = paste(metacol, "distribution"),
obj = combined.obj,
max.categs = 20,
both_pc_and_value = TRUE,
subtitle = NULL,
labels = NULL,
LegendSide = FALSE,
...) {
#
categ_pivot <- table(obj[[metacol]])
stopifnot(length(categ_pivot) < max.categs)
qpie(categ_pivot,
plotname = FixPlotName(make.names(plot_name)),
both_pc_and_value = both_pc_and_value,
LegendSide = LegendSide, labels = labels,
LegendTitle = "", subtitle = subtitle,
...
)
}
# _________________________________________________________________________________________________
# Label / identity transfer across objects ______________________________ ----
# _________________________________________________________________________________________________
#' @title Rename Azimuth Columns in Seurat Object
#'
#' @description Dynamically renames specified metadata columns in a Seurat object, particularly those
#' prefixed with "predicted." and the "mapping.score" column, by applying a new prefix
#' that combines a user-defined prefix and a reference name.
#'
#' @param obj A Seurat object containing metadata in `meta.data` that needs column names to be renamed.
#' @param ref A character string specifying the reference; defaults to "humancortexref".
#' The "ref" part of the string will be removed in the new column names.
#' @param prefix A character string to prefix to the column names, defaulting to "azi".
#' This prefix is combined with the modified `ref` to form the new column names.
#' @param azim_cols Azimuth columns
#' @return Returns the Seurat object with renamed metadata columns.
#'
#' @examples
#' # Assuming `obj` is a Seurat object with metadata columns following the "predicted." pattern:
#' obj <- renameAzimuthColumns(obj, ref = "humancortexref", prefix = "azi")
#' # This will rename columns like "predicted.class" to "azi.humancortex.class"
#' # and include "mapping.score" as "azi.humancortex.mapping.score"
#'
#' @export
renameAzimuthColumns <- function(obj, ref = c("humancortexref", "fetusref")[1],
prefix = "azi",
azim_cols = CodeAndRoll2::grepv(
x = tail(colnames(obj@meta.data), 10),
pattern = "predicted."
)) {
stopifnot(
"obj must be a Seurat object" = is(obj, "Seurat"),
"azim_cols must be non-empty" = length(azim_cols) > 0
)
ref <- sub(pattern = "ref", replacement = "", x = ref)
iprint(length(azim_cols), "azim_cols:", azim_cols)
# Extract the column names of meta.data
meta_col_names <- colnames(obj@meta.data)
# Loop through the azim_cols and replace the prefix if they exist in meta.data
for (azim_col in azim_cols) {
if (azim_col %in% meta_col_names) {
# Create the new column name by replacing "predicted." with the new prefix
new_col_name <- sub(pattern = "^predicted\\.", replacement = kpp(prefix, ref, ""), x = azim_col)
names(obj@meta.data)[names(obj@meta.data) == azim_col] <- new_col_name
}
}
if ("mapping.score" %in% colnames(obj@meta.data)) {
names(obj@meta.data)[names(obj@meta.data) == "mapping.score"] <- kpp(prefix, ref, "mapping.score")
}
print(tail(colnames(obj@meta.data), 10))
return(obj)
}
# _________________________________________________________________________________________________
#' @title Rename Small Categories in Seurat Object Metadata
#'
#' @description This function renames categories within a specified identity column of a
#' Seurat object's metadata that have fewer cells than a specified minimum threshold.
#' Categories below this threshold are renamed to a common name, typically "unclear",
#' to clean up small, potentially noisy categories.
#' @param obj A Seurat object containing the metadata with categories to be cleaned.
#' @param idents A character vector specifying the names of the identity columns within
#' `obj@meta.data` where categories are to be renamed.
#' @param min.cells An integer specifying the minimum number of cells a category must have
#' to retain its original name. Categories with fewer cells than this threshold will be
#' renamed. Defaults to the greater of the total number of columns divided by 2000 or 10.
#' @param new.name A character string specifying the new name to assign to small categories.
#' Defaults to "unclear".
#'
#' @return Returns the Seurat object with renamed categories in the specified metadata columns.
#'
#' @examples
#' # Assuming obj is a Seurat object with identity columns "ident1" and "ident2":
#' idents <- c("ident1", "ident2")
#' obj <- renameSmallCategories(obj, idents = idents)
#'
#' @export
renameSmallCategories <- function(
obj,
idents = c("predicted.class", "predicted.cluster", "predicted.subclass"),
min.cells = max(round((ncol(obj)) / 2000), 10),
new.name = "unclear") {
stopifnot("obj must be a Seurat object" = is(obj, "Seurat"))
for (ident in idents) {
if (ident %in% colnames(obj@meta.data)) {
# Count the number of cells per category in the specified identity column
category_counts <- table(obj@meta.data[[ident]])
# Identify categories with fewer cells than min.cells
small_categories <- names(category_counts[category_counts < min.cells])
# Initial number of categories
initial_categories <- length(unique(obj@meta.data[[ident]]))
# Rename the categories in the ident column that have fewer cells than min.cells to new.name
obj@meta.data[[ident]] <- ifelse(obj@meta.data[[ident]] %in% small_categories, new.name, obj@meta.data[[ident]])
# Report to console
cells_renamed <- sum(obj@meta.data[[ident]] == new.name)
categories_removed <- length(small_categories)
remaining_categories <- length(unique(obj@meta.data[[ident]]))
message(
"For ident '", ident, "':\n",
cells_renamed, " cells were renamed.\n",
remaining_categories, " of initial ", initial_categories, " categories remained.\n\n",
"Removed categories: ", paste(head(small_categories, 10), collapse = ", "), "\n"
)
} else {
message("Ident column '", ident, "' does not exist in obj@meta.data.")
}
}
return(obj)
}
# _________________________________________________________________________________________________
#' @title Transfer labels from a reference Seurat object to a query Seurat object
#'
#' @description Function to transfer labels from a reference Seurat object to a query Seurat object
#' using anchoring and transfer data methods from the Seurat package. It then visualizes the
#' reference and the combined objects using Uniform Manifold Approximation and Projection (UMAP).
#'
#' @param query_obj A Seurat object for which the labels are to be transferred.
#' @param reference_obj Alternative to `reference_path`. If provided, the path is not used to load
#' the reference data.
#' @param reference_path A character string indicating the file path to the reference Seurat object.
#' The path must exist.
#' @param reference_ident A character string specifying the name of the identity class to be used
#' from the reference Seurat object. Default is 'RNA_snn_res.0.3.ordered.ManualNames'.
#' @param anchors A list of anchors obtained from the FindTransferAnchors function. If NULL, the
#' @param new_ident A character string specifying the name of the new identity class to be
#' created in the query Seurat object. Default is obtained by replacing 'ordered' with
#' 'transferred' in reference_ident.
#' @param predictions_col A character string specifying the column in the metadata of the transferred
#' Seurat object containing the transferred labels. Default is 'predicted.id'.
#' @param predictions_score A character string specifying the column in the metadata of the transferred
#' Seurat object containing the scores of the transferred labels. Default is 'transferred.score'.
#' @param save_anchors A logical indicating whether to save the anchors as an RDS file. Default is TRUE.
#' @param reference_suffix A character string to be used as in the subtitle of the reference UMAP plot.
#' Default is 'REFERENCE.obj'.
#' @param plot_suffix A character string to be added to the UMAP with the new identity. Default is NULL.
#' @param plot_reference A logical indicating whether to plot the reference UMAP. Default is TRUE.
#' @param h Height for the saved image. Default: `12`
#' @param w Width for the saved image. Default: `9`
#' @param ... Additional arguments passed to the `Seurat.utils::clUMAP` function.
#'
#' @return The modified query Seurat object with the transferred labels as a new identity class.
#'
#' @examples
#' # combined.objX <- transferLabelsSeurat(reference_ident = 'RNA_snn_res.0.3.ordered.ManualNames',
#' # reference_obj = reference_obj,
#' # query_obj = combined.obj)
#'
#' @importFrom readr read_rds
#' @importFrom Seurat FindTransferAnchors TransferData AddMetaData
#'
#' @export
transferLabelsSeurat <- function(
query_obj,
reference_obj,
reference_path = NULL,
reference_ident,
anchors = NULL,
new_ident = gsub(
pattern = "ordered",
replacement = "transferred",
x = reference_ident
),
predictions_col = "predicted.id",
predictions_score = sppp(new_ident, "score"),
save_anchors = TRUE,
reference_suffix = "REFERENCE.obj",
plot_suffix = NULL,
plot_reference = TRUE,
w = 12, h = 9,
...) {
#
if (is.null(reference_obj)) {
iprint("Loading reference object:", basename(reference_path))
stopifnot(file.exists(reference_path))
reference_obj <- readr::read_rds(reference_path)
} else {
stopifnot(
inherits(reference_obj, "Seurat"),
min(dim(reference_obj)) > 10,
reference_ident %in% colnames(reference_obj@meta.data)
)
}
# Report
nr.cl.ref <- CodeAndRoll2::nr.unique(reference_obj[[reference_ident]])
message("reference_ident ", reference_ident, " has ", nr.cl.ref, " categories")
# Visualize reference object
if (plot_reference) {
clUMAP(
obj = reference_obj, ident = reference_ident, label = FALSE,
suffix = reference_suffix, sub = reference_suffix,
w = w, h = h, ...
)
}
if (is.null(anchors)) {
message("Calculating anchors. Provide anchors in 'anchors' to speed up.")
tictoc::tic()
anchors <- Seurat::FindTransferAnchors(reference = reference_obj, query = query_obj)
if (save_anchors) xsave(obj = anchors)
tictoc::toc()
} else {
message("Anchors provided")
}
message("Transferring labels")
tictoc::tic()
transferred_clIDs <- Seurat::TransferData(
anchorset = anchors,
refdata = reference_obj@meta.data[, reference_ident],
)
tictoc::toc()
# Add New Labels to query object
query_obj <- Seurat::AddMetaData(
object = query_obj, metadata = transferred_clIDs[, predictions_col],
col.name = new_ident
)
# Add Labels assignment scores to query object
query_obj <- Seurat::AddMetaData(
object = query_obj, metadata = transferred_clIDs[, "prediction.score.max"],
col.name = predictions_score
)
qSeuViolin(
feature = ppp(new_ident, "score"), ident = new_ident,
sub = Seurat.utils:::.parseBasicObjStats(query_obj),
pt.size = 0.0, obj = query_obj
)
# Visualize combined object
clUMAP(
ident = new_ident, obj = query_obj, suffix = plot_suffix, label = FALSE,
w = w, h = h, ...
)
qUMAP(
feature = predictions_score, obj = query_obj, suffix = plot_suffix, label = FALSE,
w = w, h = h, ...
)
return(query_obj)
}
# _________________________________________________________________________________________________
#' @title Extract meta.data Column Names Matching a Pattern
#'
#' @param obj A dataframe from which to extract column names.
#' @param pattern A regular expression pattern to match column names against.
#'
#' @return A character vector of column names matching the pattern.
#'
#' @examples
#' # Assuming 'df' is a dataframe with column names "azi.one", "azi.two", "other"
#' extract_matching_columns(df, "^azi\\.")
.metaColnames <- function(obj = combined.obj, pattern, perl = TRUE, ...) {
colz <- grep(pattern, colnames(obj@meta.data), value = TRUE, perl = perl, ...)
dput(colz)
return(colz)
}
# _________________________________________________________________________________________________
#' @title Match and Translate Best Identity
#'
#' @description This function matches the best identity from `ident_to_rename` to `reference_ident` in an object,
#' in other words, it replaces original categories with the most frequent ones from the reference,
#' hence helps to filter out less important categories.
#'
#' @param obj The object to update. This object must have a `meta.data` attribute which is a data frame
#' containing columns named as `ident_to_rename` and `reference_ident`.
#' @param ident_to_rename A string. The name of the column in `obj@meta.data` that is used as the source of identities.
#' There is no default value for this parameter.
#' @param reference_ident A string. The name of the column in `obj@meta.data` that is used as the target of identities.
#' There is no default value for this parameter.
#' @param prefix A string to add to the new identity column name. Default is prefix = Reference.
#' @param suffix ...
#' @param new_ident_name A string. The name for the newly created identity column in `obj@meta.data`.
#' Default is a concatenation: kpp(prefix, ident_to_rename, "match.to", reference_ident) .
#' @param plot_suffix A string. The suffix to add to the final UMAP.
#' @param barplot_match Draw a barplot to show the % match between the original and new identities.
#' Default: TRUE
#' @param barplot_fractions Draw a barplot to show the % of each category in the original and new
#' identities, using `scBarplot.CellFractions()`. Default: TRUE
#' @param h Height for the saved image. Default: 12
#' @param w Width for the saved image. Default: 9
#' @param ... Additional parameters to be passed to `.replace_by_most_frequent_categories` function.
#'
#' @return An updated version of `obj` with an additional column in `obj@meta.data` named as `new_ident_name`
#' representing the new identity. The function also generates a UMAP plot based on this new identity.
#'
#' @seealso \code{\link[clUMAP]{clUMAP}}, \code{\link[kpp]{kpp}}, \code{\link[FixPlotName]{FixPlotName}},
#' \code{\link[.replace_by_most_frequent_categories]{.replace_by_most_frequent_categories}}
#'
#' @examples
#' \dontrun{
#' updated_obj <- matchBestIdentity(my_obj, "origin_identity", "target_identity")
#' }
#' @export
matchBestIdentity <- function(
obj, ident_to_rename,
reference_ident = GetOrderedClusteringRuns(obj)[1],
prefix = reference_ident,
suffix = gsub(prefix, "", x = reference_ident),
# to_suffix = "matched",
# to_suffix = FixPlotName(gsub(pattern = "[a-zA-Z_]", replacement = "", x = ident_to_rename)),
new_ident_name = kpp(prefix, ident_to_rename, "match.to", suffix),
plot_suffix = prefix,
barplot_match = TRUE,
barplot_fractions = TRUE,
rnd_colors = T,
w = 12, h = 9,
...) {
stopifnot("colname prefix undefined" = !is.null(prefix))
dictionary <- obj@meta.data[, c(ident_to_rename, reference_ident)]
translation <- .replace_by_most_frequent_categories(
df = dictionary, show_plot = barplot_match, suffix_barplot = ident_to_rename, ...
)
obj@meta.data[, new_ident_name] <- translation[, 1]
imessage("new ident name:", new_ident_name)
px <- clUMAP(ident = new_ident_name, obj = obj, suffix = plot_suffix, w = w, h = h, ...)
print(px)
if (barplot_fractions) {
COLZ <- getDiscretePaletteObj(ident.used = reference_ident, obj = obj, palette.used = "glasbey")
scBarplot.CellFractions(fill.by = reference_ident , group.by = new_ident_name, obj = obj
# , rnd_colors = rnd_colors
, custom_col_palette = COLZ)
}
return(obj)
}
# _________________________________________________________________________________________________
#' @title Find Best Match: Replace Categories by the Most Frequent Match
#'
#' @description Used for mapping identity columns across objects. This function replaces each
#' category in a query column of a data frame with the most frequently corresponding category in a
#' reference column. It calculates the assignment quality, reports it, and optionally plots it.
#' @param df A data frame containing the data.
#' @param query_col The name of the column in 'df' whose categories are to be replaced. By default,
#' the first column of 'df' is used.
#' @param ref_col The name of the column in 'df' used as reference for replacement. By default, the
#' second column of 'df' is used.
#' @param show_plot Logical, whether to plot assignment quality. Defaults to TRUE.
#' @param suffix_barplot Suffix for barplot.
#' @param ... Additional parameters passed to the qbarplot function.
#' @return A data frame with categories in 'query_col' replaced by the most frequent match from
#' 'ref_col'.
#'
#' @importFrom dplyr group_by summarise arrange filter
#' @examples
#' \dontrun{
#' .replace_by_most_frequent_categories(df = my_data)
#' (MXX <- as.tibble(structure(
#' c(
#' "Adjut", "Adjut", "Yearn", "Adjut", "Dwarf", "Adjut",
#' "Dwarf", "Adjut", "Dwarf", "Yearn", "Dwarf", "Dwarf", "Dwarf",
#' "Yearn", "Dwarf", "Dwarf", "Dwarf", "Zebra", "Yucca", "Plyer",
#' "Blaze", "Blaze", "Dazed", "Blaze", "Swept", "Bold", "Vixen",
#' "Bold", "Swept", "Dazed", "Mirth", "Witch", "Vixen", "Dazed",
#' "Swept", "Mirth", "Swept", "Vexed", "Query", "Yolk"
#' ),
#' .Dim = c(20L, 2L), .Dimnames =
#' list(NULL, c("RNA_snn_res.0.1.ordered", "RNA_snn_res.0.3.ordered"))
#' )))
#'
#' z <- .replace_by_most_frequent_categories(df = MXX)
#' head(cbind(MXX[, 1], z[, 1]))
#' }
.replace_by_most_frequent_categories <- function(
df, query_col = colnames(df)[1],
ref_col = colnames(df)[2],
show_plot = TRUE,
suffix_barplot = NULL,
ext = "png",
min.thr = 0.5,
...) {
# Convert to data frame if it is not
if (!is.data.frame(df)) {
df <- as.data.frame(df)
}
cat_query <- unique(df[[query_col]])
imessage(length(cat_query), "categories to rename in", query_col, ":", head(cat_query), "...")
cat_ref <- unique(df[[ref_col]])
imessage(length(cat_ref), "reference categories in", ref_col, ":", head(cat_ref), "...")
# Create a table of the most frequent reference values for each query category
replacement_table <- df |>
dplyr::group_by(!!sym(query_col), !!sym(ref_col)) |>
dplyr::summarise(n = n(), .groups = "drop") |>
dplyr::arrange(!!sym(query_col), desc(n)) |>
dplyr::filter(!duplicated(!!sym(query_col)))
replacement_table[[ref_col]] <- make.unique(replacement_table[[ref_col]])
# Calculate assignment quality
total_counts <- table(df[[query_col]])
quality <- replacement_table$n / total_counts[replacement_table[[query_col]]]
names(quality) <- paste0(names(quality), "->", replacement_table[[ref_col]])
# Report assignment quality
message("Assignment quality (proportion of total matches):")
# print(setNames(quality, replacement_table[[query_col]]))
# Plot assignment quality
if (show_plot) {
px <- ggExpress::qbarplot(quality,
label = percentage_formatter(quality, digitz = 1),
ext = ext,
suffix = suffix_barplot,
plotname = "Assignment Quality",
filename = make.names(kpp("Assignment Quality", suffix_barplot, ext)),
subtitle = paste(
"From", colnames(df)[1], "->", colnames(df)[2], "| median",
percentage_formatter(median(quality)), "\n",
sum(quality > min.thr), "clusters above 50% match"
),
hline = min.thr, filtercol = -1,
xlab = paste("Best query match to reference"),
ylab = "Proportion of Total Matches",
...
)
print(px)
}
# Replace the query values with the most frequent reference values
df[[query_col]] <- replacement_table[[ref_col]][match(df[[query_col]], replacement_table[[query_col]])]
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.