## Copyright (C) 2025 Matt Dufort, Mario Rosasco, Virginia Muir, and Benaroya Research Institute
##
## This file is part of the briDiscovr package
#' Prepare a dataset for DISCOV-R analysis
#'
#' Loads in the marker and .fcs information and prepares a \code{discovrExperiment}
#' object for analysis. Default parameters values are based on the original DISCOV-R
#' analysis published in Wiedeman et al 2020.
#'
#' @param markerInfoFile A character string indicating the path to a .csv file.
#' This file is expected to have columns named useToCluster", as well as the
#' names specified in the \code{markerCommonField} and \code{markerFcsField}
#' variables. Details of the \code{markerCommonField} and \code{markerFcsField}
#' arguments are provided below; the "useToCluster" column should have only TRUE
#' or FALSE values. Markers with a TRUE value in this column will be used for
#' clustering, whereas the others will not. An optional
#' \code{normalizationMethod} can be included to specify the type of
#' normalization to be used for each marker; options are "zScore", "none", and
#' "warpSet". In addition, "warpSet" can include a peak number specification as
#' a number immediately following (e.g. "warpSet2"). Normalization defaults to
#' "zScore" for each and all markers if not specified.
#' @param fcsInfoFile A character string indicating the path to a file containing columns named "subject",
#' "cellSubset", and "filename". The "filename" field must contain paths to the .fcs files that will be used in analysis.
#' @param parentPopulation A character sting indicating the name of the parent population subset. Must match
#' one of the values in the fcsInfoFile 'cellSubset' column.
#' @param markerCommonField (default: "fixed") A character string indicating the
#' name of a column containing common marker names for human use, like "CD45"
#' @param markerFcsField (default: "desc") A character string indicating the
#' name ofa column containing the marker names in the .fcs files,like "89Y_CD45"
#' @param arcsinhA (default: 0) A numeric indicating the value for 'a' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param arcsinhB (default: 0.2) A numeric indicating the value for 'b' in the arcsinh
#' data transformation equation. Should be 1/5 for cytof data, and 1/150 for flow data.
#' @param arcsinhC (default: 0) A numeric indicating the value for 'c' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param verbose (default: TRUE) A logical specifying whether to display processing messages
#' @param checkMemory (default: TRUE) A logical indicating whether to check how much system memory
#' is available before loading the dataset. If TRUE, this function will display a message and prevent
#' data loading when the files take up more than 80 percent of the available system memory.
#' @param downsampleVectorList (default: NULL) A list of vectors of row numbers
#' to use from files in \code{fcsInfoFile}. Generally generated by
#' \code{downsampleFcsList}. If an object is specified here, it is used to
#' downsample the files as they are read in. NOTE: the list must be in the same
#' order as the files in \code{fcsInfoFile}, so it is best to generate
#' this object directly from that file to ensure that they match. Possible
#' future update to allow named list with names matching a combination of
#' identifying fields for the FCS files.
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir, Matt Dufort
#' @import flowCore
#' @import dplyr
#' @importFrom methods as
#' @importFrom stats setNames na.omit
#' @importFrom utils read.csv
#' @importFrom memuse Sys.meminfo mu.size
#' @export
setupDiscovrExperiment <- function(
markerInfoFile,
fcsInfoFile,
parentPopulation,
markerCommonField = "fixed",
markerFcsField = "desc",
arcsinhA = 0,
arcsinhB = 0.2,
arcsinhC = 0,
verbose = TRUE,
checkMemory = TRUE,
downsampleVectorList = NULL
){
# get marker info, and check for appropriate columns
markerInfo <- read.csv(markerInfoFile, na.strings = c("NA", ""), stringsAsFactors = FALSE)
if(!all(c(markerCommonField, markerFcsField, "useToCluster") %in% names(markerInfo))){
stop(
"The file set as 'markerInfoFile' must contain columns with names 'useToCluster' and the names ",
"specified by markerCommonField and markerFcsField Please check your markerInfo file and try again."
)
}
if(!all(is.logical(markerInfo$useToCluster))){
stop("The 'useToCluster' column in the 'markerInfo' file must contain only TRUE or FALSE values.")
}
# set colnames of marker info data for consistency
markerInfo <- dplyr::rename(markerInfo, commonMarkerName = !!markerCommonField, fcsMarkerName = !!markerFcsField)
# make sure common names are valid
markerInfo$commonMarkerName <- make.names(markerInfo$commonMarkerName)
# check for appropriate marker names
if(nrow(markerInfo) > length(unique(markerInfo$commonMarkerName))){
stop("Marker names in the ", markerCommonField, " column of the markerInfo file do not appear to be unique. Please correct this and try again.")
}
if(any(markerInfo$commonMarkerName == "")){
stop("Markers without assigned names were detected in the markerInfo file. Please correct the file and try again.")
}
# check normalizationMethod, and set to zScore if not present
markerInfo <- checkMarkerInfoNormalizationMethod(markerInfo)
# if input includes non-null downsampleVectorList, check that it is valid
if(!is.null(downsampleVectorList)) {
if(!is.list(downsampleVectorList) |
!all(sapply(downsampleVectorList, is.numeric) | (downsampleVectorList == "all")))
stop("The object included as 'downsampleVectorList' must be a list containing ",
"vectors of row names (generally the output of 'downsampleFcsList'). ",
"Please check that this object is a list, and that every object within ",
"it is a numeric vector.")
}
# extract clustering markers for convenience
clusteringMarkers <- markerInfo[markerInfo$useToCluster, "commonMarkerName", drop = TRUE]
if(verbose){message(paste("Found clustering markers:", paste(clusteringMarkers, collapse=", ")))}
# get list of FCS files with subject and cell subset information, run checks on this input and the FCS files
fcsInfo <- read.csv(fcsInfoFile, stringsAsFactors = FALSE)
gc() # run garbage collection before memory check, to clean up unused memory
checkFcsFiles(fcsInfo, checkMemory = checkMemory, verbose = verbose)
# if input includes non-null downsampleVectorList, check that it has one entry for each FCS files
if(!is.null(downsampleVectorList)) {
if(length(downsampleVectorList) != nrow(fcsInfo))
stop("The object included as 'downsampleVectorList' must have one element ",
"for each FCS file listed in the FCS info file. The object passed as ",
"'downsampleVectorList' contains ", length(downsampleVectorList),
" elements, while the FCS info file lists ", nrow(fcsInfo), " files.")
}
# check to ensure marker names and cell subsets are non-overlapping. Creates issues downstream if this happens.
overlapNames <- fcsInfo$cellSubset[fcsInfo$cellSubset %in% markerInfo$commonMarkerName]
if (length(overlapNames) > 0){
stop(
"Cell subset names and marker names must not overlap. The following names were detected in both sets of names: ",
paste0(overlapNames, collapse = ", ")
)
}
# check that the parent population is one of the subsets in the .fcs info
if(!parentPopulation %in% unique(fcsInfo$cellSubset)){
stop(
"The requested parent population '",
parentPopulation,
"' was not found in the 'cellSubset' column of the .fcs info file."
)
}
if(verbose){message("All .fcs files passed basic QC checks. Proceeding...")}
###########################################################
# Section 2.c.i-iii from original SOP load and format input
###########################################################
## Read in FCS files and associate with cell subsets and subjects
# this code uses a single object that is modified rather than copied to avoid duplicating large chunks of data
# if this is not the most efficient way to handle memory here, then we should change it
# intent is to clean up garbage as we go and limit the max memory requirement
fcsListBySubjectCellSubset <- list()
for (currSubject in unique(fcsInfo$subject)){
if(verbose){message(paste0("Assigning per-subset data for sample: ", currSubject, "..."))}
fcsListBySubjectCellSubset[[currSubject]] <-
buildFcsList(
fcsInfo %>% dplyr::filter(subject == currSubject), indexField = "cellSubset", truncate_max_range = FALSE)
# downsample each flowFrame within the FCSList for each subject
if(!is.null(downsampleVectorList)) {
currDownsampleVectorList <-
downsampleVectorList[which(fcsInfo$subject == currSubject)]
for(currFcsNum in seq_len(length(fcsListBySubjectCellSubset[[currSubject]])))
if(!identical(currDownsampleVectorList[[currFcsNum]], "all")) # skip downsampling if "all" rows specified
fcsListBySubjectCellSubset[[currSubject]][[currFcsNum]] <-
fcsListBySubjectCellSubset[[currSubject]][[currFcsNum]][currDownsampleVectorList[[currFcsNum]],]
}
}
if(verbose){message("Cleaning and selecting markers from .fcs files according to markerInfo file...")}
for (currSubject in names(fcsListBySubjectCellSubset)){
fcsListBySubjectCellSubset[[currSubject]] <-
lapply(fcsListBySubjectCellSubset[[currSubject]], processFcsList, markerInfo = markerInfo)
}
if(verbose){message("Merging data (parent and child populations)...")}
cellSubsetNames <- c(parentPopulation, setdiff(unique(fcsInfo$cellSubset), parentPopulation))
# store the cell subset row counts for each subject's data (for use later in building mergedExpr)
cellSubsetCountsBySubject <-
lapply(fcsListBySubjectCellSubset,
\(x) sapply(x[na.omit(match(cellSubsetNames, names(x)))], nrow, USE.NAMES = TRUE))
# create object to store data merged by subject
fcsListBySubjectMerged <- list()
for(currSubject in names(fcsListBySubjectCellSubset)){
# start with parentPopulation
fcsListBySubjectMerged[[currSubject]] <- fcsListBySubjectCellSubset[[currSubject]][[parentPopulation]]
# merge values into parent population, in same order
exprs(fcsListBySubjectMerged[[currSubject]]) <-
rbind(
exprs(fcsListBySubjectMerged[[currSubject]]),
do.call(
rbind,
lapply(
fcsListBySubjectCellSubset[[currSubject]][
na.omit(
match(setdiff(cellSubsetNames, parentPopulation),
names(fcsListBySubjectCellSubset[[currSubject]]))
)],
exprs)))
}
# remove previous large data object and do a garbage cleanup
rm(fcsListBySubjectCellSubset); gc()
# Transform the data
asinhTfmData <- function(fcs){
# Arcsinh transform remaining columns
tl <- transformList(
flowCore::colnames(fcs),
arcsinhTransform(a = arcsinhA, b = arcsinhB, c = arcsinhC),
transformationId = "asinh"
)
fcs <- flowCore::transform(fcs, tl)
}
# transform and convert object to a flowSet for easier use downstream
if(verbose){
message("Transforming data using arcsinh(a=", arcsinhA, ", b=", arcsinhB, ", c=", arcsinhC, ")...")
}
fcsListBySubjectMerged <- as(lapply(fcsListBySubjectMerged, asinhTfmData), "flowSet")
# Extract expression data and label Tmr+ events
mergedExprStructureOnly <-
setNames(
data.frame(matrix(data = NA_real_, ncol = ncol(exprs(fcsListBySubjectMerged[[1]])), nrow = 0)),
flowCore::colnames(fcsListBySubjectMerged))
mergedExprStructureOnly$samp <- character()
mergedExprStructureOnly$cellSubset <- list()
# mergedExprStructureOnly$cellSubset <- character() # potential future change: make cellSubset a character vector
mergedExpr <- list()
for(currSubjectNum in seq_len(length(fcsListBySubjectMerged))){
mergedExpr[[currSubjectNum]] <- as.data.frame(flowCore::exprs(fcsListBySubjectMerged[[currSubjectNum]]))
mergedExpr[[currSubjectNum]]$samp <- as.character(names(cellSubsetCountsBySubject)[currSubjectNum])
# potential future change: make cellSubset a character vector; remove the as.list() from following lines
mergedExpr[[currSubjectNum]]$cellSubset <-
as.list(
rep(names(cellSubsetCountsBySubject[[currSubjectNum]]), times = cellSubsetCountsBySubject[[currSubjectNum]]))
}
mergedExpr <- bind_rows(mergedExprStructureOnly, mergedExpr)
rm(fcsListBySubjectMerged); gc()
# building a discovr experiment S3 object
experiment <- structure(list(), class = "discovrExperiment")
experiment$markerInfoFile <- markerInfoFile
experiment$markerInfo <- markerInfo
experiment$fcsInfoFile <- fcsInfoFile
experiment$fcsInfo <- fcsInfo
experiment$parentPopulation <- parentPopulation
experiment$mergedExpr <- mergedExpr
experiment$clusteringMarkers <- clusteringMarkers
experiment$status <- "initialized"
return(experiment)
}
#' Perform phenograph clustering for a DISCOV-R experiment
#'
#' @param experiment A discovrExperiment created using \code{setupDiscovrExperiment()}
#' @param method A character string indicating the clustering method to use.
#' Currently only 'phenograph' is supported as a clustering method.
#' @param k An integer specifying the maximum number of nearest neighbors to use. Applies only if method = 'phenograph' (default: 30)
#' @param seed A number to be used as the seed for pseudorandom number generation (default: 12345)
#' @param verbose A boolean specifying whether to display processing messages (default: TRUE)
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{setupDiscovrExperiment}} \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir
#' @import dplyr
#' @importFrom flowCore exprs
#' @importFrom igraph graph_from_data_frame cluster_louvain membership modularity
#' @export
clusterDiscovrExperiment <- function(
experiment,
method = "phenograph",
k = 30,
seed = 12345,
verbose = TRUE
){
if(!is.discovrExperiment(experiment)){
stop(
"The object passed to this function is not a valid DISCOV-R experiment object.",
"Please create your experiment using the 'setupDiscovrExperiment' function and try again."
)
}
# check for subjects with too few events to cluster
if(method == "phenograph"){
if(verbose){message("Checking for subjects with too few events to cluster...")}
subjectCounts <- getSubjectCounts(experiment)
if(any(subjectCounts$nEvents < k + 2)){
stop(
"PhenoGraph clustering can only proceed with subjects that have at ",
"least k + 2 (", k + 2, ") total events. Please check or remove the ",
"following subjects, or modify the clustering parameters, ",
"and try again:\n",
paste(subjectCounts$sample[subjectCounts$nEvents < k + 2], collapse = ", ")
)
}
}
set.seed(seed)
#########################################################################
# Section 2.d.i from original SOP - Phenograph clustering - takes a while
#########################################################################
# Clustering markers object is defined in the first set-up chunk. Tweak which markers are included up there.
# Set up Phenograph function to use kd tree
findNeighbors <- function(data, k){
nearest <- RANN::nn2(data, data, k, treetype = "kd", searchtype = "standard")
return(nearest[[1]])
}
runRpheno <- function(data, k = 30){
nDigits <- 3 # set number of digits for time display
if(is.data.frame(data))
data <- as.matrix(data)
if(!is.matrix(data))
stop("Wrong input data, should be a data frame or matrix!")
if(k<1){
stop("k must be a positive integer!")
}else if (nrow(data) < k + 2){
stop("k must be smaller than the total number of points!")
}
if(verbose){
message("Run Rphenograph starts:","\n",
" -Input data of ", nrow(data)," rows and ", ncol(data), " columns","\n",
" -k is set to ", k)
message(" Finding nearest neighbors...")
}
t1 <- system.time(neighborMatrix <- findNeighbors(data, k=k+1)[,-1])
if(verbose){
message(
"DONE ~", round(t1[3], nDigits), "s\n",
"Compute jaccard coefficient between nearest-neighbor sets..."
)
}
t2 <- system.time(links <- jaccard_coeff(neighborMatrix))
if(verbose){
message(
"DONE ~", round(t2[3], nDigits), "s\n",
"Build undirected graph from the weighted links..."
)
}
links <- links[links[,1]>0, ]
relations <- as.data.frame(links)
colnames(relations)<- c("from","to","weight")
t3 <- system.time(g <- igraph::graph_from_data_frame(relations, directed=FALSE))
if(verbose){
message(
"DONE ~", round(t3[3], nDigits), "s\n",
"Run louvain clustering on the graph ..."
)
}
t4 <- system.time(community <- igraph::cluster_louvain(g))
if(verbose){
message("DONE ~", round(t4[3], nDigits), "s\n")
}
if(verbose){
message("Run Rphenograph DONE, took a total of ", round(sum(c(t1[3],t2[3],t3[3],t4[3])), nDigits), "s.")
message(" Return a community class\n -Modularity value: ", igraph::modularity(community),"\n")
message(" -Number of clusters: ", length(unique(igraph::membership(community))))
}
return(community)
}
# Cluster data in experiment
if (method == "phenograph") {
# initialize RPclust
experiment$mergedExpr$RPclust <- NA
# assign row IDs for merge
experiment$mergedExpr$rowIdx <- seq_len(nrow(experiment$mergedExpr))
# cluster events within each subject
allSubjects <- unique(experiment$mergedExpr$samp)
for(currSubj in allSubjects){
if(verbose){
message("\nClustering sample ", match(currSubj, allSubjects), " of ", length(allSubjects), "...")
}
rPhenoVect <- as.numeric(igraph::membership(runRpheno(
data = experiment$mergedExpr[
experiment$mergedExpr$samp == currSubj,
experiment$clusteringMarkers
],
k = k
)))
# add cluster IDs
experiment$mergedExpr[experiment$mergedExpr$samp == currSubj, "RPclust"] <- rPhenoVect
}
# remove row IDs so they don't interfere downstream
experiment$mergedExpr <- subset(experiment$mergedExpr, select = -get("rowIdx"))
# check to make sure that all events were assigned
if(any(is.na(experiment$mergedExpr$RPclust))){
stop("Not all events were assigned to a cluster. Please contact the package developer for assistance.")
}
} else {
stop("Clustering method '", method, "' is not currently supported. Stopping...")
}
###################################################################
# Sections 2.e.i from original SOP - Summarize and save outputs
###################################################################
# Calculate mean expression value of each marker for each phenograph cluster in each subject
clusterMeans <- experiment$mergedExpr %>%
dplyr::select(-cellSubset) %>%
distinct() %>% # remove duplicated rows; mergedExpr has both parent and gated
dplyr::group_by(samp, RPclust) %>%
dplyr::summarise_all(mean) %>%
dplyr::mutate(RPclust = as.character(RPclust))
##################################
# Calculate total mean expression for each subject
parentMeans <- experiment$mergedExpr %>%
dplyr::select(-cellSubset, -RPclust) %>%
distinct() %>% # remove duplicated rows; mergedExpr has both parent and gated
dplyr::group_by(samp) %>%
dplyr::summarise_all(mean) %>%
dplyr::mutate(RPclust = "Total_Parent")
# Count cells of each subpopulation (eg Tmr) in each phenograph cluster (from each sample)
clusterRarePopCts <-
experiment$mergedExpr %>%
dplyr::select(samp, cellSubset, RPclust) %>%
dplyr::group_by(samp, RPclust) %>%
dplyr::summarise(Total = n(), .groups = "drop_last") %>%
as.data.frame()
uniqueSubsets <- unique(experiment$mergedExpr$cellSubset)
for(currCellSubset in uniqueSubsets){
if(verbose){message("Counting cluster events for ", currCellSubset)}
additionalMatrix <- experiment$mergedExpr %>%
dplyr::select(samp, cellSubset, RPclust) %>%
group_by(samp, RPclust) %>%
summarise(number = sum(cellSubset == currCellSubset)) %>%
as.data.frame()
clusterRarePopCts <- cbind(clusterRarePopCts, additionalMatrix$number)
}
for(i in seq_len(ncol(clusterRarePopCts)-3)){
colnames(clusterRarePopCts)[i+3] <- (uniqueSubsets)[i]
}
aggregateCounts <- clusterRarePopCts %>%
dplyr::select(-RPclust) %>%
group_by(samp) %>%
summarise_all(sum) %>%
rename_at(vars(-samp), function(name) paste0(name,"_tot"))
clusterRarePopCts <- left_join(clusterRarePopCts, aggregateCounts)
for(i in seq_len(length(uniqueSubsets)+1)){
clusterRarePopCts <- cbind(
clusterRarePopCts,
clusterRarePopCts[,i+2]/clusterRarePopCts[,i+2+length(uniqueSubsets)+1]*100
)
}
for(i in seq_len(length(uniqueSubsets))){
colnames(clusterRarePopCts)[i+3+(length(uniqueSubsets)+1)*2] <- (paste0("pct_", (uniqueSubsets)[i], "_in_clust"))
}
colnames(clusterRarePopCts)[3+(length(uniqueSubsets)+1)*2] <- "pct_Total_in_clust"
# update experiment data
experiment$status <- "clustered"
experiment$clusterMeans <- bind_rows(clusterMeans, parentMeans)
experiment$clusterMethod <- method
experiment$clusterRarePopCts <- clusterRarePopCts
return(experiment)
}
#' Normalize marker expression within a DISCOV-R experiment
#'
#' @param experiment A discovrExperiment created using
#' \code{setupDiscovrExperiment()} and clustered using
#' \code{clusterDiscovrExperiment()}
#' @param normalizationInfo (default: NULL) Optional object containing
#' information on how to normalize. If left NULL, the \code{experiment} object
#' will be checked for normalization info in
#' \code{experiment$markerInfo$normalizationMethod}. If that is not present,
#' or for any markers without a method specified, normalization will use
#' the default method, currently "zScore". Acceptable non-NULL values include a
#' data frame with columns columns "commonMarkerName" and "normalizationMethod",
#' a named vector with elements specifying normalization method and names
#' referring to each marker in \code{experiment$mergedExpr}, or a single
#' character value specifying the normalization method to be applied to all
#' markers. Options for normalization are "zScore", "none", and "warpSet". In
#' addition, "warpSet" can include a maximum peak number specification as a
#' number immediately following (e.g. "warpSet2") - see documentation of
#' \code{flowStats::warpSet} for details. Any markers lacking a specification
#' will be normalized using the default method (currently "zScore").
#' @param defaultNormalizationMethod (default: "zScore") A character string, the
#' normalization method to use for markers without another method specified.
#' Options are the same as in \code{normalizationInfo} above.
#' @param seed (default: 12345) Numeric, the random number seed for warpSet
#' normalization, passed to \code{normalizeWarpSetMergedExpr}
#' @param verbose (default: TRUE) A logical specifying whether to display processing messages
#' @return An S3 object of class \code{discovrExperiment}, with additional
#' elements named "mergedExprNormalizedScaled" and
#' "clusterMeansNormalizedScaled". If the normalization methods were not taken
#' from \code{experiment$markerInfo$normalizationMethod}, the normalization
#' method for each marker will be stored there.
#'
#' @seealso \code{\link{discovrExperiment}}
#' @author Matt Dufort
#' @import flowCore
#' @import dplyr
#' @importFrom stats setNames na.omit
#' @export
normalizeDiscovrExperiment <- function(
experiment,
normalizationInfo = NULL,
defaultNormalizationMethod = "zScore",
seed = 12345,
verbose = TRUE
){
if(!is.discovrExperiment(experiment)){
stop(
"The object passed to 'normalizeDiscovrExperiment' is not a valid DISCOV-R experiment object. ",
"Please create your experiment using the 'setupDiscovrExperiment' function ",
"and perform the initial clustering using the 'clusterDiscovrExperiment' function. "
)
}
if(experiment$status %in% c("normalized", "metaclustered")){
message(
"The experiment has a status of ", experiment$status, ", which indicates that normalization has already been run.\n",
"Any existing normalized expression values stored in this experiment object will be overwritten.")
} else if(experiment$status != "clustered"){
stop(
"The experiment must have status 'clustered', 'normalized', or 'metaclustered' in order to be ready for normalization. ",
"The current experiment has a status of ", experiment$status, ". ",
"Please make sure to create your experiment using the 'setupDiscovrExperiment' function ",
"and perform the initial clustering using the 'clusterDiscovrExperiment' function. "
)
}
# check experiment$mergedExpr for validity (shouldn't be necessary, but including to head off some edge cases)
if(!is.data.frame(experiment$mergedExpr))
stop("'mergedExpr' element within the experiment object is not a data.frame; please check inputs to 'normalizeScaleMergedExpr'")
if(!("samp" %in% colnames(experiment$mergedExpr))) {
stop(
paste("Normalization of marker expression cannot be performed if 'mergedExpr' element",
"within the experiment object does not contain column 'samp' specifying sample identity for each event"))
}
# potential change: add a check on marker names in mergedExpr and normalizationInfo
# check defaultNormalizationMethod against accepted values
if(
!((defaultNormalizationMethod %in% c("zScore", "none", "warpSet")) |
str_detect(defaultNormalizationMethod, "^warpSet[0-9]+$")))
stop("Value for 'defaultNormalizationMethod' must be 'zScore', 'none', or 'warpset[#]'.")
# check normalizationInfo, and set to defaultNormalizationMethod for any markers missing info
# output of this chunk should be a data frame with "commonMarkerName" and "normalizationMethod" columns
# "commonMarkerName" values should match all marker columns in mergedExpr (which should be all those before "samp")
# "normalizationMethod" values should include only the options I've specified
if (is.null(normalizationInfo)){
# use experiment$markerInfo if it includes normalizationMethod
if (!is.null(experiment$markerInfo$normalizationMethod)) {
if(verbose){
message(
paste0(
"Found column 'normalizationMethod' in markerInfo;\n",
"using specified normalization method for each marker, ",
"and ", defaultNormalizationMethod, " for any not specified.\n"))}
normalizationInfo <- experiment$markerInfo
} else {
normalizationInfo <- defaultNormalizationMethod
}
}
if (is.vector(normalizationInfo)) {
if ((length(normalizationInfo) == 1)) {
normalizationInfo <-
setNames(rep(normalizationInfo, times = length(experiment$markerInfo$commonMarkerName)),
experiment$markerInfo$commonMarkerName)
}
# add zScore for any markers missing from normalizationInfo vector
normalizationInfo <-
c(normalizationInfo,
setNames(
rep(defaultNormalizationMethod,
times = length(setdiff(experiment$markerInfo$commonMarkerName, names(normalizationInfo)))),
setdiff(experiment$markerInfo$commonMarkerName, names(normalizationInfo)))
)
normalizationInfo <- data.frame(
commonMarkerName = names(normalizationInfo),
normalizationMethod = unname(normalizationInfo))
}
if (!is.data.frame(normalizationInfo))
stop("Format of 'normalizationInfo' not recognized. It should be a data.frame or character vector.")
# fill gaps in normalizationInfo with defaultNormalizationMethod
markersMissing <- setdiff(experiment$markerInfo$commonMarkerName, normalizationInfo$commonMarkerName)
if (length(markersMissing > 0))
normalizationInfo <-
bind_rows(normalizationInfo,
data.frame(commonMarkerName = markersMissing,
normalizationMethod = defaultNormalizationMethod))
if (is.null(normalizationInfo$normalizationMethod)){
normalizationInfo$normalizationMethod <- defaultNormalizationMethod
}
normalizationInfo$normalizationMethod[is.na(normalizationInfo$normalizationMethod)] <- defaultNormalizationMethod
# check values in normalizationInfo now that it's assembled
normalizationInfo <- checkMarkerInfoNormalizationMethod(normalizationInfo)
markersToNormalize <- intersect(normalizationInfo$commonMarkerName, colnames(experiment$mergedExpr))
# check mergedExpr and normalizationInfo
if (any(!(setdiff(colnames(experiment$mergedExpr), c("samp", "cellSubset", "RPclust")) %in% normalizationInfo$commonMarkerName)))
stop("All markers names in 'mergedExpr' must have normalization specified in 'normalizationInfo'")
if (!all(sapply(experiment$mergedExpr[, markersToNormalize], is.numeric)))
stop("Only numeric values are allow in 'mergedExpr' marker expression levels.")
mergedExprNormalizedScaled <- experiment$mergedExpr
# apply z-score normalization to each sample, for all markers with z-score normalization specified
for (samp.tmp in unique(experiment$mergedExpr$samp)) {
mergedExprNormalizedScaled[
mergedExprNormalizedScaled$samp == samp.tmp,
match(
normalizationInfo$commonMarkerName[
(normalizationInfo$commonMarkerName %in% markersToNormalize) &
(normalizationInfo$normalizationMethod %in% "zScore")],
colnames(mergedExprNormalizedScaled))] <-
scale(
mergedExprNormalizedScaled[
mergedExprNormalizedScaled$samp == samp.tmp,
match(
normalizationInfo$commonMarkerName[
(normalizationInfo$commonMarkerName %in% markersToNormalize) &
(normalizationInfo$normalizationMethod %in% "zScore")],
colnames(mergedExprNormalizedScaled))])
}
# apply warpSet normalization to each marker, iterating over variations with different numbers of peaks specified
if(any(str_detect(normalizationInfo$normalizationMethod[normalizationInfo$commonMarkerName %in% markersToNormalize], "warpSet"))){
if(verbose)
message("Starting warpSet normalization on select markers")
for(warpSetMethod in
unique(grep("^warpSet[0-9]*$",
normalizationInfo$normalizationMethod[normalizationInfo$commonMarkerName %in% markersToNormalize],
value = TRUE))){
maxPeakNr <-
if(str_detect(warpSetMethod, "(?<=warpSet)[0-9]+")){
as.numeric(str_extract(warpSetMethod, "(?<=warpSet)[0-9]+"))
} else NULL
mergedExprNormalizedScaled[
,
c(
normalizationInfo$commonMarkerName[
(normalizationInfo$commonMarkerName %in% markersToNormalize) &
(normalizationInfo$normalizationMethod == warpSetMethod)], "samp")] <-
normalizeWarpSetMergedExpr(
mergedExprNormalizedScaled[
,
c(
normalizationInfo$commonMarkerName[
(normalizationInfo$commonMarkerName %in% markersToNormalize) &
(normalizationInfo$normalizationMethod == warpSetMethod)], "samp")],
peakNr = maxPeakNr, groupCol = "samp", seed = seed)
}
}
# for normalizationMethod "none": no change applied
# scale each marker to have comparable weight in metaclustering
mergedExprNormalizedScaled[, markersToNormalize] <-
scale(mergedExprNormalizedScaled[, markersToNormalize])
# skip calculation of marker means by cluster if experiment not clustered yet (only really used when running UMAP prior to clustering)
if("RPclust" %in% colnames(mergedExprNormalizedScaled)){
# Calculate mean expression value (scaled and normalized) of each marker for each phenograph cluster in each subject
clusterMeansNormalizedScaled <- mergedExprNormalizedScaled %>%
dplyr::select(-cellSubset) %>%
distinct() %>% # remove duplicated rows; mergedExprNormalizedScaled has both parent and gated
dplyr::group_by(samp, RPclust) %>%
dplyr::summarise_all(mean) %>%
dplyr::mutate(RPclust = as.character(RPclust))
# Calculate total mean expression (scaled and normalized) for each subject
parentMeansNormalizedScaled <- mergedExprNormalizedScaled %>%
dplyr::select(-cellSubset, -RPclust) %>%
distinct() %>% # remove duplicated rows; mergedExprNormalizedScaled has both parent and gated
dplyr::group_by(samp) %>%
dplyr::summarise_all(mean) %>%
dplyr::mutate(RPclust = "Total_Parent")
}
# update experiment data
experiment$status <- "normalized"
experiment$markersNormalized <- markersToNormalize
experiment$mergedExprNormalizedScaled <- mergedExprNormalizedScaled
if("RPclust" %in% colnames(mergedExprNormalizedScaled)){
experiment$clusterMeansNormalizedScaled <- bind_rows(clusterMeansNormalizedScaled, parentMeansNormalizedScaled)
}
experiment$markerInfo$normalizationMethod[experiment$markerInfo$commonMarkerName %in% markersToNormalize] <-
normalizationInfo$normalizationMethod[
match(experiment$markerInfo$commonMarkerName[experiment$markerInfo$commonMarkerName %in% markersToNormalize],
normalizationInfo$commonMarkerName)]
experiment$markerInfo$normalizationMethod[!(experiment$markerInfo$commonMarkerName %in% markersToNormalize)] <-
"not normalized"
return(experiment)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.