R/clusteringPhase.R

Defines functions normalizeDiscovrExperiment clusterDiscovrExperiment setupDiscovrExperiment

Documented in clusterDiscovrExperiment normalizeDiscovrExperiment setupDiscovrExperiment

## Copyright (C) 2025  Matt Dufort, Mario Rosasco, Virginia Muir, and Benaroya Research Institute
##
## This file is part of the briDiscovr package

#' Prepare a dataset for DISCOV-R analysis
#'
#' Loads in the marker and .fcs information and prepares a \code{discovrExperiment}
#' object for analysis. Default parameters values are based on the original DISCOV-R
#' analysis published in Wiedeman et al 2020.
#'
#' @param markerInfoFile A character string indicating the path to a .csv file.
#' This file is expected to have columns named useToCluster", as well as the
#' names specified in the \code{markerCommonField} and \code{markerFcsField}
#' variables. Details of the \code{markerCommonField} and \code{markerFcsField}
#' arguments are provided below; the "useToCluster" column should have only TRUE
#' or FALSE values. Markers with a TRUE value in this column will be used for
#' clustering, whereas the others will not. An optional
#' \code{normalizationMethod} can be included to specify the type of
#' normalization to be used for each marker; options are "zScore", "none", and
#' "warpSet". In addition, "warpSet" can include a peak number specification as
#' a number immediately following (e.g. "warpSet2"). Normalization defaults to
#' "zScore" for each and all markers if not specified.
#' @param fcsInfoFile A character string indicating the path to a file containing columns named "subject",
#' "cellSubset", and "filename". The "filename" field must contain paths to the .fcs files that will be used in analysis.
#' @param parentPopulation A character sting indicating the name of the parent population subset. Must match
#' one of the values in the fcsInfoFile 'cellSubset' column.
#' @param markerCommonField (default: "fixed") A character string indicating the
#' name of a column containing common marker names for human use, like "CD45"
#' @param markerFcsField (default: "desc") A character string indicating the
#' name ofa column containing the marker names in the .fcs files,like "89Y_CD45"
#' @param arcsinhA (default: 0) A numeric indicating the value for 'a' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param arcsinhB (default: 0.2) A numeric indicating the value for 'b' in the arcsinh
#' data transformation equation. Should be 1/5 for cytof data, and 1/150 for flow data.
#' @param arcsinhC (default: 0) A numeric indicating the value for 'c' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param verbose (default: TRUE) A logical specifying whether to display processing messages
#' @param checkMemory (default: TRUE) A logical indicating whether to check how much system memory
#' is available before loading the dataset. If TRUE, this function will display a message and prevent
#' data loading when the files take up more than 80 percent of the available system memory.
#' @param downsampleVectorList (default: NULL) A list of vectors of row numbers
#'  to use from files in \code{fcsInfoFile}. Generally generated by 
#' \code{downsampleFcsList}. If an object is specified here, it is used to 
#' downsample the files as they are read in. NOTE: the list must be in the same
#' order as the files in \code{fcsInfoFile}, so it is best to generate
#' this object directly from that file to ensure that they match. Possible
#' future update to allow named list with names matching a combination of
#' identifying fields for the FCS files.
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir, Matt Dufort
#' @import flowCore
#' @import dplyr
#' @importFrom methods as
#' @importFrom stats setNames na.omit
#' @importFrom utils read.csv
#' @importFrom memuse Sys.meminfo mu.size
#' @export
setupDiscovrExperiment <- function(
  markerInfoFile,
  fcsInfoFile,
  parentPopulation,
  markerCommonField = "fixed",
  markerFcsField = "desc",
  arcsinhA = 0,
  arcsinhB = 0.2,
  arcsinhC = 0,
  verbose = TRUE,
  checkMemory = TRUE,
  downsampleVectorList = NULL
){
  # get marker info, and check for appropriate columns
  markerInfo <- read.csv(markerInfoFile, na.strings = c("NA", ""), stringsAsFactors = FALSE)
  if(!all(c(markerCommonField, markerFcsField, "useToCluster") %in% names(markerInfo))){
    stop(
      "The file set as 'markerInfoFile' must contain columns with names 'useToCluster' and the names ",
      "specified by markerCommonField and markerFcsField Please check your markerInfo file and try again."
    )
  }
  if(!all(is.logical(markerInfo$useToCluster))){
    stop("The 'useToCluster' column in the 'markerInfo' file must contain only TRUE or FALSE values.")
  }
  # set colnames of marker info data for consistency
  markerInfo <- dplyr::rename(markerInfo, commonMarkerName = !!markerCommonField, fcsMarkerName = !!markerFcsField)
  # make sure common names are valid
  markerInfo$commonMarkerName <- make.names(markerInfo$commonMarkerName)

  # check for appropriate marker names
  if(nrow(markerInfo) > length(unique(markerInfo$commonMarkerName))){
    stop("Marker names in the ", markerCommonField, " column of the markerInfo file do not appear to be unique. Please correct this and try again.")
  }
  if(any(markerInfo$commonMarkerName == "")){
    stop("Markers without assigned names were detected in the markerInfo file. Please correct the file and try again.")
  }

  # check normalizationMethod, and set to zScore if not present
  markerInfo <- checkMarkerInfoNormalizationMethod(markerInfo)

  # if input includes non-null downsampleVectorList, check that it is valid
  if(!is.null(downsampleVectorList)) {
    if(!is.list(downsampleVectorList) | 
       !all(sapply(downsampleVectorList, is.numeric) | (downsampleVectorList == "all")))
      stop("The object included as 'downsampleVectorList' must be a list containing ",
           "vectors of row names (generally the output of 'downsampleFcsList'). ",
           "Please check that this object is a list, and that every object within ",
           "it is a numeric vector.")
  }
  
  # extract clustering markers for convenience
  clusteringMarkers <- markerInfo[markerInfo$useToCluster, "commonMarkerName", drop = TRUE]
  if(verbose){message(paste("Found clustering markers:", paste(clusteringMarkers, collapse=", ")))}

  # get list of FCS files with subject and cell subset information, run checks on this input and the FCS files
  fcsInfo <- read.csv(fcsInfoFile, stringsAsFactors = FALSE)
  gc() # run garbage collection before memory check, to clean up unused memory
  checkFcsFiles(fcsInfo, checkMemory = checkMemory, verbose = verbose)
  
  # if input includes non-null downsampleVectorList, check that it has one entry for each FCS files
  if(!is.null(downsampleVectorList)) {
    if(length(downsampleVectorList) != nrow(fcsInfo))
      stop("The object included as 'downsampleVectorList' must have one element ",
           "for each FCS file listed in the FCS info file. The object passed as ",
           "'downsampleVectorList' contains ", length(downsampleVectorList),
           " elements, while the FCS info file lists ", nrow(fcsInfo), " files.")
  }
  
  # check to ensure marker names and cell subsets are non-overlapping. Creates issues downstream if this happens.
  overlapNames <- fcsInfo$cellSubset[fcsInfo$cellSubset %in% markerInfo$commonMarkerName]
  if (length(overlapNames) > 0){
    stop(
      "Cell subset names and marker names must not overlap. The following names were detected in both sets of names: ",
      paste0(overlapNames, collapse = ", ")
    )
  }

  # check that the parent population is one of the subsets in the .fcs info
  if(!parentPopulation %in% unique(fcsInfo$cellSubset)){
    stop(
      "The requested parent population '",
      parentPopulation,
      "' was not found in the 'cellSubset' column of the .fcs info file."
    )
  }

  if(verbose){message("All .fcs files passed basic QC checks. Proceeding...")}

  ###########################################################
  # Section 2.c.i-iii from original SOP load and format input
  ###########################################################

  ## Read in FCS files and associate with cell subsets and subjects
  # this code uses a single object that is modified rather than copied to avoid duplicating large chunks of data
  # if this is not the most efficient way to handle memory here, then we should change it
  # intent is to clean up garbage as we go and limit the max memory requirement
  fcsListBySubjectCellSubset <- list()
  for (currSubject in unique(fcsInfo$subject)){
    if(verbose){message(paste0("Assigning per-subset data for sample: ", currSubject, "..."))}
    fcsListBySubjectCellSubset[[currSubject]] <-
      buildFcsList(
        fcsInfo %>% dplyr::filter(subject == currSubject), indexField = "cellSubset", truncate_max_range = FALSE)
    # downsample each flowFrame within the FCSList for each subject
    if(!is.null(downsampleVectorList)) {
      currDownsampleVectorList <-
        downsampleVectorList[which(fcsInfo$subject == currSubject)]
      for(currFcsNum in seq_len(length(fcsListBySubjectCellSubset[[currSubject]])))
        if(!identical(currDownsampleVectorList[[currFcsNum]], "all")) # skip downsampling if "all" rows specified
          fcsListBySubjectCellSubset[[currSubject]][[currFcsNum]] <-
            fcsListBySubjectCellSubset[[currSubject]][[currFcsNum]][currDownsampleVectorList[[currFcsNum]],]
    }
  }

  if(verbose){message("Cleaning and selecting markers from .fcs files according to markerInfo file...")}

  for (currSubject in names(fcsListBySubjectCellSubset)){
    fcsListBySubjectCellSubset[[currSubject]] <-
      lapply(fcsListBySubjectCellSubset[[currSubject]], processFcsList, markerInfo = markerInfo)
  }

  if(verbose){message("Merging data (parent and child populations)...")}
  cellSubsetNames <- c(parentPopulation, setdiff(unique(fcsInfo$cellSubset), parentPopulation))
  # store the cell subset row counts for each subject's data (for use later in building mergedExpr)
  cellSubsetCountsBySubject <-
    lapply(fcsListBySubjectCellSubset,
           \(x) sapply(x[na.omit(match(cellSubsetNames, names(x)))], nrow, USE.NAMES = TRUE))
  
  # create object to store data merged by subject
  fcsListBySubjectMerged <- list()
  for(currSubject in names(fcsListBySubjectCellSubset)){
    # start with parentPopulation
    fcsListBySubjectMerged[[currSubject]] <- fcsListBySubjectCellSubset[[currSubject]][[parentPopulation]]
    # merge values into parent population, in same order
    exprs(fcsListBySubjectMerged[[currSubject]]) <-
      rbind(
        exprs(fcsListBySubjectMerged[[currSubject]]),
        do.call(
          rbind,
          lapply(
            fcsListBySubjectCellSubset[[currSubject]][
              na.omit(
                match(setdiff(cellSubsetNames, parentPopulation),
                      names(fcsListBySubjectCellSubset[[currSubject]]))
              )],
            exprs)))
  }
  # remove previous large data object and do a garbage cleanup
  rm(fcsListBySubjectCellSubset); gc()

  # Transform the data
  asinhTfmData <- function(fcs){
    # Arcsinh transform remaining columns
    tl <- transformList(
      flowCore::colnames(fcs),
      arcsinhTransform(a = arcsinhA, b = arcsinhB, c = arcsinhC),
      transformationId = "asinh"
    )
    fcs <- flowCore::transform(fcs, tl)
  }

  # transform and convert object to a flowSet for easier use downstream
  if(verbose){
    message("Transforming data using arcsinh(a=", arcsinhA, ", b=", arcsinhB, ", c=", arcsinhC, ")...")
  }
  fcsListBySubjectMerged <- as(lapply(fcsListBySubjectMerged, asinhTfmData), "flowSet")
  
  # Extract expression data and label Tmr+ events
  mergedExprStructureOnly <-
    setNames(
      data.frame(matrix(data = NA_real_, ncol = ncol(exprs(fcsListBySubjectMerged[[1]])), nrow = 0)),
      flowCore::colnames(fcsListBySubjectMerged))
  mergedExprStructureOnly$samp <- character()
  mergedExprStructureOnly$cellSubset <- list()
  # mergedExprStructureOnly$cellSubset <- character() # potential future change: make cellSubset a character vector
  
  mergedExpr <- list()
  for(currSubjectNum in seq_len(length(fcsListBySubjectMerged))){
    mergedExpr[[currSubjectNum]] <- as.data.frame(flowCore::exprs(fcsListBySubjectMerged[[currSubjectNum]]))
    mergedExpr[[currSubjectNum]]$samp <- as.character(names(cellSubsetCountsBySubject)[currSubjectNum])
    # potential future change: make cellSubset a character vector; remove the as.list() from following lines
    mergedExpr[[currSubjectNum]]$cellSubset <-
      as.list(
        rep(names(cellSubsetCountsBySubject[[currSubjectNum]]), times = cellSubsetCountsBySubject[[currSubjectNum]]))
  }
  mergedExpr <- bind_rows(mergedExprStructureOnly, mergedExpr)
  rm(fcsListBySubjectMerged); gc()

  # building a discovr experiment S3 object
  experiment <- structure(list(), class = "discovrExperiment")
  experiment$markerInfoFile     <- markerInfoFile
  experiment$markerInfo         <- markerInfo
  experiment$fcsInfoFile        <- fcsInfoFile
  experiment$fcsInfo            <- fcsInfo
  experiment$parentPopulation   <- parentPopulation
  experiment$mergedExpr         <- mergedExpr
  experiment$clusteringMarkers  <- clusteringMarkers
  experiment$status             <- "initialized"
  
  return(experiment)
}

#' Perform phenograph clustering for a DISCOV-R experiment
#'
#' @param experiment A discovrExperiment created using \code{setupDiscovrExperiment()}
#' @param method A character string indicating the clustering method to use.
#' Currently only 'phenograph' is supported as a clustering method.
#' @param k An integer specifying the maximum number of nearest neighbors to use. Applies only if method = 'phenograph' (default: 30)
#' @param seed A number to be used as the seed for pseudorandom number generation (default: 12345)
#' @param verbose A boolean specifying whether to display processing messages (default: TRUE)
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{setupDiscovrExperiment}} \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir
#' @import dplyr
#' @importFrom flowCore exprs
#' @importFrom igraph graph_from_data_frame cluster_louvain membership modularity
#' @export
clusterDiscovrExperiment <- function(
  experiment,
  method = "phenograph",
  k = 30,
  seed = 12345,
  verbose = TRUE
){
  if(!is.discovrExperiment(experiment)){
    stop(
      "The object passed to this function is not a valid DISCOV-R experiment object.",
      "Please create your experiment using the 'setupDiscovrExperiment' function and try again."
    )
  }
  
  # check for subjects with too few events to cluster
  if(method == "phenograph"){
    if(verbose){message("Checking for subjects with too few events to cluster...")}
    subjectCounts <- getSubjectCounts(experiment)
    if(any(subjectCounts$nEvents < k + 2)){
      stop(
        "PhenoGraph clustering can only proceed with subjects that have at ",
        "least k + 2 (", k + 2, ") total events. Please check or remove the ",
        "following subjects, or modify the clustering parameters, ",
        "and try again:\n",
        paste(subjectCounts$sample[subjectCounts$nEvents < k + 2], collapse = ", ")
      )
    }
  }

  set.seed(seed)

  #########################################################################
  # Section 2.d.i from original SOP - Phenograph clustering - takes a while
  #########################################################################
  # Clustering markers object is defined in the first set-up chunk.  Tweak which markers are included up there.

  # Set up Phenograph function to use kd tree
  findNeighbors <- function(data, k){
    nearest <- RANN::nn2(data, data, k, treetype = "kd", searchtype = "standard")
    return(nearest[[1]])
  }

  runRpheno <- function(data, k = 30){
    nDigits <- 3 # set number of digits for time display
    if(is.data.frame(data))
      data <- as.matrix(data)

    if(!is.matrix(data))
      stop("Wrong input data, should be a data frame or matrix!")

    if(k<1){
      stop("k must be a positive integer!")
    }else if (nrow(data) < k + 2){
      stop("k must be smaller than the total number of points!")
    }

    if(verbose){
      message("Run Rphenograph starts:","\n",
              "  -Input data of ", nrow(data)," rows and ", ncol(data), " columns","\n",
              "  -k is set to ", k)
      message("  Finding nearest neighbors...")
    }

    t1 <- system.time(neighborMatrix <- findNeighbors(data, k=k+1)[,-1])

    if(verbose){
      message(
        "DONE ~", round(t1[3], nDigits), "s\n",
        "Compute jaccard coefficient between nearest-neighbor sets..."
      )
    }
    t2 <- system.time(links <- jaccard_coeff(neighborMatrix))

    if(verbose){
      message(
        "DONE ~", round(t2[3], nDigits), "s\n",
        "Build undirected graph from the weighted links..."
      )
    }

    links <- links[links[,1]>0, ]
    relations <- as.data.frame(links)
    colnames(relations)<- c("from","to","weight")
    t3 <- system.time(g <- igraph::graph_from_data_frame(relations, directed=FALSE))

    if(verbose){
      message(
        "DONE ~", round(t3[3], nDigits), "s\n",
        "Run louvain clustering on the graph ..."
      )
    }

    t4 <- system.time(community <- igraph::cluster_louvain(g))
    if(verbose){
      message("DONE ~", round(t4[3], nDigits), "s\n")
    }

    if(verbose){
      message("Run Rphenograph DONE, took a total of ", round(sum(c(t1[3],t2[3],t3[3],t4[3])), nDigits), "s.")
      message("  Return a community class\n  -Modularity value: ", igraph::modularity(community),"\n")
      message("  -Number of clusters: ", length(unique(igraph::membership(community))))

    }
    return(community)
  }

  # Cluster data in experiment
  if (method == "phenograph") {
    # initialize RPclust
    experiment$mergedExpr$RPclust <- NA
    # assign row IDs for merge
    experiment$mergedExpr$rowIdx <- seq_len(nrow(experiment$mergedExpr))
    # cluster events within each subject
    allSubjects <- unique(experiment$mergedExpr$samp)
    for(currSubj in allSubjects){
      if(verbose){
        message("\nClustering sample ", match(currSubj, allSubjects), " of ", length(allSubjects), "...")
      }
      rPhenoVect <- as.numeric(igraph::membership(runRpheno(
        data = experiment$mergedExpr[
          experiment$mergedExpr$samp == currSubj,
          experiment$clusteringMarkers
        ],
        k = k
      )))
      # add cluster IDs
      experiment$mergedExpr[experiment$mergedExpr$samp == currSubj, "RPclust"] <- rPhenoVect
    }
    # remove row IDs so they don't interfere downstream
    experiment$mergedExpr <- subset(experiment$mergedExpr, select = -get("rowIdx"))
    # check to make sure that all events were assigned
    if(any(is.na(experiment$mergedExpr$RPclust))){
      stop("Not all events were assigned to a cluster. Please contact the package developer for assistance.")
    }

  } else {
    stop("Clustering method '", method, "' is not currently supported. Stopping...")
  }
  
  ###################################################################
  # Sections 2.e.i from original SOP - Summarize and save outputs
  ###################################################################
  
  # Calculate mean expression value of each marker for each phenograph cluster in each subject
  clusterMeans <- experiment$mergedExpr %>%
    dplyr::select(-cellSubset) %>%
    distinct() %>% # remove duplicated rows; mergedExpr has both parent and gated
    dplyr::group_by(samp, RPclust) %>%
    dplyr::summarise_all(mean) %>%
    dplyr::mutate(RPclust = as.character(RPclust))
  

  ##################################
  # Calculate total mean expression for each subject
  parentMeans <- experiment$mergedExpr %>%
    dplyr::select(-cellSubset, -RPclust) %>%
    distinct() %>% # remove duplicated rows; mergedExpr has both parent and gated
    dplyr::group_by(samp) %>%
    dplyr::summarise_all(mean) %>%
    dplyr::mutate(RPclust = "Total_Parent")

  # Count cells of each subpopulation (eg Tmr) in each phenograph cluster (from each sample)
  clusterRarePopCts <-
    experiment$mergedExpr %>%
    dplyr::select(samp, cellSubset, RPclust) %>%
    dplyr::group_by(samp, RPclust) %>%
    dplyr::summarise(Total = n(), .groups = "drop_last") %>%
    as.data.frame()

  uniqueSubsets <- unique(experiment$mergedExpr$cellSubset)

  for(currCellSubset in uniqueSubsets){
    if(verbose){message("Counting cluster events for ", currCellSubset)}
    additionalMatrix <- experiment$mergedExpr %>%
      dplyr::select(samp, cellSubset, RPclust) %>%
      group_by(samp, RPclust) %>%
      summarise(number = sum(cellSubset == currCellSubset)) %>%
      as.data.frame()
    clusterRarePopCts <- cbind(clusterRarePopCts, additionalMatrix$number)
  }

  for(i in seq_len(ncol(clusterRarePopCts)-3)){
    colnames(clusterRarePopCts)[i+3] <- (uniqueSubsets)[i]
  }

  aggregateCounts <- clusterRarePopCts %>%
    dplyr::select(-RPclust) %>%
    group_by(samp) %>%
    summarise_all(sum) %>%
    rename_at(vars(-samp), function(name) paste0(name,"_tot"))

  clusterRarePopCts <- left_join(clusterRarePopCts, aggregateCounts)

  for(i in seq_len(length(uniqueSubsets)+1)){
    clusterRarePopCts <- cbind(
      clusterRarePopCts,
      clusterRarePopCts[,i+2]/clusterRarePopCts[,i+2+length(uniqueSubsets)+1]*100
    )
  }

  for(i in seq_len(length(uniqueSubsets))){
    colnames(clusterRarePopCts)[i+3+(length(uniqueSubsets)+1)*2] <- (paste0("pct_", (uniqueSubsets)[i], "_in_clust"))
  }
  colnames(clusterRarePopCts)[3+(length(uniqueSubsets)+1)*2] <- "pct_Total_in_clust"

  # update experiment data
  experiment$status                        <- "clustered"
  experiment$clusterMeans                  <- bind_rows(clusterMeans, parentMeans)
  experiment$clusterMethod                 <- method
  experiment$clusterRarePopCts             <- clusterRarePopCts

  return(experiment)
}

#' Normalize marker expression within a DISCOV-R experiment
#'
#' @param experiment A discovrExperiment created using
#' \code{setupDiscovrExperiment()} and clustered using
#' \code{clusterDiscovrExperiment()}
#' @param normalizationInfo (default: NULL) Optional object containing
#' information on how to normalize. If left NULL, the \code{experiment} object
#' will be checked for normalization info in
#' \code{experiment$markerInfo$normalizationMethod}. If that is not present,
#' or for any markers without a method specified, normalization will use
#' the default method, currently "zScore". Acceptable non-NULL values include a
#' data frame with columns columns "commonMarkerName" and "normalizationMethod",
#' a named vector with elements specifying normalization method and names
#' referring to each marker in \code{experiment$mergedExpr}, or a single
#' character value specifying the normalization method to be applied to all
#' markers. Options for normalization are "zScore", "none", and "warpSet". In
#' addition, "warpSet" can include a maximum peak number specification as a
#' number immediately following (e.g. "warpSet2") - see documentation of
#' \code{flowStats::warpSet} for details. Any markers lacking a specification
#' will be normalized using the default method (currently "zScore").
#' @param defaultNormalizationMethod (default: "zScore") A character string, the
#' normalization method to use for markers without another method specified.
#' Options are the same as in \code{normalizationInfo} above.
#' @param seed (default: 12345) Numeric, the random number seed for warpSet
#' normalization, passed to \code{normalizeWarpSetMergedExpr}
#' @param verbose (default: TRUE) A logical specifying whether to display processing messages
#' @return An S3 object of class \code{discovrExperiment}, with additional
#' elements named "mergedExprNormalizedScaled" and
#' "clusterMeansNormalizedScaled". If the normalization methods were not taken
#' from \code{experiment$markerInfo$normalizationMethod}, the normalization
#' method for each marker will be stored there.
#'
#' @seealso \code{\link{discovrExperiment}}
#' @author Matt Dufort
#' @import flowCore
#' @import dplyr
#' @importFrom stats setNames na.omit
#' @export
normalizeDiscovrExperiment <- function(
    experiment,
    normalizationInfo = NULL,
    defaultNormalizationMethod = "zScore",
    seed = 12345,
    verbose = TRUE
){
  if(!is.discovrExperiment(experiment)){
    stop(
      "The object passed to 'normalizeDiscovrExperiment' is not a valid DISCOV-R experiment object. ",
      "Please create your experiment using the 'setupDiscovrExperiment' function ",
      "and perform the initial clustering using the 'clusterDiscovrExperiment' function. "
    )
  }
  if(experiment$status %in% c("normalized", "metaclustered")){
    message(
      "The experiment has a status of ", experiment$status, ", which indicates that normalization has already been run.\n",
      "Any existing normalized expression values stored in this experiment object will be overwritten.")
  } else if(experiment$status != "clustered"){
    stop(
      "The experiment must have status 'clustered', 'normalized', or 'metaclustered' in order to be ready for normalization. ",
      "The current experiment has a status of ", experiment$status, ". ",
      "Please make sure to create your experiment using the 'setupDiscovrExperiment' function ",
      "and perform the initial clustering using the 'clusterDiscovrExperiment' function. "
    )
  }
  
  # check experiment$mergedExpr for validity (shouldn't be necessary, but including to head off some edge cases)
  if(!is.data.frame(experiment$mergedExpr))
    stop("'mergedExpr' element within the experiment object is not a data.frame; please check inputs to 'normalizeScaleMergedExpr'")
  if(!("samp" %in% colnames(experiment$mergedExpr))) {
    stop(
      paste("Normalization of marker expression cannot be performed if 'mergedExpr' element",
            "within the experiment object does not contain column 'samp' specifying sample identity for each event"))
  }
  # potential change: add a check on marker names in mergedExpr and normalizationInfo
  
  # check defaultNormalizationMethod against accepted values
  if(
    !((defaultNormalizationMethod %in% c("zScore", "none", "warpSet")) |
      str_detect(defaultNormalizationMethod, "^warpSet[0-9]+$")))
    stop("Value for 'defaultNormalizationMethod' must be 'zScore', 'none', or 'warpset[#]'.")
  
  # check normalizationInfo, and set to defaultNormalizationMethod for any markers missing info
  # output of this chunk should be a data frame with "commonMarkerName" and "normalizationMethod" columns
  # "commonMarkerName" values should match all marker columns in mergedExpr (which should be all those before "samp")
  # "normalizationMethod" values should include only the options I've specified
  if (is.null(normalizationInfo)){
    # use experiment$markerInfo if it includes normalizationMethod
    if (!is.null(experiment$markerInfo$normalizationMethod)) {
      if(verbose){
        message(
          paste0(
            "Found column 'normalizationMethod' in markerInfo;\n",
            "using specified normalization method for each marker, ",
            "and ", defaultNormalizationMethod, " for any not specified.\n"))}
      normalizationInfo <- experiment$markerInfo
    } else {
      normalizationInfo <- defaultNormalizationMethod
    }
  }
  if (is.vector(normalizationInfo)) {
    if ((length(normalizationInfo) == 1)) {
      normalizationInfo <-
        setNames(rep(normalizationInfo, times = length(experiment$markerInfo$commonMarkerName)),
                 experiment$markerInfo$commonMarkerName)
    }
    # add zScore for any markers missing from normalizationInfo vector
    normalizationInfo <-
      c(normalizationInfo,
        setNames(
          rep(defaultNormalizationMethod,
              times = length(setdiff(experiment$markerInfo$commonMarkerName, names(normalizationInfo)))),
          setdiff(experiment$markerInfo$commonMarkerName, names(normalizationInfo)))
      )
    normalizationInfo <- data.frame(
      commonMarkerName = names(normalizationInfo),
      normalizationMethod = unname(normalizationInfo))
  }
  if (!is.data.frame(normalizationInfo))
    stop("Format of 'normalizationInfo' not recognized. It should be a data.frame or character vector.")
  
  # fill gaps in normalizationInfo with defaultNormalizationMethod
  markersMissing <- setdiff(experiment$markerInfo$commonMarkerName, normalizationInfo$commonMarkerName)
  if (length(markersMissing > 0))
    normalizationInfo <-
    bind_rows(normalizationInfo,
              data.frame(commonMarkerName = markersMissing,
                         normalizationMethod = defaultNormalizationMethod))
  if (is.null(normalizationInfo$normalizationMethod)){
    normalizationInfo$normalizationMethod <- defaultNormalizationMethod
  }
  normalizationInfo$normalizationMethod[is.na(normalizationInfo$normalizationMethod)] <- defaultNormalizationMethod
  
  # check values in normalizationInfo now that it's assembled
  normalizationInfo <- checkMarkerInfoNormalizationMethod(normalizationInfo)
  
  markersToNormalize <- intersect(normalizationInfo$commonMarkerName, colnames(experiment$mergedExpr))
  
  # check mergedExpr and normalizationInfo
  if (any(!(setdiff(colnames(experiment$mergedExpr), c("samp", "cellSubset", "RPclust")) %in% normalizationInfo$commonMarkerName)))
    stop("All markers names in 'mergedExpr' must have normalization specified in 'normalizationInfo'")
  if (!all(sapply(experiment$mergedExpr[, markersToNormalize], is.numeric)))
    stop("Only numeric values are allow in 'mergedExpr' marker expression levels.")
  
  mergedExprNormalizedScaled <- experiment$mergedExpr
  
  # apply z-score normalization to each sample, for all markers with z-score normalization specified
  for (samp.tmp in unique(experiment$mergedExpr$samp)) {
    mergedExprNormalizedScaled[
      mergedExprNormalizedScaled$samp == samp.tmp,
      match(
        normalizationInfo$commonMarkerName[
          (normalizationInfo$commonMarkerName %in% markersToNormalize) &
          (normalizationInfo$normalizationMethod %in% "zScore")],
            colnames(mergedExprNormalizedScaled))] <-
      scale(
        mergedExprNormalizedScaled[
          mergedExprNormalizedScaled$samp == samp.tmp,
          match(
            normalizationInfo$commonMarkerName[
              (normalizationInfo$commonMarkerName %in% markersToNormalize) &
                (normalizationInfo$normalizationMethod %in% "zScore")],
            colnames(mergedExprNormalizedScaled))])
  }
  
  # apply warpSet normalization to each marker, iterating over variations with different numbers of peaks specified
  if(any(str_detect(normalizationInfo$normalizationMethod[normalizationInfo$commonMarkerName %in% markersToNormalize], "warpSet"))){
    if(verbose)
      message("Starting warpSet normalization on select markers")
    
    for(warpSetMethod in 
        unique(grep("^warpSet[0-9]*$",
                    normalizationInfo$normalizationMethod[normalizationInfo$commonMarkerName %in% markersToNormalize],
                    value = TRUE))){
      maxPeakNr <-
        if(str_detect(warpSetMethod, "(?<=warpSet)[0-9]+")){
          as.numeric(str_extract(warpSetMethod, "(?<=warpSet)[0-9]+"))
        } else NULL
      mergedExprNormalizedScaled[
        ,
        c(
          normalizationInfo$commonMarkerName[
            (normalizationInfo$commonMarkerName %in% markersToNormalize) &
              (normalizationInfo$normalizationMethod == warpSetMethod)], "samp")] <-
        normalizeWarpSetMergedExpr(
          mergedExprNormalizedScaled[
            ,
            c(
              normalizationInfo$commonMarkerName[
                (normalizationInfo$commonMarkerName %in% markersToNormalize) &
                  (normalizationInfo$normalizationMethod == warpSetMethod)], "samp")],
          peakNr = maxPeakNr, groupCol = "samp", seed = seed)
    }
  }
  
  # for normalizationMethod "none": no change applied
  
  # scale each marker to have comparable weight in metaclustering
  mergedExprNormalizedScaled[, markersToNormalize] <-
    scale(mergedExprNormalizedScaled[, markersToNormalize])
  
  # skip calculation of marker means by cluster if experiment not clustered yet (only really used when running UMAP prior to clustering)
  if("RPclust" %in% colnames(mergedExprNormalizedScaled)){
    # Calculate mean expression value (scaled and normalized) of each marker for each phenograph cluster in each subject
    clusterMeansNormalizedScaled <- mergedExprNormalizedScaled %>%
      dplyr::select(-cellSubset) %>%
      distinct() %>% # remove duplicated rows; mergedExprNormalizedScaled has both parent and gated
      dplyr::group_by(samp, RPclust) %>%
      dplyr::summarise_all(mean) %>%
      dplyr::mutate(RPclust = as.character(RPclust))
    
    # Calculate total mean expression (scaled and normalized) for each subject
    parentMeansNormalizedScaled <- mergedExprNormalizedScaled %>%
      dplyr::select(-cellSubset, -RPclust) %>%
      distinct() %>% # remove duplicated rows; mergedExprNormalizedScaled has both parent and gated
      dplyr::group_by(samp) %>%
      dplyr::summarise_all(mean) %>%
      dplyr::mutate(RPclust = "Total_Parent")
  }
    
  # update experiment data
  experiment$status                          <- "normalized"
  experiment$markersNormalized               <- markersToNormalize
  experiment$mergedExprNormalizedScaled      <- mergedExprNormalizedScaled
  if("RPclust" %in% colnames(mergedExprNormalizedScaled)){
    experiment$clusterMeansNormalizedScaled    <- bind_rows(clusterMeansNormalizedScaled, parentMeansNormalizedScaled)
  }
  experiment$markerInfo$normalizationMethod[experiment$markerInfo$commonMarkerName %in% markersToNormalize] <-
    normalizationInfo$normalizationMethod[
      match(experiment$markerInfo$commonMarkerName[experiment$markerInfo$commonMarkerName %in% markersToNormalize],
            normalizationInfo$commonMarkerName)]
  experiment$markerInfo$normalizationMethod[!(experiment$markerInfo$commonMarkerName %in% markersToNormalize)] <-
    "not normalized"
  
  return(experiment)
}
BenaroyaResearch/briDiscovr documentation built on July 16, 2025, 7:13 p.m.