R/clusteringPhase.R

Defines functions clusterDiscovrExperiment setupDiscovrExperiment

Documented in clusterDiscovrExperiment setupDiscovrExperiment

## Copyright (C) 2020  Mario Rosasco and Benaroya Research Institute
##
## This file is part of the briDiscovr package

#' Prepare a dataset for DISCOV-R analysis
#'
#' Loads in the marker and .fcs information and prepares a \code{discovrExperiment}
#' object for analysis. Default parameters values are based on the original DISCOV-R
#' analysis published in Wiedeman et al 2020.
#'
#' @param markerInfoFile A character string indicating the path to a .csv file. This file is expected to have columns
#' named useToCluster", as well as the names specified in the \code{markerCommonField} and \code{markerFcsField} variables.
#' Details of the \code{markerCommonField} and \code{markerFcsField} arguments are provided below; the "useToCluster"
#' column should have only TRUE or FALSE values. Markers with a TRUE value in this column will be used for clustering,
#' whereas the others will not.
#' @param fcsInfoFile A character string indicating the path to a file containing columns named "subject",
#' "cellSubset", and "filename". The "filename" field must contain paths to the .fcs files that will be used in analysis.
#' @param parentPopulation A character sting indicating the name of the parent population subset. Must match
#' one of the values in the fcsInfoFile 'cellSubset' column.
#' @param markerCommonField (default: "fixed") A character string indicating the
#' name of a column containing common marker names for human use, like "CD45"
#' @param markerFcsField (default: "desc") A character string indicating the
#' name ofa column containing the marker names in the .fcs files,like "89Y_CD45"
#' @param arcsinhA (default: 0) A numeric indicating the value for 'a' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param arcsinhB (default: 0.2) A numeric indicating the value for 'b' in the arcsinh
#' data transformation equation. Should be 1/5 for cytof data, and 1/150 for flow data.
#' @param arcsinhC (default: 0) A numeric indicating the value for 'c' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param verbose (default: TRUE) A logical specifying whether to display processing messages
#' @param checkMemory (default: TRUE) A logical indicating whether to check how much system memory
#' is available before loading the dataset. If TRUE, this function will display a message and prevent
#' data loading when the files take up more than 80 percent of the available system memory.
#' @param downsampleVectorList (default: NULL) A list of vectors of row numbers
#'  to use from files in \code{fcsInfoFile}. Generally generated by 
#' \code{downsampleFcsList}. If an object is specified here, it is used to 
#' downsample the files as they are read in. NOTE: the list must be in the same
#' order as the files in \code{fcsInfoFile}, so it is best to generate
#' this object directly from that file to ensure that they match. Possible
#' future update to allow named list with names matching a combination of
#' identifying fields for the FCS files.
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir, Matt Dufort
#' @import flowCore
#' @import dplyr
#' @importFrom methods as
#' @importFrom stats setNames
#' @importFrom utils read.csv
#' @importFrom memuse Sys.meminfo mu.size
#' @export
setupDiscovrExperiment <- function(
  markerInfoFile,
  fcsInfoFile,
  parentPopulation,
  markerCommonField = "fixed",
  markerFcsField = "desc",
  arcsinhA = 0,
  arcsinhB = 0.2,
  arcsinhC = 0,
  verbose = TRUE,
  checkMemory = TRUE,
  downsampleVectorList = NULL
){
  # get marker info, and check for appropriate columns
  markerInfo <- read.csv(markerInfoFile, stringsAsFactors = FALSE)
  if(!all(c(markerCommonField, markerFcsField, "useToCluster") %in% names(markerInfo))){
    stop(
      "The file set as 'markerInfoFile' must contain columns with names 'useToCluster' and the names ",
      "specified by markerCommonField and markerFcsField Please check your markerInfo file and try again."
    )
  }
  if(!all(is.logical(markerInfo$useToCluster))){
    stop("The 'useToCluster' column in the 'markerInfo' file must contain only TRUE or FALSE values.")
  }
  # set colnames of marker info data for consistency
  markerInfo <- dplyr::rename(markerInfo, commonMarkerName = !!markerCommonField, fcsMarkerName = !!markerFcsField)
  # make sure common names are valid
  markerInfo$commonMarkerName <- make.names(markerInfo$commonMarkerName)

  # check for appropriate marker names
  if(nrow(markerInfo) > length(unique(markerInfo$commonMarkerName))){
    stop("Marker names in the ", markerCommonField, " column of the markerInfo file do not appear to be unique. Please correct this and try again.")
  }
  if(any(markerInfo$commonMarkerName == "")){
    stop("Markers without assigned names were detected in the markerInfo file. Please correct the file and try again.")
  }

  # if input includes non-null downsampleVectorList, check that it is valid
  if(!is.null(downsampleVectorList)) {
    if(!is.list(downsampleVectorList) | 
       !all(sapply(downsampleVectorList, is.numeric) | (downsampleVectorList == "all")))
      stop("The object included as 'downsampleVectorList' must be a list containing ",
           "vectors of row names (generally the output of 'downsampleFcsList'). ",
           "Please check that this object is a list, and that every object within ",
           "it is a numeric vector.")
  }
  
  # extract clustering markers for convenience
  clusteringMarkers <- markerInfo[markerInfo$useToCluster, "commonMarkerName", drop = TRUE]
  if(verbose){message(paste("Found clustering markers:", paste(clusteringMarkers, collapse=", ")))}

  # get list of FCS files with subject and cell subset information, run checks on this input and the FCS files
  fcsInfo <- read.csv(fcsInfoFile, stringsAsFactors = FALSE)
  checkFcsFiles(fcsInfo, checkMemory = checkMemory, verbose = verbose)
  
  # if input includes non-null downsampleVectorList, check that it has one entry for each FCS files
  if(!is.null(downsampleVectorList)) {
    if(length(downsampleVectorList) != nrow(fcsInfo))
      stop("The object included as 'downsampleVectorList' must have one element ",
           "for each FCS file listed in the FCS info file. The object passed as ",
           "'downsampleVectorList' contains ", length(downsampleVectorList),
           " elements, while the FCS info file lists ", nrow(fcsInfo), " files.")
  }
  
  # check to ensure marker names and cell subsets are non-overlapping. Creates issues downstream if this happens.
  overlapNames <- fcsInfo$cellSubset[fcsInfo$cellSubset %in% markerInfo$commonMarkerName]
  if (length(overlapNames) > 0){
    stop(
      "Cell subset names and marker names must not overlap. The following names were detected in both sets of names: ",
      paste0(overlapNames, collapse = ", ")
    )
  }

  # check that the parent population is one of the subsets in the .fcs info
  if(!parentPopulation %in% unique(fcsInfo$cellSubset)){
    stop(
      "The requested parent population '",
      parentPopulation,
      "' was not found in the 'cellSubset' column of the .fcs info file."
    )
  }

  if(verbose){message("All .fcs files passed basic QC checks. Proceeding...")}

  ###########################################################
  # Section 2.c.i-iii from original SOP load and format input
  ###########################################################

  # Read in FCS files and associate with cell subsets and subjects
  for (currCellSubset in unique(fcsInfo$cellSubset)){
    if(verbose){message(paste0("Assigning per-subject data for subset: ", currCellSubset, "..."))}
    currFcsList <-
      buildFcsList(fcsInfo %>% dplyr::filter(.data$cellSubset == currCellSubset), truncate_max_range = FALSE)
    if(!is.null(downsampleVectorList)) {
      currDownsampleVectorList <-
        downsampleVectorList[which(fcsInfo$cellSubset == currCellSubset)]
      for(currFcsNum in seq_len(length(currFcsList)))
        if(!identical(currDownsampleVectorList[[currFcsNum]], "all")) # skip downsampling if "all" rows specified
          currFcsList[[currFcsNum]] <-
            currFcsList[[currFcsNum]][currDownsampleVectorList[[currFcsNum]],]
    }
    assign(currCellSubset, currFcsList)
  }
  allSubjects <-
    buildFcsList(fcsInfo[fcsInfo$cellSubset == parentPopulation,], truncate_max_range = FALSE)
  if(!is.null(downsampleVectorList)) {
    currDownsampleVectorList <-
      downsampleVectorList[which(fcsInfo$cellSubset == parentPopulation)]
    for(currFcsNum in seq_len(length(allSubjects)))
      if(!identical(currDownsampleVectorList[[currFcsNum]], "all")) # skip downsampling if "all" rows specified
      allSubjects[[currFcsNum]] <-
        allSubjects[[currFcsNum]][currDownsampleVectorList[[currFcsNum]],]
  }
  
  # Process data
  processData <- function(fcs){
    # confirm that all markers to be used in clustering are mappable to fcs parameter names
    clusteringMarkerDesc <- markerInfo[markerInfo$useToCluster, "fcsMarkerName", drop = TRUE]
    missingMarkers <- clusteringMarkerDesc[!clusteringMarkerDesc %in% pData(parameters(fcs))$desc]
    
    # check for missingMarkers in name field of flowFrame parameters, where desc is empty
    missingMarkersToRename <- intersect(missingMarkers, pData(parameters(fcs))$name)
    missingMarkersToRename <-
      missingMarkersToRename[
        is.na(pData(parameters(fcs))$desc[match(missingMarkersToRename, pData(parameters(fcs))$name)])]
    if (length(missingMarkersToRename) > 0){
      pData(parameters(fcs))$desc[match(missingMarkers, pData(parameters(fcs))$name)] <-
        missingMarkersToRename
    }
    missingMarkers <- setdiff(missingMarkers, missingMarkersToRename)
    
    if(length(missingMarkers) > 0){
      stop("Could not find the following clustering markers in the .fcs data\n", paste0(missingMarkers, collapse = ", "))
    }

    # Tidy marker names
    pData(parameters(fcs))$desc <-
      markerInfo$commonMarkerName[match(pData(parameters(fcs))$desc, markerInfo$fcsMarkerName)]

    # This changes parameters(fcs)$name, featureNames(fcs), and colnames(fcs) - aka events colnames - all in one fell swoop.
    # note colnames has to be the one from flowCore
    flowCore::colnames(fcs) = make.names(pData(parameters(fcs))$desc)

    # Remove markers that aren't informative/shared between panels (i.e. duplicated NAs)
    fcs = fcs[,!(duplicated(flowCore::colnames(fcs)) | duplicated(flowCore::colnames(fcs), fromLast = TRUE))]
    fcs = fcs[, order(flowCore::colnames(fcs))]
  }

  if(verbose){message("Cleaning and selecting markers from .fcs files according to markerInfo file...")}

  for (currCellSubset in unique(fcsInfo$cellSubset)){
    tmpFile <- get(currCellSubset)
    tmpFile <- lapply(tmpFile, processData)
    assign(currCellSubset, tmpFile)
  }
  allSubjects <- lapply(allSubjects, processData)

  if(verbose){message("Merging data (parent and child populations)...")}
  allMerged = allSubjects
  for(currSubject in names(allSubjects)){
    exprs(allMerged[[currSubject]]) = exprs(allMerged[[currSubject]])[0,]
    for(cellSubset in unique(fcsInfo$cellSubset)){
      exprs(allMerged[[currSubject]]) = rbind(
        exprs(allMerged[[currSubject]]),
        if(!is.null(get(cellSubset)[[currSubject]])) exprs(get(cellSubset)[[currSubject]])
      )
    }
  }

  # Transform the data
  asinhTfmData <- function(fcs){
    # Arcsinh transform remaining columns
    tl <- transformList(
      flowCore::colnames(fcs),
      arcsinhTransform(a = arcsinhA, b = arcsinhB, c = arcsinhC),
      transformationId="asinh"
    )
    fcs = transform(fcs, tl)
  }

  if(verbose){
    message("Transforming data using arcsinh(a=", arcsinhA, ", b=", arcsinhB, ", c=", arcsinhC, ")...")
  }
  allDataTransformed <- lapply(allMerged, asinhTfmData)
  transformedFlowSet <- as(allDataTransformed, "flowSet")
  # remove old data to conserve memory
  rm(allDataTransformed)

  # Extract expression data and label Tmr+ events
  mergedExpr <- setNames(
    data.frame(matrix(ncol = ncol(exprs(transformedFlowSet[[1]]))+2, nrow = 0)),
    c(flowCore::colnames(transformedFlowSet), "samp", "cellSubset")
  )

  for(currSubject in names(allSubjects)){
    tmpExpr = as.data.frame(flowCore::exprs(transformedFlowSet[[currSubject]]))
    tmpExpr$samp = as.character(currSubject)
    temp_cellSubsets <- list()
    for(cellSubset in unique(fcsInfo$cellSubset)){
      temp_cellSubsets <- c(
        temp_cellSubsets,
        if(!is.null(get(cellSubset)[[currSubject]])){
          rep(paste0(cellSubset), nrow(flowCore::exprs(get(cellSubset)[[currSubject]])))
        }
      )
    }
    tmpExpr$cellSubset = temp_cellSubsets
    mergedExpr = rbind(mergedExpr, tmpExpr)
  }

  # building a discovr experiment S3 object
  exptInProgress <- structure(list(), class = "discovrExperiment")
  exptInProgress$markerInfoFile     <- markerInfoFile
  exptInProgress$markerInfo         <- markerInfo
  exptInProgress$fcsInfoFile        <- fcsInfoFile
  exptInProgress$fcsInfo            <- fcsInfo
  exptInProgress$parentPopulation   <- parentPopulation
  exptInProgress$mergedExpr         <- mergedExpr
  exptInProgress$clusteringMarkers  <- clusteringMarkers
  exptInProgress$status             <- "initialized"

  return(exptInProgress)
}

#' Perform phenograph clustering for a DISCOV-R experiment
#'
#' @param experiment A discovrExperiment created using \code{setupDiscovrExperiment()}
#' @param method A character string indicating the clustering method to use.
#' Currently only 'phenograph' is supported as a clustering method.
#' @param k An integer specifying the maximum number of nearest neighbors to use. Applies only if method = 'phenograph' (default: 30)
#' @param seed A number to be used as the seed for pseudorandom number generation (default: 12345)
#' @param verbose A boolean specifying whether to display processing messages (default: TRUE)
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{setupDiscovrExperiment}} \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir
#' @import dplyr
#' @importFrom rlang .data
#' @importFrom flowCore exprs
#' @importFrom igraph graph.data.frame cluster_louvain membership modularity
#' @export
clusterDiscovrExperiment <- function(
  experiment,
  method = "phenograph",
  k = 30,
  seed = 12345,
  verbose = TRUE
){
  if(!is.discovrExperiment(experiment)){
    stop(
      "The object passed to this function is not a valid DISCOV-R experiment object.",
      "Please create your experiment using the 'setupDiscovrExperiment' function and try again."
    )
  }
  
  # check for subjects with too few events to cluster
  if(method == "phenograph"){
    if(verbose){message("Checking for subjects with too few events to cluster...")}
    subjectCounts <- getSubjectCounts(experiment)
    if(any(subjectCounts$nEvents < k + 2)){
      stop(
        "PhenoGraph clustering can only proceed with subjects that have at ",
        "least k + 2 (", k + 2, ") total events. Please check or remove the ",
        "following subjects, or modify the clustering parameters, ",
        "and try again:\n",
        paste(subjectCounts$sample[subjectCounts$nEvents < k + 2], collapse = ", ")
      )
    }
  }

  set.seed(seed)

  #########################################################################
  # Section 2.d.i from original SOP - Phenograph clustering - takes a while
  #########################################################################
  # Clustering markers object is defined in the first set-up chunk.  Tweak which markers are included up there.

  # Set up Phenograph function to use kd tree
  findNeighbors <- function(data, k){
    nearest <- RANN::nn2(data, data, k, treetype = "kd", searchtype = "standard")
    return(nearest[[1]])
  }

  runRpheno <- function(data, k=30){
    nDigits <- 3 # set number of digits for time display
    if(is.data.frame(data))
      data <- as.matrix(data)

    if(!is.matrix(data))
      stop("Wrong input data, should be a data frame or matrix!")

    if(k<1){
      stop("k must be a positive integer!")
    }else if (nrow(data) < k + 2){
      stop("k must be smaller than the total number of points!")
    }

    if(verbose){
      message("Run Rphenograph starts:","\n",
              "  -Input data of ", nrow(data)," rows and ", ncol(data), " columns","\n",
              "  -k is set to ", k)
      message("  Finding nearest neighbors...")
    }

    t1 <- system.time(neighborMatrix <- findNeighbors(data, k=k+1)[,-1])

    if(verbose){
      message(
        "DONE ~", round(t1[3], nDigits), "s\n",
        "Compute jaccard coefficient between nearest-neighbor sets..."
      )
    }
    t2 <- system.time(links <- jaccard_coeff(neighborMatrix))

    if(verbose){
      message(
        "DONE ~", round(t2[3], nDigits), "s\n",
        "Build undirected graph from the weighted links..."
      )
    }

    links <- links[links[,1]>0, ]
    relations <- as.data.frame(links)
    colnames(relations)<- c("from","to","weight")
    t3 <- system.time(g <- igraph::graph.data.frame(relations, directed=FALSE))

    if(verbose){
      message(
        "DONE ~", round(t3[3], nDigits), "s\n",
        "Run louvain clustering on the graph ..."
      )
    }

    t4 <- system.time(community <- igraph::cluster_louvain(g))
    if(verbose){
      message("DONE ~", round(t4[3], nDigits), "s\n")
    }

    if(verbose){
      message("Run Rphenograph DONE, took a total of ", round(sum(c(t1[3],t2[3],t3[3],t4[3])), nDigits), "s.")
      message("  Return a community class\n  -Modularity value: ", igraph::modularity(community),"\n")
      message("  -Number of clusters: ", length(unique(igraph::membership(community))))

    }
    return(community)
  }

  # Cluster data in experiment
  if (method == "phenograph") {
    # initialize RPclust
    experiment$mergedExpr$RPclust <- NA
    # assign row IDs for merge
    experiment$mergedExpr$rowIdx <- seq_len(nrow(experiment$mergedExpr))
    # cluster events within each subject
    allSubjects <- unique(experiment$mergedExpr$samp)
    for(currSubj in allSubjects){
      if(verbose){
        message("\nClustering sample ", match(currSubj, allSubjects), " of ", length(allSubjects), "...")
      }
      rPhenoVect = as.numeric(igraph::membership(runRpheno(
        data = experiment$mergedExpr[
          experiment$mergedExpr$samp == currSubj,
          experiment$clusteringMarkers
        ],
        k = k
      )))
      # add cluster IDs
      experiment$mergedExpr[experiment$mergedExpr$samp == currSubj, "RPclust"] <- rPhenoVect
    }
    # remove row IDs so they don't interfere downstream
    experiment$mergedExpr <- subset(experiment$mergedExpr, select = -get("rowIdx"))
    # check to make sure that all events were assigned
    if(any(is.na(experiment$mergedExpr$RPclust))){
      stop("Not all events were assigned to a cluster. Please contact the package developer for assistance.")
    }

  } else {
    stop("Clustering method '", method, "' is not currently supported. Stopping...")
  }

  ###################################################################
  # Sections 2.e.i from original SOP - Summarize and save outputs
  ###################################################################
  # Calculate mean expression value of each marker for each phenograph cluster in each subject
  clusterMeans <- experiment$mergedExpr %>%
    dplyr::select(-.data$cellSubset) %>%
    unique() %>% # remove duplicated rows; mergedExpr has both parent and gated
    dplyr::group_by(.data$samp, .data$RPclust) %>%
    dplyr::summarise_all(mean) %>%
    dplyr::mutate(RPclust = as.character(.data$RPclust))

  ##################################
  # Calculate total mean expression for each subject
  parentMeans = experiment$mergedExpr %>%
    dplyr::select(-.data$cellSubset, -.data$RPclust) %>%
    unique() %>% # remove duplicated rows; mergedExpr has both parent and gated
    dplyr::group_by(.data$samp) %>%
    dplyr::summarise_all(mean) %>%
    dplyr::mutate(RPclust = "Total_Parent")

  experiment$clusterMeans <- bind_rows(clusterMeans, parentMeans)

  # Count cells of each subpopulation (eg Tmr) in each phenograph cluster (from each sample)
  clusterRarePopCts <-
    experiment$mergedExpr %>%
    dplyr::select(.data$samp, .data$cellSubset, .data$RPclust) %>%
    dplyr::group_by(.data$samp, .data$RPclust) %>%
    dplyr::summarise(Total=n(), .groups = "drop_last") %>%
    as.data.frame()

  uniqueSubsets <- unique(experiment$mergedExpr$cellSubset)

  for(currCellSubset in uniqueSubsets){
    if(verbose){message("Counting cluster events for ", currCellSubset)}
    additionalMatrix = experiment$mergedExpr %>%
      dplyr::select(.data$samp, .data$cellSubset, .data$RPclust) %>%
      group_by(.data$samp, .data$RPclust) %>%
      summarise(number = sum(.data$cellSubset == currCellSubset)) %>%
      as.data.frame()
    clusterRarePopCts <- cbind(clusterRarePopCts, additionalMatrix$number)
  }

  for(i in seq_len(ncol(clusterRarePopCts)-3)){
    colnames(clusterRarePopCts)[i+3] = (uniqueSubsets)[i]
  }

  aggregateCounts = clusterRarePopCts %>%
    dplyr::select(-.data$RPclust) %>%
    group_by(.data$samp) %>%
    summarise_all(sum) %>%
    rename_at(vars(-.data$samp),function(name) paste0(name,"_tot"))

  clusterRarePopCts = left_join(clusterRarePopCts, aggregateCounts)

  for(i in seq_len(length(uniqueSubsets)+1)){
    clusterRarePopCts <- cbind(
      clusterRarePopCts,
      clusterRarePopCts[,i+2]/clusterRarePopCts[,i+2+length(uniqueSubsets)+1]*100
    )
  }

  for(i in seq_len(length(uniqueSubsets))){
    colnames(clusterRarePopCts)[i+3+(length(uniqueSubsets)+1)*2]= (paste0("pct_", (uniqueSubsets)[i], "_in_clust"))
  }
  colnames(clusterRarePopCts)[3+(length(uniqueSubsets)+1)*2]= "pct_Total_in_clust"

  # update experiment data
  experiment$status             <- "clustered"
  experiment$clusterMethod      <- method
  experiment$clusterRarePopCts  <- clusterRarePopCts

  return(experiment)
}
BenaroyaResearch/briDiscovr documentation built on March 15, 2024, 12:31 a.m.