## Copyright (C) 2020 Mario Rosasco and Benaroya Research Institute
##
## This file is part of the briDiscovr package
#' Prepare a dataset for DISCOV-R analysis
#'
#' Loads in the marker and .fcs information and prepares a \code{discovrExperiment}
#' object for analysis. Default parameters values are based on the original DISCOV-R
#' analysis published in Wiedeman et al 2020.
#'
#' @param markerInfoFile A character string indicating the path to a .csv file. This file is expected to have columns
#' named useToCluster", as well as the names specified in the \code{markerCommonField} and \code{markerFcsField} variables.
#' Details of the \code{markerCommonField} and \code{markerFcsField} arguments are provided below; the "useToCluster"
#' column should have only TRUE or FALSE values. Markers with a TRUE value in this column will be used for clustering,
#' whereas the others will not.
#' @param fcsInfoFile A character string indicating the path to a file containing columns named "subject",
#' "cellSubset", and "filename". The "filename" field must contain paths to the .fcs files that will be used in analysis.
#' @param parentPopulation A character sting indicating the name of the parent population subset. Must match
#' one of the values in the fcsInfoFile 'cellSubset' column.
#' @param markerCommonField (default: "fixed") A character string indicating the
#' name of a column containing common marker names for human use, like "CD45"
#' @param markerFcsField (default: "desc") A character string indicating the
#' name ofa column containing the marker names in the .fcs files,like "89Y_CD45"
#' @param arcsinhA (default: 0) A numeric indicating the value for 'a' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param arcsinhB (default: 0.2) A numeric indicating the value for 'b' in the arcsinh
#' data transformation equation. Should be 1/5 for cytof data, and 1/150 for flow data.
#' @param arcsinhC (default: 0) A numeric indicating the value for 'c' in the
#' arcsinh data transformation equation. Should usually be 0.
#' @param verbose (default: TRUE) A logical specifying whether to display processing messages
#' @param checkMemory (default: TRUE) A logical indicating whether to check how much system memory
#' is available before loading the dataset. If TRUE, this function will display a message and prevent
#' data loading when the files take up more than 80 percent of the available system memory.
#' @param downsampleVectorList (default: NULL) A list of vectors of row numbers
#' to use from files in \code{fcsInfoFile}. Generally generated by
#' \code{downsampleFcsList}. If an object is specified here, it is used to
#' downsample the files as they are read in. NOTE: the list must be in the same
#' order as the files in \code{fcsInfoFile}, so it is best to generate
#' this object directly from that file to ensure that they match. Possible
#' future update to allow named list with names matching a combination of
#' identifying fields for the FCS files.
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir, Matt Dufort
#' @import flowCore
#' @import dplyr
#' @importFrom methods as
#' @importFrom stats setNames
#' @importFrom utils read.csv
#' @importFrom memuse Sys.meminfo mu.size
#' @export
setupDiscovrExperiment <- function(
markerInfoFile,
fcsInfoFile,
parentPopulation,
markerCommonField = "fixed",
markerFcsField = "desc",
arcsinhA = 0,
arcsinhB = 0.2,
arcsinhC = 0,
verbose = TRUE,
checkMemory = TRUE,
downsampleVectorList = NULL
){
# get marker info, and check for appropriate columns
markerInfo <- read.csv(markerInfoFile, stringsAsFactors = FALSE)
if(!all(c(markerCommonField, markerFcsField, "useToCluster") %in% names(markerInfo))){
stop(
"The file set as 'markerInfoFile' must contain columns with names 'useToCluster' and the names ",
"specified by markerCommonField and markerFcsField Please check your markerInfo file and try again."
)
}
if(!all(is.logical(markerInfo$useToCluster))){
stop("The 'useToCluster' column in the 'markerInfo' file must contain only TRUE or FALSE values.")
}
# set colnames of marker info data for consistency
markerInfo <- dplyr::rename(markerInfo, commonMarkerName = !!markerCommonField, fcsMarkerName = !!markerFcsField)
# make sure common names are valid
markerInfo$commonMarkerName <- make.names(markerInfo$commonMarkerName)
# check for appropriate marker names
if(nrow(markerInfo) > length(unique(markerInfo$commonMarkerName))){
stop("Marker names in the ", markerCommonField, " column of the markerInfo file do not appear to be unique. Please correct this and try again.")
}
if(any(markerInfo$commonMarkerName == "")){
stop("Markers without assigned names were detected in the markerInfo file. Please correct the file and try again.")
}
# if input includes non-null downsampleVectorList, check that it is valid
if(!is.null(downsampleVectorList)) {
if(!is.list(downsampleVectorList) |
!all(sapply(downsampleVectorList, is.numeric) | (downsampleVectorList == "all")))
stop("The object included as 'downsampleVectorList' must be a list containing ",
"vectors of row names (generally the output of 'downsampleFcsList'). ",
"Please check that this object is a list, and that every object within ",
"it is a numeric vector.")
}
# extract clustering markers for convenience
clusteringMarkers <- markerInfo[markerInfo$useToCluster, "commonMarkerName", drop = TRUE]
if(verbose){message(paste("Found clustering markers:", paste(clusteringMarkers, collapse=", ")))}
# get list of FCS files with subject and cell subset information, run checks on this input and the FCS files
fcsInfo <- read.csv(fcsInfoFile, stringsAsFactors = FALSE)
checkFcsFiles(fcsInfo, checkMemory = checkMemory, verbose = verbose)
# if input includes non-null downsampleVectorList, check that it has one entry for each FCS files
if(!is.null(downsampleVectorList)) {
if(length(downsampleVectorList) != nrow(fcsInfo))
stop("The object included as 'downsampleVectorList' must have one element ",
"for each FCS file listed in the FCS info file. The object passed as ",
"'downsampleVectorList' contains ", length(downsampleVectorList),
" elements, while the FCS info file lists ", nrow(fcsInfo), " files.")
}
# check to ensure marker names and cell subsets are non-overlapping. Creates issues downstream if this happens.
overlapNames <- fcsInfo$cellSubset[fcsInfo$cellSubset %in% markerInfo$commonMarkerName]
if (length(overlapNames) > 0){
stop(
"Cell subset names and marker names must not overlap. The following names were detected in both sets of names: ",
paste0(overlapNames, collapse = ", ")
)
}
# check that the parent population is one of the subsets in the .fcs info
if(!parentPopulation %in% unique(fcsInfo$cellSubset)){
stop(
"The requested parent population '",
parentPopulation,
"' was not found in the 'cellSubset' column of the .fcs info file."
)
}
if(verbose){message("All .fcs files passed basic QC checks. Proceeding...")}
###########################################################
# Section 2.c.i-iii from original SOP load and format input
###########################################################
# Read in FCS files and associate with cell subsets and subjects
for (currCellSubset in unique(fcsInfo$cellSubset)){
if(verbose){message(paste0("Assigning per-subject data for subset: ", currCellSubset, "..."))}
currFcsList <-
buildFcsList(fcsInfo %>% dplyr::filter(.data$cellSubset == currCellSubset), truncate_max_range = FALSE)
if(!is.null(downsampleVectorList)) {
currDownsampleVectorList <-
downsampleVectorList[which(fcsInfo$cellSubset == currCellSubset)]
for(currFcsNum in seq_len(length(currFcsList)))
if(!identical(currDownsampleVectorList[[currFcsNum]], "all")) # skip downsampling if "all" rows specified
currFcsList[[currFcsNum]] <-
currFcsList[[currFcsNum]][currDownsampleVectorList[[currFcsNum]],]
}
assign(currCellSubset, currFcsList)
}
allSubjects <-
buildFcsList(fcsInfo[fcsInfo$cellSubset == parentPopulation,], truncate_max_range = FALSE)
if(!is.null(downsampleVectorList)) {
currDownsampleVectorList <-
downsampleVectorList[which(fcsInfo$cellSubset == parentPopulation)]
for(currFcsNum in seq_len(length(allSubjects)))
if(!identical(currDownsampleVectorList[[currFcsNum]], "all")) # skip downsampling if "all" rows specified
allSubjects[[currFcsNum]] <-
allSubjects[[currFcsNum]][currDownsampleVectorList[[currFcsNum]],]
}
# Process data
processData <- function(fcs){
# confirm that all markers to be used in clustering are mappable to fcs parameter names
clusteringMarkerDesc <- markerInfo[markerInfo$useToCluster, "fcsMarkerName", drop = TRUE]
missingMarkers <- clusteringMarkerDesc[!clusteringMarkerDesc %in% pData(parameters(fcs))$desc]
# check for missingMarkers in name field of flowFrame parameters, where desc is empty
missingMarkersToRename <- intersect(missingMarkers, pData(parameters(fcs))$name)
missingMarkersToRename <-
missingMarkersToRename[
is.na(pData(parameters(fcs))$desc[match(missingMarkersToRename, pData(parameters(fcs))$name)])]
if (length(missingMarkersToRename) > 0){
pData(parameters(fcs))$desc[match(missingMarkers, pData(parameters(fcs))$name)] <-
missingMarkersToRename
}
missingMarkers <- setdiff(missingMarkers, missingMarkersToRename)
if(length(missingMarkers) > 0){
stop("Could not find the following clustering markers in the .fcs data\n", paste0(missingMarkers, collapse = ", "))
}
# Tidy marker names
pData(parameters(fcs))$desc <-
markerInfo$commonMarkerName[match(pData(parameters(fcs))$desc, markerInfo$fcsMarkerName)]
# This changes parameters(fcs)$name, featureNames(fcs), and colnames(fcs) - aka events colnames - all in one fell swoop.
# note colnames has to be the one from flowCore
flowCore::colnames(fcs) = make.names(pData(parameters(fcs))$desc)
# Remove markers that aren't informative/shared between panels (i.e. duplicated NAs)
fcs = fcs[,!(duplicated(flowCore::colnames(fcs)) | duplicated(flowCore::colnames(fcs), fromLast = TRUE))]
fcs = fcs[, order(flowCore::colnames(fcs))]
}
if(verbose){message("Cleaning and selecting markers from .fcs files according to markerInfo file...")}
for (currCellSubset in unique(fcsInfo$cellSubset)){
tmpFile <- get(currCellSubset)
tmpFile <- lapply(tmpFile, processData)
assign(currCellSubset, tmpFile)
}
allSubjects <- lapply(allSubjects, processData)
if(verbose){message("Merging data (parent and child populations)...")}
allMerged = allSubjects
for(currSubject in names(allSubjects)){
exprs(allMerged[[currSubject]]) = exprs(allMerged[[currSubject]])[0,]
for(cellSubset in unique(fcsInfo$cellSubset)){
exprs(allMerged[[currSubject]]) = rbind(
exprs(allMerged[[currSubject]]),
if(!is.null(get(cellSubset)[[currSubject]])) exprs(get(cellSubset)[[currSubject]])
)
}
}
# Transform the data
asinhTfmData <- function(fcs){
# Arcsinh transform remaining columns
tl <- transformList(
flowCore::colnames(fcs),
arcsinhTransform(a = arcsinhA, b = arcsinhB, c = arcsinhC),
transformationId="asinh"
)
fcs = transform(fcs, tl)
}
if(verbose){
message("Transforming data using arcsinh(a=", arcsinhA, ", b=", arcsinhB, ", c=", arcsinhC, ")...")
}
allDataTransformed <- lapply(allMerged, asinhTfmData)
transformedFlowSet <- as(allDataTransformed, "flowSet")
# remove old data to conserve memory
rm(allDataTransformed)
# Extract expression data and label Tmr+ events
mergedExpr <- setNames(
data.frame(matrix(ncol = ncol(exprs(transformedFlowSet[[1]]))+2, nrow = 0)),
c(flowCore::colnames(transformedFlowSet), "samp", "cellSubset")
)
for(currSubject in names(allSubjects)){
tmpExpr = as.data.frame(flowCore::exprs(transformedFlowSet[[currSubject]]))
tmpExpr$samp = as.character(currSubject)
temp_cellSubsets <- list()
for(cellSubset in unique(fcsInfo$cellSubset)){
temp_cellSubsets <- c(
temp_cellSubsets,
if(!is.null(get(cellSubset)[[currSubject]])){
rep(paste0(cellSubset), nrow(flowCore::exprs(get(cellSubset)[[currSubject]])))
}
)
}
tmpExpr$cellSubset = temp_cellSubsets
mergedExpr = rbind(mergedExpr, tmpExpr)
}
# building a discovr experiment S3 object
exptInProgress <- structure(list(), class = "discovrExperiment")
exptInProgress$markerInfoFile <- markerInfoFile
exptInProgress$markerInfo <- markerInfo
exptInProgress$fcsInfoFile <- fcsInfoFile
exptInProgress$fcsInfo <- fcsInfo
exptInProgress$parentPopulation <- parentPopulation
exptInProgress$mergedExpr <- mergedExpr
exptInProgress$clusteringMarkers <- clusteringMarkers
exptInProgress$status <- "initialized"
return(exptInProgress)
}
#' Perform phenograph clustering for a DISCOV-R experiment
#'
#' @param experiment A discovrExperiment created using \code{setupDiscovrExperiment()}
#' @param method A character string indicating the clustering method to use.
#' Currently only 'phenograph' is supported as a clustering method.
#' @param k An integer specifying the maximum number of nearest neighbors to use. Applies only if method = 'phenograph' (default: 30)
#' @param seed A number to be used as the seed for pseudorandom number generation (default: 12345)
#' @param verbose A boolean specifying whether to display processing messages (default: TRUE)
#' @return An S3 object of class \code{discovrExperiment}
#'
#' @seealso \code{\link{setupDiscovrExperiment}} \code{\link{discovrExperiment}}
#' @author Mario G Rosasco, Virginia Muir
#' @import dplyr
#' @importFrom rlang .data
#' @importFrom flowCore exprs
#' @importFrom igraph graph.data.frame cluster_louvain membership modularity
#' @export
clusterDiscovrExperiment <- function(
experiment,
method = "phenograph",
k = 30,
seed = 12345,
verbose = TRUE
){
if(!is.discovrExperiment(experiment)){
stop(
"The object passed to this function is not a valid DISCOV-R experiment object.",
"Please create your experiment using the 'setupDiscovrExperiment' function and try again."
)
}
# check for subjects with too few events to cluster
if(method == "phenograph"){
if(verbose){message("Checking for subjects with too few events to cluster...")}
subjectCounts <- getSubjectCounts(experiment)
if(any(subjectCounts$nEvents < k + 2)){
stop(
"PhenoGraph clustering can only proceed with subjects that have at ",
"least k + 2 (", k + 2, ") total events. Please check or remove the ",
"following subjects, or modify the clustering parameters, ",
"and try again:\n",
paste(subjectCounts$sample[subjectCounts$nEvents < k + 2], collapse = ", ")
)
}
}
set.seed(seed)
#########################################################################
# Section 2.d.i from original SOP - Phenograph clustering - takes a while
#########################################################################
# Clustering markers object is defined in the first set-up chunk. Tweak which markers are included up there.
# Set up Phenograph function to use kd tree
findNeighbors <- function(data, k){
nearest <- RANN::nn2(data, data, k, treetype = "kd", searchtype = "standard")
return(nearest[[1]])
}
runRpheno <- function(data, k=30){
nDigits <- 3 # set number of digits for time display
if(is.data.frame(data))
data <- as.matrix(data)
if(!is.matrix(data))
stop("Wrong input data, should be a data frame or matrix!")
if(k<1){
stop("k must be a positive integer!")
}else if (nrow(data) < k + 2){
stop("k must be smaller than the total number of points!")
}
if(verbose){
message("Run Rphenograph starts:","\n",
" -Input data of ", nrow(data)," rows and ", ncol(data), " columns","\n",
" -k is set to ", k)
message(" Finding nearest neighbors...")
}
t1 <- system.time(neighborMatrix <- findNeighbors(data, k=k+1)[,-1])
if(verbose){
message(
"DONE ~", round(t1[3], nDigits), "s\n",
"Compute jaccard coefficient between nearest-neighbor sets..."
)
}
t2 <- system.time(links <- jaccard_coeff(neighborMatrix))
if(verbose){
message(
"DONE ~", round(t2[3], nDigits), "s\n",
"Build undirected graph from the weighted links..."
)
}
links <- links[links[,1]>0, ]
relations <- as.data.frame(links)
colnames(relations)<- c("from","to","weight")
t3 <- system.time(g <- igraph::graph.data.frame(relations, directed=FALSE))
if(verbose){
message(
"DONE ~", round(t3[3], nDigits), "s\n",
"Run louvain clustering on the graph ..."
)
}
t4 <- system.time(community <- igraph::cluster_louvain(g))
if(verbose){
message("DONE ~", round(t4[3], nDigits), "s\n")
}
if(verbose){
message("Run Rphenograph DONE, took a total of ", round(sum(c(t1[3],t2[3],t3[3],t4[3])), nDigits), "s.")
message(" Return a community class\n -Modularity value: ", igraph::modularity(community),"\n")
message(" -Number of clusters: ", length(unique(igraph::membership(community))))
}
return(community)
}
# Cluster data in experiment
if (method == "phenograph") {
# initialize RPclust
experiment$mergedExpr$RPclust <- NA
# assign row IDs for merge
experiment$mergedExpr$rowIdx <- seq_len(nrow(experiment$mergedExpr))
# cluster events within each subject
allSubjects <- unique(experiment$mergedExpr$samp)
for(currSubj in allSubjects){
if(verbose){
message("\nClustering sample ", match(currSubj, allSubjects), " of ", length(allSubjects), "...")
}
rPhenoVect = as.numeric(igraph::membership(runRpheno(
data = experiment$mergedExpr[
experiment$mergedExpr$samp == currSubj,
experiment$clusteringMarkers
],
k = k
)))
# add cluster IDs
experiment$mergedExpr[experiment$mergedExpr$samp == currSubj, "RPclust"] <- rPhenoVect
}
# remove row IDs so they don't interfere downstream
experiment$mergedExpr <- subset(experiment$mergedExpr, select = -get("rowIdx"))
# check to make sure that all events were assigned
if(any(is.na(experiment$mergedExpr$RPclust))){
stop("Not all events were assigned to a cluster. Please contact the package developer for assistance.")
}
} else {
stop("Clustering method '", method, "' is not currently supported. Stopping...")
}
###################################################################
# Sections 2.e.i from original SOP - Summarize and save outputs
###################################################################
# Calculate mean expression value of each marker for each phenograph cluster in each subject
clusterMeans <- experiment$mergedExpr %>%
dplyr::select(-.data$cellSubset) %>%
unique() %>% # remove duplicated rows; mergedExpr has both parent and gated
dplyr::group_by(.data$samp, .data$RPclust) %>%
dplyr::summarise_all(mean) %>%
dplyr::mutate(RPclust = as.character(.data$RPclust))
##################################
# Calculate total mean expression for each subject
parentMeans = experiment$mergedExpr %>%
dplyr::select(-.data$cellSubset, -.data$RPclust) %>%
unique() %>% # remove duplicated rows; mergedExpr has both parent and gated
dplyr::group_by(.data$samp) %>%
dplyr::summarise_all(mean) %>%
dplyr::mutate(RPclust = "Total_Parent")
experiment$clusterMeans <- bind_rows(clusterMeans, parentMeans)
# Count cells of each subpopulation (eg Tmr) in each phenograph cluster (from each sample)
clusterRarePopCts <-
experiment$mergedExpr %>%
dplyr::select(.data$samp, .data$cellSubset, .data$RPclust) %>%
dplyr::group_by(.data$samp, .data$RPclust) %>%
dplyr::summarise(Total=n(), .groups = "drop_last") %>%
as.data.frame()
uniqueSubsets <- unique(experiment$mergedExpr$cellSubset)
for(currCellSubset in uniqueSubsets){
if(verbose){message("Counting cluster events for ", currCellSubset)}
additionalMatrix = experiment$mergedExpr %>%
dplyr::select(.data$samp, .data$cellSubset, .data$RPclust) %>%
group_by(.data$samp, .data$RPclust) %>%
summarise(number = sum(.data$cellSubset == currCellSubset)) %>%
as.data.frame()
clusterRarePopCts <- cbind(clusterRarePopCts, additionalMatrix$number)
}
for(i in seq_len(ncol(clusterRarePopCts)-3)){
colnames(clusterRarePopCts)[i+3] = (uniqueSubsets)[i]
}
aggregateCounts = clusterRarePopCts %>%
dplyr::select(-.data$RPclust) %>%
group_by(.data$samp) %>%
summarise_all(sum) %>%
rename_at(vars(-.data$samp),function(name) paste0(name,"_tot"))
clusterRarePopCts = left_join(clusterRarePopCts, aggregateCounts)
for(i in seq_len(length(uniqueSubsets)+1)){
clusterRarePopCts <- cbind(
clusterRarePopCts,
clusterRarePopCts[,i+2]/clusterRarePopCts[,i+2+length(uniqueSubsets)+1]*100
)
}
for(i in seq_len(length(uniqueSubsets))){
colnames(clusterRarePopCts)[i+3+(length(uniqueSubsets)+1)*2]= (paste0("pct_", (uniqueSubsets)[i], "_in_clust"))
}
colnames(clusterRarePopCts)[3+(length(uniqueSubsets)+1)*2]= "pct_Total_in_clust"
# update experiment data
experiment$status <- "clustered"
experiment$clusterMethod <- method
experiment$clusterRarePopCts <- clusterRarePopCts
return(experiment)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.