R/evaluation.R

Defines functions emdEvaluation PlotOverviewCV testCV getDensities

Documented in emdEvaluation getDensities PlotOverviewCV testCV

#' getDensities
#'
#' Returns a dataframe with 4 columns, containing for every file and every
#' channel of interest the x/y values computed by density.
#'
#' @param files         Full paths of to the fcs files of the samples
#' @param channels      Names of the channels to compute the densities for
#' @param transformList Transformation list to pass to the flowCore
#'                      \code{transform} function
#' @param selection     List with indexation vector for every file.
#' @param quantileValues Quantiles to extract as well
#' @param ...           Extra parameters to pass to density
#'
#' @examples
#'
#' dir <- system.file("extdata", package = "CytoNorm")
#' files <- list.files(dir, pattern = "fcs$")
#'
#' ff <- flowCore::read.FCS(file.path(dir, files[1]))
#' channels <- grep("Di$", flowCore::colnames(ff), value = TRUE)
#' transformList <- flowCore::transformList(channels,
#'                                          cytofTransform)
#'
#' densities <- getDensities(files = file.path(dir, files),
#'                           channels = channels,
#'                           transformList = transformList,
#'                           quantileValues = c(0.25, 0.5, 0.75))
#'
#' densities$densities$Marker <- paste0(
#'   FlowSOM::GetMarkers(ff, densities$densities$Channel),
#'   " (", densities$densities$Channel, ")")
#'
#' library(ggplot2)
#' ggplot(dplyr::filter(densities$densities,
#'                      File == file.path(dir, files[1]))) +
#'   geom_line(aes(x = x, y = y)) +
#'   facet_wrap(~ Marker, scales = "free") +
#'   theme_minimal()
#'
#' ggplot(dplyr::filter(densities$quantiles,
#'                      Channel == FlowSOM::GetChannels(ff, "CD66"))) +
#'   geom_vline(aes(xintercept = `0.25`), col = "grey") +
#'   geom_vline(aes(xintercept = `0.5`), col = "grey", lwd = 1) +
#'   geom_vline(aes(xintercept = `0.75`), col = "grey") +
#'   geom_line(aes(x = x, y = y),
#'             data = dplyr::filter(densities$densities,
#'                      Channel == FlowSOM::GetChannels(ff, "CD66"))) +
#'   facet_wrap(~ File) +
#'   theme_minimal()
#'
#' @export
getDensities <- function(files,
                         channels,
                         quantileValues = 0.5,
                         transformList = NULL,
                         selection = NULL,
                         ...){

    densities <- data.frame(matrix(ncol = 4, nrow = 0,
                                   dimnames = list(NULL,
                                                   c("File",
                                                     "Channel",
                                                     "x",
                                                     "y"))))
    quantiles <- data.frame(matrix(ncol = 2+length(quantileValues),
                                   nrow = 0,
                                   dimnames = list(NULL,
                                                   c("File",
                                                     "Channel",
                                                     quantileValues))))

    for(file in files){

        o <- utils::capture.output(ff <- flowCore::read.FCS(file))
        if (!is.null(ff) && !is.null(transformList)) {
            ff <- flowCore::transform(ff, transformList)
        }

        if (!is.null(ff) & !is.null(selection)){
            ff <- ff[selection[[file]], ]
        }

        for (channel in channels) {
            dens <- stats::density(ff@exprs[ , channel], ...)
            densities <- rbind(densities,
                               data.frame(File = file,
                                          Channel = channel,
                                          x = dens$x,
                                          y = dens$y))

            quant <- stats::quantile(ff@exprs[, channel],
                                     quantileValues)
            quantiles <- rbind(quantiles,
                               cbind(data.frame(File = file,
                                          Channel = channel),
                                     matrix(quant,
                                            nrow = 1,
                                            dimnames = list(NULL,
                                                            quantileValues))))
        }
    }

    return(named.list(densities,
                      quantiles))
}

#' test_cv
#'
#' Function to inspect whether all control samples contain a similar percentage
#' of cells in all FlowSOM clusters
#'
#' @param fsom           FlowSOM list, as generated by prepareFlowSOM
#' @param cluster_values Vector with all amounts of clusters to test, can not be
#'                       smaller than 3.
#' @param plot           If TRUE, a plot of the CV values is generated
#' @param verbose        If TRUE, extra progress updates are printed
#' @param seed           Seed for reproducible results. Default = 1.
#'
#' @examples
#'
#' dir <- system.file("extdata", package = "CytoNorm")
#' files <- list.files(dir, pattern = "fcs$")
#' data <- data.frame(File = files,
#'                    Path = file.path(dir, files),
#'                    Type = stringr::str_match(files, "_([12]).fcs")[,2],
#'                    Batch = stringr::str_match(files, "PTLG[0-9]*")[,1],
#'                    stringsAsFactors = FALSE)
#' data$Type <- c("1" = "Train", "2" = "Validation")[data$Type]
#' train_data <- dplyr::filter(data, Type == "Train")
#'
#' ff <- flowCore::read.FCS(data$Path[1])
#' channels <- grep("Di$", flowCore::colnames(ff), value = TRUE)
#' transformList <- flowCore::transformList(channels,
#'                                          cytofTransform)
#'
#' fsom <- prepareFlowSOM(train_data$Path,
#'                        channels,
#'                        nCells = 10000, #1000000
#'                        FlowSOM.params = list(xdim = 15,
#'                                              ydim = 15,
#'                                              nClus = 25,
#'                                              scale = FALSE),
#'                        transformList = transformList,
#'                        seed = 1)
#'
#' cvs <- testCV(fsom,
#'               cluster_values = c(5,15,25,35,45)) # 3:50
#'
#' @export
testCV <- function(fsom,
                   cluster_values = 3:50,
                   plot = TRUE,
                   verbose = FALSE,
                   seed = 1) {

    nClus <- fsom$map$nNodes
    cluster_labels <- FlowSOM::GetClusters(fsom)
    
    if (any(cluster_values < 3)) {
      stop("Number of clusters to test cannot be smaller than 3.")
    }
    if (any(cluster_values > nClus)) {
      stop(paste0("Number of clusters to test cannot be higher than number of 
  FlowSOM nodes (", nClus, ")."))
    }
    
    if (length(unique(fsom$data[,"File"])) == 1){
      warning("Calculating the CV is not possible when only 1 file is used.")
    }
    
    # Determine metacluster labels
    meta_cl <- list()
    for(mc in cluster_values){
        if(verbose) message("Computing ", mc, " metaclusters")
        meta_cl[[as.character(mc)]] <-
            FlowSOM::metaClustering_consensus(fsom$map$codes,
                                              mc,
                                              seed = seed)
    }
    meta_cl[[as.character(nClus)]] <- seq_len(nClus)

    # Percentages assigned to each of the clusters per file
    pctgs <- list()
    for(mc in as.character(c(cluster_values, nClus))){
        counts <- matrix(0,
                         nrow = length(unique(fsom$data[,"File"])),
                         ncol = as.numeric(mc),
                         dimnames = list(unique(fsom$data[,"File"]),
                                         as.character(seq_len(as.numeric(mc)))))
        tmp <- table(fsom$data[,"File"],
                     meta_cl[[mc]][cluster_labels])
        counts[rownames(tmp), colnames(tmp)] <- tmp
        pctgs[[mc]] <- t(apply(counts, 1,
                               function(x){ 100 * x/sum(x) }))
    }

    # Coefficient of variation for each of the percentages
    cvs <- list()
    for(mc in as.character(c(cluster_values, nClus))){
        cvs[[mc]] <- apply(pctgs[[mc]],
                           2,
                           function(x){ stats::sd(x) / mean(x)})
    }
    res <- named.list(pctgs, cvs, meta_cl)
    if(plot){
        PlotOverviewCV(fsom, res)
    }

    return(res)
}

#' Plot overview of CV values for a flowSOM model
#'
#' @param fsom FlowSOM model
#' @param cv_res As generated by the testCV function
#' @param max_cv All values higher than this value will get the same color
#' @param show_cv All values higher than this one will be shown numerically
#'
#' dir <- system.file("extdata", package = "CytoNorm")
#' files <- list.files(dir, pattern = "fcs$")
#' data <- data.frame(File = files,
#'                    Path = file.path(dir, files),
#'                    Type = stringr::str_match(files, "_([12]).fcs")[,2],
#'                    Batch = stringr::str_match(files, "PTLG[0-9]*")[,1],
#'                    stringsAsFactors = FALSE)
#' data$Type <- c("1" = "Train", "2" = "Validation")[data$Type]
#' train_data <- dplyr::filter(data, Type == "Train")
#'
#' ff <- flowCore::read.FCS(data$Path[1])
#' channels <- grep("Di$", flowCore::colnames(ff), value = TRUE)
#' transformList <- flowCore::transformList(channels,
#'                                          cytofTransform)
#'
#' fsom <- prepareFlowSOM(train_data$Path,
#'                        channels,
#'                        nCells = 10000, #1000000
#'                        FlowSOM.params = list(xdim = 15,
#'                                              ydim = 15,
#'                                              nClus = 25,
#'                                              scale = FALSE),
#'                        transformList = transformList,
#'                        seed = 1)
#'
#' cvs <- testCV(fsom,
#'               cluster_values = c(5,15,25,35,45)) # 3:50
#'
#' PlotOverviewCV(fsom, cvs)
#' @export
PlotOverviewCV <- function(fsom, cv_res, max_cv = 2.5, show_cv = 1.5){
    cvs <- cv_res$cvs
    pctgs <- cv_res$pctgs
    nMetaClus <- length(levels(fsom$metaclustering))
    nClus <- fsom$map$nNodes

    cluster_values <- as.numeric(names(cvs))[-length(cvs)]
    width <- max(cluster_values)
    chosen <- which(cluster_values == nMetaClus)
    cv_matrix <- do.call(rbind,
                         lapply(cvs[as.character(cluster_values)],
                                function (x) {
                                    c(x,
                                      rep(NA,
                                          (width - length(x))))
                                }))
    cv_matrix <- rbind(cv_matrix,
                       matrix(c(cvs[[as.character(nClus)]],
                                rep(NA, (width - (nClus %% width)))),
                              ncol = width,
                              byrow = TRUE))
    rownames(cv_matrix)[length(cluster_values) + 1] <- "Original\nclustering"
    colnames(cv_matrix) <- NULL

    disp <- apply(cv_matrix, 2, function(x) as.character(round(x, 2)))
    disp[cv_matrix < show_cv] <- ""
    disp[is.na(cv_matrix)] <- ""

    p_cvs <- pheatmap::pheatmap(cv_matrix,
                                cluster_cols = FALSE,
                                cluster_rows = FALSE,
                                na_col = "white",
                                border_color = "white",
                                gaps_row = c(chosen-1, chosen,
                                             length(cluster_values)),
                                breaks = seq(0, max_cv, length.out = 100),
                                display_numbers = disp)
    
    scaling <- ifelse(nrow(pctgs[[as.character(nMetaClus)]]) == 1, "none", "column")
    p_pctgs_selected <- pheatmap::pheatmap(pctgs[[as.character(nMetaClus)]],
                                           cluster_cols = FALSE,
                                           cluster_rows = FALSE,
                                           na_col = "white",
                                           border_color = "white",
                                           display_numbers = round(pctgs[[as.character(nMetaClus)]],2),
                                           #number_color = "white",
                                           scale = scaling)

    gridExtra::grid.arrange(p_cvs[[4]], p_pctgs_selected[[4]])
}

#' emdEvaluation
#'
#' Evaluate how much files differ by computing the maximum Earth Movers Distance
#' for all markers and cellTypes.
#'
#' @param files     Full paths of to the fcs files of the control samples.
#' @param channels  Channels to evaluate (corresponding with the column
#'                  names of the flow frame)
#' @param transformList Transformation list to pass to the flowCore
#'                  \code{transform} function. Default NULL
#' @param prefix    Prefix present in the files, which will be removed to match
#'                  the manual list.
#' @param manual    A list which contains for every file a factor array. These
#'                  arrays contain a cell label for every cell in the files. All
#'                  arrays should have the same levels. Default = NULL, all
#'                  cells are evaluated together.
#' @param binSize   Binsize to approximate distribution. Default = 0.1.
#' @param minRange  Minimum to approximate distribution. Default = -100.
#' @param maxRange  Maximum to approximate distribution. Default = 100.
#' @param return_all If TRUE, distributions and pairwise distances are returned
#'                   as well. Default = FALSE.
#' @param manualThreshold Minimum number of cells to calculate the EMD. 
#'                  Default = 2 
#'
#' @return A matrix in which the rows represent the cell types, the columns
#' reprents the markers and the values represent the maximal earth movers
#' distances for the distributions between all files
#'
#' @examples
#'    # Describe file names
#'    dir <- system.file("extdata",package="CytoNorm")
#'    fileNames <- c("Gates_PTLG021_Unstim_Control_1.fcs",
#'                    "Gates_PTLG028_Unstim_Control_1.fcs")
#'    labels <- c("PTLG021","PTLG028")
#'    ff <- flowCore::read.FCS(file.path(dir,fileNames[1]))
#'    channelsToNormalize <- flowCore::colnames(ff)[c(10, 11, 14, 16:35, 37, 39:51)]
#'
#'    # Build transform list
#'    transformList <- flowCore::transformList(channelsToNormalize,
#'                                             cytofTransform)
#'    emdEvaluation(file.path(dir,fileNames),
#'                  transformList = transformList,
#'                  channelsToNormalize)
#'
#' @export
emdEvaluation <- function(files,
                          channels,
                          transformList = NULL,
                          prefix = "^Norm_",
                          manual = NULL,
                          binSize = 0.1,
                          minRange = -100,
                          maxRange = 100,
                          return_all = FALSE,
                          manualThreshold = 2){

    if (is.null(manual)) {
        cellTypes <- c("AllCells")
    } else {
        cellTypes <- c("AllCells", levels(manual[[1]]))
    }
    prettyColnames <- NULL
    distr <- list()
    for (file in files) {
        distr[[file]] <- list()

        ff <- flowCore::read.FCS(file)
        if (is.null(prettyColnames)){
            prettyColnames <- paste0(FlowSOM::GetMarkers(ff, channels = channels),
                                     " <", channels, ">")
        }
        if (!is.null(transformList)) {
            ff <- flowCore::transform(ff, transformList)
        }
        for (cellType in cellTypes) {
            if (is.null(manual) | cellType == "AllCells") {
                selection <- seq_len(flowCore::nrow(ff))
            } else {
                selection <- manual[[gsub(prefix, "", gsub(".*/",
                                                           "", file))]] == cellType
            }
            if (sum(selection) >= manualThreshold){
                distr[[file]][[cellType]] <-
                    apply(flowCore::exprs(ff)[selection, channels],
                          2,
                          function(x) {
                              graphics::hist(x,
                                             breaks = seq(minRange, maxRange, by = binSize),
                                             plot = FALSE)$counts
                          })
                colnames(distr[[file]][[cellType]]) <- prettyColnames
                distr[[file]][[cellType]] <- distr[[file]][[cellType]]/sum(selection)
            } else {
                distr[[file]][[cellType]] <- matrix(data = NA,
                                                    nrow = length(seq(minRange, maxRange, by = binSize))-1,
                                                    ncol = length(channels),
                                                    dimnames = list(NULL, prettyColnames))
            }


            any(distr[[file]][[cellType]] != 0)
        }
    }
    distances <- list()
    for (cellType in cellTypes) {
        distances[[cellType]] <- list()
        for (name in prettyColnames) {
            distances[[cellType]][[name]] <- matrix(NA, nrow = length(files),
                                                    ncol = length(files), dimnames = list(files,
                                                                                          files))
            for (i in seq_along(files)[-length(files)]) {
                file1 <- files[i]
                for (j in seq(i + 1, length(files))) {
                    file2 <- files[j]
                    distances[[cellType]][[name]][file1, file2] <-
                        emdist::emd2d(matrix(distr[[file1]][[cellType]][, name]),
                                      matrix(distr[[file2]][[cellType]][, name]))
                }
            }
        }
    }
    comparison <- matrix(NA, nrow = length(cellTypes), ncol = length(channels),
                         dimnames = list(cellTypes, prettyColnames))
    for (cellType in cellTypes) {
        for (name in prettyColnames) {
            comparison[cellType, name] <- max(distances[[cellType]][[name]],
                                              na.rm = TRUE)
        }
    }
    if (return_all) {
        return(named.list(distr, distances, comparison))
    } else {
        return(comparison)
    }
}


#' madEvaluation
#'
#' Evaluate whether you lose biological information by checking whether the
#' MAD stays similar before and after normalization.
#'
#' @param files     Full paths of to the fcs files of the control samples.
#' @param channels  Channels to evaluate (corresponding with the column
#'                  names of the flow frame)
#' @param transformList Transformation list to pass to the flowCore
#'                  \code{transform} function. Default NULL
#' @param prefix    Prefix present in the files, which will be removed to match
#'                  the manual list.
#' @param manual    A list which contains for every file a factor array. These
#'                  arrays contain a cell label for every cell in the files. All
#'                  arrays should have the same levels. Default = NULL, all
#'                  cells are evaluated together.
#' @param return_all If TRUE, individual MAD values are returned
#'                   as well. Default = FALSE.
#'
#' @return A matrix in which the rows represent the cell types, the columns
#' reprents the markers and the values represent the median MAD values for
#' the distributions of all files
#'
#' @examples
#'    # Describe file names
#'    dir <- system.file("extdata",package="CytoNorm")
#'    fileNames <- c("Gates_PTLG021_Unstim_Control_1.fcs",
#'                    "Gates_PTLG028_Unstim_Control_1.fcs")
#'    labels <- c("PTLG021","PTLG028")
#'    ff <- flowCore::read.FCS(file.path(dir,fileNames[1]))
#'    channelsToNormalize <- flowCore::colnames(ff)[c(10, 11, 14, 16:35, 37, 39:51)]
#'
#'    # Build transform list
#'    transformList <- flowCore::transformList(channelsToNormalize,
#'                                             cytofTransform)
#'    res <- madEvaluation(file.path(dir,fileNames),
#'                         transformList = transformList,
#'                         channelsToNormalize)
#'
#' @importFrom stats mad median
#' 
#' @export
madEvaluation <- function (files,
                           channels,
                           transformList = NULL,
                           prefix = "^Norm_",
                           manual = NULL,
                           return_all = FALSE)
{
    if (is.null(manual)) {
        cellTypes <- c("AllCells")
    } else {
        cellTypes <- c("AllCells", levels(manual[[1]]))
    }
    mad_values <- list()
    for (file in files) {
        mad_values[[file]] <- list()
        ff <- flowCore::read.FCS(file)
        if (!is.null(transformList))
            ff <- flowCore::transform(ff, transformList)
        for (cellType in cellTypes) {
            if (is.null(manual) | cellType == "AllCells") {
                selection <- seq_len(flowCore::nrow(ff))
            } else {
                selection <- manual[[gsub(prefix, "", gsub(".*/",
                                                             "", file))]] == cellType
            }
            if (sum(selection) > 1){
                mad_values[[file]][[cellType]] <- apply(flowCore::exprs(ff)[selection,
                                                                            channels], 2, function(x) {
                                                                                mad(x)
                                                                            })
            } else {
                mad_values[[file]][[cellType]] <- NA
            }
            any(mad_values[[file]][[cellType]] != 0)
        }
    }

    mad_per_celltype <- list()

    for (cellType in cellTypes){

        cell_matr <- matrix(NA, nrow = length(files),
                            ncol = length(channels), dimnames = list(files,
                                                                     channels))

        for (x in seq_along(mad_values)){

            cell_matr[x,] <-
                mad_values[[x]][[grep(cellType,names(mad_values[[x]]),
                                      fixed = TRUE)]]
        }

        mad_per_celltype[[cellType]] <- cell_matr
    }

    comparison <- t(sapply(mad_per_celltype, function(x){
        apply(x, 2, median, na.rm = FALSE)
    }))

    if (return_all) {
        return(named.list(mad_values, mad_per_celltype, comparison))
    }
    else {
        return(comparison)
    }
}
saeyslab/CytoNorm documentation built on Nov. 2, 2024, 12:39 p.m.