R/utils.R

Defines functions tidy_lmer_test_glmm cosine_similarity l2_normalize magnitude dot flowsom_consensus tof_join_metacluster tof_summarize_clusters tof_spade_downsampling rev_asinh get_extension tidytof_example_data

Documented in cosine_similarity dot get_extension l2_normalize magnitude rev_asinh tidytof_example_data

# reading data -----------------------------------------------------------------

## tidytof_example_data --------------------------------------------------------

#' Get paths to tidytof example data
#'
#' tidytof comes bundled with a number of sample .fcs files in its
#' inst/extdata directory. This function makes them easy to access.
#'
#' @param dataset_name Name of the dataset you want to access. If NULL,
#' the names of the datasets (each of which is from a different study)
#' will be listed.
#'
#' @return A character vector of file paths where the requested .fcs
#' files are located. If `dataset_name` is NULL, a character vector of
#' dataset names (that can be used as values for `dataset_name`) is
#' returned instead.
#'
#' @export
#'
#' @examples
#' tidytof_example_data()
#' tidytof_example_data(dataset_name = "phenograph")
#'
tidytof_example_data <-
    function(dataset_name = NULL) {
        if (is.null(dataset_name)) {
            dir(system.file("extdata", package = "tidytof"))
        } else {
            system.file("extdata", dataset_name, package = "tidytof", mustWork = TRUE)
        }
    }


## get_extension ---------------------------------------------------------------

#' Find the extension for a file
#'
#' @param filename A string representing the name of a file in its local directory
#'
#' @return The the file extension of `filename`
#'
#'
get_extension <- function(filename) {
    ex <- strsplit(basename(filename), split = "\\.")[[1]]
    return(ex[[length(ex)]])
}

# preprocessing/postprocessing -------------------------------------------------

#' Reverses arcsinh transformation with cofactor `scale_factor` and a
#' shift of `shift_factor`.
#'
#' @param x A numeric vector.
#'
#' @param shift_factor The scalar value `a` in the following equation used to
#' transform high-dimensional cytometry raw data ion counts using the hyperbolic arcsinh function:
#'    `new_x <- asinh(a + b * x)`.
#'
#' @param scale_factor The scalar value `b` in the following equation used to
#' transform high-dimensional cytometry raw data ion counts using the hyperbolic arcsinh function:
#'    `new_x <- asinh(a + b * x)`.
#'
#' @return A numeric vector after undergoing reverse
#' arcsinh transformation
#'
#' @export
#'
#' @examples
#' shift_factor <- 0
#' scale_factor <- 1 / 5
#'
#' input_value <- 20
#' asinh_value <- asinh(shift_factor + input_value * scale_factor)
#'
#' restored_value <- rev_asinh(asinh_value, shift_factor, scale_factor)
#'
rev_asinh <- function(x, shift_factor, scale_factor) {
    new_x <- (sinh(x) - shift_factor) / scale_factor
    return(new_x)
}

# downsampling -----------------------------------------------------------------

tof_spade_downsampling <-
    function(densities, cell_ids, target_num_cells, power = 1) {
        densities_sorted <- sort(densities)

        # a fast estimate of the expected number
        # of cells that will be sampled at each density threshold (starting
        # with the most inclusive threshold)
        cdf <- rev(cumsum(1.0 / rev(densities_sorted)))

        max_number_cells <- cdf[[1]]

        boundary <- target_num_cells / max_number_cells

        if (boundary > densities_sorted[1]) {
            targets <-
                (target_num_cells - (seq_along(densities_sorted))) / cdf
            boundary <- targets[which.min(targets - densities_sorted > 0)]
        }

        result_indices <-
            which((boundary / densities)^power > stats::runif(length(densities)))

        result <-
            cell_ids$..cell_id[result_indices]

        return(result)
    }

# metaclustering ---------------------------------------------------------------

tof_summarize_clusters <-
    function(
        tof_tibble,
        cluster_col,
        metacluster_cols = where(tof_is_numeric),
        group_cols,
        central_tendency_function = stats::median) {
        if (missing(group_cols)) {
            group_cols <- NULL
        }

        result <-
            tof_tibble |>
            tof_extract_central_tendency(
                cluster_col = {{ cluster_col }},
                group_cols = {{ group_cols }},
                marker_cols = {{ metacluster_cols }},
                central_tendency_function = central_tendency_function,
                format = "long"
            ) |>
            tidyr::pivot_wider(
                names_from = "channel",
                values_from = "values"
            )

        return(result)
    }

tof_join_metacluster <-
    function(
        tof_tibble,
        meta_tibble,
        cluster_col,
        metacluster_vector) {
        cluster_colname <- colnames(dplyr::select(tof_tibble, {{ cluster_col }}))

        cluster_dictionary <-
            meta_tibble |>
            dplyr::select({{ cluster_col }}) |>
            dplyr::mutate(.metacluster = metacluster_vector)

        result <-
            tof_tibble |>
            dplyr::select({{ cluster_col }}) |>
            dplyr::left_join(cluster_dictionary, by = cluster_colname) |>
            dplyr::select(-{{ cluster_col }})

        return(result)
    }

flowsom_consensus <-
    function(data_matrix, num_clusters) {
        tof_tibble <-
            t(data_matrix) |>
            dplyr::as_tibble()

        result <-
            tof_cluster_flowsom(
                tof_tibble = tof_tibble,
                som_xdim = num_clusters,
                som_ydim = 1,
                som_distance_function = "euclidean",
                perform_metaclustering = FALSE
            ) |>
            dplyr::pull(.data$flowsom_cluster)

        return(result)
    }

# mathematical operations ------------------------------------------------------

#' Find the dot product between two vectors.
#'
#' @param x A numeric vector.
#' @param y A numeric vector.
#'
#' @return The dot product between x and y.
#'
dot <- function(x, y) {
    return(as.numeric(t(x) %*% y))
}

#' Find the magnitude of a vector.
#'
#' @param x A numeric vector.
#'
#' @return A scalar value (the magnitude of x).
#'
magnitude <- function(x) {
    return(sqrt(dot(x, x)))
}

#' L2 normalize an input vector x to a length of 1
#'
#' @param x a numeric vector
#'
#' @return a vector of length length(x) with a magnitude of 1
#'
l2_normalize <- function(x) {
    return(x / magnitude(x))
}

#' Find the cosine similarity between two vectors
#'
#' @param x a numeric vector
#' @param y a numeric vector
#'
#' @return a scalar value representing the cosine similarity between x and y
#'
cosine_similarity <- function(x, y) {
    result <- dot(x, y) / (magnitude(x) * magnitude(y))
    return(result)
}

# differential discovery analysis ----------------------------------------------

#' @importFrom dplyr as_tibble
#'
tidy_lmer_test_glmm <- function(lmer_test) {
    result <-
        summary(lmer_test)$coefficients |>
        dplyr::as_tibble(rownames = "term") |>
        dplyr::rename(
            p.value = .data$`Pr(>|z|)`,
            estimate = .data$Estimate,
            statistic = .data$`z value`,
            std_error = .data$`Std. Error`
        )

    return(result)
}


#' @importFrom dplyr as_tibble
#'
tidy_lmer_test <- function(lmer_test) {
    result <-
        summary(lmer_test)$coefficients |>
        dplyr::as_tibble(rownames = "term") |>
        dplyr::rename(
            p.value = .data$`Pr(>|t|)`,
            estimate = .data$Estimate,
            statistic = .data$`t value`
        )

    return(result)
}


#' @importFrom rlang enquo
#' @importFrom tidyselect eval_select
#' @importFrom purrr map
#' @importFrom tidyr nest
#'
prepare_diffcyt_args <-
    function(
        tof_tibble,
        sample_col,
        cluster_col,
        marker_cols = where(tof_is_numeric),
        fixed_effect_cols,
        random_effect_cols,
        diffcyt_method = c("glmm", "edgeR", "voom"),
        include_observation_level_random_effects = FALSE) {
        # initialize formula
        my_formula <- NULL

        # extract sample column name as a string
        sample_colname <-
            tidyselect::eval_select(
                expr = rlang::enquo(sample_col),
                data = tof_tibble
            ) |>
            names()

        # extract cluster column name as a string
        cluster_colname <-
            tidyselect::eval_select(
                expr = rlang::enquo(cluster_col),
                data = tof_tibble
            ) |>
            names()

        # extract fixed effect columns as a character vector
        # will return an empty character vector if the argument is missing
        fixed_effect_colnames <-
            tidyselect::eval_select(
                expr = rlang::enquo(fixed_effect_cols),
                data = tof_tibble
            ) |>
            names()

        # extract random effect columns as a character vector
        # will return an empty character vector if the argument is missing
        random_effect_colnames <-
            tidyselect::eval_select(
                expr = rlang::enquo(random_effect_cols),
                data = tof_tibble
            ) |>
            names()

        # find all marker names by process of elimination
        marker_names <-
            colnames(tof_tibble)[
                !(
                    colnames(tof_tibble) %in%
                        c(cluster_colname, sample_colname, fixed_effect_colnames, random_effect_colnames)
                )
            ]

        if (length(marker_names) < 2) {
            stop("At least 2 markers must be selected (due to diffcyt's underlying implementation).")
        }

        # create diffcyt experiment_info
        experiment_info <-
            tof_tibble |>
            dplyr::select({{ sample_col }}, {{ fixed_effect_cols }}, {{ random_effect_cols }}) |>
            dplyr::distinct() |>
            dplyr::rename(sample_id = {{ sample_col }}) |>
            dplyr::arrange(.data$sample_id)

        # create diffcyt marker_info
        marker_info <-
            dplyr::tibble(marker_name = marker_names) |>
            dplyr::mutate(marker_class = "state")

        # create formula or design matrix depending on which method is being used
        if (diffcyt_method %in% c("glmm", "lmm") & include_observation_level_random_effects) {
            random_effect_colnames <-
                c("sample_id", random_effect_colnames)
        }

        if (diffcyt_method %in% c("glmm", "lmm")) {
            # if using glmms, create formula

            if (length(random_effect_colnames) == 0) {
                # if there are no random effects
                my_formula <-
                    diffcyt::createFormula(
                        experiment_info = experiment_info,
                        cols_fixed = fixed_effect_colnames
                    )
            } else {
                # if there are random effects
                my_formula <-
                    diffcyt::createFormula(
                        experiment_info = experiment_info,
                        cols_fixed = fixed_effect_colnames,
                        cols_random = random_effect_colnames
                    )
            }
        }

        # create design matrix
        my_design <-
            diffcyt::createDesignMatrix(
                experiment_info = experiment_info,
                cols_design = fixed_effect_colnames
            )

        ## make contrast matrix
        contrast_names <- colnames(my_design)

        # tibble of contrast matrices to test the null hypothesis that any given
        # fixed-effect coefficient is 0
        contrast_matrix_tibble <-
            dplyr::tibble(
                contrast_names = contrast_names,
                contrast_matrices =
                    purrr::map(
                        .x = seq_along(contrast_names),
                        .f = ~
                            make_binary_vector(length = length(contrast_names), indices = .x) |>
                                diffcyt::createContrast()
                    )
            ) |>
            dplyr::filter(contrast_names != "(Intercept)")

        # test against the null hypothesis that all fixed-effect coefficients are 0
        initial_contrast <-
            diffcyt::createContrast(
                make_binary_vector(length = length(contrast_names), indices = -1)
            )

        contrast_matrix_tibble <-
            dplyr::bind_rows(
                dplyr::tibble(contrast_names = "omnibus", contrast_matrices = list(initial_contrast)),
                contrast_matrix_tibble
            )

        # configure data into the format diffcyt likes
        data_list <-
            tof_tibble |>
            dplyr::group_by({{ sample_col }}) |>
            tidyr::nest() |>
            dplyr::arrange({{ sample_col }}) |>
            dplyr::pull(data)

        cols_to_include <-
            colnames(data_list[[1]]) %in%
            marker_info$marker_name

        data_diff <-
            diffcyt::prepareData(
                d_input = data_list,
                experiment_info = as.data.frame(experiment_info),
                marker_info = as.data.frame(marker_info),
                cols_to_include = cols_to_include
            )

        # add clusters to diffcyt object
        temp <-
            data_diff |>
            SummarizedExperiment::rowData()

        temp[, "cluster_id"] <-
            tof_tibble |>
            dplyr::pull({{ cluster_col }}) |>
            as.factor()

        SummarizedExperiment::rowData(data_diff) <- temp

        # fix type-related issues in the exprs component of the SummarizedExperiment
        data_exprs <-
            data_diff |>
            SummarizedExperiment::assays()
        data_exprs <- data_exprs[["exprs"]]

        data_colnames <- colnames(data_exprs)

        data_exprs <-
            data_exprs |>
            apply(MARGIN = 2, FUN = as.numeric)

        colnames(data_exprs) <- data_colnames

        SummarizedExperiment::assays(data_diff)[["exprs"]] <- data_exprs

        # return result
        diffcyt_args <-
            list(
                sample_colname = sample_colname,
                cluster_colname = cluster_colname,
                fixed_effect_colnames = fixed_effect_colnames,
                random_effect_colnames = random_effect_colnames,
                marker_names = marker_names,
                experiment_info = experiment_info,
                marker_info = marker_info,
                my_formula = my_formula,
                my_design = my_design,
                contrast_matrix_tibble = contrast_matrix_tibble,
                data_diff = data_diff
            )

        return(diffcyt_args)
    }

#'
#' @importFrom stats glm
#'
fit_da_model <- function(data, formula, has_random_effects = TRUE) {
    # check to see if lme4 is installed
    rlang::check_installed(pkg = "lme4")

    if (!requireNamespace(package = "lme4")) {
        stop("tof_daa_glmm requires the lme4 package to be installed")
    }

    if (is.null(formula)) {
        total_cells <- NULL
    }

    if (has_random_effects) {
        model_fit <-
            lme4::glmer(formula, data, family = "binomial", weights = total_cells)
    } else {
        model_fit <-
            stats::glm(formula, data, family = "binomial", weights = total_cells)
    }

    return(model_fit)
}


#'
#' @importFrom stats glm
#'
fit_de_model <-
    function(data, formula, has_random_effects = TRUE) {
        # check to see if lmerTest is installed
        rlang::check_installed(pkg = "lmerTest")

        if (!requireNamespace(package = "lmerTest")) {
            stop("tof_dea_lmm requires the lmerTest package to be installed")
        }

        if (has_random_effects) {
            model_fit <-
                lmerTest::lmer(formula, data)
        } else {
            model_fit <-
                stats::glm(formula, data, family = "gaussian")
        }
    }



tof_ttest <-
    function(enough_samples, x, y, paired = FALSE) {
        if (enough_samples) {
            return(t.test(x = x, y = y, paired = paired))
        } else {
            return(list(statistic = NA_real_, parameter = NA_real_, p.value = NA_real_))
        }
    }


# feature_extraction -----------------------------------------------------------

#' Find the earth-mover's distance between two numeric vectors
#'
#' @param vec_1 A numeric vector.
#'
#' @param vec_2 A numeric vector.
#'
#' @param num_bins An integer number of bins to use when performing kernel
#' density estimation on the two vectors. Defaults to 100.
#'
#' @return A double (of length 1) representing the EMD between the two vectors.
#'
#'
tof_find_emd <- function(vec_1, vec_2, num_bins = 100) {
    # check to see if emdist package is installed
    rlang::check_installed(
        pkg = "emdist",
        reason = "to compute the Earth-mover's distance."
    )

    # check that both vec_1 and vec_2 are numeric
    if (!is.numeric(vec_1) | !is.numeric(vec_2)) {
        return(NA_real_)
    }

    if (length(vec_1) < 10 | length(vec_2) < 10) {
        return(NA_real_)
    }

    # set up bins
    set_max <- ceiling(max(vec_1, vec_2))
    set_min <- floor(min(vec_1, vec_2))
    bins <- seq(set_max, set_min, length.out = num_bins)

    # find histograms
    density_1 <-
        graphics::hist(vec_1, breaks = bins, plot = FALSE)$density |>
        as.matrix()

    density_2 <-
        graphics::hist(vec_2, breaks = bins, plot = FALSE)$density |>
        as.matrix()

    # return final result
    em_dist <- emdist::emd2d(density_1, density_2)
    return(em_dist)
}

#' Find the Jensen-Shannon Divergence (JSD) between two numeric vectors
#'
#' @param vec_1 A numeric vector.
#'
#' @param vec_2 A numeric vector.
#'
#' @param num_bins An integer number of bins to use when binning
#' across the two vectors' combined range. Defaults to 100.
#'
#' @return A double (of length 1) representing the JSD between the two vectors.
#'
#'
#'
tof_find_jsd <- function(vec_1, vec_2, num_bins = 100) {
    # check to see if philentropy package is installed
    rlang::check_installed(
        pkg = "philentropy",
        reason = "to compute the Jensen-Shannon Divergence."
    )

    # check that both vectors are numeric
    if (!is.numeric(vec_1) | !is.numeric(vec_2)) {
        return(NA_real_)
    }

    # check that both vectors have at least 10 cells
    if (length(vec_1) < 10 | length(vec_2) < 10) {
        return(NA_real_)
    }

    # set up bins
    set_max <- ceiling(max(vec_1, vec_2))
    set_min <- floor(min(vec_1, vec_2))
    bins <- seq(set_max, set_min, length.out = num_bins)

    # find histograms
    p1 <-
        graphics::hist(vec_1, breaks = bins, plot = FALSE)$counts |>
        as.numeric()

    p1 <- p1 / sum(p1)

    p2 <-
        graphics::hist(vec_2, breaks = bins, plot = FALSE)$counts |>
        as.numeric()

    p2 <- p2 / sum(p2)

    # return final result
    js_dist <-
        purrr::quietly(philentropy::JSD)(rbind(p1, p2))$result |>
        as.numeric()

    return(js_dist)
}


#' @importFrom dplyr pull
pull_unless_null <- function(tib, uq_colname) {
    if (is.null(tib)) {
        return(NULL)
    } else {
        return(dplyr::pull(tib, {{ uq_colname }}))
    }
}


# local density estimation -----------------------------------------------------


#' Estimate the local densities for all cells in a high-dimensional cytometry dataset.
#'
#' This function is a wrapper around tidytof's tof_*_density() function family.
#' It performs local density estimation on high-dimensional cytometry data using a user-specified
#' method (of 3 choices) and each method's corresponding input parameters.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param distance_cols Unquoted names of the columns in `tof_tibble` to use in
#' calculating cell-to-cell distances during the local density estimation for
#' each cell. Defaults to all numeric columns in `tof_tibble`.
#'
#' @param distance_function A string indicating which distance function to use
#' for calculating cell-to-cell distances during local density estimation. Options
#' include "euclidean" (the default) and "cosine".
#'
#' @param normalize A boolean value indicating if the vector of local density
#' estimates should be normalized to values between 0 and 1. Defaults to TRUE.
#'
#' @param ... Additional arguments to pass to the `tof_*_density()` function family
#' member corresponding to the chosen `method`.
#'
#' @param augment A boolean value indicating if the output should column-bind the
#' local density estimates of each cell as a new column in `tof_tibble` (TRUE; the default) or if
#' a single-column tibble including only the local density estimates should be returned (FALSE).
#'
#' @param method  A string indicating which local density estimation method should be used.
#' Valid values include "mean_distance", "sum_distance", and "spade".
#'
#' @return A `tof_tbl` or `tibble` If augment = FALSE, it will have a single column encoding
#' the local density estimates for each cell in `tof_tibble`. If augment = TRUE, it will have
#' ncol(tof_tibble) + 1 columns: each of the (unaltered) columns in `tof_tibble`
#' plus an additional column encoding the local density estimates.
#'
#' @family local density estimation functions
#'
#' @export
#'
#' @examples
#' sim_data <-
#'     dplyr::tibble(
#'         cd45 = rnorm(n = 1000),
#'         cd38 = rnorm(n = 1000),
#'         cd34 = rnorm(n = 1000),
#'         cd19 = rnorm(n = 1000)
#'     )
#'
#' # perform the density estimation
#' tof_estimate_density(tof_tibble = sim_data, method = "spade")
#'
#' # perform the density estimation with a smaller search radius around
#' # each cell
#' tof_estimate_density(
#'     tof_tibble = sim_data,
#'     alpha_multiplier = 2,
#'     method = "spade"
#' )
#'
tof_estimate_density <-
    function(
        tof_tibble,
        distance_cols = where(tof_is_numeric),
        distance_function = c("euclidean", "cosine", "l2", "ip"),
        normalize = TRUE,
        ...,
        augment = TRUE,
        method = c("mean_distance", "sum_distance", "spade")) {
        # check arguments
        method <-
            rlang::arg_match(method, values = c("mean_distance", "sum_distance", "spade"))
        distance_function <- rlang::arg_match(distance_function)

        if (method %in% c("mean_distance", "sum_distance")) {
            densities <-
                tof_knn_density(
                    tof_tibble = tof_tibble,
                    distance_cols = {{ distance_cols }},
                    distance_function = distance_function,
                    estimation_method = method,
                    normalize = normalize,
                    ...
                )
        } else if (method == "spade") {
            densities <-
                tof_spade_density(
                    tof_tibble = tof_tibble,
                    distance_cols = {{ distance_cols }},
                    distance_function = distance_function,
                    normalize = normalize,
                    ...
                )
        }

        if (augment) {
            result <-
                dplyr::bind_cols(tof_tibble, densities)
        } else {
            result <- densities
        }

        return(result)
    }

#' Estimate cells' local densities as done in Spanning-tree Progression Analysis of Density-normalized Events (SPADE)
#'
#' This function uses the algorithm described in
#' \href{https://pubmed.ncbi.nlm.nih.gov/21964415/}{Qiu et al., (2011)} to estimate
#' the local density of each cell in a `tof_tbl` or `tibble` containing high-dimensional cytometry data.
#' Briefly, this algorithm involves counting the number of neighboring cells
#' within  a sphere of radius alpha surrounding each cell. Here, we do so using
#' the \code{\link[RANN]{nn2}} function.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param distance_cols Unquoted names of the columns in `tof_tibble` to use in
#' calculating cell-to-cell distances during the local density estimation for
#' each cell. Defaults to all numeric columns in `tof_tibble`.
#'
#' @param distance_function A string indicating which distance function to use
#' for calculating cell-to-cell distances during local density estimation. Options
#' include "euclidean" (the default) and "cosine".
#'
#' @param num_alpha_cells An integer indicating how many cells from `tof_tibble`
#' should be randomly sampled from `tof_tibble` in order to estimate `alpha`, the
#' radius of the sphere constructed around each cell during local density
#' estimation. Alpha is calculated by taking the median nearest-neighbor distance
#' from the `num_alpha_cells` randomly-sampled cells and multiplying it by
#' `alpha_multiplier`. Defaults to 2000.
#'
#' @param alpha_multiplier An numeric value indicating the multiplier that should be used
#' when calculating `alpha`, the radius of the sphere constructed around each
#' cell during local density estimation. Alpha is calculated by taking
#' the median nearest-neighbor distance from the `num_alpha_cells` cells
#' randomly-sampled from `tof_tibble` and multiplying it by
#' `alpha_multiplier`. Defaults to 5.
#'
#' @param max_neighbors An integer indicating the maximum number of neighbors
#' that can be counted within the sphere surrounding any given cell. Implemented
#' to reduce the density estimation procedure's speed and memory requirements.
#' Defaults to 1\% of the number of rows in `tof_tibble`.
#'
#' @param normalize A boolean value indicating if the vector of local density
#' estimates should be normalized to values between 0 and 1. Defaults to TRUE.
#'
#' @param ... Additional optional arguments to pass to \code{\link{tof_find_knn}}.
#'
#' @return A tibble with a single column named ".spade_density" containing the
#' local density estimates for each input cell in `tof_tibble`.
#'
#' @family local density estimation functions
#'
#' @importFrom dplyr select
#' @importFrom dplyr tibble
#'
#' @importFrom rlang arg_match
#'
#' @export
#'
#' @examples
#' sim_data <-
#'     dplyr::tibble(
#'         cd45 = rnorm(n = 1000),
#'         cd38 = rnorm(n = 1000),
#'         cd34 = rnorm(n = 1000),
#'         cd19 = rnorm(n = 1000)
#'     )
#'
#' # perform the density estimation
#' tof_spade_density(tof_tibble = sim_data)
#'
#' # perform the density estimation using cosine distance
#' tof_spade_density(
#'     tof_tibble = sim_data,
#'     distance_function = "cosine",
#'     alpha_multiplier = 2
#' )
#'
#' # perform the density estimation with a smaller search radius around
#' # each cell
#' tof_spade_density(
#'     tof_tibble = sim_data,
#'     alpha_multiplier = 2
#' )
#'
tof_spade_density <-
    function(
        tof_tibble,
        distance_cols = where(tof_is_numeric),
        distance_function = c("euclidean", "cosine", "l2", "ip"),
        num_alpha_cells = 2000L,
        alpha_multiplier = 5,
        max_neighbors = round(0.01 * nrow(tof_tibble)),
        normalize = TRUE,
        ...) {
        distance_function <- rlang::arg_match(distance_function)

        # estimate alpha -----------------------------------------------------------

        # find median nearest-neighbor distances for num_alpha_cells sampled cells
        alpha_knns <-
            tof_tibble |>
            tof_downsample_constant(num_cells = num_alpha_cells) |>
            dplyr::select({{ distance_cols }}) |>
            tof_find_knn(
                k = 1L,
                distance_function = distance_function,
                ...
            )
        alpha_median <- median(alpha_knns$neighbor_distances)
        alpha <- alpha_multiplier * alpha_median

        # estimate local densities -------------------------------------------------

        # compute number of neighbors within a sphere of radius alpha for each
        # input cell in tof_tibble
        spade_neighbors <-
            tof_tibble |>
            dplyr::select({{ distance_cols }}) |>
            tof_find_knn(
                k = max_neighbors,
                distance_function = distance_function
            )

        # calculate densities equal to the number of neighbors
        # (up to max_neighbors) that each cell has
        num_neighbors_within_radius <-
            (spade_neighbors$neighbor_distances <= alpha) |>
            rowSums()
        densities <- num_neighbors_within_radius

        if (normalize) {
            # normalize densities
            densities <-
                (densities - min(densities)) /
                    ((max(densities) - min(densities)))
        }

        # return result
        result <-
            dplyr::tibble(.spade_density = densities)

        return(result)
    }


#' Estimate cells' local densities using K-nearest-neighbor density estimation
#'
#' This function uses the distances between a cell and each of its K nearest
#' neighbors to estimate local density of each cell in a
#' `tof_tbl` or `tibble` containing high-dimensional cytometry data.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param distance_cols Unquoted names of the columns in `tof_tibble` to use in
#' calculating cell-to-cell distances during the local density estimation for
#' each cell. Defaults to all numeric columns in `tof_tibble`.
#'
#' @param num_neighbors An integer indicating the number of nearest neighbors
#' to use in estimating the local density of each cell. Defaults to the minimum
#' of 15 and the number of rows in `tof_tibble`.
#'
#' @param distance_function A string indicating which distance function to use
#' for calculating cell-to-cell distances during local density estimation. Options
#' include "euclidean" (the default) and "cosine".
#'
#' @param estimation_method A string indicating how the relative density for each cell should be
#' calculated from the distances between it and each of its k nearest neighbors. Options are
#' "mean_distance" (the default; estimates the relative density for a cell's neighborhood by
#' taking the negative average of the distances to its nearest neighbors) and "sum_distance"
#' (estimates the relative density for a cell's neighborhood by taking the negative sum of the
#' distances to its nearest neighbors).
#'
#' @param normalize A boolean value indicating if the vector of local density
#' estimates should be normalized to values between 0 and 1. Defaults to TRUE.
#'
#' @param ... Additional optional arguments to pass to
#' \code{\link{tof_find_knn}}.
#'
#' @return A tibble with a single column named ".knn_density" containing the
#' local density estimates for each input cell in `tof_tibble`.
#'
#' @family local density estimation functions
#'
#'
tof_knn_density <-
    function(
        tof_tibble,
        distance_cols = where(tof_is_numeric),
        num_neighbors = min(15L, nrow(tof_tibble)),
        distance_function = c("euclidean", "cosine", "l2", "ip"),
        estimation_method = c("mean_distance", "sum_distance"),
        normalize = TRUE,
        ...) {
        # check arguments
        estimation_method <-
            rlang::arg_match(estimation_method, values = c("mean_distance", "sum_distance"))
        distance_function <-
            rlang::arg_match(distance_function)

        # compute knn information
        knn_result <-
            tof_tibble |>
            dplyr::select({{ distance_cols }}) |>
            tof_find_knn(
                k = num_neighbors,
                distance_function = distance_function,
                ...
            )

        # find densities using one of 2 methods
        if (estimation_method == "sum_distance") {
            densities <- -base::rowSums(abs(knn_result$neighbor_distances))
        } else if (estimation_method == "mean_distance") {
            densities <- -base::rowMeans(abs(knn_result$neighbor_distances))
        } else {
            stop("Not a valid estimation_method.")
        }

        if (normalize) {
            # normalize densities
            densities <-
                (densities - min(densities)) /
                    ((max(densities) - min(densities)))
        }

        result <-
            dplyr::tibble(.knn_density = densities)

        # a tibble with N (number of cells) rows with the ith
        # row representing the KNN-estimated density of the ith cell.
        return(result)
    }


# miscellaneous ----------------------------------------------------------------

#' Find if a vector is numeric
#'
#' This function takes an input vector `.vec` and checks if it is either an
#' integer or a double (i.e. is the type of vector that might encode high-dimensional cytometry
#' measurements).
#'
#' @param .vec A vector.
#'
#' @return A boolean value indicating if .vec is of type integer or double.
#'
#' @importFrom purrr is_integer
#' @importFrom purrr is_double
#'
#'
tof_is_numeric <- function(.vec) {
    return(purrr::is_integer(.vec) | purrr::is_double(.vec))
}


#' Find the k-nearest neighbors of each cell in a high-dimensional cytometry dataset.
#'
#' @param .data A `tof_tibble` or `tibble` in which each row represents a cell
#' and each column represents a high-dimensional cytometry measurement.
#'
#' @param k An integer indicating the number of nearest neighbors to return for
#' each cell.
#'
#' @param distance_function A string indicating which distance function to use
#' for the nearest-neighbor calculation. Options include "euclidean"
#' (the default) and "cosine" distances.
#'
#' @param .query A set of cells to be queried against .data (i.e. a set of cells
#' for which to find nearest neighbors within .data). Defaults to .data itself,
#' i.e. finding nearest neighbors for all cells in .data.
#'
#' @param ... Optional additional arguments to pass to \code{\link[RcppHNSW]{hnsw_knn}}
#'
#' @return A list with two elements: "neighbor_ids" and "neighbor_distances,"
#' both of which are n by k matrices (in which n is the number of cells in the
#' input `.data`. The [i,j]-th entry of "neighbor_ids" represents the row index
#' for the j-th nearest neighbor of the cell in the i-th row of `.data`.
#' The [i,j]-th entry of "neighbor_distances" represents the distance between
#' those two cells according to `distance_function`.
#'
#' @importFrom RcppHNSW hnsw_build
#' @importFrom RcppHNSW hnsw_search
#'
#' @importFrom rlang arg_match
#'
#' @export
#'
#' @examples
#' sim_data <-
#'     dplyr::tibble(
#'         cd45 = rnorm(n = 1000),
#'         cd38 = rnorm(n = 1000),
#'         cd34 = rnorm(n = 1000),
#'         cd19 = rnorm(n = 1000)
#'     )
#'
#' # Find the 10 nearest neighbors of each cell in the dataset
#' tof_find_knn(
#'     .data = sim_data,
#'     k = 10,
#'     distance_function = "euclidean"
#' )
#'
#' # Find the 10 approximate nearest neighbors
#' tof_find_knn(
#'     .data = sim_data,
#'     k = 10,
#'     distance_function = "euclidean",
#' )
#'
tof_find_knn <-
    function(
        .data,
        k = min(10, nrow(.data)),
        distance_function = c("euclidean", "cosine", "l2", "ip"),
        .query,
        ...) {
        # check distance function
        distance_function <- rlang::arg_match(distance_function)

        # compute result
        hnsw_tree <-
            RcppHNSW::hnsw_build(
                X = as.matrix(.data),
                distance = distance_function,
                ...
            )

        if (missing(.query)) {
            query <- .data
        } else {
            query <- .query
        }

        nn_result <-
            RcppHNSW::hnsw_search(
                X = as.matrix(query),
                ann = hnsw_tree,
                k = k + 1,
                ...
            )

        names(nn_result) <- c("neighbor_ids", "neighbor_distances")

        # remove the first-closest neighbor (column), which is always the point itself
        if (missing(.query)) {
            # remove the first-closest neighbor (column), which is always the point itself
            nn_result$neighbor_ids <- nn_result$neighbor_ids[, 2:(k + 1)]
            nn_result$neighbor_distances <- nn_result$neighbor_distances[, 2:(k + 1)]
        } else {
            # unless the query is a different set of cells than the dataset being
            # searched, in which case discard the (unneeded) extra neighbor.
            nn_result$neighbor_ids <- nn_result$neighbor_ids[, seq_len(k)]
            nn_result$neighbor_distances <- nn_result$neighbor_distances[, seq_len(k)]
        }

        # return result
        return(nn_result)
    }



#' Title
#'
#' @param tof_tibble A tibble or tof_tbl.
#' @param knn_cols Unquoted column names indicating which columns in tof_tibble
#' should be used for the KNN calculation.
#' @param num_neighbors An integer number of neighbors to find for each cell (
#' not including itself).
#' @param distance_function A string indicating which distance function to use
#' for the nearest-neighbor calculation. Options include "euclidean"
#' (the default) and "cosine" distances.
#' @param graph_type A string indicating if the graph's edges should have weights
#' ("weighted"; the default) or not ("unweighted").
#' @param ... Optional additional arguments to pass to \code{\link[tidytof]{tof_find_knn}}
#'
#' @return A \code{\link[tidygraph]{tbl_graph}}.
#'
#' @export
#'
#' @examples
#' NULL
#'
tof_make_knn_graph <-
    function(
        tof_tibble, # single-cell data
        knn_cols, # unquoted column names of columns to use in the knn calculation
        num_neighbors, # number of knn's
        distance_function = c("euclidean", "cosine"),
        graph_type = c("weighted", "unweighted"), # weighted or unweighted graph
        ... # additional arguments to tof_find_knn
        ) {
        distance_function <- rlang::arg_match(distance_function)
        graph_type <- rlang::arg_match(graph_type)

        knn_data <-
            tof_tibble |>
            # select only knn_cols
            dplyr::select({{ knn_cols }}) |>
            tof_find_knn(
                k = num_neighbors,
                distance_function = distance_function,
                ...
            )

        # extract knn_ids and put them into long format
        knn_ids <-
            knn_data |>
            purrr::pluck("neighbor_ids")
        colnames(knn_ids) <- seq_len(ncol(knn_ids))

        knn_ids <-
            knn_ids |>
            dplyr::as_tibble() |>
            dplyr::mutate(from = seq(from = 1, to = nrow(knn_ids), by = 1)) |>
            tidyr::pivot_longer(
                cols = -"from",
                names_to = "neighbor_index",
                values_to = "to"
            )

        if (graph_type == "weighted") {
            # extract knn distances and put them into long format
            knn_dists <-
                knn_data |>
                purrr::pluck("neighbor_distances")
            colnames(knn_dists) <- seq_len(ncol(knn_dists))

            knn_dists <-
                knn_dists |>
                dplyr::as_tibble() |>
                dplyr::mutate(from = seq(from = 1, to = nrow(knn_dists), by = 1)) |>
                tidyr::pivot_longer(
                    cols = -"from",
                    names_to = "neighbor_index",
                    values_to = "distance"
                )

            # join knn distances with knn ids for final edge tibble
            edge_tibble <-
                knn_ids |>
                dplyr::left_join(knn_dists, by = (c("from", "neighbor_index")))

            if (distance_function == "euclidean") {
                edge_tibble <-
                    edge_tibble |>
                    dplyr::mutate(weight = 1 / (1 + .data$distance))
            } else {
                edge_tibble <-
                    edge_tibble |>
                    dplyr::mutate(weight = 1 - .data$distance)
            }
        } else {
            edge_tibble <-
                knn_ids
        }

        # create the knn_graph as a tidygraph object
        knn_graph <-
            tidygraph::tbl_graph(
                nodes = tof_tibble,
                edges = edge_tibble,
                directed = FALSE
            )

        return(knn_graph)
    }


make_binary_vector <- function(length, indices) {
    result <- rep.int(0, times = length)
    result[indices] <- 1

    return(result)
}

deframe <- function(x) {
    value <- x[[2L]]
    name <- x[[1L]]
    result <- setNames(value, nm = name)
    return(result)
}

tof_rescale <-
    function(.vec) {
        vec_max <- max(.vec)
        vec_min <- min(.vec)
        vec_diff <- vec_max - vec_min

        result <- (.vec - vec_min) / vec_diff
        return(result)
    }


#' Make a heatmap summarizing group marker expression patterns in high-dimensional cytometry data
#'
#' This function makes a heatmap of group-to-group marker expression patterns
#' in single-cell data. Markers are plotted along the horizontal (x-) axis of
#' the heatmap and groups are plotted along the vertical (y-) axis of the
#' heatmap.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param y_col An unquoted column name indicating which column in `tof_tibble`
#' stores the ids for the group to which each cell belongs.
#'
#' @param marker_cols Unquoted column names indicating which column in `tof_tibble`
#' should be interpreted as markers to be plotted along the x-axis of the heatmap.
#' Supports tidyselect helpers.
#'
#' @param central_tendency_function A function to use for computing the
#' measure of central tendency that will be aggregated from each cluster in
#' cluster_col. Defaults to the median.
#'
#' @param scale_markerwise A boolean value indicating if the heatmap should
#' rescale the columns of the heatmap such that the maximum value for each
#' marker is 1 and the minimum value is 0. Defaults to FALSE.
#'
#' @param scale_ywise A boolean value indicating if the heatmap should
#' rescale the rows of the heatmap such that the maximum value for each
#' group is 1 and the minimum value is 0. Defaults to FALSE.
#'
#' @param line_width A numeric value indicating how thick the lines separating
#' the tiles of the heatmap should be. Defaults to 0.25.
#'
#' @param theme A ggplot2 theme to apply to the heatmap.
#' Defaults to \code{\link[ggplot2]{theme_minimal}}
#'
#' @param cluster_markers A boolean value indicating if the heatmap should
#' order its columns (i.e. markers) using hierarchical clustering. Defaults to
#' TRUE.
#'
#' @param cluster_groups A boolean value indicating if the heatmap should
#' order its rows (i.e. groups) using hierarchical clustering. Defaults to
#' TRUE.
#'
#' @return A ggplot object.
#'
#' @importFrom dplyr as_tibble
#' @importFrom dplyr bind_cols
#' @importFrom dplyr mutate
#' @importFrom dplyr pull
#' @importFrom dplyr select
#'
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 element_text
#' @importFrom ggplot2 geom_tile
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 scale_fill_viridis_c
#' @importFrom ggplot2 theme_minimal
#'
#' @importFrom purrr pluck
#'
#' @importFrom stats dist
#' @importFrom stats hclust
#'
#' @importFrom tidyr pivot_longer
#'
tof_plot_heatmap <-
    function(
        tof_tibble,
        y_col,
        marker_cols = where(tof_is_numeric),
        central_tendency_function = stats::median,
        scale_markerwise = FALSE,
        scale_ywise = FALSE,
        cluster_markers = TRUE,
        cluster_groups = TRUE,
        line_width = 0.25,
        theme = ggplot2::theme_minimal()) {
        # compute summary statistics for each group ------------------------------
        group_tibble <-
            tof_tibble |>
            dplyr::select(
                {{ y_col }},
                {{ marker_cols }}
            ) |>
            # compute one summary statistic for each group across all knn_cols
            tof_summarize_clusters(
                cluster_col = {{ y_col }},
                metacluster_cols = c({{ marker_cols }}),
                central_tendency_function = central_tendency_function
            )

        group_matrix <-
            group_tibble |>
            dplyr::select({{ marker_cols }}) |>
            as.matrix()

        marker_names <- colnames(group_matrix)

        if (scale_markerwise) {
            group_matrix <-
                apply(X = group_matrix, MARGIN = 2, FUN = tof_rescale)

            # if there are NaN values, tell the user
            if (any(is.nan(group_matrix))) {
                stop("NaN values resulted from marker-wise scaling.
                Consider setting scale_markerwise to FALSE.")
            }
        }

        if (scale_ywise) {
            group_matrix <-
                apply(X = group_matrix, MARGIN = 1, FUN = tof_rescale) |>
                t()

            # if there are NaN values, tell the user
            if (any(is.nan(group_matrix))) {
                stop("NaN values resulted from group-wise scaling.
                Consider setting scale_* to FALSE.")
            }
        }

        colnames(group_matrix) <- marker_names

        if (cluster_markers) {
            marker_order <-
                group_matrix |>
                t() |>
                stats::dist(method = "euclidean") |>
                stats::hclust() |>
                purrr::pluck("order")
            marker_order <- marker_names[marker_order]
        } else {
            marker_order <- marker_names
        }

        if (cluster_groups) {
            group_order <-
                stats::dist(x = group_matrix, method = "euclidean") |>
                stats::hclust() |>
                purrr::pluck("order")
            group_order <- dplyr::pull(group_tibble, {{ y_col }})[group_order]
        } else {
            group_order <- dplyr::pull(group_tibble, {{ y_col }})
        }

        group_tibble_long <-
            group_matrix |>
            dplyr::as_tibble() |>
            dplyr::bind_cols(dplyr::select(group_tibble, {{ y_col }})) |>
            tidyr::pivot_longer(
                cols = {{ marker_cols }},
                names_to = "marker",
                values_to = "expression"
            ) |>
            dplyr::mutate(
                "{{y_col}}" := factor({{ y_col }}, levels = group_order),
                marker = factor(.data$marker, levels = marker_order)
            )

        heatmap <-
            group_tibble_long |>
            ggplot2::ggplot(
                ggplot2::aes(x = .data$marker, y = {{ y_col }}, fill = .data$expression)
            ) +
            ggplot2::geom_tile(color = "black", linewidth = line_width) +
            ggplot2::scale_fill_viridis_c()

        # rotate x axis labels
        theme <-
            theme +
            ggplot2::theme(
                axis.text.x =
                    ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5)
            )

        return(heatmap + theme)
    }


flatten_model_predictions <- function(model_prediction, lambdas) {
    num_lambdas <- length(lambdas)
    prediction_list <- list()
    for (i in seq_len(num_lambdas)) {
        lambda <- lambdas[[i]]
        prediction <- model_prediction[, , i]

        prediction_list[[i]] <- dplyr::as_tibble(prediction)
    }

    result <- dplyr::tibble(
        lambda = lambdas,
        predictions = prediction_list
    )

    return(result)
}


#' @importFrom survival survfit
tof_model_plot_survival_curves <- function(tof_model, new_x) {
    cox_model <- tof_model$model
    lambda <- tof_model$penalty

    if (missing(new_x)) {
        new_x <-
            tof_setup_glmnet_xy(
                feature_tibble = tof_get_model_training_data(tof_model),
                recipe = tof_model$recipe,
                outcome_cols = dplyr::any_of(tof_model$outcome_colnames),
                model_type = tof_model$model_type
            )$x
    }

    survfit_result <-
        survival::survfit(
            cox_model,
            s = lambda,
            x = tof_get_model_x(tof_model),
            y = tof_get_model_y(tof_model),
            newx = new_x
        )

    times <-
        dplyr::tibble(
            time = survfit_result$time,
            .timepoint_index = seq_along(survfit_result$time)
        )

    survival_curves <-
        survfit_result$surv |>
        dplyr::as_tibble() |>
        dplyr::mutate(.timepoint_index = seq_len(nrow(survfit_result$surv))) |>
        tidyr::pivot_longer(
            cols = -".timepoint_index",
            names_to = "row_index",
            values_to = "probability"
        ) |>
        dplyr::left_join(times, by = ".timepoint_index") |>
        dplyr::select(-".timepoint_index") |>
        tidyr::nest(survival_curve = -"row_index")

    return(survival_curves)
}

#' @importFrom survival survfit
tof_plot_survival_curves <-
    function(cox_model, lambda, recipe, train_x, train_y, new_x) {
        survfit_result <-
            survival::survfit(
                cox_model,
                s = lambda,
                x = train_x,
                y = train_y,
                newx = new_x
            )

        times <-
            dplyr::tibble(
                time = survfit_result$time,
                .timepoint_index = seq_along(survfit_result$time)
            )

        survival_curves <-
            survfit_result$surv |>
            dplyr::as_tibble() |>
            dplyr::mutate(.timepoint_index = seq_len(nrow(survfit_result$surv))) |>
            tidyr::pivot_longer(
                cols = -".timepoint_index",
                names_to = "row_index",
                values_to = "probability"
            ) |>
            dplyr::left_join(times, by = ".timepoint_index") |>
            dplyr::select(-".timepoint_index") |>
            tidyr::nest(survival_curve = -"row_index")

        return(survival_curves)
    }

#' Compute a Kaplan-Meier curve from sample-level survival data
#'
#' @param survival_curves A tibble from which the Kaplan-Meier curve will be
#' computed. Each row must represent an observation and must have two
#' columns named "time_to_event" and "event".
#'
#' @return A tibble with 3 columns: time_to_event, survival_probability, and
#' is_censored (whether or not an event was censored at that timepoint).
#'
#' @importFrom dplyr add_row
#' @importFrom dplyr arrange
#' @importFrom dplyr lag
#' @importFrom dplyr mutate
#' @importFrom dplyr n
#' @importFrom dplyr select
#' @importFrom dplyr tibble
#'
tof_compute_km_curve <- function(survival_curves) {
    if (!is.factor(survival_curves$event)) {
        survival_curves$event <- factor(survival_curves$event, levels = c(0, 1))
    }

    censor_level <- levels(survival_curves$event)[1]
    death_level <- levels(survival_curves$event)[2]

    km_curve <-
        survival_curves |>
        dplyr::select("time_to_event", "event") |>
        dplyr::arrange(.data$time_to_event) |>
        dplyr::mutate(
            num_current_deaths = as.character(.data$event) == death_level,
            num_deaths = cumsum(.data$num_current_deaths),
            num_current_censored = as.character(.data$event) == censor_level,
            num_censored = cumsum(.data$num_current_censored)
        ) |>
        dplyr::add_row(
            dplyr::tibble(
                time_to_event = 0,
                num_current_deaths = 0,
                num_deaths = 0,
                num_current_censored = 0,
                num_censored = 0
            ),
            .before = 1L
        ) |>
        dplyr::mutate(
            num_at_risk = dplyr::n() - (.data$num_deaths + .data$num_censored + 1),
            num_at_risk = dplyr::lag(.data$num_at_risk, default = dplyr::n() - 1),
            multiplier = (.data$num_at_risk - .data$num_current_deaths) / .data$num_at_risk,
            survival_probability = 1,
            survival_probability = cumprod(.data$multiplier),
            is_censored = (.data$num_current_censored == 1)
        ) |>
        dplyr::select(
            "time_to_event",
            "survival_probability",
            "is_censored"
        )

    return(km_curve)
}
keyes-timothy/tidytof documentation built on May 7, 2024, 12:33 p.m.