barcodetrackR: Functions for Analyzing Cellular Barcoding Data

Documented in clonal_contribution

#' @importFrom rlang %||%
#' @importFrom magrittr %>%
#'
#' @title Clonal contribution plot
#'
#' @description Bar or line plot of percentage contribution of the top clones from a selected sample or all clones across samples matching the specified filter within the SummarizedExperiment object. Usually used for tracking a cell lineage's top clones over time.
#'
#' @param your_SE A Summarized Experiment object.
#' @param SAMPLENAME_choice The identifying SAMPLENAME from which to obtain the top "n_clones" clones to color. If NULL and clone_sequences is NULL, all clones will be shown as gray.
#' @param filter_by Name of metadata column to filter by e.g. Lineage
#' @param filter_selection The value of the filter column to display e.g. "T" (within Lineage)
#' @param plot_over The column of metadata that you want to be the x-axis of the plot. e.g. Month. For numeric metadata, the x-axis will be ordered in ascending fashion. For categorical metadata, the sample order will be followed.
#' @param plot_over_display_choices Choice(s) from the column designated in plot_over that will be used for plotting. Defaults to all.
#' @param clone_sequences The identifying rownames within your_SE for which to plot. SAMPLENAME_choice should be set to NULL or not specified if clone_sequences is specified.
#' @param n_clones Numeric. Number of top clones from SAMPLENAME_choice that should be assigned a unique color.
#' @param graph_type Choice of "bar" or "line" for how to display the clonal contribution data
#' @param keep_numeric If plot_over is numeric, whether to space the x-axis appropriately according to the numerical values.
#' @param plot_non_selected Plot clones NOT found within the top clones in SAMPLENAME_choice or the specified clones passed to clone_sequences. These clones are colored gray. If both SAMPLENAME_choice and clone_sequences are NULL, this argument must be set to TRUE. Otherwise, there will be no data to show.
#' @param linesize Numeric. Thickness of the lines.
#' @param text_size Numeric. Size of text in plot.
#' @param your_title Title string for your plot.
#' @param y_limit Numeric. What the max value of the y scale should be for the "proportions" assay.
#' @param return_table Logical. If set to TRUE, the function will return a dataframe with each sequence that is selected and its percentage contribution to each selected sample rather than a plot.
#'
#' @importFrom stats setNames
#'
#' @return Displays a stacked area line or bar plot (made by ggplot2) of the samples' top clones. Or, if return_table is set to TRUE, returns a dataframe of the percentage abundances in each sample.
#'
#' @examples
#' data(wu_subset)
#' clonal_contribution(
#'     your_SE = wu_subset, graph_type = "bar",
#'     SAMPLENAME_choice = "ZJ31_20m_T",
#'     filter_by = "celltype", filter_selection = "T",
#'     plot_over = "months", n_clones = 10
#' )
#' @export

clonal_contribution <- function(your_SE,
    SAMPLENAME_choice = NULL,
    filter_by,
    filter_selection,
    plot_over,
    plot_over_display_choices = NULL,
    clone_sequences = NULL,
    n_clones = 10,
    graph_type = "bar",
    keep_numeric = TRUE,
    plot_non_selected = TRUE,
    linesize = 0.2,
    text_size = 15,
    your_title = "",
    y_limit = NULL,
    return_table = FALSE) {


    # Error checking
    coldata_names <- colnames(SummarizedExperiment::colData(your_SE))
    if (any(!c(plot_over) %in% coldata_names)) {
        stop("plot_over must match a column name in colData(your_SE)")
    }
    if (any(!c(filter_by) %in% coldata_names)) {
        stop("filter_by must match a column name in colData(your_SE)")
    }
    if (!filter_selection %in% unique(SummarizedExperiment::colData(your_SE)[[filter_by]])) {
        stop("filter_selection must be an element in the colData column specified with filter_by")
    }

    if (sum(is.null(SAMPLENAME_choice), is.null(clone_sequences)) == 0) {
        stop("please specify only ONE of SAMPLENAME_choice or clone_sequences.")
    }

    if (sum(is.null(SAMPLENAME_choice), is.null(clone_sequences)) == 2 & plot_non_selected == FALSE) {
        stop("If neither SAMPLENAME_choice nor clone_sequences are specified, plot_non_selected must be TRUE. Otherwise, there is no data to plot.")
    }

    # get appropriate rows to use in plotting
    if (!is.null(SAMPLENAME_choice)) {
        selected_sequences <- get_top_clones(your_SE = your_SE, SAMPLENAME_choice = SAMPLENAME_choice, n_clones = n_clones)
    } else if (!is.null(clone_sequences)) {
        selected_sequences <- clone_sequences
    } else {
        selected_sequences <- c("Filler text")
    }

    # select those samples which to plot_over and to filter_by
    temp_subset <- your_SE[, your_SE[[filter_by]] == filter_selection]
    if (is.numeric(SummarizedExperiment::colData(temp_subset)[[plot_over]])) {
        plot_over_display_choices <- plot_over_display_choices %||% sort(unique(SummarizedExperiment::colData(temp_subset)[[plot_over]]))
        plot_over_display_choices <- as.numeric(as.character(plot_over_display_choices))
    } else if (is.factor(SummarizedExperiment::colData(temp_subset)[[plot_over]])) {
        plot_over_display_choices <- plot_over_display_choices %||% levels(SummarizedExperiment::colData(temp_subset)[[plot_over]])
    } else {
        plot_over_display_choices <- plot_over_display_choices %||% unique(SummarizedExperiment::colData(temp_subset)[[plot_over]])
    }
    temp_subset <- temp_subset[, temp_subset[[plot_over]] %in% plot_over_display_choices]
    temp_subset_coldata <- SummarizedExperiment::colData(temp_subset) %>%
        as.data.frame() %>%
        dplyr::mutate_if(is.factor, as.character)

    # fetch proportions
    your_data <- SummarizedExperiment::assays(temp_subset)[["proportions"]]
    your_data <- your_data[rowSums(your_data) > 0, , drop = FALSE]

    # If there are duplicate samples within the chosen plot_over argument for the x-axis, then combine but warn the user
    if (length(temp_subset_coldata[[plot_over]]) != length(unique(temp_subset_coldata[[plot_over]]))) {
        duplicated_plot_over <- temp_subset_coldata[[plot_over]][duplicated(temp_subset_coldata[[plot_over]])]
        cat("Duplicate samples with the same value of the plot_over variable:", plot_over, "\n")
        for (i in seq_along(duplicated_plot_over)) {
            duplicated_samplenames <- temp_subset_coldata[["SAMPLENAME"]][temp_subset_coldata[[plot_over]] == duplicated_plot_over[i]]
            cat(plot_over, "value =", duplicated_plot_over[i], "; Duplicate sample names =", duplicated_samplenames, "\n")
            your_data[, duplicated_samplenames[1]] <- rowMeans(your_data[, duplicated_samplenames])
            your_data[, duplicated_samplenames[-1]] <- NULL
            temp_subset_coldata <- temp_subset_coldata[temp_subset_coldata[["SAMPLENAME"]] %in% duplicated_samplenames[-1] == FALSE, ]
        }
        cat("Barcode proportions have been averaged for duplicate samples.\nTo see the duplicate samples as separate replicates, create a categorical column of metadata with your desired plot_over values but _repX appended to the replicate samples. \n")
    }

    # turn sample_name into the plot_over equivalents
    plotting_data <- tibble::rownames_to_column(your_data, var = "sequence") %>%
        tidyr::pivot_longer(cols = -sequence, names_to = "sample_name", values_to = "value") %>%
        dplyr::mutate(fill_label = ifelse(sequence %in% selected_sequences, sequence, "other")) %>%
        dplyr::mutate(fill_label = factor(.data$fill_label, levels = c(selected_sequences, "other"))) %>%
        dplyr::left_join(temp_subset_coldata %>% dplyr::rename(sample_name = .data$SAMPLENAME), by = "sample_name") %>%
        dplyr::mutate(sample_name = !!as.name(plot_over))
    if (is.numeric(temp_subset_coldata[[plot_over]]) & keep_numeric) {
        plotting_data <- dplyr::mutate(plotting_data, sample_name = factor(.data$sample_name, levels = plot_over_display_choices))
        plotting_data <- dplyr::mutate(plotting_data, sample_name = as.numeric(as.character(.data$sample_name)))
    } else {
        plotting_data <- dplyr::mutate(plotting_data, sample_name = factor(.data$sample_name, levels = plot_over_display_choices))
    }

    # set up appropriate levels in plotting the selected elements and specify the colors
    if (plot_non_selected) {
        non_selected_sequences <- plotting_data %>%
            dplyr::arrange(desc(.data$value)) %>%
            dplyr::pull(sequence) %>%
            unique() %>%
            .[!(. %in% selected_sequences)]
        plotting_data$sequence <- factor(plotting_data$sequence, levels = rev(c(selected_sequences, non_selected_sequences)))
        color_vector <- setNames(c(scales::hue_pal()(length(selected_sequences)), "grey"), c(selected_sequences, "other"))
    } else {
        plotting_data <- dplyr::filter(plotting_data, sequence %in% selected_sequences)
        plotting_data$sequence <- factor(plotting_data$sequence, levels = rev(c(selected_sequences)))
        color_vector <- setNames(c(scales::hue_pal()(length(selected_sequences))), selected_sequences)
    }

    if (return_table) {
        return(plotting_data[, seq_len(3)])
    }

    if (graph_type == "bar") {
        g <- ggplot2::ggplot(plotting_data, ggplot2::aes(x = .data$sample_name, y = .data$value, group = .data$sequence, fill = .data$fill_label)) +
            ggplot2::geom_col(colour = "black", size = linesize) +
            ggplot2::scale_y_continuous(name = "proportions", labels = function(x) {
                paste0(x * 100, "%")
            }, expand = c(0.01, 0)) +
            ggplot2::scale_fill_manual("selected_sequences", values = color_vector) +
            ggplot2::theme_classic() +
            ggplot2::ggtitle(your_title) +
            ggplot2::theme(
                text = ggplot2::element_text(size = text_size),
                plot.title = ggplot2::element_text(hjust = 0.5), panel.grid = ggplot2::element_blank()
            )
    } else if (graph_type == "line") {
        g <- ggplot2::ggplot(plotting_data, ggplot2::aes(x = .data$sample_name, y = .data$value, group = sequence, fill = .data$fill_label)) +
            ggplot2::geom_area(position = "stack", colour = "black", size = linesize) +
            ggplot2::scale_y_continuous(name = "proportions", labels = function(x) {
                paste0(x * 100, "%")
            }, expand = c(0.01, 0)) +
            ggplot2::scale_fill_manual("selected_sequences", values = color_vector) +
            ggplot2::theme_classic() +
            ggplot2::ggtitle(your_title) +
            ggplot2::theme(text = ggplot2::element_text(size = text_size))
    }

    if (!is.null(y_limit)) {
        g <- g + ggplot2::coord_cartesian(ylim = c(0, y_limit))
    }


    if (is.numeric(temp_subset_coldata[[plot_over]]) & keep_numeric) {
        g + ggplot2::scale_x_continuous(paste0(plot_over), breaks = plot_over_display_choices, labels = plot_over_display_choices) +
            ggplot2::theme(legend.position = "none")
    } else {
        g + ggplot2::scale_x_discrete(paste0(plot_over), breaks = plot_over_display_choices, labels = plot_over_display_choices) +
            ggplot2::theme(legend.position = "none")
    }
}