R/common_dplyr.R

Defines functions call_consensus cor_to_call_rank collapse_to_cluster call_to_metadata cor_to_call

Documented in call_consensus call_to_metadata collapse_to_cluster cor_to_call cor_to_call_rank

#' get best calls for each cluster
#'
#' @param cor_mat input similarity matrix
#' @param metadata input metadata with tsne or umap coordinates and cluster ids
#' @param cluster_col metadata column, can be cluster or cellid
#' @param collapse_to_cluster if a column name is provided, takes the most
#' frequent call of entire cluster to color in plot
#' @param threshold minimum correlation coefficent cutoff for calling clusters
#' @param rename_prefix prefix to add to type and r column names
#' @param carry_r whether to include threshold in unassigned names
#' @return dataframe of cluster, new ident, and r info
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' cor_to_call(res)
#' @export
cor_to_call <- function(
    cor_mat,
    metadata = NULL,
    cluster_col = "cluster",
    collapse_to_cluster = FALSE,
    threshold = 0,
    rename_prefix = NULL,
    carry_r = FALSE) {
    correlation_matrix <- cor_mat
    if (threshold == "auto") {
        threshold <- round(0.75 * max(correlation_matrix), 2)
        message("using threshold of ", threshold)
    }
    correlation_matrix[is.na(correlation_matrix)] <- 0
    df_temp <-
        tibble::as_tibble(correlation_matrix, rownames = cluster_col)
    df_temp <- tidyr::gather(
        df_temp,
        key = !!dplyr::sym("type"),
        value = !!dplyr::sym("r"), -!!cluster_col
    )

    if (carry_r) {
        df_temp[["type"]][df_temp$r < threshold] <-
            paste0("r<", threshold, ", unassigned")
    } else {
        df_temp[["type"]][df_temp$r < threshold] <- "unassigned"
    }

    df_temp <-
        dplyr::top_n(dplyr::group_by_at(df_temp, 1), 1, !!dplyr::sym("r"))
    if (nrow(df_temp) != nrow(correlation_matrix)) {
        clash <- dplyr::summarize(dplyr::group_by_at(df_temp, 1), n = n())
        clash <- dplyr::filter(clash, n > 1)
        clash <- dplyr::pull(clash, 1)
        df_temp[lapply(
            df_temp[, 1],
            FUN = function(x) {
                x %in% clash
            }
        )[[1]], 2] <-
            paste0(df_temp[["type"]][lapply(
                df_temp[, 1],
                FUN = function(x) {
                    x %in% clash
                }
            )[[1]]], "-CLASH!")
        df_temp2 <- df_temp
        df_temp_full <-
            dplyr::distinct_at(df_temp,
                vars(-!!dplyr::sym("type")),
                .keep_all = TRUE
            )
    } else {
        df_temp_full <- df_temp
    }

    if (collapse_to_cluster != FALSE) {
        if (!(cluster_col %in% colnames(metadata))) {
            metadata <- tibble::as_tibble(metadata, rownames = "rn")
        }
        df_temp_full <-
            collapse_to_cluster(
                df_temp_full,
                metadata = metadata,
                cluster_col = cluster_col,
                threshold = threshold
            )
    }

    if (!is.null(rename_prefix)) {
        if (collapse_to_cluster) {
            eval(parse(
                text = paste0(
                    "df_temp_full <- dplyr::rename(df_temp_full, ",
                    paste0(rename_prefix, "_type"),
                    " = type, ",
                    paste0(rename_prefix, "_sum"),
                    " = sum, ",
                    paste0(rename_prefix, "_n"),
                    " = n)"
                )
            ))
        } else {
            eval(parse(
                text = paste0(
                    "df_temp_full <- dplyr::rename(df_temp_full, ",
                    paste0(rename_prefix, "_type"),
                    " = type, ",
                    paste0(rename_prefix, "_r"),
                    " = r)"
                )
            ))
        }
    }
    df_temp_full
}

#' Insert called ident results into metadata
#'
#' @param res dataframe of idents, such as output of cor_to_call
#' @param metadata input metadata with tsne or umap coordinates and cluster ids
#' @param cluster_col metadata column, can be cluster or cellid
#' @param per_cell whether the res dataframe is listed per cell
#' @param rename_prefix prefix to add to type and r column names
#' @return new metadata with added columns
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' res2 <- cor_to_call(res, cluster_col = "classified")
#'
#' call_to_metadata(
#'     res = res2,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     rename_prefix = "assigned"
#' )
#' @export
call_to_metadata <- function(
    res,
    metadata,
    cluster_col,
    per_cell = FALSE,
    rename_prefix = NULL) {
    temp_col_id <- get_unique_column(metadata, "rn")

    df_temp <- res
    if (!is.null(rename_prefix)) {
        eval(parse(
            text = paste0(
                "df_temp <- dplyr::rename(df_temp, ",
                paste0(rename_prefix, "_type"),
                " = type, ",
                paste0(rename_prefix, "_r"),
                " = r)"
            )
        ))
    }

    if (per_cell == FALSE) {
        if (!(cluster_col %in% colnames(metadata))) {
            stop("cluster_col is not a column of metadata",
                call. = FALSE
            )
        }

        if (!(cluster_col %in% colnames(res))) {
            stop("cluster_col is not a column ",
                "of called cell type dataframe",
                call. = FALSE
            )
        }

        if (!(all(unique(df_temp[[cluster_col]]) %in%
            unique(metadata[[cluster_col]])))) {
            stop("cluster_col from clustify step and",
                "joining to metadata step are not the same",
                call. = FALSE
            )
        }

        df_temp_full <-
            suppressWarnings(
                dplyr::left_join(
                    tibble::rownames_to_column(
                        metadata,
                        temp_col_id
                    ),
                    df_temp,
                    by = cluster_col,
                    suffix = c("", ".clustify")
                )
            )
        if (tibble::has_rownames(df_temp_full)) {
            df_temp_full <- tibble::remove_rownames(df_temp_full)
        }
        df_temp_full <- tibble::column_to_rownames(
            df_temp_full,
            temp_col_id
        )
    } else {
        colnames(df_temp)[1] <- cluster_col
        names(cluster_col) <- temp_col_id

        df_temp_full <-
            suppressWarnings(
                dplyr::left_join(
                    tibble::rownames_to_column(
                        metadata,
                        temp_col_id
                    ),
                    df_temp,
                    by = cluster_col,
                    suffix = c("", ".clustify")
                )
            )
        if (tibble::has_rownames(df_temp_full)) {
            df_temp_full <- tibble::remove_rownames(df_temp_full)
        }
        df_temp_full <-
            tibble::column_to_rownames(
                df_temp_full,
                temp_col_id
            )
    }
    df_temp_full
}

#' From per-cell calls, take highest freq call in each cluster
#'
#' @param res dataframe of idents, such as output of cor_to_call
#' @param metadata input metadata with tsne or umap coordinates and cluster ids
#' @param cluster_col metadata column for cluster
#' @param threshold minimum correlation coefficent cutoff for calling clusters
#' @return new metadata with added columns
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref,
#'     per_cell = TRUE
#' )
#'
#' res2 <- cor_to_call(res)
#'
#' collapse_to_cluster(
#'     res2,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     threshold = 0
#' )
#' @export
collapse_to_cluster <- function(
    res,
    metadata,
    cluster_col,
    threshold = 0) {
    res_temp <- res
    colnames(res_temp)[1] <- "rn"
    df_temp_full <- as.data.frame(res_temp)
    df_temp_full <-
        dplyr::mutate(df_temp_full,
            cluster = metadata[[cluster_col]]
        )
    df_temp_full2 <-
        dplyr::group_by(
            df_temp_full,
            !!dplyr::sym("type"),
            !!dplyr::sym("cluster")
        )
    df_temp_full2 <-
        dplyr::summarize(df_temp_full2,
            sum = sum(!!dplyr::sym("r")),
            n = n()
        )
    df_temp_full2 <-
        dplyr::group_by(
            df_temp_full2,
            !!dplyr::sym("cluster")
        )
    df_temp_full2 <-
        dplyr::arrange(
            df_temp_full2,
            desc(n),
            desc(sum)
        )
    df_temp_full2 <-
        dplyr::filter(
            df_temp_full2,
            !!dplyr::sym("type") != paste0(
                "r<",
                threshold,
                ", unassigned"
            )
        )
    df_temp_full2 <- dplyr::slice(df_temp_full2, 1)
    df_temp_full2 <-
        dplyr::rename(
            df_temp_full2,
            !!cluster_col := cluster
        )
    dplyr::select(
        df_temp_full2, 2, 1,
        tidyr::everything()
    )
}

#' get ranked calls for each cluster
#'
#' @param cor_mat input similarity matrix
#' @param metadata input metadata with tsne or umap coordinates
#' and cluster ids
#' @param cluster_col metadata column, can be cluster or cellid
#' @param collapse_to_cluster if a column name is provided, takes the most
#' frequent call of entire cluster to color in plot
#' @param threshold minimum correlation coefficent cutoff for calling clusters
#' @param rename_prefix prefix to add to type and r column names
#' @param top_n the number of ranks to keep, the rest will be set to 100
#' @return dataframe of cluster, new ident, and r info
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' cor_to_call_rank(res, threshold = "auto")
#' @export
cor_to_call_rank <- function(
    cor_mat,
    metadata = NULL,
    cluster_col = "cluster",
    collapse_to_cluster = FALSE,
    threshold = 0,
    rename_prefix = NULL,
    top_n = NULL) {
    correlation_matrix <- cor_mat
    if (threshold == "auto") {
        threshold <- round(0.75 * max(correlation_matrix), 2)
        message("using threshold of ", threshold)
    }
    df_temp <- tibble::as_tibble(correlation_matrix,
        rownames = cluster_col
    )
    df_temp <-
        tidyr::gather(
            df_temp,
            key = !!dplyr::sym("type"),
            value = !!dplyr::sym("r"), -!!cluster_col
        )
    df_temp <-
        dplyr::mutate(dplyr::group_by_at(df_temp, 1),
            rank = dplyr::dense_rank(desc(!!dplyr::sym("r")))
        )
    df_temp[["rank"]][df_temp$r < threshold] <- 100
    if (!(is.null(top_n))) {
        df_temp <- dplyr::filter(df_temp, rank <= top_n)
    }
    df_temp_full <- df_temp
    if (!is.null(rename_prefix)) {
        eval(parse(
            text = paste0(
                "df_temp_full <- dplyr::rename(df_temp_full, ",
                paste0(rename_prefix, "_type"),
                " = type, ",
                paste0(rename_prefix, "_r"),
                " = r)"
            )
        ))
    }
    df_temp_full
}

#' get concensus calls for a list of cor calls
#'
#' @param list_of_res list of call dataframes from cor_to_call_rank
#' @return dataframe of cluster, new ident, and mean rank
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' res2 <- cor_to_call_rank(res, threshold = "auto")
#' res3 <- cor_to_call_rank(res)
#' call_consensus(list(res2, res3))
#' @export
call_consensus <- function(list_of_res) {
    res <- do.call("rbind", list_of_res)
    df_temp <- dplyr::group_by_at(res, c(1, 2))
    df_temp <- dplyr::summarize_at(df_temp, 2, mean)
    df_temp <- dplyr::top_n(df_temp, -1)
    df_temp <- dplyr::group_by_at(df_temp, c(1, 3))
    df_temp <-
        dplyr::summarize_at(df_temp, 1, function(x) {
            stringr::str_c(x, collapse = "__")
        })
    df_temp <- dplyr::select(df_temp, c(1, 3, 2))
}
rnabioco/clustifyr documentation built on Sept. 2, 2024, 11:12 p.m.