R/compare_genelist.R

Defines functions compare_lists get_vargenes matrixize_markers binarize_expr

Documented in binarize_expr compare_lists get_vargenes matrixize_markers

#' Binarize scRNAseq data
#'
#' @param mat single-cell expression matrix
#' @param n number of top expressing genes to keep
#' @param cut cut off to set to 0
#' @return matrix of 1s and 0s
#' @examples
#' pbmc_avg <- average_clusters(
#'     mat = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified"
#' )
#'
#' mat <- binarize_expr(pbmc_avg)
#' mat[1:3, 1:3]
#' @export
binarize_expr <- function(mat,
    n = 1000,
    cut = 0) {
    expr_mat <- mat
    if (n < nrow(expr_mat)) {
        expr_df <- as.matrix(expr_mat)
        filterout <- t(matrixStats::colRanks(-expr_df,
            ties.method = "average"
        )) > n
        expr_df[filterout] <- 0
        expr_df[expr_df > cut] <- 1
        expr_df[expr_df < cut] <- 0
        as.matrix(expr_df)
    } else {
        expr_mat[expr_mat > cut] <- 1
        expr_mat[expr_mat < cut] <- 0
        expr_mat
    }
}

#' Convert candidate genes list into matrix
#'
#' @param marker_df dataframe of candidate genes, must contain
#'   "gene" and "cluster" columns, or a matrix of gene names to
#'   convert to ranked
#' @param ranked unranked gene list feeds into hyperp, the ranked
#' gene list feeds into regular corr_coef
#' @param n number of genes to use
#' @param step_weight ranked genes are tranformed into pseudo
#'  expression by descending weight
#' @param background_weight ranked genes are tranformed into pseudo
#'  expression with added weight
#' @param unique whether to use only unique markers to 1 cluster
#' @param metadata vector or dataframe of cluster names, should
#' have column named cluster
#' @param cluster_col column for cluster names to replace original
#'  cluster, if metadata is dataframe
#' @param remove_rp do not include rps, rpl, rp1-9 in markers
#' @return matrix of unranked gene marker names, or matrix of
#'  ranked expression
#' @examples
#' matrixize_markers(pbmc_markers)
#' @export
matrixize_markers <- function(marker_df,
    ranked = FALSE,
    n = NULL,
    step_weight = 1,
    background_weight = 0,
    unique = FALSE,
    metadata = NULL,
    cluster_col = "classified",
    remove_rp = FALSE) {
    # takes marker in dataframe form
    # equal number of marker genes per known cluster
    marker_df <- dplyr::as_tibble(marker_df)

    # if "feature" and "group" are present,
    # assume the dataframe is output from presto
    if (("feature" %in% colnames(marker_df)) & ("group" %in% colnames(marker_df))) {
        marker_df <- marker_df %>%
            dplyr::rename(
                gene = "feature",
                cluster = "group"
            ) %>%
            dplyr::group_by(cluster) %>%
            dplyr::arrange(padj, .by_group = TRUE) %>%
            ungroup()
    }

    # if "gene" not present in column names,
    # assume df is a matrix to be converted to ranked
    if (!("gene" %in% colnames(marker_df))) {
        marker_df <-
            data.frame(lapply(marker_df, as.character),
                stringsAsFactors = FALSE
            )
        marker_df <-
            tidyr::gather(marker_df,
                factor_key = TRUE,
                key = "cluster",
                value = "gene"
            )
    }

    if (remove_rp) {
        marker_df <-
            dplyr::filter(
                marker_df,
                !(stringr::str_detect(
                    gene,
                    "^RP[0-9,L,S]|^Rp[0-9,l,s]"
                ))
            )
    }

    if (unique) {
        nonunique <- dplyr::group_by(marker_df, gene)
        nonunique <- dplyr::summarize(nonunique, n = dplyr::n())
        nonunique <- dplyr::filter(nonunique, n > 1)
        marker_df <- dplyr::anti_join(marker_df, nonunique, by = "gene")
    }

    cut_temp <- dplyr::group_by(marker_df, cluster)
    cut_temp <- dplyr::summarize(cut_temp, n = n())
    cut_num <- min(cut_temp$n)

    if (!is.null(n)) {
        if (n < cut_num) {
            cut_num <- n
        }
    }

    marker_temp <- dplyr::select(marker_df, gene, cluster)
    marker_temp <- dplyr::group_by(marker_temp, cluster)
    marker_temp <- dplyr::slice(marker_temp, seq_len(cut_num))
    if (ranked) {
        marker_temp <-
            dplyr::mutate(
                marker_temp,
                n = seq(
                    step_weight * cut_num,
                    by = -step_weight,
                    length.out = cut_num
                ) + background_weight
            )
        marker_temp2 <-
            tidyr::spread(marker_temp, key = "cluster", value = n)
        marker_temp2 <-
            as.data.frame(replace(marker_temp2, is.na(marker_temp2), 0))
        rownames(marker_temp2) <- marker_temp2$gene
        marker_temp2 <- dplyr::select(marker_temp2, -gene)
    } else {
        marker_temp <- dplyr::mutate(marker_temp, n = seq_len(cut_num))
        marker_temp2 <-
            tidyr::spread(marker_temp, key = "cluster", value = "gene")
        marker_temp2 <- as.data.frame(dplyr::select(marker_temp2, -n))
    }

    # if metadata is vector, adopt names in vector;
    # if metadata is a metadata dataframe, pulls names from cluster_col column
    if (!is.null(metadata)) {
        if (typeof(metadata) != "character") {
            metadata <-
                suppressWarnings(dplyr::left_join(
                    tibble::tibble(cluster = colnames(marker_temp2)),
                    unique(
                        tibble::tibble(
                            cluster = metadata$cluster,
                            classified = metadata[[cluster_col]]
                        )
                    ),
                    by = "cluster"
                ))
            metadata <- metadata[[cluster_col]]
        }
        colnames(marker_temp2) <- metadata
    }

    marker_temp2
}

#' Generate variable gene list from marker matrix
#'
#' @description Variable gene list is required for `clustify` main function.
#' This function parses variables genes from a matrix input.
#'
#' @param marker_mat matrix or dataframe of candidate genes for each cluster
#' @return vector of marker gene names
#' @examples
#' get_vargenes(cbmc_m)
#' @export
get_vargenes <- function(marker_mat) {
    if (rownames(marker_mat)[1] != "1") {
        unique(rownames(marker_mat))
    } else {
        unique(unlist(marker_mat, use.names = FALSE))
    }
}

#' Calculate adjusted p-values for hypergeometric test of gene lists
#' or jaccard index
#'
#' @param bin_mat binarized single-cell expression matrix,
#'   feed in by_cluster mat, if desired
#' @param marker_mat matrix or dataframe of candidate genes for each cluster
#' @param n number of genes in the genome
#' @param metric adjusted p-value for hypergeometric test, or jaccard index
#' @param output_high if true (by default to fit with rest of package),
#' -log10 transform p-value
#' @param details_out whether to also output shared gene list from jaccard
#' @return matrix of numeric values, clusters from expr_mat as row names,
#'  cell types from marker_mat as column names
#' @examples
#' pbmc_mm <- matrixize_markers(pbmc_markers)
#'
#' pbmc_avg <- average_clusters(
#'     pbmc_matrix_small,
#'     pbmc_meta,
#'     cluster_col = "classified"
#' )
#'
#' pbmc_avgb <- binarize_expr(pbmc_avg)
#'
#' compare_lists(
#'     pbmc_avgb,
#'     pbmc_mm,
#'     metric = "spearman"
#' )
#' @export
compare_lists <- function(bin_mat,
    marker_mat,
    n = 30000,
    metric = "hyper",
    output_high = TRUE,
    details_out = FALSE) {
    # check if matrix is binarized
    if (is.list(marker_mat)) {
        message("list of markers instead of matrix, only supports jaccard")
    } 
    if ((length(unique(bin_mat[, 1])) > 2) & (metric != "gsea")) {
          warning("non-binarized data, running spearman instead")
          metric <- "spearman"
    }

    if (details_out) {
        spe <- lapply(
            colnames(bin_mat),
            function(x) {
                per_col <- lapply(
                    names(marker_mat),
                    function(y) {
                        marker_list <- unlist(marker_mat[[y]],
                                              use.names = FALSE
                        )
                        bin_temp <- bin_mat[, x][bin_mat[, x] == 1]
                        list_top <- names(bin_temp)
                        
                        genes <- paste(intersect(list_top, marker_list), collapse = ",")
                        genes
                    }
                )
                do.call(cbind, per_col)
            }
        )
    }
    
    if (metric == "hyper") {
        out <- lapply(
            colnames(bin_mat),
            function(x) {
                per_col <- lapply(
                    colnames(marker_mat),
                    function(y) {
                        marker_list <- unlist(marker_mat[, y],
                            use.names = FALSE
                        )
                        bin_temp <- bin_mat[, x][bin_mat[, x] == 1]
                        list_top <- names(bin_temp)

                        t <- length(intersect(list_top, marker_list))
                        a <- max(length(list_top), length(marker_list))
                        b <- min(length(list_top), length(marker_list))
                        sum(dhyper(seq(t, b), a, n - a, b))
                    }
                )
                do.call(cbind, as.list(p.adjust(per_col)))
            }
        )
        if (any(vapply(out, is.na, FUN.VALUE = logical(length(out[[1]]))))) {
            error("NaN produced, possibly due to wrong n")
        }
    } else if (metric == "jaccard") {
        if (is.list(marker_mat)) {
          out <- lapply(
            colnames(bin_mat),
            function(x) {
              per_col <- lapply(
                names(marker_mat),
                function(y) {
                  marker_list <- unlist(marker_mat[[y]],
                                        use.names = FALSE
                  )
                  bin_temp <- bin_mat[, x][bin_mat[, x] == 1]
                  list_top <- names(bin_temp)
                  
                  I <- length(intersect(list_top, marker_list))
                  I / (length(list_top) + length(marker_list) - I)
                }
              )
              do.call(cbind, per_col)
            }
          )
        } else {
          out <- lapply(
            colnames(bin_mat),
            function(x) {
              per_col <- lapply(
                colnames(marker_mat),
                function(y) {
                  marker_list <- unlist(marker_mat[, y],
                                        use.names = FALSE
                  )
                  bin_temp <- bin_mat[, x][bin_mat[, x] == 1]
                  list_top <- names(bin_temp)
                  
                  I <- length(intersect(list_top, marker_list))
                  I / (length(list_top) + length(marker_list) - I)
                }
              )
              do.call(cbind, per_col)
            }
          )
        }
    } else if (metric == "spearman") {
        out <- lapply(
            colnames(bin_mat),
            function(x) {
                per_col <- lapply(
                    colnames(marker_mat),
                    function(y) {
                        marker_list <- unlist(marker_mat[, y],
                            use.names = FALSE
                        )
                        v1 <- marker_list[marker_list %in%
                            names(as.matrix(bin_mat)[, x])]

                        bin_temp <- as.matrix(bin_mat)[, x]
                        bin_temp <- bin_temp[order(bin_temp,
                            decreasing = TRUE
                        )]
                        list_top <- names(bin_temp)
                        v2 <- list_top[list_top %in% v1]
                        v1 <- v1
                        v2 <- v2
                        sum(vapply(seq_along(v1), function(i) {
                            abs(i - (
                                which(v2 == v1[i])
                            ))
                        }, FUN.VALUE = numeric(1)))
                    }
                )
                do.call(cbind, per_col)
            }
        )
    } else if (metric == "gsea") {
        out <- lapply(
            colnames(marker_mat),
            function(y) {
                marker_list <- list()
                marker_list[[1]] <- marker_mat[, y]
                names(marker_list) <- y
                v1 <- marker_list
                run_gsea(bin_mat, v1, n_perm = 1000, per_cell = TRUE)
            }
        )
        res <- do.call(cbind, out)
        n <- ncol(res)
        res2 <- res[, 3 * seq_len((ncol(res) / 3)) - 1, drop = FALSE]
        rownames(res2) <- rownames(res)
        colnames(res2) <- colnames(marker_mat)
        res <- res2
        if (output_high) {
            res <- -log10(res)
        }
    } else {
        stop("unrecognized metric", call. = FALSE)
    }

    if (metric != "gsea") {
        if (!is.list(marker_mat)) {
          res <- do.call(rbind, out)
          rownames(res) <- colnames(bin_mat)
          colnames(res) <- colnames(marker_mat)
        } else {
          res <- do.call(rbind, out)
          rownames(res) <- colnames(bin_mat)
          colnames(res) <- names(marker_mat)
        }
        
    }

    if (output_high) {
        if (metric == "hyper") {
            res <- -log10(res)
        } else if (metric == "spearman") {
            res <- -res + max(res)
        }
    }

    if (details_out) {
        spe <- do.call(rbind, spe)
        rownames(spe) <- colnames(bin_mat)
        colnames(spe) <- names(marker_mat)
        list(res = res,
             details = spe)
    } else {
        res
    }
}
NCBI-Hackathons/clustifyR documentation built on April 25, 2024, 4:22 a.m.