R/common_dplyr.R

Defines functions call_consensus cor_to_call_rank collapse_to_cluster call_to_metadata cor_to_call

Documented in call_consensus call_to_metadata collapse_to_cluster cor_to_call cor_to_call_rank

#' get best calls for each cluster
#'
#' @param cor_mat input similarity matrix
#' @param metadata input metadata with tsne or umap coordinates and cluster ids
#' @param cluster_col metadata column, can be cluster or cellid
#' @param collapse_to_cluster if a column name is provided, takes the most
#' frequent call of entire cluster to color in plot
#' @param threshold minimum correlation coefficent cutoff for calling clusters
#' @param rename_prefix prefix to add to type and r column names
#' @param carry_r whether to include threshold in unassigned names
#' @return dataframe of cluster, new ident, and r info
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' cor_to_call(res)
#' @export
cor_to_call <- function(
  cor_mat,
  metadata = NULL,
  cluster_col = "cluster",
  collapse_to_cluster = FALSE,
  threshold = 0,
  rename_prefix = NULL,
  carry_r = FALSE
) {
  correlation_matrix <- cor_mat
  if (threshold == "auto") {
    threshold <- round(0.75 * max(correlation_matrix), 2)
    message("using threshold of ", threshold)
  }
  correlation_matrix[is.na(correlation_matrix)] <- 0
  df_temp <-
    tibble::as_tibble(correlation_matrix, rownames = cluster_col)
  df_temp <- tidyr::gather(
    df_temp,
    key = !!dplyr::sym("type"),
    value = !!dplyr::sym("r"),
    -!!cluster_col
  )

  if (carry_r) {
    df_temp[["type"]][df_temp$r < threshold] <-
      paste0("r<", threshold, ", unassigned")
  } else {
    df_temp[["type"]][df_temp$r < threshold] <- "unassigned"
  }

  df_temp <-
    dplyr::top_n(dplyr::group_by_at(df_temp, 1), 1, !!dplyr::sym("r"))
  if (nrow(df_temp) != nrow(correlation_matrix)) {
    clash <- dplyr::summarize(dplyr::group_by_at(df_temp, 1), n = n())
    clash <- dplyr::filter(clash, n > 1)
    clash <- dplyr::pull(clash, 1)
    df_temp[
      lapply(
        df_temp[, 1],
        FUN = function(x) {
          x %in% clash
        }
      )[[1]],
      2
    ] <-
      paste0(
        df_temp[["type"]][lapply(
          df_temp[, 1],
          FUN = function(x) {
            x %in% clash
          }
        )[[1]]],
        "-CLASH!"
      )
    df_temp2 <- df_temp
    df_temp_full <-
      dplyr::distinct_at(df_temp, vars(-!!dplyr::sym("type")), .keep_all = TRUE)
  } else {
    df_temp_full <- df_temp
  }

  if (collapse_to_cluster != FALSE) {
    if (!(cluster_col %in% colnames(metadata))) {
      metadata <- tibble::as_tibble(metadata, rownames = "rn")
    }
    df_temp_full <-
      collapse_to_cluster(
        df_temp_full,
        metadata = metadata,
        cluster_col = cluster_col,
        threshold = threshold
      )
  }

  if (!is.null(rename_prefix)) {
    if (collapse_to_cluster) {
      eval(parse(
        text = paste0(
          "df_temp_full <- dplyr::rename(df_temp_full, ",
          paste0(rename_prefix, "_type"),
          " = type, ",
          paste0(rename_prefix, "_sum"),
          " = sum, ",
          paste0(rename_prefix, "_n"),
          " = n)"
        )
      ))
    } else {
      eval(parse(
        text = paste0(
          "df_temp_full <- dplyr::rename(df_temp_full, ",
          paste0(rename_prefix, "_type"),
          " = type, ",
          paste0(rename_prefix, "_r"),
          " = r)"
        )
      ))
    }
  }
  df_temp_full
}

#' Insert called ident results into metadata
#'
#' @param res dataframe of idents, such as output of cor_to_call
#' @param metadata input metadata with tsne or umap coordinates and cluster ids
#' @param cluster_col metadata column, can be cluster or cellid
#' @param per_cell whether the res dataframe is listed per cell
#' @param rename_prefix prefix to add to type and r column names
#' @return new metadata with added columns
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' res2 <- cor_to_call(res, cluster_col = "classified")
#'
#' call_to_metadata(
#'     res = res2,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     rename_prefix = "assigned"
#' )
#' @export
call_to_metadata <- function(
  res,
  metadata,
  cluster_col,
  per_cell = FALSE,
  rename_prefix = NULL
) {
  temp_col_id <- get_unique_column(metadata, "rn")

  df_temp <- res
  if (!is.null(rename_prefix)) {
    eval(parse(
      text = paste0(
        "df_temp <- dplyr::rename(df_temp, ",
        paste0(rename_prefix, "_type"),
        " = type, ",
        paste0(rename_prefix, "_r"),
        " = r)"
      )
    ))
  }

  if (per_cell == FALSE) {
    if (!(cluster_col %in% colnames(metadata))) {
      stop("cluster_col is not a column of metadata", call. = FALSE)
    }

    if (!(cluster_col %in% colnames(res))) {
      stop(
        "cluster_col is not a column ",
        "of called cell type dataframe",
        call. = FALSE
      )
    }

    if (
      !(all(
        unique(df_temp[[cluster_col]]) %in%
          unique(metadata[[cluster_col]])
      ))
    ) {
      stop(
        "cluster_col from clustify step and",
        "joining to metadata step are not the same",
        call. = FALSE
      )
    }

    df_temp_full <-
      suppressWarnings(
        dplyr::left_join(
          tibble::rownames_to_column(
            metadata,
            temp_col_id
          ),
          df_temp,
          by = cluster_col,
          suffix = c("", ".clustify")
        )
      )
    if (tibble::has_rownames(df_temp_full)) {
      df_temp_full <- tibble::remove_rownames(df_temp_full)
    }
    df_temp_full <- tibble::column_to_rownames(
      df_temp_full,
      temp_col_id
    )
  } else {
    colnames(df_temp)[1] <- cluster_col
    names(cluster_col) <- temp_col_id

    df_temp_full <-
      suppressWarnings(
        dplyr::left_join(
          tibble::rownames_to_column(
            metadata,
            temp_col_id
          ),
          df_temp,
          by = cluster_col,
          suffix = c("", ".clustify")
        )
      )
    if (tibble::has_rownames(df_temp_full)) {
      df_temp_full <- tibble::remove_rownames(df_temp_full)
    }
    df_temp_full <-
      tibble::column_to_rownames(
        df_temp_full,
        temp_col_id
      )
  }
  df_temp_full
}

#' From per-cell calls, take highest freq call in each cluster
#'
#' @param res dataframe of idents, such as output of cor_to_call
#' @param metadata input metadata with tsne or umap coordinates and cluster ids
#' @param cluster_col metadata column for cluster
#' @param threshold minimum correlation coefficent cutoff for calling clusters
#' @return new metadata with added columns
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref,
#'     per_cell = TRUE
#' )
#'
#' res2 <- cor_to_call(res)
#'
#' collapse_to_cluster(
#'     res2,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     threshold = 0
#' )
#' @export
collapse_to_cluster <- function(
  res,
  metadata,
  cluster_col,
  threshold = 0
) {
  res_temp <- res
  colnames(res_temp)[1] <- "rn"
  df_temp_full <- as.data.frame(res_temp)
  df_temp_full <-
    dplyr::mutate(df_temp_full, cluster = metadata[[cluster_col]])
  df_temp_full2 <-
    dplyr::group_by(
      df_temp_full,
      !!dplyr::sym("type"),
      !!dplyr::sym("cluster")
    )
  df_temp_full2 <-
    dplyr::summarize(df_temp_full2, sum = sum(!!dplyr::sym("r")), n = n())
  df_temp_full2 <-
    dplyr::group_by(
      df_temp_full2,
      !!dplyr::sym("cluster")
    )
  df_temp_full2 <-
    dplyr::arrange(
      df_temp_full2,
      desc(n),
      desc(sum)
    )
  df_temp_full2 <-
    dplyr::filter(
      df_temp_full2,
      !!dplyr::sym("type") !=
        paste0(
          "r<",
          threshold,
          ", unassigned"
        )
    )
  df_temp_full2 <- dplyr::slice(df_temp_full2, 1)
  df_temp_full2 <-
    dplyr::rename(
      df_temp_full2,
      !!cluster_col := cluster
    )
  dplyr::select(
    df_temp_full2,
    2,
    1,
    tidyr::everything()
  )
}

#' get ranked calls for each cluster
#'
#' @param cor_mat input similarity matrix
#' @param metadata input metadata with tsne or umap coordinates
#' and cluster ids
#' @param cluster_col metadata column, can be cluster or cellid
#' @param collapse_to_cluster if a column name is provided, takes the most
#' frequent call of entire cluster to color in plot
#' @param threshold minimum correlation coefficent cutoff for calling clusters
#' @param rename_prefix prefix to add to type and r column names
#' @param top_n the number of ranks to keep, the rest will be set to 100
#' @return dataframe of cluster, new ident, and r info
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' cor_to_call_rank(res, threshold = "auto")
#' @export
cor_to_call_rank <- function(
  cor_mat,
  metadata = NULL,
  cluster_col = "cluster",
  collapse_to_cluster = FALSE,
  threshold = 0,
  rename_prefix = NULL,
  top_n = NULL
) {
  correlation_matrix <- cor_mat
  if (threshold == "auto") {
    threshold <- round(0.75 * max(correlation_matrix), 2)
    message("using threshold of ", threshold)
  }
  df_temp <- tibble::as_tibble(correlation_matrix, rownames = cluster_col)
  df_temp <-
    tidyr::gather(
      df_temp,
      key = !!dplyr::sym("type"),
      value = !!dplyr::sym("r"),
      -!!cluster_col
    )
  df_temp <-
    dplyr::mutate(
      dplyr::group_by_at(df_temp, 1),
      rank = dplyr::dense_rank(desc(!!dplyr::sym("r")))
    )
  df_temp[["rank"]][df_temp$r < threshold] <- 100
  if (!(is.null(top_n))) {
    df_temp <- dplyr::filter(df_temp, rank <= top_n)
  }
  df_temp_full <- df_temp
  if (!is.null(rename_prefix)) {
    eval(parse(
      text = paste0(
        "df_temp_full <- dplyr::rename(df_temp_full, ",
        paste0(rename_prefix, "_type"),
        " = type, ",
        paste0(rename_prefix, "_r"),
        " = r)"
      )
    ))
  }
  df_temp_full
}

#' get concensus calls for a list of cor calls
#'
#' @param list_of_res list of call dataframes from cor_to_call_rank
#' @return dataframe of cluster, new ident, and mean rank
#' @examples
#' res <- clustify(
#'     input = pbmc_matrix_small,
#'     metadata = pbmc_meta,
#'     cluster_col = "classified",
#'     ref_mat = cbmc_ref
#' )
#'
#' res2 <- cor_to_call_rank(res, threshold = "auto")
#' res3 <- cor_to_call_rank(res)
#' call_consensus(list(res2, res3))
#' @export
call_consensus <- function(list_of_res) {
  res <- do.call("rbind", list_of_res)
  df_temp <- dplyr::group_by_at(res, c(1, 2))
  df_temp <- dplyr::summarize_at(df_temp, 2, mean)
  df_temp <- dplyr::top_n(df_temp, -1)
  df_temp <- dplyr::group_by_at(df_temp, c(1, 3))
  df_temp <-
    dplyr::summarize_at(df_temp, 1, function(x) {
      stringr::str_c(x, collapse = "__")
    })
  df_temp <- dplyr::select(df_temp, c(1, 3, 2))
}
NCBI-Hackathons/clustifyR documentation built on July 4, 2025, 10 a.m.