R/countKmer.R

Defines functions kmerPlot calculateCounts countKmer

Documented in calculateCounts countKmer kmerPlot

#' Find k-mers and its counts
#'
#' Calculate counts of k-mers in the query nucleotide sequence
#'
#' @param study_table A tibble consisting of antigen receptor
#' sequencing imported by the LymphoSeq2 function [readImmunoSeq()].
#' "repertoire_id" and "junction" are required columns.
#' @param k The length of k-mers to find.
#' @param separate A boolean value.
#'  * `TRUE` (the default):  separate the counts of each k-mer by repertoire_id.
#'  * `FALSE` : show cumulative counts instead.
#' @return A tibble with the k-mer and its counts. The counts can be cumulative
#' counts of the entire study_table or counts for each repertoire_id.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing",
#'  package = "LymphoSeq2")
#' study_table <- LymphoSeq2::readImmunoSeq(path = file_path, threads = 1)
#' study_table <- LymphoSeq2::topSeqs(study_table, top = 100)
#' kmer_table <- LymphoSeq2::countKmer(
#'   study_table = study_table, k = 5,
#'   separate = TRUE
#' )
#'
#' @export
countKmer <- function(study_table, k, separate = TRUE) {
  if (separate) {
    kmer_counts <- study_table |>
      dplyr::group_by(repertoire_id) |>
      dplyr::group_split() |>
      purrr::map(~ calculateCounts(.x, k)) |>
      dplyr::bind_rows() |>
      tidyr::pivot_wider(
        id_cols = Kmer, names_from = repertoire_id,
        values_from = Count
      )
    return(kmer_counts)
  } else {
    kmer_counts <- calculateCounts(study_table, k)
    return(kmer_counts)
  }
}
#' Calculate k-mer counts
#' @inheritParams countKmer
calculateCounts <- function(study_table, k) {
  seq <- dplyr::pull(study_table, "junction") |>
    stats::na.omit()
  rep_id <- study_table |>
    dplyr::pull(repertoire_id) |>
    unique()
  kmer_counts <- purrr::map(
    seq,
    function(x) Biostrings::oligonucleotideFrequency(Biostrings::DNAString(x),
                                                     k)
  )
  kmer_counts <- purrr::map(
    kmer_counts,
    function(x) data.frame(Kmer = names(x), Count = unname(x))
  ) |>
    dplyr::bind_rows() |>
    dplyr::group_by(Kmer) |>
    dplyr::summarise(Count = sum(Count)) |>
    dplyr::ungroup() |>
    dplyr::mutate(repertoire_id = rep_id)
  return(kmer_counts)
}

#' Plot kmer distributions
#'
#' Plot k-mer distributions by repertoire id
#'
#' @param kmer_table A tibble of k-mer counts generated by the LymphoSeq2
#' function countKmer where the separate parameter is set to TRUE.
#' @param top The number of top k-mer to show
#' @return A stacked bar chart showing k-mer distributions by repertoire_id
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing",
#'  package = "LymphoSeq2")
#' study_table <- LymphoSeq2::readImmunoSeq(path = file_path, threads = 1)
#' study_table <- LymphoSeq2::topSeqs(study_table, top = 100)
#' kmer_table <- LymphoSeq2::countKmer(study_table = study_table, k = 5,
#'  separate = TRUE)
#' kmer_distributions <- LymphoSeq2::kmerPlot(kmer_table, top = 10)
#' @export
kmerPlot <- function(kmer_table, top = 10) {
  kmer_table <- kmer_table |>
    tidyr::pivot_longer(!Kmer,
      names_to = "repertoire_id",
      values_to = "count"
    )
  rep_num <- length(unique(kmer_table$repertoire_id))
  kmer_table <- kmer_table |>
    dplyr::group_by(Kmer) |>
    dplyr::mutate(total = sum(count)) |>
    dplyr::arrange(desc(total)) |>
    head(top * rep_num)
  bar <- ggplot2::ggplot(kmer_table, ggplot2::aes(
    fill = repertoire_id, y = count,
    x = Kmer
  )) +
    ggplot2::geom_bar(position = "stack", stat = "identity") +
    ggplot2::theme(axis.text.x = ggplot2::element_text(
      angle = 90, vjust = 0.5,
      hjust = 1
    ))
  return(bar)
}
shashidhar22/LymphoSeq2 documentation built on Jan. 16, 2024, 4:29 a.m.