R/geneFreq.R

Defines functions geneFreq

Documented in geneFreq

#' Gene frequencies
#' 
#' Creates a data frame of VDJ gene counts and frequencies.
#' 
#' @param productive_nt A tibble of productive sequences 
#' generated by the LymphoSeq function productiveSeq where the parameter 
#' aggregate is set to "nucleotide".
#' @param locus A character vector indicating which VDJ genes to include in the 
#' output.  Available options include "VDJ", "DJ", "VJ", "DJ", "V", "D", or "J".
#' @param family A Boolean value indicating whether or not family names instead 
#' of gene names are used.  If TRUE, then family names are used and if FALSE, 
#' gene names are used.
#' @return Returns a data frame with the repertoire_id names, VDJ gene name, duplicate_count, and 
#' \% frequency of the V, D, or J genes (each gene frequency should add to 
#' 100\% for each repertoire_id).
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2")
#' 
#' stable <- readImmunoSeq(path = file_path)
#' 
#' ntable <- productiveSeq(study_table = stable, aggregate = "nucleotide")
#' 
#' geneFreq(ntable, locus = "VDJ", family = FALSE)
#' 
#' @export
#' @import tidyverse dtplyr
geneFreq <- function(productive_nt, locus = "V|D|J", family = FALSE) {
    if (family){
        gene_names <- productive_nt %>% 
                      dplyr::select(repertoire_id, duplicate_count, v_family, j_family, d_family) %>%
                      tidyr::pivot_longer(cols = contains("family"), 
                                          values_to="gene_name", 
                                          names_to="gene_type") %>%
                      dplyr::filter(stringr::str_detect(gene_type, base::paste("[", locus, base::tolower(locus), "]", sep=""))) %>%
                        dtplyr::lazy_dt()
        gene_names <- gene_names %>%
                      dplyr::group_by(repertoire_id, gene_name) %>% 
                      dplyr::summarize(duplicate_count = sum(duplicate_count), 
                                       gene_type = dplyr::first(gene_type)) %>%
                      dplyr::ungroup() %>%
                      dplyr::group_by(repertoire_id, gene_type) %>% 
                      dplyr::mutate(gene_frequency = duplicate_count/sum(duplicate_count)) %>%
                      dplyr::ungroup() %>%
                      dplyr::as_tibble()
    } else {
        gene_names <- productive_nt %>% 
                      dplyr::select(repertoire_id, duplicate_count, v_call, j_call, d_call) %>%
                      tidyr::pivot_longer(cols = contains("call"), 
                                          values_to="gene_name", 
                                          names_to="gene_type") %>%
                      dplyr::filter(stringr::str_detect(gene_type, base::paste("[", locus, base::tolower(locus), "]", sep=""))) %>%
                      dtplyr::lazy_dt()
        gene_names <- gene_names %>%
                      dplyr::group_by(repertoire_id, gene_name) %>% 
                      dplyr::summarize(duplicate_count = sum(duplicate_count),
                                       gene_type = dplyr::first(gene_type)) %>%
                      dplyr::ungroup() %>%
                      dplyr::group_by(repertoire_id, gene_type) %>%
                      dplyr::mutate(gene_frequency = duplicate_count/sum(duplicate_count)) %>%
                      dplyr::ungroup() %>%
                      dplyr::as_tibble()
    }
    return(gene_names)
}
elulu3/LymphoSeqTest documentation built on Aug. 27, 2022, 5:47 a.m.