R/allele_genotype.R

Defines functions extractASCTable recentAlleleClusters

Documented in extractASCTable recentAlleleClusters

# ------------------------------------------------------------------------------
# Allele clusters functions
# The functions in this scripts are for generating the allele clusters from a given reference set

#' @include piglet.R
#' @include utils.R
NULL


## TODO: decide if a class is needed

# ------------------------------------------------------------------------------
#' zenodoArchive
#' @docType class
#' @return Object of \code{R6Class} for modelling an zenodoArchive for ASC cluster files
#' @format \code{R6Class} object.
#'
#' @examples
#' \donttest{
#'   zenodo_archive <- zenodoArchive$new(
#'      doi = "10.5281/zenodo.7401189"
#'   )
#'
#'   # view available version ins the archive
#'   archive_versions <- zenodo_archive$get_versions()
#'
#'   # Getting the available files in the latest zenodo archive version
#'   files <- zenodo_archive$get_version_files()
#'
#'   # downloading the first file from the latest archive version
#'   zenodo_archive$download_zenodo_files()
#' }
#' @export
zenodoArchive <- R6::R6Class(
  "zenodoArchive",
  list(
    #' @field doi zenodoArchive doi, NULL is not supplied
    doi = NULL,
    #' @field all_versions zenodoArchive if to return all versions, `true` when not specified
    all_versions = "true",
    #' @field sort zenodoArchive how to sort the records, `mostrecent` when not specified
    sort = "mostrecent",
    #' @field page zenodoArchive which page to pull in query, `1` when not specified
    page = 1,
    #' @field size zenodoArchive how many records per page,  `20` when not specified
    size = 20,
    #' @field zenodoVersions zenodoArchive doi available version, a storing variable.
    zenodoVersions = NULL,
    #' @field zenodoQuery zenodoArchive doi version query, a storing variable.
    zenodoQuery = NULL,
    #' @field download_file zenodoArchive doi downloads files, a storing variable.
    download_file = NULL,
    #' @field download_url zenodoArchive doi downloads urls, a storing variable.
    download_url = NULL,
    
    
    #' @description initializes the zenodoArchive
    #' @param doi A zenodo doi. To retrieve all records supply a concept doi (a generic doi common to all versions).
    #' @param page Which page to query. Default is 1
    #' @param size How many records per page. Default is 20
    #' @param all_versions If to return all concept doi versions. If `true` returns all, if `false` returns the latest. Default is `ture`
    #' @param sort Which sorting to apply on the records. Default is `mostrecent`. Possible sortings "bestmatch", "mostrecent", "-mostrecent" (ascending), "version", "-version" (ascending).
    initialize = function(doi,
                          page = 1,
                          size = 20,
                          all_versions = "true",
                          #c("true","false"),
                          sort = "mostrecent") {
      # c("bestmatch","mostrecent","-mostrecent","version","-version")
      stopifnot(is.character(doi), length(doi) == 1)
      stopifnot(is.character(all_versions),
                length(all_versions) == 1,
                any(all_versions %in% c("true", "false")))
      stopifnot(is.character(sort),
                length(sort) == 1,
                any(
                  sort %in% c(
                    "bestmatch",
                    "mostrecent",
                    "-mostrecent",
                    "version",
                    "-version"
                  )
                ))
      stopifnot(is.numeric(page), length(page) == 1)
      stopifnot(is.numeric(size), length(size) == 1)
      self$doi <- doi
      self$all_versions <- all_versions
      self$sort <- sort
      self$size <- size
      self$page <- page
      self$zenodoVersions <- NULL
      self$zenodoQuery <- NULL
      self$download_file <- NULL
      self$download_url <- NULL
    },
    #' @description cleans the doi record for query
    #' @param doi  The zenodo archive doi
    #' @return the clean doi
    clean_doi = function(doi = self$doi) {
      gsub("10.5281/zenodo.", "", doi, fixed = T)
    },
    #' @description Query the zenodo archive according to the initial parameters.
    #' @param ...   Excepts the self created by `initialize`
    #' @return a list with the query values.
    zenodo_query = function(...) {
      url <-
        sprintf(
          'https://zenodo.org/api/records/?q=conceptrecid:%s&all_versions=%s&sort=%s&page=%s&size=%s',
          self$clean_doi(),
          self$all_versions,
          self$sort,
          self$page,
          self$size
        )
      version_query <- jsonlite::read_json(url)
      
      if (length(version_query$hits$hits) == 0) {
        # try querying the doi. If succeeded doi supplied is not conceptrecid, if fails then doi is incorect
        url <-
          sprintf(
            'https://zenodo.org/api/records/?q=%s&all_versions=%s&sort=%s&page=%s&size=%s',
            self$clean_doi(),
            self$all_versions,
            self$sort,
            self$page,
            self$size
          )
        
        version_query <- jsonlite::read_json(url)
        
        stopifnot("doi supplied does not match any zenodo record" = length(version_query$hits$hits) !=
                    0)
        
        message(
          sprintf(
            "doi supplied is not an 'all versions doi'\nfor viewing all of the archive records change the doi to:%s",
            version_query$hits$hits[[1]]$conceptdoi
          )
        )
      }
      self$zenodoQuery <- version_query
      self$zenodoQuery
    },
    #' @description Extract all concept doi available versions.
    #' @param ...   Excepts the self created by `initialize`
    #' @return a data.frame of the available versions.
    get_versions = function(...) {
      req <-
        if (is.null(self$zenodoQuery))
          self$zenodo_query()
      else
        self$zenodoQuery
      self$zenodoVersions <-
        do.call(rbind,
                sapply(req$hits$hits, function(x)
                  data.frame(
                    doi = x$metadata$doi,
                    version = x$metadata$version,
                    date = x$metadata$publication_date
                  ), simplify = F))
      self$zenodoVersions
    },
    #' @description get the chosen doi archive version available files
    #' @param version which archive version files to get. Default to latest. To see all available version use `get_versions`
    #' @return a list of the available files in the archive version.
    get_version_files = function(version = "latest") {
      versions <-
        if (is.null(self$zenodoVersions))
          self$get_versions()
      else
        self$zenodoVersions
      
      doi_version <-
        if (version == "latest")
          versions$doi[rev(order(versions$date))][1]
      else
        versions$doi[self$zenodoVersions$version == version]
      
      query <-
        jsonlite::fromJSON(paste0(
          "https://zenodo.org/api/records/",
          self$clean_doi(doi_version)
        ))
      self$download_url <- query$files$links$self
      self$download_file <- basename(query$files$key)
      self$download_file
    },
    #' @description get the chosen doi archive version available files
    #' @param file If supplied, downloads the specific file from the archive.
    #' @param path The output folder for saving the archive files. Default is to a temporary directory.
    #' @param version which archive version files to get. Default to latest. To see all available version use `get_versions`
    #' @param get_file_path Logical (FALSE by default). Do you want to return the path for the file downloaded.
    #' @param all_files Logical (FALSE by default). Do you want to download all files in the archive.
    #' @return If `get_file_path` is TRUE, the function returns the path to the archive file
    #' @param quite     Logical (FALSE by default). Do you want to suppress informative messages
    download_zenodo_files = function(file = NULL,
                                     path = tempdir(),
                                     version = "latest",
                                     all_files = F,
                                     get_file_path = F,
                                     quite = F) {
      if (is.null(self$download_url)) {
        invisible(self$get_version_files(version))
        url <- self$download_url
      } else{
        url <- self$download_url
      }
      if (!is.null(file)) {
        if (is.na(match(file, self$download_file))) {
          message(paste0("Input file is not found in chosen version "))
          if (!is.null(self$download_file))
            message(paste0(
              "The available file are: ",
              paste0(self$download_file, collapse = ",")
            ))
          stop()
        }
        
        url <- grep(file, url, fixed = T, value = T)
      } else{
        file <- self$download_file
      }
      if (all_files) {
        lapply(1:length(file), function(i)
          download.file(url = url[1], file.path(path, file[i])), quite = quite)
        
        if (get_file_path)
          file.path(path, file)
      } else{
        download.file(url[1],  file.path(path, file[1]), quite = quite)
        if (get_file_path)
          file.path(path, file[1])
      }
      
      
    }
  )
)

#' Retrieving allele similarity clusters Zenodo archive
#' 
#' A wrapper function for `zenodoArchive`, download the most recent allele similarity clusters and thresholds from the zenodo archive.
#' The clusters and thresholds are based on \url{https://yaarilab.github.io/IGHV_reference_book/}
#' At the moment only available for human IGHV reference set.
#'
#' @param doi       The doi for the archive to download. Default is the IGHV set.
#' @param path      The output folder for saving the archive files. Default is to a temporary directory.
#' @param get_file  Logical (FALSE by default). Do you want to return the path for the file downloaded.
#' @param quite     Logical (FALSE by default). Do you want to suppress informative messages
#'
#' @return
#'
#' If get_file is TRUE, the function returns the path to the archive file
#'
#' @examples
#' 
#' \donttest{
#' recentAlleleClusters(doi="10.5281/zenodo.7401189")
#' }
#'
#' @export
recentAlleleClusters <-
  function(doi = "10.5281/zenodo.7401189",
           path,
           get_file = FALSE,
           quite = FALSE) {
    if (missing(path)) {
      path <- tempdir()
      if (!quite)
        message(paste0("Files will be downloaded to tmp directory: ", path))
    }
    
    zenodo_archive <- zenodoArchive$new(doi = doi)
    output <-
      zenodo_archive$download_zenodo_files(path = path,
                                           quite = quite,
                                           get_file_path = get_file)
    
    if (get_file)
      return(output)
  }

#' Extracts the allele cluster table from the archive file.
#'
#' @param archive_file            A path to the asc archive file. Default is null. (see details)
#'
#' @details
#'
#' For downloading the latest archive file with the updated allele cluster table, use the function \code{recentAlleleClusters}.
#'
#' @return
#'
#' Returns the allele cluster table.
#'
#' The table columns:
#' `new_allele` - the ASC given allele name
#' `func_group` - the ASC cluster number
#' `imgt_allele` - the original IUIS/IMGT allele name
#' `thresh` - the allele threshold for ASC-based genotype inference
#' `amplicon_length` - is the original length of the reference set.
#' 
#' @examples
#' 
#' \donttest{
#' asc_archive <- recentAlleleClusters(doi="10.5281/zenodo.7429773", get_file = TRUE)
#'
#' allele_cluster_table <- extractASCTable(archive_file = asc_archive)
#' }
#' 
#'
#' @export

extractASCTable <- function(archive_file = NULL) {
  . <- NULL
  
  if (!is.null(archive_file)) {
    master <- as.character(unzip(archive_file, list = TRUE)$Name)
    allele_cluster_table <-
      read.delim(unz(
        archive_file,
        grep("asc_alleles_table.tsv", master, value = T)
      ))
  }
  
  setDT(allele_cluster_table)
  allele_cluster_table <- allele_cluster_table[, .("imgt_allele" = unlist(strsplit(get("imgt_allele"),"/"))), by = setdiff(names(allele_cluster_table), "imgt_allele")]
  
  return(allele_cluster_table)
}

#' Converts IGHV germline set to ASC germline set.
#'
#' @param allele_cluster_table    The allele cluster table.
#' @param germline                An IGHV germline set with matching names to the "imgt_allele" column in the allele_cluster_table.
#'
#' @return
#'
#' Returns the IGHV germline set with the ASC allele names.
#'
#' @examples
#' 
#' # preferably obtain the latest ASC cluster table
#' # asc_archive <- recentAlleleClusters(doi="10.5281/zenodo.7429773", get_file = TRUE)
#'
#' # allele_cluster_table <- extractASCTable(archive_file = asc_archive)
#'
#' data(HVGERM)
#'
#' # example allele similarity cluster table
#' data(allele_cluster_table)
#'
#' asc_germline <- germlineASC(allele_cluster_table, germline = HVGERM)
#'
#' 
#'
#' @export
germlineASC <- function(allele_cluster_table, germline) {
  . <- NULL
  
  setDT(allele_cluster_table)
  if (any(grepl("/", allele_cluster_table$imgt_allele))) {
    allele_cluster_table <- allele_cluster_table[, .("imgt_allele" = unlist(strsplit(get("imgt_allele"),"/"))), by = setdiff(names(allele_cluster_table), "imgt_allele")]
  }
  
  germline_asc <- germline[allele_cluster_table$imgt_allele]
  
  asc_alleles <-
    setNames(allele_cluster_table$new_allele,
             allele_cluster_table$imgt_allele)
  
  names(germline_asc) <- asc_alleles[names(germline_asc)]
  
  germline_asc <- germline_asc[!duplicated(germline_asc)]
  
  return(germline_asc)
}


# ------------------------------------------------------------------------------

#' Assign allele similarity clusters
#' 
#' \code{assignAlleleClusters} uses the allele clusters annotation to change the preliminary allele
#' assignments to the new annotations before inferring a genotype.
#'
#' @param data                  data.frame in AIRR format, containing V allele calls from a single subject and the sample IMGT-gapped V(D)J sequences under seq.
#' @param v_call                name of the V allele call column. Default is `v_call`
#' @param alleleClusterTable    A data.frame of the allele clusters new annotations relative to the original reference set. See details.
#'
#' @return
#' A modified input \code{data.frame} with the new assigned
#'
#'@examples
#'
#'
#' # preferably obtain the latest ASC cluster table
#' # asc_archive <- recentAlleleClusters(doi="10.5281/zenodo.7429773", get_file = TRUE)
#'
#' # allele_cluster_table <- extractASCTable(archive_file = asc_archive)
#'
#' # example allele similarity cluster table
#' data(allele_cluster_table)
#'
#' # loading TIgGER AIRR-seq b cell data
#' data <- tigger::AIRRDb
#'
#' asc_data <- assignAlleleClusters(data, allele_cluster_table)
#'
#'
#' @export
assignAlleleClusters <-
  function(data, alleleClusterTable, v_call = "v_call") {
    . <- NULL
    
    setDT(alleleClusterTable)
    if (any(grepl("/", alleleClusterTable$imgt_allele))) {
      alleleClusterTable <- alleleClusterTable[, .("imgt_allele" = unlist(strsplit(get("imgt_allele"),"/"))), by = setdiff(names(alleleClusterTable), "imgt_allele")]
    }
    
    # set the dictionary
    germline_set <-
      setNames(alleleClusterTable$new_allele,
               alleleClusterTable$imgt_allele)
    
    # switch the assignments
    data[[v_call]] <- sapply(data[[v_call]], function(x) {
      calls <- unlist(strsplit(x, ","))
      calls <- germline_set[calls]
      calls <- calls[!duplicated(calls)]
      paste0(calls, collapse = ",")
    }, USE.NAMES = F)
    
    return(data)
  }

# ------------------------------------------------------------------------------

#' Allele based genotype inference
#' 
#' \code{inferGenotypeAllele} infer an individual's genotype based on the allele-base method.
#' The method utilize the allele specific threshold to determine the presence of an allele in the genotype.
#' More specifically, based on the allele frequency, repertoire depth, and the specific allele threshold, a confidence level (Z score) is calculated 
#' for the presence of the allele in the genotype. The user can select the confidence level for the genotype inference.
#'
#' @param data                     data.frame in AIRR format, containing allele calls from a single subject and the sample IMGT-gapped V(D)J sequences under seq.
#' @param allele_threshold_table   A data.frame of the alleles and their thresholds. 
#' @param call                     name of the V,D, or J allele call column, i.e v_call, d_call, j_call. Default is `v_call`
#' @param asc_annotation           Logical (FALSE by default). Are the allele calls annotated with the allele similarity clusters.
#' @param translate_to_asc         For V allele calls, collapse identical allele for the genotype inference. Default is FALSE.
#' @param single_assignment        if TRUE, the method only considers sequence with single assignment for the genotype inference.
#' @param germline_db              named vector of sequences containing the germline sequences named in V allele calls and the alleleClusterTable. Only required if find_unmutated is TRUE.
#' @param find_unmutated           if TRUE, use germline_db to find which samples are unmutated. Not needed if V allele calls only represent unmutated samples.
#' @param seq                      name of the column in data with the aligned, IMGT-numbered, V(D)J nucleotide sequence. Default is sequence_alignment.
#' @param default_allele_threshold The default allele threshold for the genotype inference, in case the allele threshold is not in the `allele_threshold_table`. Default is 1e-04.
#' @param quiet                    Logical (TRUE by default). Do you want to suppress informative messages
#' @return
#' A a data.frame with the inferred V genotype. The table contains the following columns:
#' 
#' - allele: The alleles in the `allele_threshold_table`.
#' - counts: The number of reads for each alleles.
#' - depth: The total number of reads in the genotype (Sum of counts).
#' - threshold: The population driven allele thresholds for genotype presence.
#' - z_score: The confidence level for the presence of the allele in the genotype.
#' - asc_allele: If `translate_to_asc` is true, the asc allele value from allele_threshold_table.
#'
#' @details
#'
#' In naive repertoires, allele calls where more than one assignment is assigned is rare. Hence, in case the data represents the naive repertoire of a subject
#' it is recommended to use the `find_unmutated=TRUE` option, to remove mutated sequences. For non-naive population, the allele calls in cases of multiple assignment
#' are treated as belonging to all groups.
#'
#' @seealso
#'
#' \link{inferAlleleClusters} will infer the allele clusters based on a supplied V reference set and set the default allele threshold of 1e-04.
#' See \link{recentAlleleClusters} to obtain the latest version of the IGHV allele clusters and the naive population based allele threshold.
#'
#'@examples
#'
#'
#' # loading TIgGER AIRR-seq b cell data
#' data <- tigger::AIRRDb
#'
#' # allele threshold table
#' data(allele_threshold_table)
#'
#' data(HVGERM)
#'
#' # inferring the genotype
#' genotype <- inferGenotypeAllele(
#' data = data,
#' allele_threshold_table = allele_threshold_table,
#' germline_db = HVGERM, find_unmutated=TRUE)
#' 
#' # filter alleles with z_score >= 0 
#' 
#' head(genotype[genotype$z_score >= 0,])
#'
#' @export
# Parts are adapted from tigger::inferGenotype
inferGenotypeAllele <-
  function(data,
           allele_threshold_table=NULL,
           call = "v_call",
           asc_annotation = FALSE,
           single_assignment = FALSE,
           translate_to_asc = FALSE,
           germline_db = NA,
           find_unmutated = FALSE,
           seq = "sequence_alignment",
           default_allele_threshold = 1e-04,
           quiet = TRUE) {
    . = NULL
    .. = NULL
    
    # only clean the allele calls if they don't start with IG/TR
    if (any(!grepl("IG|TR", data[[call]]))) {
      allele_calls <- clean_allele_calls(data[[call]])
    }else{
      allele_calls <- data[[call]]
    }
    
    unique_calls <- unique(unlist(allele_calls))
    allele_calls <- sapply(allele_calls, function(x) paste0(x, collapse = ","))
    segment <- unique(substr(unique_calls,4,4))
    ## check that the allele calls are in the supplied allele_threshold_table
    if(is.null(allele_threshold_table)){
      ## load the default allele threshold table
      data(allele_threshold_table, envir = environment())
    }
    
    if(!"tag" %in% names(allele_threshold_table)){
      allele_threshold_table[,"tag":=substr(get("allele"),4,4)]
    }
    
    asc_col <- "allele"
    if(asc_annotation){
      asc_col <- "asc_allele"
    }
    
    allele_threshold_table <- allele_threshold_table[get("tag") %in% segment]
    match <- unique_calls %in% allele_threshold_table[[asc_col]]
    if (!all(match)) {
      for(allele in unique_calls[!match]) {
        if(!quiet) warning(paste0("The allele call ", allele, " is not in the allele threshold table. Using the default threshold: ", default_allele_threshold))
        allele_threshold_table <- rbind(allele_threshold_table, 
                                    data.frame(
                                      "tag" = substr(allele,4,4),
                                      "allele" = allele,
                                      "asc_allele" = allele,
                                      "threshold" = default_allele_threshold
                                      ))
      }
    }
    
    allele_threshold_table[,"genotyped_allele":=get(asc_col)]
    
    base_count <- 1/length(unique(allele_threshold_table[["genotyped_allele"]]))
    
    ## check unmutated
    if (find_unmutated) {
      if (is.na(germline_db[1])) {
        stop("germline_db needed if find_unmutated is TRUE")
      }
      allele_calls <-
        tigger::findUnmutatedCalls(allele_calls, as.character(data[[seq]]),
                                   germline_db)
      if (length(allele_calls) == 0) {
        stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.")
      }
    }
    
    genotype_dt <- data.table::data.table(
      "genotyped_allele" = allele_calls)
    
    if(single_assignment){
      genotype_dt <- genotype_dt[!grepl(",",get("genotyped_allele")),]    
    }
    
    genotype_dt[,"multiple":=1/(stringi::stri_count_fixed(get("genotyped_allele"), ",")+1)]
    genotype_dt[,"nrow":=1:.N]
    genotype_dt <- genotype_dt[, .("genotyped_allele" = unlist(strsplit(get("genotyped_allele"), ","))), by = mget(c("multiple", "nrow"))]
    genotype_dt[,"single_assignments":=sum(get("multiple")==1), by = .(get("genotyped_allele"))]
    genotype_dt[,"multiple_assignments_wheight":=ifelse(get("single_assignments") == 0,
                                                        base_count,
                                                        get("single_assignments") / length(unique(allele_threshold_table$allele))),
                by = mget(c("genotyped_allele", "nrow"))]
    genotype_dt[get("multiple")==1, "multiple_assignments_wheight":=1]
    genotype_dt[,"fraction":=get("multiple")*get("multiple_assignments_wheight")]
    genotype_dt <- genotype_dt[,.("count" = sum(get("fraction"))), by = mget(c("genotyped_allele"))]
    genotype_dt <- merge(allele_threshold_table, genotype_dt, by = c("genotyped_allele"), all.x = T, all.y=F)
    genotype_dt[,"gene" := alakazam::getGene(
      get("genotyped_allele"),
      first = F,
      collapse = T,
      strip_d = F
    )]
    ## if translate_to_asc, then collapse similar alleles by asc.
    final_columns <- c("gene","genotyped_allele", "count", "depth", "threshold", "z_score")
    if(translate_to_asc){
      genotype_dt <- genotype_dt[,.(
        "gene" = paste0(unique(get("gene")),collapse = "/"),
        "genotyped_allele" = paste0(get("genotyped_allele"),collapse = "/"),
        "count" = sum(get("count"), na.rm = T),
        "threshold" = min(get("threshold"))
      ), by =.(get("asc_allele"))]
      final_columns <- c("gene","genotyped_allele", "count", "depth", "threshold", "z_score")
    }
    ## add base counts
    genotype_dt[is.na(get("count")), "count" := base_count]
    genotype_dt[, "depth" := sum(get("count"))]
    
    ## get the z_score
    genotype_dt[, "z_score" := z_score(get("count"), get("depth"), get("threshold"))]
    genotype_dt <- genotype_dt[, .SD, .SDcols = final_columns]
    names(genotype_dt)[2] <- "allele"
    return(genotype_dt)
}

# ------------------------------------------------------------------------------

#' Allele similarity cluster based genotype inference Testing function
#' 
#' \code{inferGenotypeAllele_asc} infer an individual's genotype based on the allele-base method.
#' The method utilize the allele specific threshold to determine the presence of an allele in the genotype.
#' More specifically, the absolute frequency of each allele is calculated and checked against the threshold.
#'
#' @param data                     data.frame in AIRR format, containing V allele calls from a single subject and the sample IMGT-gapped V(D)J sequences under seq.
#' @param alleleClusterTable       A data.frame of the allele similarity clusters thresholds.
#' @param v_call                   name of the V allele call column. Default is `v_call`
#' @param single_assignment        if TRUE, the method only considers sequence with single assignment for the genotype inference.
#' @param germline_db              named vector of sequences containing the germline sequences named in V allele calls and the alleleClusterTable. Only required if find_unmutated is TRUE.
#' @param find_unmutated           if TRUE, use germline_db to find which samples are unmutated. Not needed if V allele calls only represent unmutated samples.
#' @param seq                      name of the column in data with the aligned, IMGT-numbered, V(D)J nucleotide sequence. Default is sequence_alignment.
#' @param confidence_level         The confidence level on which to filter the inferred genotype alleles. Default is NULL, meaning filtering only based on allele threshold.
#' @param default_allele_threshold The default allele threshold for the genotype inference, in case the allele threshold is not in the `alleleClusterTable`. Default is 1e-04.
#'
#' @return
#' A a data.frame with the inferred V genotype. The table contains the following columns:
#' 	|gene            | alleles             | imgt_alleles          | counts              | absolute_fraction     | absolute_threshold               | genotyped_alleles    | genotype_imgt_alleles|
#'  |----------------|---------------------|-----------------------|---------------------|-----------------------|----------------------------------|----------------------|----------------------|
#'  | allele cluster | the present alleles | the imgt nomenclature | the number of reads | the absolute fraction | the population driven allele     | the alleles which    | the imgt nomenclature|
#'  |                | in the repertoire   | of the alleles        | for each alleles    | of the alleles        | thresholds for genotype presence | entered the genotype | of the alleles 		  |
#'
#' @details
#'
#' In naive repertoires, allele calls where more than one assignment is assigned is rare. Hence, in case the data represents the naive repertoire of a subject
#' it is recommended to use the `find_unmutated=TRUE` option, to remove mutated sequences. For non-naive population, the allele calls in cases of multiple assignment
#' are treated as belonging to all groups.
#'
#' @seealso
#'
#' \link{inferAlleleClusters} will infer the allele clusters based on a supplied V reference set and set the default allele threshold of 1e-04.
#' See \link{recentAlleleClusters} to obtain the latest version of the IGHV allele clusters and the naive population based allele threshold.
#'
#'@examples
#'
#'
#' # loading TIgGER AIRR-seq b cell data
#' data <- tigger::AIRRDb
#'
#' # preferably obtain the latest ASC cluster table
#' # asc_archive <- recentAlleleClusters(doi="10.5281/zenodo.7429773", get_file = TRUE)
#'
#' # allele_cluster_table <- extractASCTable(archive_file = asc_archive)
#'
#' # example allele similarity cluster table
#' data(allele_cluster_table)
#'
#' data(HVGERM)
#'
#' # reforming the germline set
#' asc_germline <- germlineASC(allele_cluster_table, germline = HVGERM)
#'
#' # assigning the ASC alleles
#' asc_data <- assignAlleleClusters(data, allele_cluster_table)
#'
#' # inferring the genotype
#' asc_genotype <- inferGenotypeAllele_asc(
#' data = asc_data,
#' alleleClusterTable = allele_cluster_table,
#' germline_db = asc_germline, find_unmutated=TRUE)
#' 
#' @export
inferGenotypeAllele_asc <- function(data,
         alleleClusterTable,
         v_call = "v_call",
         single_assignment = FALSE,
         germline_db = NA,
         find_unmutated = FALSE,
         seq = "sequence_alignment",
         confidence_level = NULL,
         default_allele_threshold = 1e-04) {
  . = NULL
  
  allele_calls = clean_allele_calls(data[[v_call]])
  
  ## check that the allele calls are in the supplied alleleClusterTable
  unique_calls <- unique(unlist(allele_calls))
  match <- unique_calls %in% alleleClusterTable$new_allele
  if (!all(match)) {
    
    for(allele in unique_calls[!match]) {
      warning(paste0("The allele call ", allele, " is not in the alleleClusterTable. Using the default threshold: ", default_allele_threshold))
      
      alleleClusterTable <- rbind(alleleClusterTable, 
                                  data.frame(new_allele = allele,
                                             func_group = strsplit(allele,'[*]')[[1]][1], 
                                             imgt_allele = allele,
                                             thresh = default_allele_threshold))
    }
    
  }
  
  ## check unmutated
  if (find_unmutated) {
    if (is.na(germline_db[1])) {
      stop("germline_db needed if find_unmutated is TRUE")
    }
    allele_calls <-
      tigger::findUnmutatedCalls(allele_calls, as.character(data[[seq]]),
                                 germline_db)
    if (length(allele_calls) == 0) {
      stop("No unmutated sequences found! Set 'find_unmutated' to 'FALSE'.")
    }
  }
  
  geno_V <- data.table::data.table(v_call = allele_calls)
  
  geno_V <-
    geno_V[, "gene" := alakazam::getGene(
      get("v_call"),
      first = F,
      collapse = T,
      strip_d = F
    )]
  
  # removing multiple gene assignments
  geno_V <- geno_V[!grepl(",", get("gene"))]
  
  # clean the allele class
  geno_V[, "v_allele" :=
           gsub(
             "(IG[HKL][VDJADEGMC]|TR[ABDG])[A-Z0-9\\(\\)]+[-/\\w]*[*]",
             "",
             get("v_call"),
             perl = T
           )]
  
  ## get single assignments
  if (single_assignment) {
    geno_V <- geno_V[!(grepl(",", get("v_allele")))]
    geno_V <- geno_V[!is.na(get("v_allele"))]
    geno_V[, "n_row_sub" := .N]
    geno_V[, "frac" := 1]
    geno_V_fraction <-
      geno_V[, .("absolute_fraction" = round(sum(get("frac")) / unique(get("n_row_sub")), 8),
                 "count" = sum(get("frac"))), by = list(get("gene"), get("v_allele"))]
  } else{
    ### distribute the multiple assignments for non naive sequences
    geno_V <- geno_V[!is.na(get("v_allele"))]
    geno_V[, "n_row_sub" := .N]
    geno_V[, "frac" := 1]
    geno_V <-
      geno_V[, .("count" = sum(get("frac"))), by = mget(c("gene", "v_allele", "n_row_sub"))]
    
    n_row_sub <- unique(geno_V$n_row_sub)
    
    geno_V_fraction <- c()
    #### code from TIgGER inferGentoype
    for (g in unique(geno_V$gene)) {
      ac <- geno_V[get("gene") == g, get("v_allele")]
      t_ac <-
        setNames(geno_V[get("gene") == g, get("count")], geno_V[get("gene") == g, get("v_allele")]) # table of allele calls
      potentials <-
        unique(unlist(strsplit(names(t_ac), ","))) # potential alleles
      
      regexpotentials <-
        paste(gsub("\\*", "\\\\*", potentials), "$", sep = "")
      regexpotentials <-
        paste(regexpotentials, gsub("\\$", ",", regexpotentials), sep =
                "|")
      tmat <-
        sapply(regexpotentials, function(x)
          grepl(x, names(t_ac), fixed = FALSE))
      
      if (length(potentials) == 1 | length(t_ac) == 1) {
        seqs_expl <-
          t(as.data.frame(apply(t(as.matrix(tmat)), 2, function(x)
            x *
              t_ac)))
        rownames(seqs_expl) <- names(t_ac)[1]
      } else{
        seqs_expl <- as.data.frame(apply(tmat, 2, function(x)
          x *
            t_ac))
      }
      #       seqs_expl = as.data.frame(apply(tmat, 2, function(x) x*t_ac))
      colnames(seqs_expl) <- potentials
      # Add low (fake) counts
      sapply(colnames(seqs_expl), function(x) {
        if (sum(rownames(seqs_expl) %in% paste(x)) == 0) {
          seqs_expl <<- rbind(seqs_expl, rep(0, ncol(seqs_expl)))
          
          rownames(seqs_expl)[nrow(seqs_expl)] <<- paste(x)
          seqs_expl[rownames(seqs_expl) %in% paste(x), paste(x)] <<-
            0.01
          
        }
      })
      
      # Build ratio dependent allele count distribution of multi assigned reads
      seqs_expl_single <-
        seqs_expl[grep(',', rownames(seqs_expl), invert = T),]
      
      seqs_expl_multi <-
        seqs_expl[grep(',', rownames(seqs_expl), invert = F),]
      if (is.null(nrow(seqs_expl_multi))) {
        seqs_expl_multi <- t(as.data.frame(seqs_expl_multi))
        rownames(seqs_expl_multi) <-
          grep(',',
               rownames(seqs_expl),
               invert = F,
               value = T)
      }
      
      if (!is.null(nrow(seqs_expl_single))  &&
          nrow(seqs_expl_single) != 0 &&
          nrow(seqs_expl_single) != nrow(seqs_expl)) {
        if (nrow(seqs_expl_multi) > 1) {
          seqs_expl_multi <-
            seqs_expl_multi[order(nchar(row.names(seqs_expl_multi))),]
        }
        sapply(1:nrow(seqs_expl_multi), function(x) {
          genes <- unlist(strsplit(row.names(seqs_expl_multi)[x], ','))
          
          counts <-
            seqs_expl_single[rownames(seqs_expl_single) %in% genes, genes]
          counts <- colSums(counts)
          counts_to_distribute <- seqs_expl_multi[x, genes]
          
          new_counts <-
            counts + ((counts_to_distribute * counts) / sum(counts))
          for (i in 1:length(new_counts)) {
            gene_tmp <- names(new_counts)[i]
            seqs_expl_single[rownames(seqs_expl_single) %in% gene_tmp, gene_tmp] <<-
              new_counts[i]
          }
        })
      }
      
      seqs_expl <-
        if (is.null(nrow(seqs_expl_single)) ||
            nrow(seqs_expl_single) == 0) {
          seqs_expl
        } else{
          seqs_expl_single
        }
      seqs_expl <- round(seqs_expl)
      if (sum(rowSums(seqs_expl) == 0) != 0) {
        seqs_expl <- seqs_expl[rowSums(seqs_expl) != 0, ]
      }
      
      allele_tot <-
        sort(apply(seqs_expl, 2, sum), decreasing = TRUE)
      
      gene_table <-
        data.table::data.table(
          "gene" = g,
          "v_allele" = names(allele_tot),
          "count" = allele_tot,
          "n_row_sub" = n_row_sub
        )
      gene_table <- gene_table[get("count") != 0]
      
      geno_V_fraction <- dplyr::bind_rows(geno_V_fraction, gene_table)
    }
    ############
    geno_V_fraction <-
      geno_V_fraction[, .("absolute_fraction" = round(get("count") / unique(get("n_row_sub")), 8),
                          "count" = get("count")), by = mget(c("gene", "v_allele"))]
    
  }
  
  ## add original allele and cut off
  geno_V_fraction[, "v_call" := paste0(get("gene"), "*", get("v_allele"))]
  
  if (any(duplicated(alleleClusterTable$new_allele))) {
    alleleClusterTable <- setDT(alleleClusterTable)
    
    alleleClusterTable <-
      alleleClusterTable[, .("imgt_allele" = paste0(sort(unlist(unique(mget(
        c("imgt_allele")
      )))), collapse = "/")), by = mget(names(alleleClusterTable)[names(alleleClusterTable) !=
                                                                    "imgt_allele"])]
  }
  
  alleles_clusters <-
    setNames(alleleClusterTable$imgt_allele,
             alleleClusterTable$new_allele)
  geno_V_fraction[, "v_call_or" := alleles_clusters[get("v_call")]]
  
  
  na_id <- which(is.na(geno_V_fraction$v_call_or))
  if (length(na_id) != 0)
    geno_V_fraction$v_call_or[na_id] <-
    sapply(na_id, function(i) {
      new_allele <- geno_V_fraction$v_call[i]
      closest <- strsplit(geno_V_fraction$v_call[i], "_")[[1]][1]
      or_allele <- alleles_clusters[closest]
      paste0(or_allele, gsub(closest, "", new_allele, fixed = T))
    })
  
  allele_cluster_threshold <-
    setNames(as.numeric(alleleClusterTable$thresh), alleleClusterTable$new_allele)
  geno_V_fraction <-
    geno_V_fraction[, "absolute_thresh" := allele_cluster_threshold[get("v_call")]]
  
  z_score <- function(Nai, N, Tai) (Nai - Tai*N) / sqrt(Tai*N*(1-Tai))
  
  geno_V_fraction <-
    geno_V_fraction[, "genotype_confidence" := z_score(Nai=get("count"),N=n_row_sub,Tai=get("absolute_thresh"))]
  
  na_id <- which(is.na(geno_V_fraction$absolute_thresh))
  if (length(na_id) != 0)
    geno_V_fraction$absolute_thresh[na_id] <-
    sapply(na_id, function(i) {
      new_allele <- geno_V_fraction$v_call[i]
      closest <- strsplit(geno_V_fraction$v_call[i], "_")[[1]][1]
      allele_cluster_threshold[closest]
    })
  
  ## check if allele is above thresh.
  
  geno_V_fraction[, "above_thresh" := get("absolute_fraction") >= get("absolute_thresh")]
  
  if(!is.null(confidence_level)) {
    geno_V_fraction <- geno_V_fraction[, "above_confidence" := get("genotype_confidence") >= confidence_level]
  }else{
    geno_V_fraction <- geno_V_fraction[, "above_confidence" := TRUE]
  }
  
  sortBy <- c('gene', 'absolute_fraction')
  sortType <- c(1, -1)
  data.table::setorderv(geno_V_fraction, sortBy, sortType)
  genoV <-
    geno_V_fraction[, .(
      "alleles" = paste0(get("v_allele"), collapse = ","),
      "imgt_alleles" = paste0(get("v_call_or"), collapse = ","),
      "counts" = paste0(get("count"), collapse = ","),
      "absolute_fraction" = paste0(round(get(
        "absolute_fraction"
      ), 7), collapse = ","),
      "absolute_threshold" = paste0(formatC(get(
        "absolute_thresh"
      ), format = "f"), collapse = ","),
      "genotype_confidence" = paste0(formatC(get(
        "genotype_confidence"
      ), format = "f"), collapse = ","),
      "genotyped_alleles" = paste0(get("v_allele")[get("above_thresh") &
                                                     get("above_confidence")], collapse = ","),
      "genotyped_imgt_alleles" = paste0(get("v_call_or")[get("above_thresh") &
                                                           get("above_confidence")], collapse = ",")
    ), by = mget(c("gene"))]
  
  
  # geno <- as.data.frame(genotype, stringsAsFactors = FALSE)
  # if (find_unmutated == TRUE) {
  #   seqs <- genotypeFasta(geno, germline_db)
  #   dist_mat <- seqs %>% sapply(function(x) sapply((getMutatedPositions(seqs,
  #                                                                       x)), length)) %>% as.matrix
  #   rownames(dist_mat) <- colnames(dist_mat)
  #   for (i in 1:nrow(dist_mat)) {
  #     dist_mat[i, i] = NA
  #   }
  #   same <- which(dist_mat == 0, arr.ind = TRUE)
  #   if (nrow(same) > 0) {
  #     for (r in 1:nrow(same)) {
  #       inds <- as.vector(same[r, ])
  #       geno[getGene(rownames(dist_mat)[inds][1]), ]$note <- paste(rownames(dist_mat)[inds],
  #                                                                  collapse = " and ") %>% paste("Cannot distinguish",
  #                                                                                                .)
  #     }
  #   }
  # }
  # rownames(geno) <- NULL
  
  return(genoV)
}

Try the piglet package in your browser

Any scripts or data that you put into this service are public.

piglet documentation built on April 12, 2025, 1:27 a.m.