R/parsingFunctions.R

Defines functions readKirCalls readHlaAlignments readHlaCalls

Documented in readHlaAlignments readHlaCalls readKirCalls

#' Read HLA allele calls
#'
#' \code{readHlaCalls} read HLA allele calls from file
#'
#' Input file has to be a tsv formatted table with a header. First column should
#' contain sample IDs, further columns hold HLA allele numbers. See
#' \code{system.file("extdata", "MiDAS_tut_HLA.txt", package = "midasHLA")} file
#' for an example.
#'
#' \code{resolution} parameter can be used to reduce HLA allele numbers. If
#' reduction is not needed \code{resolution} can be set to 8. \code{resolution}
#' parameter can take the following values: 2, 4, 6, 8. For more details
#' about HLA allele numbers resolution see
#' \url{http://hla.alleles.org/nomenclature/naming.html}.
#'
#' @inheritParams reduceAlleleResolution
#' @inheritParams utils::read.table
#' @param file Path to input file.
#'
#' @return HLA calls data frame. First column hold sample IDs, further columns 
#'   hold HLA allele numbers.
#'
#' @examples
#' file <- system.file("extdata", "MiDAS_tut_HLA.txt", package = "midasHLA")
#' hla_calls <- readHlaCalls(file)
#'
#' @importFrom assertthat assert_that is.readable see_if
#' @importFrom stats na.omit
#' @importFrom stringi stri_split_fixed
#' @importFrom utils read.table
#' @export
readHlaCalls <- function(file,
                         resolution = 4,
                         na.strings = c("Not typed", "-", "NA")) {
  assert_that(is.readable(file),
              is.count(resolution),
              is.character(na.strings)
  )
  hla_calls <- read.table(file,
                          header = TRUE,
                          sep = "\t",
                          stringsAsFactors = FALSE,
                          na.strings = na.strings
  )
  assert_that(checkHlaCallsFormat(hla_calls))

  # set colnames based on allele numbers
  gene_names <- vapply(X = 2:ncol(hla_calls),
                       FUN = function(i) {
                         names <- stri_split_fixed(hla_calls[, i], "*")
                         names <- vapply(X = names,
                                FUN = function(x) x[1],
                                FUN.VALUE = character(length = 1)
                         )
                         names <- unique(na.omit(names))
                         assert_that(
                           see_if(length(names) <= 1,
                                  msg = "Gene names in columns are not identical"
                           ),
                           see_if(length(names) != 0,
                                  msg = "One of the columns contains only NA"
                           )
                         )
                         return(names)
                       },
                       FUN.VALUE = character(length = 1)
  )
  ord <- order(gene_names)
  gene_names <- gene_names[ord]
  gene_names <- toupper(gene_names)
  gene_names_id <- unlist(lapply(table(gene_names), seq_len))
  gene_names <- paste(gene_names, gene_names_id, sep = "_")
  hla_calls <- hla_calls[, c(1, ord + 1)]
  colnames(hla_calls) <- c("ID", gene_names)

  hla_calls <- reduceHlaCalls(hla_calls, resolution = resolution)

  return(hla_calls)
}

#' Read HLA allele alignments
#'
#' \code{readHlaAlignments} read HLA allele alignments from file.
#'
#' HLA allele alignment file should follow EBI database format, for details
#' see
#' \url{ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/alignments/README.md}.
#'
#' All protein alignment files from the EBI database are shipped with the package.
#' They can be easily accessed using \code{gene} parameter. If \code{gene} is
#' set to \code{NULL}, \code{file} parameter is used instead and alignment is
#' read from the provided file. In EBI database alignments for DRB1, DRB3, DRB4
#' and DRB5 genes are provided as a single file, here they are separated.
#' Additionally, were possible sequences for alleles not present in the alignments have been 
#' inferred based on higher resolution alleles. To this end we have reduced 
#' alleles to 6 and 4 digit resolution and took consensus sequence to represent
#' missing alleles. Positions for which there was no full agreement were marked
#' as unknown.
#' 
#' 
#'
#' @inheritParams readHlaCalls
#' @param gene Character vector of length one specifying the name of a gene for
#'   which alignment is required. See details for further explanations.
#' @param trim Logical indicating if alignment should be trimmed to start codon
#'   of the mature protein.
#' @param unkchar Character to be used to represent positions with unknown
#'   sequence.
#'
#' @return Matrix containing HLA allele alignments.
#'
#'   Rownames correspond to allele numbers and columns to positions in the
#'   alignment. Sequences following the termination codon are marked as empty
#'   character (\code{""}). Unknown sequences are marked with a character of
#'   choice, by default \code{""}. Stop codons are represented by a hash (X).
#'   Insertion and deletions are marked with period (.).
#'
#' @examples
#' hla_alignments <- readHlaAlignments(gene = "A")
#'
#' @importFrom assertthat assert_that is.count is.readable is.string see_if
#' @importFrom stringi stri_flatten stri_split_regex stri_sub
#' @importFrom stringi stri_subset_fixed stri_read_lines stri_detect_regex
#' @export
readHlaAlignments <- function(file,
                              gene = NULL,
                              trim = FALSE,
                              unkchar = "") {
  assert_that(
    isTRUEorFALSE(trim),
    is.string(unkchar)
  )

  if (is.null(gene)) {
    assert_that(is.readable(file))
    aln_raw <- stri_read_lines(file)
    aln <- stri_split_regex(aln_raw, "\\s+")

    # extract lines containing alignments and omit empty alignment lines
    nonempty_lines <- vapply(aln, length, integer(length = 1)) >= 2
    aln <- aln[nonempty_lines]
    allele_numbers <- vapply(aln, `[`, character(length = 1), 2)
    allele_lines <- checkAlleleFormat(allele_numbers)

    assert_that(
      see_if(any(allele_lines),
             msg = "could not find alleles numbers in the alignment file"
      )
    )
    allele_numbers <- allele_numbers[allele_lines]
    aln <- aln[allele_lines]
    assert_that(
      see_if(all(stri_detect_regex(unlist(aln), "^[A-Z0-9:*.-]*$")),
                               msg = "alignments lines contain non standard characters"
      )
    )

    tmp_aln_env <- new.env(size = 5000)
    for (i in seq_along(allele_numbers)) {
      assign(
        x = allele_numbers[i],
        value = append(
          x = get0(
            x = allele_numbers[i],
            envir = tmp_aln_env,
            ifnotfound = character(length = 0)
          ),
          values = aln[[i]][-c(1, 2)] # discard empty element and allele number
        ),
        envir = tmp_aln_env
      )
    }
    aln_list <- as.list(tmp_aln_env)[unique(allele_numbers)] # convert to list and sort

    ref_seq <- stri_flatten(aln_list[[1]])
    seq_along_ref <- seq(1, nchar(ref_seq), 1)
    ref_seq <- stri_sub(ref_seq,
                        seq_along_ref,
                        seq_along_ref
    )
    ref_len <- length(ref_seq)
    aln <- do.call(rbind,
                   lapply(aln_list,
                          function(a) {
                            a <- stri_flatten(a)
                            a <- stri_sub(a,
                                          seq_along_ref,
                                          seq_along_ref
                            )
                            i <- a == "-"
                            a[i] <- ref_seq[i]

                            return(a)
                          }
                   )
    )

    # find AA positions numbers
    aln_raw <- aln_raw[nonempty_lines]
    raw_first_codon_idx <- nchar(stri_subset_fixed(aln_raw, "Prot")[1])
    raw_alignment_line <- stri_sub(aln_raw[allele_lines][1],
                                   1,
                                   raw_first_codon_idx
    )
    raw_alignment_seq <- stri_split_regex(raw_alignment_line, "\\s+")
    raw_alignment_seq <- unlist(raw_alignment_seq)[-c(1, 2)]
    first_codon_idx <- nchar(stri_flatten(raw_alignment_seq))
    assert_that(
      see_if(is.count(first_codon_idx),
             msg = "start codon is not marked properly in the input file"
      )
    )
    if (first_codon_idx > 1) {
      aln_colnames <- c(seq(1 - first_codon_idx, -1, 1),
                        seq(1, ncol(aln) + 1 - first_codon_idx, 1)
                      )
    } else {
      aln_colnames <- seq(1, ncol(aln) + 1 - first_codon_idx, 1)
    }
    colnames(aln) <- aln_colnames

    # discard aa '5 to start codon of mature protein
    if (trim) {
      aln <- aln[, first_codon_idx:ncol(aln)]
    }
  } else {
    assert_that(is.string(gene))
    gene <- toupper(gene)
    file <- paste0(system.file("extdata", package = "midasHLA"),
                   "/",
                   gene,
                   "_prot.Rdata"
    )
    available_genes <- list.files(
      path = system.file("extdata", package = "midasHLA"),
      pattern = "_prot.Rdata$",
      full.names = TRUE
    )
    assert_that(
      see_if(file %in% available_genes,
             msg = sprintf("alignment for %s is not available", gene)
      )
    )

    cached_aln_obj <- readRDS(file) # list(readHlaAlignments(file, trim = FALSE, unkchar = "*"), first_codon_idx)
    aln <- cached_aln_obj[[1]]

    # discard aa '5 to start codon of mature protein
    if (trim) {
      first_codon_idx <- cached_aln_obj[[2]]
      aln <- aln[, first_codon_idx:ncol(aln)]
    }
  }

  # substitute unkchar
  aln[aln == "*"] <- unkchar

  return(aln)
}

#' Read KIR calls
#'
#' \code{readKirCalls} read KIR calls from file.
#'
#' Input file has to be a tsv formatted table. First column should be named
#' "ID" and contain samples IDs, further columns should hold KIR genes presence
#' / absence indicators. See
#' \code{system.file("extdata", "MiDAS_tut_KIR", package = "midasHLA")} for an
#' example.
#'
#' @inheritParams utils::read.table
#' @param file Path to input file.
#'
#' @return Data frame containing KIR gene's counts. First column hold samples 
#'   IDs, further columns hold KIR genes presence / absence indicators.
#'
#' @examples
#' file <- system.file("extdata", "MiDAS_tut_KIR.txt", package = "midasHLA")
#' readKirCalls(file)
#'
#' @importFrom assertthat assert_that is.readable see_if
#' @importFrom dplyr left_join select
#' @importFrom stats na.omit setNames
#' @export
readKirCalls <- function(file,
                         na.strings = c("", "NA", "uninterpretable")) {
  assert_that(
    is.readable(file),
    is.character(na.strings)
  )

  kir_calls <- read.table(
    file = file,
    header = TRUE,
    sep = "\t",
    stringsAsFactors = FALSE,
    na.strings = na.strings
  )
  checkKirCallsFormat(kir_calls)

  return(kir_calls)
}
Genentech/midasHLA documentation built on Feb. 12, 2024, 9:38 a.m.