R/translateGenotypes.R

Defines functions translateGenotypes

Documented in translateGenotypes

#' Translate genotypes
#'
#' @param input A data.frame or a (relative) path to a file of alleles to be
#' translated. Fixed columns are name of the laboratory, which should match
#' those from \code{ref_tbl} and sample name (this is for your viewing pleasure
#' only). The varying columns represent genotypes. Make sure columns match names
#' from the \code{ref_tbl}. Each locus should be written in two columns and
#' names should have a suffix of _1 and _2.
#' @param ref_tbl A data.frame which holds the translation table.
#' Structure is fixed and the columns are \code{lab_from}, \code{locus},
#' \code{allele_from}, \code{allele_ref} and \code{delta}. First two columns
#' are self explanatory. Columns that start with \code{allele_} have actual
#' allele values in laboratory of question and the reference (right now the
#' reference is laboratory from Slovenia). The last column is the amount
#' an allele should be shifted relative to the reference.
#' A reference could also be a relative or absolute path to an xlsx file with
#' columns as described above. Note that names are case sensitive.
#' @param long Logical. If \code{TRUE}, the result will be returned as a long
#' table instead of wide. If \code{FALSE} (default), wide table will be
#' provided.
#' @param output Character. Relative or absolute path to the file the data
#' should be written to. The result will have tab separated columns and no row
#' names. Default is \code{NA}.
#' @param ... Parameters passed to \link[readxl]{read_excel}.
#'
#' @details The shift (\code{delta}) can be possibly calculated for some loci.
#' If alleles for a certain locus are missing and there's only a delta
#' parameter, this means that all so far examined alleles behave properly and
#' can be offset safely. For those that also have mapping table in
#' \code{allele_from} and \code{allele_ref}, the mapping is done 1:1.
#'
#' @importFrom readxl read_excel
#' @importFrom tidyr gather spread
#' @importFrom utils write.table
#'
#' @export

translateGenotypes <- function(input, ref_tbl, long = FALSE, output = NA, ...) {
  # If input is not an already formatted table, import it assuming it's an
  # xlsx file. User needs to pass in the sheet name using the ... argument.
  if (any(class(input) %in% "character")) {
    stopifnot(file.exists(input))
    xy <- read_excel(path = input, ...)
  }

  # If input is data.frame, make sure it has all the appropriate columns.
  if (any(class(input) %in% "data.frame")) {
    stopifnot(all(c("lab_from", "sample") %in% names(input)))
  }

  # Output should have all loci from the reference table. Find loci that are
  # in reference, but not xy (uncommon.loci), and append those as NA to xy
  # before translation starts.
  ref <- ref_tbl[ref_tbl$lab_from %in% unique(input$lab_from), ]

  input.loci <- colnames(input)
  input.loci <- colnames(input)[!(input.loci %in% c("lab_from", "sample"))]
  input.loci <- unique(gsub("_[1|2]", "", input.loci))

  ref.loci <- unique(ref$locus)
  uncommon.loci <- ref.loci[!(ref.loci %in% input.loci)]
  if (length(uncommon.loci) != 0) {
    uncommon.loci <- paste(rep(uncommon.loci, each = 2),
                           c("1", "2"),
                           sep = "_")

    for (i in uncommon.loci) {
      input[, i] <- NA
    }
  }

  # Make sure column order matches the order of loci in reference (ref).
  ref.order <- paste(rep(unique(ref$locus), each = 2),
                     c("1", "2"),
                     sep = "_")
  ref.order <- c("lab_from", "sample", ref.order)

  if (!all(ref.order %in% colnames(input))) {
    stop("Reference and input columns do not match. Investigate.")
  }

  input <- input[, ref.order]

  # The algorithm works on a long format, so we reflow accordingly.
  xy <- suppressWarnings(
    gather(input, key = locus, value = allele, -lab_from, -sample)
  )

  trnslt <- "translated_allele"
  xy[, trnslt] <- NA

  # Algorithm for translating values:
  for (i in 1:nrow(xy)) {
    # Show indicator only when there's many rows to be translated.
    if (nrow(xy) > 1000 && (i %% round(0.1 * nrow(xy), -3)) == 0) print(sprintf("Processing %s/%s", i, nrow(xy)))

    # For each line (lab, locus, allele) of data, find corresponding allele
    # in translation table.
    roll.i <- xy[i, ]

    # Extract pretty locus name (loc2_1 is now loc2).
    roll.locus <- strsplit(roll.i$locus, "_")[[1]][[1]]

    roll.lab <- roll.i$lab_from
    ref <- ref_tbl[ref_tbl$locus == roll.locus & ref_tbl$lab_from == roll.lab, ]

    if (nrow(ref) == 0) {
      stop(sprintf("No entries in translation table for this combination of locus-lab (%s-%s)",
                   roll.locus, roll.lab))
    }

    # If locus is well behaved, it has one delta (offset) value used for
    # translating. If that's the case, ref will only have one row.
    if (!is.na(ref$delta) && nrow(ref) == 1) {
      # If offset (delta) is provided, calculate new allele based on offset.
      xy[i, trnslt] <- as.character(as.numeric(roll.i$allele) + as.numeric(ref$delta))
    } else {
      # If direct translation value available, use that instead. But first,
      # tease it out using the correct allele name.
      ref <- ref[ref$allele_from == roll.i$allele, ]

      # The reference table may have a locus entry, but no data for it. In this case,
      # we just leave entry as NA.
      if (all(is.na(ref[, c("allele_from", "allele_ref", "delta")]))) {
        xy[i, trnslt] <- NA
        warning(sprintf("No information provided to translate %s from %s. Consider
                        removing this locus or setting its delta value to 0 in
                        the reference table.",
                        roll.locus, roll.lab))
        next
      }

      if (nrow(ref) >= 2) {
        stop(sprintf("Reference from lab %s (locus %s) have more than one translation.
    Check translation table.", roll.lab, roll.locus))
      }

      if (nrow(ref) == 0) {
        warning(sprintf("%s has no translation value (locus: %s, lab: %s).",
                        roll.i$allele, roll.locus, roll.lab))
        next
      }

      if (nrow(ref) == 1) {
        xy[i, trnslt] <- ref$allele_ref
      }
    }
  }

  # Keep in long format if so requested.
  if (long == TRUE) {
    return(xy)
  }

  # Remove the original allele value in order to properly reflow data into
  # wide format.
  xy$allele <- NULL
  xy <- spread(xy, key = locus, value = trnslt)

  # Export if file (path)name provided.
  if (!is.na(output)) {
    write.table(xy, file = output, quote = FALSE, row.names = FALSE,
                col.names = TRUE, sep = "\t")

    message(sprintf("Output written to file %s", output))
  }

  return(list(translated = xy,
              original = input)
  )
}
romunov/transgt documentation built on Aug. 1, 2020, 7:28 p.m.