R/labelHaplotypes.R
In strataG: Summaries and Population Structure Analyses of Genetic Data

Documented in labelHaplotypes labelHaplotypes.character labelHaplotypes.default labelHaplotypes.gtypes labelHaplotypes.list

#' @title Find and label haplotypes
#' @description Identify and group sequences that share the same haplotype.
#'
#' @param x sequences in a \code{character matrix}, \code{list},
#'   or \code{\link{DNAbin}} object, or a haploid \linkS4class{gtypes} object
#'   with sequences.
#' @param prefix a character string giving prefix to be applied to numbered
#'   haplotypes. If NULL, haplotypes will be labeled with the first label
#'   from original sequences.
#' @param use.indels logical. Use indels when comparing sequences?
#' @param ... arguments to be passed to \code{labelHaplotypes.default}.
#'
#' @details If any sequences contain ambiguous bases (N's) they are first
#'   removed. Then haplotypes are assigned based on the remaining
#'   sequences. The sequences with N's that were removed are then assigned to
#'   the new haplotypes if it can be done unambiguously (they match only one
#'   haplotype with 0 differences once the N's have been removed). If this
#'   can't be done they are assigned NAs and listed in the
#'   \code{unassigned} element.
#'
#' @return
#'  For \code{character}, \code{list}, or \code{DNAbin}, a list with the following elements:
#'  \describe{
#'    \item{haps}{named vector (\code{DNAbin}) or list of named vectors
#'      (\code{multidina}) of haplotypes for each sequence in \code{x}.}
#'    \item{hap.seqs}{\code{DNAbin} or \code{multidna} object containing
#'      sequences for each haplotype.}
#'    \item{unassigned}{\code{data.frame} listing closest matching haplotypes
#'      for unassignable sequences with N's and the minimum number of
#'      substitutions between the two. Will be \code{NULL} if no sequences
#'      remain unassigned.}
#'  }
#'
#'  For \code{gtypes}, a new \code{gtypes} object with unassigned individuals
#'  stored in the @@other slot in an element named \code{'haps.unassigned'} (see
#'  \code{\link{getOther}}).
#'
#' @author Eric Archer \email{eric.archer@@noaa.gov}
#' 
#' @seealso \code{\link{expandHaplotypes}}
#'
#' @examples
#' # create 5 example short haplotypes
#' haps <- c(
#'   H1 = "ggctagct",
#'   H2 = "agttagct",
#'   H3 = "agctggct",
#'   H4 = "agctggct",
#'   H5 = "ggttagct"
#' )
#
#' # draw and label 100 samples
#' sample.seqs <- sample(names(haps), 100, rep = TRUE)
#' ids <- paste(sample.seqs, 1:length(sample.seqs), sep = "_")
#' sample.seqs <- lapply(sample.seqs, function(x) strsplit(haps[x], "")[[1]])
#' names(sample.seqs) <- ids
#'
#' # add 1-2 random ambiguities
#' with.error <- sample(1:length(sample.seqs), 10)
#' for(i in with.error) {
#'   num.errors <- sample(1:2, 1)
#'   sites <- sample(1:length(sample.seqs[[i]]), num.errors)
#'   sample.seqs[[i]][sites] <- "n"
#' }
#'
#' hap.assign <- labelHaplotypes(sample.seqs, prefix = "Hap.")
#' hap.assign
#'
#' @name labelHaplotypes
#' @export
#'
labelHaplotypes <- function(x, prefix = NULL, use.indels = TRUE) {
  UseMethod("labelHaplotypes")
}


#' @rdname labelHaplotypes
#' @export
#'
labelHaplotypes.default  <- function(x, prefix = NULL, use.indels = TRUE) {
  if(!inherits(x, "DNAbin")) stop("'x' must be a DNAbin object.")
  x <- as.matrix(x)
  
  # return same data if only one sequence exists
  if(nrow(x) == 1) {
    haps <- rownames(x)
    names(haps) <- haps
    return(list(haps = haps, hap.seqs = x, unassigned = NULL))
  }
  
  # throw error if any sequence names are duplicated
  if(any(duplicated(rownames(x)))) {
    warning(
      "'x' cannot have duplicate sequence names. NULL returned.",
      call. = FALSE, 
      immediate. = TRUE
    )
    return(NULL)
  }
  
  # find sequences without ambiguities  
  good.bases <- c("a", "c", "g", "t", "-")
  has.amb <- apply(as.character(x), 1, function(bases) {
    !all(tolower(bases) %in% good.bases)
  })
  if(sum(!has.amb) <= 1) {
    warning(
      "There are fewer than two sequences without ambiguities (N's). Can't assign haplotypes. NULL returned.",
      call. = FALSE, 
      immediate. = TRUE
    )
    return(NULL)
  }
  
  # get pairwise distances and set all non-0 distances to 1
  x.no.amb <- x[!has.amb, , drop = FALSE]
  hap.dist <- ape::dist.dna(x.no.amb, model = "N", pairwise.deletion = TRUE)
  if(use.indels) {
    hap.dist <- hap.dist + ape::dist.dna(x.no.amb, model = "indelblock")
  }
  hap.dist <- as.matrix(hap.dist)
  hap.dist[hap.dist > 0] <- 1
  
  # create haplotype code out of 0s and 1s
  hap.code <- hap.dist %>% 
    apply(1, paste, collapse = "") %>% 
    factor() %>% 
    as.numeric() %>% 
    stats::setNames(rownames(hap.dist))
  
  # rename haplotypes
  hap.labels <- if(!is.null(prefix)) {
    # use prefix+number if prefix given
    # sort based on frequency first
    hap.order <- as.numeric(names(sort(table(hap.code), decreasing = TRUE)))
    hap.nums <- swfscMisc::zero.pad(1:length(hap.order))
    names(hap.order) <- paste(prefix, hap.nums, sep = "")
    names(sort(hap.order))
  } else {
    # if no prefix, use first sequence name for each haplotype
    names(hap.code)[!duplicated(hap.code)]
  }
  hap.code <- hap.labels[hap.code]
  names(hap.code) <- rownames(hap.dist)
  
  # get sequences for each haplotype
  unique.codes <- hap.code[!duplicated(hap.code)]
  hap.seqs <- x[names(unique.codes), , drop = FALSE]
  rownames(hap.seqs) <- unique.codes
  hap.seqs <- hap.seqs[order(rownames(hap.seqs)), , drop = FALSE] %>% 
    as.matrix() %>% 
    as.character()
  
  # get distance of all sequences with n's to other sequences (possible matching sequences)
  x.has.amb <- as.character(x)[has.amb, , drop = FALSE]
  unk.dist <- sapply(rownames(x.has.amb), function(i) {
    # get ambiguity sites in this sequence
    amb.sites <- which(!tolower(x.has.amb[i, ]) %in% good.bases)
    if(length(amb.sites) == 0) return(NULL)
    # create matrix of ambiguity sites and remove sequences with ambiguities
    possible.sites <- unique(hap.seqs[, amb.sites, drop = FALSE])
    # create sequences to test
    if(nrow(possible.sites) == 0) return(NULL)
    test.seqs <- do.call(rbind, lapply(1:nrow(possible.sites), function(j) {
      seq.j <- x.has.amb[i, ]
      seq.j[amb.sites] <- possible.sites[j, ]
      seq.j
    }))
    test.names <- paste("test.seq.", 1:nrow(test.seqs), sep = "")
    rownames(test.seqs) <- test.names
    # get distances between test sequences and sequences without n's
    new.mat <- ape::as.DNAbin(rbind(hap.seqs, test.seqs))
    hap.dist <- ape::dist.dna(
      new.mat, 
      model = "N", 
      pairwise.deletion = TRUE, 
      as.matrix = TRUE
    )
    if(use.indels) hap.dist <- hap.dist + 
      ape::dist.dna(
        new.mat, 
        model = "indelblock", 
        as.matrix = TRUE
      )
    hap.dist[test.names, !colnames(hap.dist) %in% test.names, drop = FALSE]
  }, simplify = FALSE, USE.NAMES = TRUE)
  
  min.dist <- NULL
  hap.assign <- if(length(unk.dist) > 0) {
    # get minimum distance of unknowns to other sequences
    min.dist <- do.call(rbind, sapply(
      unk.dist, 
      function(mat) apply(mat, 2, min), 
      simplify = FALSE
    ))
    # assign sequences if only one of the minimum sequences is 0
    apply(min.dist, 1, function(counts) {
      num.matches <- sum(counts == 0)
      if(num.matches == 1) colnames(min.dist)[counts == 0] else NA
    })
  } else NULL
  
  # compile vector of haplotype assignments
  hap.vec <- c(hap.code, hap.assign)[rownames(x)]
  
  # create data.frame of unassigned sequences
  unassigned.df <- if(any(is.na(hap.vec))) {
    mat <- min.dist[names(hap.vec)[is.na(hap.vec)], , drop = FALSE]
    df <- do.call(rbind, lapply(1:nrow(mat), function(r) {
      min.dist <- min(mat[r, ])
      i <- which(mat[r, ] == min.dist)
      haps <- paste(colnames(mat)[i], collapse = ", ")
      data.frame(haplotype = haps, min.substitutions = min.dist)
    }))
    rownames(df) <- rownames(mat)
    df
  } else NULL
  
  list(
    haps = hap.vec, 
    hap.seqs = ape::as.DNAbin(hap.seqs), 
    unassigned = unassigned.df
  )
}

#' @rdname labelHaplotypes
#' @export
#'
labelHaplotypes.list <- function(x, ...) labelHaplotypes(ape::as.DNAbin(x),...)


#' @rdname labelHaplotypes
#' @export
#'
labelHaplotypes.character <- function(x, ...) labelHaplotypes(ape::as.DNAbin(x), ...)


#' @rdname labelHaplotypes
#' @export
#'
labelHaplotypes.gtypes <- function(x, ...) {
  # check that sequences are present
  if(getPloidy(x) > 1 | is.null(getSequences(x))) {
    stop("'x' is not haploid or does not have any sequences")
  }

  # label haplotypes for each gene
  new.haps <- purrr::map(getSequences(x), labelHaplotypes, ...)
  has.errors <- sapply(new.haps, is.null)
  if(sum(has.errors) > 0) {
    has.errors <- paste(names(new.haps)[has.errors], collapse = ", ")
    stop("haplotypes could not be assigned for: ", has.errors)
  }

  # create haplotype data.frame
  hap.df <- as.data.frame(x)
  for(gene in names(new.haps)) {
    old.haps <- hap.df[, gene]
    hap.df[, gene] <- new.haps[[gene]]$haps[old.haps]
  }

  # collect sequences
  hap.seqs <- lapply(new.haps, function(x) x$hap.seqs)
  names(hap.seqs) <- names(new.haps)

  # collect unassigned
  unassigned <- lapply(new.haps, function(x) x$unassigned)

  # create new gtypes
  df2gtypes(
    hap.df, 
    ploidy = 1, 
    id.col = 1, 
    strata.col = 2, 
    loc.col = 3,
    sequences = hap.seqs, 
    description = getDescription(x), 
    schemes = getSchemes(x),
    other = c(getOther(x), list(haps.unassigned = unassigned))
  )
}

Any scripts or data that you put into this service are public.

strataG documentation built on Feb. 28, 2020, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

strataG
Summaries and Population Structure Analyses of Genetic Data

R/labelHaplotypes.R
In strataG: Summaries and Population Structure Analyses of Genetic Data

Defines functions labelHaplotypes labelHaplotypes.default labelHaplotypes.list labelHaplotypes.character labelHaplotypes.gtypes

Documented in labelHaplotypes labelHaplotypes.character labelHaplotypes.default labelHaplotypes.gtypes labelHaplotypes.list

Try the strataG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

strataG Summaries and Population Structure Analyses of Genetic Data

R/labelHaplotypes.R In strataG: Summaries and Population Structure Analyses of Genetic Data

Defines functions labelHaplotypes labelHaplotypes.default labelHaplotypes.list labelHaplotypes.character labelHaplotypes.gtypes

Documented in labelHaplotypes labelHaplotypes.character labelHaplotypes.default labelHaplotypes.gtypes labelHaplotypes.list

Try the strataG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

strataG
Summaries and Population Structure Analyses of Genetic Data

R/labelHaplotypes.R
In strataG: Summaries and Population Structure Analyses of Genetic Data