R/data_subset.r

Defines functions split_gen make_haplotypes_genlight make_haplotypes_genind recode_polyploids informloci missingno popsub clonecorrect

Documented in clonecorrect informloci missingno popsub recode_polyploids

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#
# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate
# students at Oregon State University; Jonah C. Brooks, undergraduate student at
# Oregon State University; and Dr. Nik Grünwald, an employee of USDA-ARS.
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for educational, research and non-profit purposes, without fee,
# and without a written agreement is hereby granted, provided that the statement
# above is incorporated into the material, giving appropriate attribution to the
# authors.
#
# Permission to incorporate this software into commercial products may be
# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for
# Commercialization and Corporate Development.
#
# The software program and documentation are supplied "as is", without any
# accompanying services from the USDA or the University. USDA ARS or the
# University do not warrant that the operation of the program will be
# uninterrupted or error-free. The end-user understands that the program was
# developed for research purposes and is advised not to rely exclusively on the
# program for any reason.
#
# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY
# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE
# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#==============================================================================#
#' Remove potential bias caused by cloned genotypes in genind or genclone
#' object.
#'
#' This function removes any duplicated multilocus genotypes from any specified
#' population strata.
#'
#' @param pop a \code{\link[adegenet:genind-class]{genind}}, \code{\linkS4class{genclone}}, or
#'   \code{\linkS4class{snpclone}} object
#'
#' @param strata a hierarchical formula or numeric vector. This will define the
#'   columns of the data frame in the strata slot to use.
#'
#' @param combine \code{logical}. When set to TRUE, the strata will be combined
#'   to create a new population for the clone-corrected genind or genclone
#'   object.
#'
#' @param keep \code{integer}. When \code{combine} is set to \code{FALSE}, you
#'   can use this flag to choose the levels of your population strata. For
#'   example: if your clone correction strata is set to "Pop", "Subpop", and
#'   "Year", and you want to analyze your populations with respect to year, you
#'   can set \code{keep = c(1,3)}.
#'
#' @return a clone corrected \code{\linkS4class{genclone}},
#'   \code{\linkS4class{snpclone}}, or \code{\link[adegenet:genind-class]{genind}} object.
#'
#' @details This function will clone correct based on the stratification
#'   provided. To clone correct indiscriminately of population structure, set
#'   \code{strata = NA}.
#'
#'
#' @export
#' @author Zhian N. Kamvar
#' @examples
#' # LOAD A. euteiches data set
#' data(Aeut)
#'
#' # Redefine it as a genclone object
#' Aeut <- as.genclone(Aeut)
#' strata(Aeut) <- other(Aeut)$population_hierarchy[-1]
#'
#' # Check the number of multilocus genotypes
#' mlg(Aeut)
#' popNames(Aeut)
#'
#' # Clone correct at the population level.
#' Aeut.pop <- clonecorrect(Aeut, strata =  ~Pop)
#' mlg(Aeut.pop)
#' popNames(Aeut.pop)
#'
#' \dontrun{
#' # Clone correct at the subpopulation level with respect to population and
#' # combine.
#' Aeut.subpop <- clonecorrect(Aeut, strata = ~Pop/Subpop, combine=TRUE)
#' mlg(Aeut.subpop)
#' popNames(Aeut.subpop)
#'
#' # Do the same, but set to the population level.
#' Aeut.subpop2 <- clonecorrect(Aeut, strata = ~Pop/Subpop, keep=1)
#' mlg(Aeut.subpop2)
#' popNames(Aeut.subpop2)
#'
#' # LOAD H3N2 dataset
#' data(H3N2)
#'
#' strata(H3N2) <- other(H3N2)$x
#'
#' # Extract only the individuals located in China
#' country <- clonecorrect(H3N2, strata =  ~country)
#'
#' # How many isolates did we have from China before clone correction?
#' sum(strata(H3N2, ~country) == "China") # 155
#'
#' # How many unique isolates from China after clone correction?
#' sum(strata(country, ~country) == "China") # 79
#'
#' # Something a little more complicated. (This could take a few minutes on
#' # slower computers)
#'
#' # setting the hierarchy to be Country > Year > Month
#' c.y.m <- clonecorrect(H3N2, strata =  ~year/month/country)
#'
#' # How many isolates in the original data set?
#' nInd(H3N2) # 1903
#'
#' # How many after we clone corrected for country, year, and month?
#' nInd(c.y.m) # 1190
#' }
#==============================================================================#

clonecorrect <- function(pop, strata = 1, combine = FALSE, keep = 1) {
  if (!is.genind(pop) & !is(pop, "snpclone")) {
    stop(deparse(substitute(pop)), " is not a genind or snpclone object.\n")
  }
  if (is.null(strata(pop))) {
    msg <- paste0(
      "Strata is not set for ",
      deparse(substitute(pop)),
      ". Clone correct will be performed without population",
      " information."
    )
    warning(msg)
    strata <- NA
  }
  if (is.language(strata)) {
    strataformula <- strata
    strata <- all.vars(strata)
  }
  if (is.genind(pop)) {
    popcall <- match.call()
  }

  if (is.na(strata[1])) {
    return(pop[.clonecorrector(pop), ])
  }

  if (is.numeric(strata)) {
    strata <- names(strata(pop))[strata]
    strataformula <- as.formula(paste0("~", paste(strata, collapse = "/")))
  }
  if (!all(strata %in% names(strata(pop)))) {
    stop(hier_incompatible_warning(strata, strata(pop)))
  }
  setPop(pop) <- strataformula
  # Corrects the individual names of the object. This is fo the fact that the
  # clone corrector relies on the unique individual names for it to work.
  if (all(indNames(pop) == "")) {
    indNames(pop) <- as.character(1:nInd(pop))
  }

  cpop <- nPop(pop)

  # Steps for correction:
  # Subset by population factor.
  # Run subset population by the .clonecorrector
  # Profit!
  corWrecked <- function(x, pop) {
    subbed <- popsub(pop, x) # population to be...corrected.
    subbed <- subbed[.clonecorrector(subbed), ]
    # Return the indices based off of the individual names.
    return(which(indNames(pop) %in% indNames(subbed)))
  }

  ccpop <- unlist(lapply(1:cpop, corWrecked, pop))
  pop <- pop[ccpop, ]

  if (!combine) {
    # When the combine flag is not true, the default is to keep the first level
    # of the strata. The keep flag is a numeric vector corresponding to the
    # strata flag indicating which levels the user wants to keep.
    strata <- strata[keep]
    newformula <- as.formula(paste0("~", paste(strata, collapse = "/")))
    setPop(pop) <- newformula
  }
  if (is.genind(pop)) {
    pop@call <- popcall
  }

  return(pop)
}


#==============================================================================#
#' Subset data by population
#'
#' Create a new dataset with specified populations or exclude specified
#' populations from the dataset.
#'
#' @param gid a \code{\link[adegenet:genind-class]{genind}}, \code{\linkS4class{genclone}},
#'   \code{\link[adegenet:genlight-class]{genlight}}, or \code{\linkS4class{snpclone}} object.
#'
#' @param sublist a \code{vector} of population names or indexes that the user
#' wishes to keep. Default to \code{"ALL"}.
#'
#' @param exclude a \code{vector} of population names or indexes that the user
#' wishes to discard. Default to \code{NULL}.
#'
#' @param blacklist DEPRECATED, use exclude.
#'
#' @param mat a \code{matrix} object produced by \code{\link{mlg.table}} to be
#' subsetted. If this is present, the subsetted matrix will be returned instead
#' of the genind object
#'
#' @param drop \code{logical}. If \code{TRUE}, unvarying alleles will be dropped
#' from the population.
#'
#' @return A \code{genind} object or a matrix.
#' @author Zhian N. Kamvar
#' @examples
#' # Load the dataset microbov.
#' data(microbov)
#'
#' # List the population names.
#' popNames(microbov)
#'
#' # Analyze only the populations with exactly 50 individuals
#' mic.50 <- popsub(microbov, sublist=c(1:6, 11:15), exclude=c(3,4,13,14))
#'
#' \dontrun{
#' # Analyze the first 10 populations, except for "Bazadais"
#' mic.10 <- popsub(microbov, sublist=1:10, exclude="Bazadais")
#'
#' # Take out the two smallest populations
#' micbig <- popsub(microbov, exclude=c("NDama", "Montbeliard"))
#'
#' # Analyze the two largest populations
#' miclrg <- popsub(microbov, sublist=c("BlondeAquitaine", "Charolais"))
#' }
#' @export
#==============================================================================#

popsub <- function(
  gid,
  sublist = "ALL",
  exclude = NULL,
  blacklist = NULL,
  mat = NULL,
  drop = TRUE
) {
  if (!is.genind(gid) & !is(gid, "genlight")) {
    stop("popsub requires a genind or genlight object\n")
  }
  if (!is.null(blacklist)) {
    warning(
      option_deprecated(
        match.call(),
        "blacklist",
        "exclude",
        "2.8.7.",
        "Please use `exclude` in the future"
      ),
      immediate. = TRUE
    )
    exclude <- blacklist
  }
  if (is.null(pop(gid))) {
    if (!is.na(sublist[1]) && sublist[1] != "ALL") {
      warning("No population structure. Subsetting not taking place.")
    }
    return(gid)
  }
  orig_list <- sublist
  popnames <- popNames(gid)
  if (toupper(sublist[1]) == "ALL") {
    if (is.null(exclude)) {
      return(gid)
    } else {
      # filling the sublist with all of the population names.
      sublist <- popnames
    }
  }

  # Checking if there are names for the population names.
  # If there are none, it will give them names.
  if (is.null(names(popnames))) {
    if (length(popnames) == nPop(gid)) {
      names(popnames) <- popNames(gid)
    } else {
      stop("Population names do not match population factors.")
    }
  }

  # Treating anything present in exclude.
  if (!is.null(exclude)) {
    # If both the sublist and exclude are numeric or character.
    if (
      is.numeric(sublist) &&
        is.numeric(exclude) ||
        identical(class(sublist), class(exclude))
    ) {
      sublist <- sublist[!sublist %in% exclude]
    } else if (is.numeric(sublist) && inherits(exclude, "character")) {
      # if the sublist is numeric and exclude is a character. eg s=1:10, b="USA"
      sublist <- sublist[sublist %in% which(!popnames %in% exclude)]
    } else {
      # no sublist specified. Ideal situation
      if (all(popnames %in% sublist)) {
        sublist <- sublist[-exclude]
      } else {
        # weird situation where the user will specify a certain sublist, yet
        # index the exclude numerically. Interpreted as an index of
        # populations in the whole data set as opposed to the sublist.
        warning(
          "exclude is numeric. Interpreting exclude as the index of the population in the total data set."
        )
        sublist <- sublist[!sublist %in% popnames[exclude]]
      }
    }
  }
  if (!is.null(mat)) {
    mat <- mat[sublist, , drop = FALSE]
    return(mat[, which(colSums(mat) > 0), drop = FALSE])
  } else {
    # subsetting the population.
    if (is.numeric(sublist)) {
      sublist <- popnames[sublist]
    } else {
      sublist <- popnames[popnames %in% sublist]
    }
    sublist <- pop(gid) %in% sublist
    if (!any(sublist)) {
      if (!is.numeric(orig_list) & !any(popNames(gid) %in% orig_list)) {
        stop(unmatched_pops_warning(popNames(gid), orig_list))
      } else {
        nothing_warn <- paste(
          "Nothing present in the sublist.\n",
          "Perhaps the sublist and exclude arguments have",
          "duplicate entries?\n",
          "Subset not taking place."
        )
        warning(nothing_warn)
        return(gid)
      }
    }
    gid <- gid[sublist, drop = drop]
    if (is.genind(gid)) {
      gid@call <- match.call()
    }
    return(gid)
  }
}

#==============================================================================#
#' Treat missing data
#'
#' missingno gives the user four options to deal with missing data: remove loci,
#' remove samples, replace with zeroes, or replace with average allele counts.
#'
#'@param pop a \code{\linkS4class{genclone}} or \code{\link[adegenet:genind-class]{genind}}
#'  object.
#'
#'@param type a \code{character} string: can be "ignore", "zero", "mean",
#'  "loci", or "geno" (see \code{Details} for definitions).
#'
#'@param cutoff \code{numeric}. A number from 0 to 1 indicating the allowable
#'  rate of missing data in either genotypes or loci. This will be ignored for
#'  \code{type} values of \code{"mean"} or \code{"zero"}.
#'
#'@param quiet if \code{TRUE}, it will print to the screen the action performed.
#'
#'@param freq defaults to \code{FALSE}. This option is passed on to the
#'  \code{\link[adegenet]{tab}} function. If \code{TRUE}, the matrix in the
#'  genind object will be replaced by a numeric matrix (as opposed to integer).
#'  THIS IS NOT RECOMMENDED. USE THE FUNCTION \code{\link[adegenet]{tab}}
#'  instead.
#'
#'@details These methods provide a way to deal with systematic missing data and
#'  to give a wrapper for \code{adegenet}'s \code{ \link[adegenet]{tab}} function.
#'  ALL OF THESE ARE TO BE USED WITH CAUTION.
#'
#'  Using this function with polyploid data (where missing data is coded as "0")
#'  may give spurious results.
#'
#'  \subsection{Treatment types}{
#'  \itemize{
#'  \item{\code{"ignore"} - does not remove or replace missing data.}
#'  \item{\code{"loci"} - removes all loci containing missing data in the entire
#'  data set. }
#'  \item{\code{"genotype"} - removes any genotypes/isolates/individuals with
#'  missing data.}
#'  \item{\code{"mean"} - replaces all NA's with the mean of the alleles for the
#'  entire data set.}
#'  \item{\code{"zero"} or \code{"0"} - replaces all NA's with "0". Introduces
#'  more diversity.}
#'  }
#'  }
#'@return a \code{\linkS4class{genclone}} or \code{\link[adegenet:genind-class]{genind}} object.
#'
#'@note \emph{"wild missingno appeared!"}
#'
#'@seealso \code{\link[adegenet]{tab}}, \code{\link{poppr}}, \code{\link{poppr.amova}},
#'  \code{\link{nei.dist}}, \code{\link{aboot}}
#'
#'@export
#'@author Zhian N. Kamvar
#' @examples
#'
#' data(nancycats)
#'
#' nancy.locina <- missingno(nancycats, type = "loci")
#'
#' ## Found 617 missing values.
#' ## 2 loci contained missing values greater than 5%.
#' ## Removing 2 loci : fca8 fca45
#'
#' nancy.genona <- missingno(nancycats, type = "geno")
#'
#' ## Found 617 missing values.
#' ## 38 genotypes contained missing values greater than 5%.
#' ## Removing 38 genotypes : N215 N216 N188 N189 N190 N191 N192 N302 N304 N310
#' ## N195 N197 N198 N199 N200 N201 N206 N182 N184 N186 N298 N299 N300 N301 N303
#' ## N282 N283 N288 N291 N292 N293 N294 N295 N296 N297 N281 N289 N290
#'
#' # Replacing all NA with "0" (see tab in the adegenet package).
#' nancy.0 <- missingno(nancycats, type = "0")
#'
#' ## Replaced 617 missing values
#'
#' # Replacing all NA with the mean of each column (see tab in the
#' # adegenet package).
#' nancy.mean <- missingno(nancycats, type = "mean")
#'
#' ## Replaced 617 missing values
#==============================================================================#
# ###''
# --,,`
# ##,`+
# #.-`.
# --','-.
# `+&#-+'
# .`','##
# &`''#*'
# `&-'-'#
# '`##.-,
missingno <- function(
  pop,
  type = "loci",
  cutoff = 0.05,
  quiet = FALSE,
  freq = FALSE
) {
  if (is(pop, "genlight")) {
    warning("missingno cannot be applied to genlight objects at this time.")
    return(pop)
  }
  if (sum(is.na(pop@tab)) > 0) {
    # removes any loci (columns) with missing values.
    MISSINGOPTS <- c("loci", "genotypes", "mean", "zero", "0", "ignore", "asis")
    freq_warning <- paste(
      "Objects of class 'genind' must have integers in the",
      "genotype matrix. Setting freq = TRUE will force the matrix to be numeric.",
      "please see help('tab') for alternatives."
    )
    type <- match.arg(tolower(type), MISSINGOPTS)
    if (type %in% c("ignore", "asis")) {
      return(pop)
    }
    navals <- percent_missing(pop, type = type, cutoff = cutoff)
    if (type == "loci") {
      if (quiet != TRUE) {
        if (length(navals) == ncol(tab(pop))) {
          message(
            "\n No loci with missing values above ",
            paste0(cutoff * 100, "%"),
            " found.\n"
          )
        } else {
          remloc <- locNames(pop)[!cumsum(nAll(pop)) %in% navals]
          rem <- sum(is.na(tab(pop)))
          missing_messenger(
            remloc,
            type = c("locus", "loci"),
            nremoved = rem,
            cutoff = cutoff
          )
        }
      }
      pop <- pop[, navals]
    } else if (type == "genotypes") {
      if (quiet != TRUE) {
        if (length(navals) == nInd(pop)) {
          message(
            "\n No genotypes with missing values above ",
            paste0(cutoff * 100, "%"),
            " found.\n"
          )
        } else {
          remgeno <- indNames(pop)[-navals]
          rem <- sum(is.na(tab(pop)))
          missing_messenger(
            remgeno,
            type = c("genotype", "genotypes"),
            nremoved = rem,
            cutoff = cutoff
          )
        }
      }
      pop <- pop[navals, ]
    } else if (type == "mean") {
      if (!quiet) {
        message("\n Replaced ", sum(is.na(tab(pop))), " missing values.")
      }
      pop@tab <- tab(pop, freq = freq, NA.method = "mean", quiet = quiet)
      if (freq) {
        warning(freq_warning)
      }
    } else if (type %in% c("zero", "0")) {
      # changes all NA's to 0. NOT RECOMMENDED. INTRODUCES MORE DIVERSITY.
      if (!quiet) {
        message("\n Replaced ", sum(is.na(tab(pop))), " missing values.")
      }
      pop@tab <- tab(pop, freq = freq, NA.method = "zero", quiet = quiet)

      if (freq) {
        warning(freq_warning)
      }
    } else {
      message("\nInvalid type.\n")
    }
  } else {
    if (quiet == FALSE) {
      message("\n No missing values detected.\n")
    }
  }
  return(pop)
}

# ==============================================================================#
#' Remove all non-phylogentically informative loci
#'
#' This function will facilitate in removing phylogenetically uninformative loci
#' from a \code{\linkS4class{genclone}} or \code{\link[adegenet:genind-class]{genind}} object.
#' The user has the ability to define what uninformative means by setting a
#' cutoff value for either percentage of differentiating genotypes or minor
#' allele frequency.
#'
#' @param pop a \code{\linkS4class{genclone}} or \code{\link[adegenet:genind-class]{genind}}
#'   object.
#'
#' @param cutoff \code{numeric}. A number from 0 to 1 defining the minimum
#'   number of differentiating samples.
#'
#' @param MAF \code{numeric}. A number from 0 to 1 defining the minimum minor
#'   allele frequency. This is passed as the \code{thresh} parameter of
#'   \code{\link[adegenet]{isPoly}}.
#'
#' @param quiet \code{logical}. When \code{quiet = TRUE} (default), messages
#'   indicating the loci removed will be printed to screen. When \code{quiet =
#'   FALSE}, nothing will be printed to screen.
#'
#' @return A \code{genind} object with user-defined informative loci.
#'
#' @details This function will remove uninformative loci using a traditional MAF
#'   cutoff (using \code{\link[adegenet]{isPoly}} from \pkg{adegenet}) as well
#'   as analyzing the number of observed genotypes in a locus. This is important
#'   for clonal organisms that can have fixed heterozygous sites not detected by
#'   MAF methods.
#'
#' @note This will have a few side effects that affect certain analyses. First,
#'   the number of multilocus genotypes might be reduced due to the reduced
#'   number of markers (if you are only using a genind object). Second, if you
#'   plan on using this data for analysis of the index of association, be sure
#'   to use the standardized version (rbarD) that corrects for the number of
#'   observed loci.
#'
#' @author Zhian N. Kamvar
#' @examples
#' # We will use a dummy data set to demonstrate how this detects uninformative
#' # loci using both MAF and a cutoff.
#'
#' genos <- c("A/A", "A/B", "A/C", "B/B", "B/C", "C/C")
#'
#' v <- sample(genos, 100, replace = TRUE)
#' w <- c(rep(genos[2], 99), genos[3]) # found by cutoff
#' x <- c(rep(genos[1], 98), genos[3], genos[2]) # found by MAF
#' y <- c(rep(genos[1], 99), genos[2]) # found by both
#' z <- sample(genos, 100, replace = TRUE)
#' dat <- df2genind(data.frame(v = v, w = w, x = x, y = y, z = z), sep = "/")
#'
#' informloci(dat)
#'
#' \dontrun{
#' # Ignore MAF
#' informloci(dat, MAF = 0)
#'
#' # Ignore cutoff
#' informloci(dat, cutoff = 0)
#'
#' # Real data
#' data(H3N2)
#' informloci(H3N2)
#' }
#' @export
#==============================================================================#
#' @importFrom pegas as.loci
informloci <- function(pop, cutoff = 2 / nInd(pop), MAF = 0.01, quiet = FALSE) {
  if (!is.genind(pop)) {
    stop("This function only works on genind objects.")
  }
  noisy <- !isTRUE(quiet)
  MLG <- mlg(pop, quiet = TRUE)
  if (MLG < 3) {
    if (noisy) {
      message("Not enough multilocus genotypes to be meaningful.\n")
    }
    return(pop)
  }
  cutoff <- ifelse(cutoff > 0.5, 1 - cutoff, cutoff)
  MAF <- ifelse(MAF > 0.5, 1 - MAF, MAF)
  min_ind <- round(cutoff * nInd(pop))
  if (noisy) {
    ind <- if (min_ind == 1) "sample" else "samples"
    message("cutoff value: ", cutoff * 100, " % ( ", min_ind, " ", ind, " ).")
    message("MAF         : ", MAF)
  }
  if (pop@type == "PA") {
    # cutoff applies to too many or too few typed individuals in AFLP cases.
    glocivals <- apply(tab(pop), 2, sum) %in% min_ind:(nInd(pop) - min_ind)
  } else {
    genloc <- as.loci(pop)
    the_loci <- attr(genloc, "locicol")
    glocivals <- apply(genloc[the_loci], 2, test_table, min_ind, nInd(pop))
  }

  alocivals <- isPoly(pop, "locus", thres = MAF)
  locivals <- alocivals & glocivals

  if (!isTRUE(quiet)) {
    if (all(locivals == TRUE)) {
      msg <- paste("\nAll sites polymorphic")
    } else if (sum(locivals) < 2) {
      msg <- paste0(
        "\nFewer than 2 loci found informative.",
        "\nPerhaps you should choose a ",
        "lower cutoff value?\nReturning with no changes."
      )
      locivals <- rep(TRUE, nLoc(pop))
    } else {
      msg <- uninformative_loci_message(
        pop,
        glocivals,
        alocivals,
        locivals,
        min_ind,
        MAF
      )
    }
    message(msg)
  }

  if (pop@type == "PA") {
    return(pop[, locivals])
  }
  return(pop[, loc = locNames(pop)[locivals]])
}
#==============================================================================#
#' Recode polyploid microsatellite data for use in frequency based statistics.
#'
#' As the genind object requires ploidy to be consistent across loci, a
#' workaround to importing polyploid data was to code missing alleles as "0"
#' (for microsatellite data sets). The advantage of this is that users would be
#' able to calculate Bruvo's distance, the index of association, and genotypic
#' diversity statistics. The tradeoff was the fact that this broke all other
#' analyses as they relied on allele frequencies and the missing alleles are
#' treated as extra alleles. This function removes those alleles and returns a
#' \code{\linkS4class{genclone}} or \code{\link[adegenet:genind-class]{genind}} object where
#' allele frequencies are coded based on the number of alleles observed at a
#' single locus per individual. See the examples for more details.
#'
#' @param poly a \code{\linkS4class{genclone}}, \code{\link[adegenet:genind-class]{genind}}, or
#'   \code{\link[adegenet:genpop-class]{genpop}} object that has a ploidy of > 2
#' @param newploidy for genind or genclone objects: if \code{FALSE} (default),
#'   the user-defined ploidy will stay constant. if \code{TRUE}, the ploidy for
#'   each sample will be determined by the maximum ploidy observed for each
#'   genotype.
#'
#' @param addzero add zeroes onto genind or genclone objects with uneven ploidy?
#' if \code{TRUE}, objects with uneven ploidies will have zeroes appended to all
#' loci to allow conversion to genpop objects. Defaults to \code{FALSE}.
#'
#' @return a \code{\linkS4class{genclone}}, \code{\link[adegenet:genind-class]{genind}}, or
#'   \code{\link[adegenet:genpop-class]{genpop}} object.
#'
#' @details The genind object has two caveats that make it difficult to work
#'   with polyploid data sets:
#'   \enumerate{
#'     \item ploidy must be constant throughout the data set
#'     \item missing data is treated as "all-or-none"
#'   } In an ideal world, polyploid genotypes would be just as unambiguous as
#'   diploid or haploid genotypes. Unfortunately, the world we live in is far
#'   from ideal and a genotype of AB in a tetraploid organism could be AAAB,
#'   AABB, or ABBB. In order to get polyploid data in to \pkg{adegenet} or
#'   \pkg{poppr}, we must code all loci to have the same number of allelic
#'   states as the ploidy or largest observed heterozygote (if ploidy is
#'   unknown). The way to do this is to insert zeroes to pad the alleles. So, to
#'   import two genotypes of:
#'
#' \tabular{rrrr}{
#' NA \tab 20 \tab 23 \tab 24\cr
#' 20 \tab 24 \tab 26 \tab 43
#' }
#' they should be coded as:
#' \tabular{rrrr}{
#'  0 \tab 20 \tab 23 \tab 24\cr
#' 20 \tab 24 \tab 26 \tab 43
#' }
#' This zero is treated as an extra allele and is represented in the genind object as so:
#' \tabular{rrrrrr}{
#' \strong{0} \tab \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr
#' 1 \tab 1 \tab 1 \tab 1 \tab 0 \tab 0\cr
#' 0 \tab 1 \tab 0 \tab 1 \tab 1 \tab 1
#' }
#'
#'   This function remedies this problem by removing the zero column.
#'   The above table would become:
#'
#' \tabular{rrrrr}{
#' \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr
#' 1 \tab 1 \tab 1 \tab 0 \tab 0\cr
#' 1 \tab 0 \tab 1 \tab 1 \tab 1
#' }
#'
#' With this, the user is able to calculate frequency based statistics on the
#' data set.
#'
#' @note This is an approximation, and a bad one at that. \pkg{Poppr} was not
#' originally intended for polyploids, but with the inclusion of Bruvo's
#' distance, it only made sense to attempt something beyond single use.
#'
#' @author Zhian N. Kamvar
#' @export
#' @examples
#' data(Pinf)
#' iPinf <- recode_polyploids(Pinf)
#'
#' # Note that the difference between the number of alleles.
#' nAll(Pinf)
#' nAll(iPinf)
#'
#' \dontrun{
#' library("ape")
#'
#' # Removing missing data.
#' setPop(Pinf) <- ~Country
#'
#' # Calculating Rogers' distance.
#' rog <- rogers.dist(genind2genpop(Pinf))
#' irog <- rogers.dist(recode_polyploids(genind2genpop(Pinf)))
#'
#' # We will now plot neighbor joining trees. Note the decreased distance in the
#' # original data.
#' plot(nj(rog), type = "unrooted")
#' add.scale.bar(lcol = "red", length = 0.02)
#' plot(nj(irog), type = "unrooted")
#' add.scale.bar(lcol = "red", length = 0.02)
#' }
#==============================================================================#
recode_polyploids <- function(poly, newploidy = FALSE, addzero = FALSE) {
  if (!is.genind(poly) && !is.genpop(poly)) {
    stop("input must be a genind object.")
  } else if (!test_zeroes(poly) && addzero == FALSE) {
    warning("Input is not a polyploid data set, returning original.")
    return(poly)
  }
  if (addzero && is.genind(poly)) {
    if (test_zeroes(poly)) {
      warning(
        "addzero = TRUE, but data already has zeroes. Returning original."
      )
      return(poly)
    }
    polydf <- genind2df(poly, sep = "/", usepop = FALSE)
    maxploid <- max(ploidy(poly))
    polydf <- generate_bruvo_mat(polydf, maxploid, sep = "/")
    res <- df2genind(
      polydf,
      sep = "/",
      ploidy = maxploid,
      pop = pop(poly),
      strata = strata(poly),
      hierarchy = poly@hierarchy
    )
    other(res) <- other(poly)
    if (is.genclone(poly)) {
      res <- as.genclone(res, mlg = poly@mlg)
    }
    return(res)
  }
  MAT <- tab(poly)
  fac <- locFac(poly)

  non_zero_cols_list <- lapply(alleles(poly), function(x) as.numeric(x) > 0)
  non_zero_cols_vector <- unlist(non_zero_cols_list, use.names = FALSE)

  poly@loc.fac <- fac[non_zero_cols_vector]
  poly@loc.n.all <- stats::setNames(
    tabulate(locFac(poly), nbins = nLoc(poly)),
    locNames(poly)
  )
  poly@tab <- MAT[, non_zero_cols_vector, drop = FALSE]
  alleles(poly) <- mapply(
    "[",
    alleles(poly),
    non_zero_cols_list,
    SIMPLIFY = FALSE
  )

  if (newploidy && is.genind(poly)) {
    ploc <- seploc(poly, res.type = "matrix")
    ploid_mat <- vapply(
      ploc,
      FUN = apply,
      FUN.VALUE = integer(nInd(poly)),
      MARGIN = 1,
      sum
    )
    ploidy(poly) <- apply(ploid_mat, MARGIN = 1, max, na.rm = TRUE)
  }
  return(poly)
}


make_haplotypes_genind <- function(gid) {
  ploidy <- max(ploidy(gid))
  if (ploidy == 1) {
    warning("This procedure does not work on haploid data. Returning data.")
    return(gid)
  }
  if (is.null(strata(gid))) {
    warning("No strata found... creating one from the population.")
    strata(gid) <- data.frame(pop = pop(gid))
  }
  addStrata(gid) <- data.frame(Individual = indNames(gid))
  df <- strata(gid)
  df <- df[rep(seq(nrow(df)), each = ploidy), , drop = FALSE]
  newdf <- separate_haplotypes(gid)
  if (ploidy > 2) {
    is_typed <- apply(newdf, 1, function(i) sum(is.na(i))) < nLoc(gid)
    newdf <- newdf[is_typed, , drop = FALSE]
    df <- df[is_typed, , drop = FALSE]
  }
  newgid <- df2genind(newdf, ploidy = 1, strata = df)
  setPop(newgid) <- ~Individual
  return(newgid)
}

make_haplotypes_genlight <- function(gid) {
  ploidy <- max(ploidy(gid))
  if (ploidy == 1) {
    warning("This procedure does not work on haploid data. Returning data.")
    return(gid)
  }
  if (is.null(strata(gid))) {
    warning("No strata found... creating one from the population.")
    strata(gid) <- data.frame(pop = pop(gid))
  }
  if (is.null(indNames(gid))) {
    indNames(gid) <- .genlab("ind", nInd(gid))
  }
  addStrata(gid) <- data.frame(Individual = indNames(gid))
  df <- strata(gid)
  df <- df[rep(seq(nrow(df)), each = ploidy), , drop = FALSE]
  newgid <- lapply(gid@gen, function(x, p) lapply(seq(p), split_gen, x), ploidy)
  newgid <- new(
    "genlight",
    unlist(newgid, recursive = FALSE, use.names = FALSE)
  )
  ploidy(newgid) <- 1L
  alleles(newgid) <- alleles(gid)
  strata(newgid) <- df
  setPop(newgid) <- ~Individual
  return(newgid)
}

split_gen <- function(i, gen) {
  no_snps <- is.null(gen@snp[i][[1L]])
  gen@snp <- if (no_snps) {
    list(as.raw(rep(0L, length(gen@snp[[1L]]))))
  } else {
    gen@snp[i]
  }
  ploidy(gen) <- 1L
  gen
}

Try the poppr package in your browser

Any scripts or data that you put into this service are public.

poppr documentation built on Aug. 24, 2025, 1:09 a.m.