Nothing
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#
# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate
# students at Oregon State University; Jonah C. Brooks, undergraduate student at
# Oregon State University; and Dr. Nik Grünwald, an employee of USDA-ARS.
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for educational, research and non-profit purposes, without fee,
# and without a written agreement is hereby granted, provided that the statement
# above is incorporated into the material, giving appropriate attribution to the
# authors.
#
# Permission to incorporate this software into commercial products may be
# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for
# Commercialization and Corporate Development.
#
# The software program and documentation are supplied "as is", without any
# accompanying services from the USDA or the University. USDA ARS or the
# University do not warrant that the operation of the program will be
# uninterrupted or error-free. The end-user understands that the program was
# developed for research purposes and is advised not to rely exclusively on the
# program for any reason.
#
# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY
# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE
# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#==============================================================================#
#' Remove potential bias caused by cloned genotypes in genind or genclone
#' object.
#'
#' This function removes any duplicated multilocus genotypes from any specified
#' population strata.
#'
#' @param pop a \code{\linkS4class{genind}}, \code{\linkS4class{genclone}}, or
#' \code{\linkS4class{snpclone}} object
#'
#' @param strata a hierarchical formula or numeric vector. This will define the
#' columns of the data frame in the strata slot to use.
#'
#' @param combine \code{logical}. When set to TRUE, the strata will be combined
#' to create a new population for the clone-corrected genind or genclone
#' object.
#'
#' @param keep \code{integer}. When \code{combine} is set to \code{FALSE}, you
#' can use this flag to choose the levels of your population strata. For
#' example: if your clone correction strata is set to "Pop", "Subpop", and
#' "Year", and you want to analyze your populations with respect to year, you
#' can set \code{keep = c(1,3)}.
#'
#' @return a clone corrected \code{\linkS4class{genclone}},
#' \code{\linkS4class{snpclone}}, or \code{\linkS4class{genind}} object.
#'
#' @details This function will clone correct based on the stratification
#' provided. To clone correct indiscriminately of population structure, set
#' \code{strata = NA}.
#'
#'
#' @export
#' @author Zhian N. Kamvar
#' @examples
#' # LOAD A. euteiches data set
#' data(Aeut)
#'
#' # Redefine it as a genclone object
#' Aeut <- as.genclone(Aeut)
#' strata(Aeut) <- other(Aeut)$population_hierarchy[-1]
#'
#' # Check the number of multilocus genotypes
#' mlg(Aeut)
#' popNames(Aeut)
#'
#' # Clone correct at the population level.
#' Aeut.pop <- clonecorrect(Aeut, strata = ~Pop)
#' mlg(Aeut.pop)
#' popNames(Aeut.pop)
#'
#' \dontrun{
#' # Clone correct at the subpopulation level with respect to population and
#' # combine.
#' Aeut.subpop <- clonecorrect(Aeut, strata = ~Pop/Subpop, combine=TRUE)
#' mlg(Aeut.subpop)
#' popNames(Aeut.subpop)
#'
#' # Do the same, but set to the population level.
#' Aeut.subpop2 <- clonecorrect(Aeut, strata = ~Pop/Subpop, keep=1)
#' mlg(Aeut.subpop2)
#' popNames(Aeut.subpop2)
#'
#' # LOAD H3N2 dataset
#' data(H3N2)
#'
#' strata(H3N2) <- other(H3N2)$x
#'
#' # Extract only the individuals located in China
#' country <- clonecorrect(H3N2, strata = ~country)
#'
#' # How many isolates did we have from China before clone correction?
#' sum(strata(H3N2, ~country) == "China") # 155
#'
#' # How many unique isolates from China after clone correction?
#' sum(strata(country, ~country) == "China") # 79
#'
#' # Something a little more complicated. (This could take a few minutes on
#' # slower computers)
#'
#' # setting the hierarchy to be Country > Year > Month
#' c.y.m <- clonecorrect(H3N2, strata = ~year/month/country)
#'
#' # How many isolates in the original data set?
#' nInd(H3N2) # 1903
#'
#' # How many after we clone corrected for country, year, and month?
#' nInd(c.y.m) # 1190
#' }
#==============================================================================#
clonecorrect <- function(pop, strata = 1, combine = FALSE, keep = 1){
if (!is.genind(pop) & !is(pop, "snpclone")){
stop(deparse(substitute(pop)), " is not a genind or snpclone object.\n")
}
if (is.null(strata(pop))){
msg <- paste0("Strata is not set for ", deparse(substitute(pop)),
". Clone correct will be performed without population",
" information.")
warning(msg)
strata <- NA
}
if (is.language(strata)){
strataformula <- strata
strata <- all.vars(strata)
}
if (is.genind(pop)) popcall <- match.call()
if (is.na(strata[1])){
return(pop[.clonecorrector(pop), ])
}
if (is.numeric(strata)){
strata <- names(strata(pop))[strata]
strataformula <- as.formula(paste0("~", paste(strata, collapse = "/")))
}
if (!all(strata %in% names(strata(pop)))){
stop(hier_incompatible_warning(strata, strata(pop)))
}
setPop(pop) <- strataformula
# Corrects the individual names of the object. This is fo the fact that the
# clone corrector relies on the unique individual names for it to work.
if (all(indNames(pop) == "")){
indNames(pop) <- as.character(1:nInd(pop))
}
cpop <- nPop(pop)
# Steps for correction:
# Subset by population factor.
# Run subset population by the .clonecorrector
# Profit!
corWrecked <- function(x, pop){
subbed <- popsub(pop, x) # population to be...corrected.
subbed <- subbed[.clonecorrector(subbed), ]
# Return the indices based off of the individual names.
return(which(indNames(pop) %in% indNames(subbed)))
}
ccpop <- unlist(lapply(1:cpop, corWrecked, pop))
pop <- pop[ccpop, ]
if (!combine){
# When the combine flag is not true, the default is to keep the first level
# of the strata. The keep flag is a numeric vector corresponding to the
# strata flag indicating which levels the user wants to keep.
strata <- strata[keep]
newformula <- as.formula(paste0("~", paste(strata, collapse = "/")))
setPop(pop) <- newformula
}
if (is.genind(pop)){
pop@call <- popcall
}
return(pop)
}
#==============================================================================#
#' Subset data by population
#'
#' Create a new dataset with specified populations or exclude specified
#' populations from the dataset.
#'
#' @param gid a \code{\linkS4class{genind}}, \code{\linkS4class{genclone}},
#' \code{\linkS4class{genlight}}, or \code{\linkS4class{snpclone}} object.
#'
#' @param sublist a \code{vector} of population names or indexes that the user
#' wishes to keep. Default to \code{"ALL"}.
#'
#' @param exclude a \code{vector} of population names or indexes that the user
#' wishes to discard. Default to \code{NULL}.
#'
#' @param blacklist DEPRECATED, use exclude.
#'
#' @param mat a \code{matrix} object produced by \code{\link{mlg.table}} to be
#' subsetted. If this is present, the subsetted matrix will be returned instead
#' of the genind object
#'
#' @param drop \code{logical}. If \code{TRUE}, unvarying alleles will be dropped
#' from the population.
#'
#' @return A \code{genind} object or a matrix.
#' @author Zhian N. Kamvar
#' @examples
#' # Load the dataset microbov.
#' data(microbov)
#'
#' # List the population names.
#' popNames(microbov)
#'
#' # Analyze only the populations with exactly 50 individuals
#' mic.50 <- popsub(microbov, sublist=c(1:6, 11:15), exclude=c(3,4,13,14))
#'
#' \dontrun{
#' # Analyze the first 10 populations, except for "Bazadais"
#' mic.10 <- popsub(microbov, sublist=1:10, exclude="Bazadais")
#'
#' # Take out the two smallest populations
#' micbig <- popsub(microbov, exclude=c("NDama", "Montbeliard"))
#'
#' # Analyze the two largest populations
#' miclrg <- popsub(microbov, sublist=c("BlondeAquitaine", "Charolais"))
#' }
#' @export
#==============================================================================#
popsub <- function(gid, sublist="ALL", exclude=NULL, blacklist=NULL, mat=NULL, drop=TRUE){
if (!is.genind(gid) & !is(gid, "genlight")){
stop("popsub requires a genind or genlight object\n")
}
if (!is.null(blacklist)) {
warning(
option_deprecated(
match.call(),
"blacklist",
"exclude",
"2.8.7.",
"Please use `exclude` in the future"
),
immediate. = TRUE
)
exclude <- blacklist
}
if (is.null(pop(gid))){
if (!is.na(sublist[1]) && sublist[1] != "ALL")
warning("No population structure. Subsetting not taking place.")
return(gid)
}
orig_list <- sublist
popnames <- popNames(gid)
if (toupper(sublist[1]) == "ALL"){
if (is.null(exclude)){
return(gid)
} else {
# filling the sublist with all of the population names.
sublist <- popnames
}
}
# Checking if there are names for the population names.
# If there are none, it will give them names.
if (is.null(names(popnames))){
if (length(popnames) == nPop(gid)){
names(popnames) <- popNames(gid)
} else {
stop("Population names do not match population factors.")
}
}
# Treating anything present in exclude.
if (!is.null(exclude)){
# If both the sublist and exclude are numeric or character.
if (is.numeric(sublist) & is.numeric(exclude) | class(sublist) == class(exclude)){
sublist <- sublist[!sublist %in% exclude]
} else if (is.numeric(sublist) & class(exclude) == "character"){
# if the sublist is numeric and exclude is a character. eg s=1:10, b="USA"
sublist <- sublist[sublist %in% which(!popnames %in% exclude)]
} else {
# no sublist specified. Ideal situation
if(all(popnames %in% sublist)){
sublist <- sublist[-exclude]
} else {
# weird situation where the user will specify a certain sublist, yet
# index the exclude numerically. Interpreted as an index of
# populations in the whole data set as opposed to the sublist.
warning("exclude is numeric. Interpreting exclude as the index of the population in the total data set.")
sublist <- sublist[!sublist %in% popnames[exclude]]
}
}
}
if (!is.null(mat)){
mat <- mat[sublist, , drop=FALSE]
return(mat[, which(colSums(mat) > 0), drop=FALSE])
} else {
# subsetting the population.
if (is.numeric(sublist)){
sublist <- popnames[sublist]
} else {
sublist <- popnames[popnames %in% sublist]
}
sublist <- pop(gid) %in% sublist
if (!any(sublist)){
if (!is.numeric(orig_list) & !any(popNames(gid) %in% orig_list)){
stop(unmatched_pops_warning(popNames(gid), orig_list))
} else {
nothing_warn <- paste("Nothing present in the sublist.\n",
"Perhaps the sublist and exclude arguments have",
"duplicate entries?\n",
"Subset not taking place.")
warning(nothing_warn)
return(gid)
}
}
gid <- gid[sublist, drop = drop]
if (is.genind(gid)){
gid@call <- match.call()
}
return(gid)
}
}
#==============================================================================#
#' Treat missing data
#'
#' missingno gives the user four options to deal with missing data: remove loci,
#' remove samples, replace with zeroes, or replace with average allele counts.
#'
#'@param pop a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}}
#' object.
#'
#'@param type a \code{character} string: can be "ignore", "zero", "mean",
#' "loci", or "geno" (see \code{Details} for definitions).
#'
#'@param cutoff \code{numeric}. A number from 0 to 1 indicating the allowable
#' rate of missing data in either genotypes or loci. This will be ignored for
#' \code{type} values of \code{"mean"} or \code{"zero"}.
#'
#'@param quiet if \code{TRUE}, it will print to the screen the action performed.
#'
#'@param freq defaults to \code{FALSE}. This option is passed on to the
#' \code{\link[adegenet]{tab}} function. If \code{TRUE}, the matrix in the
#' genind object will be replaced by a numeric matrix (as opposed to integer).
#' THIS IS NOT RECOMMENDED. USE THE FUNCTION \code{\link[adegenet]{tab}}
#' instead.
#'
#'@details These methods provide a way to deal with systematic missing data and
#' to give a wrapper for \code{adegenet}'s \code{ \link{tab}} function.
#' ALL OF THESE ARE TO BE USED WITH CAUTION.
#'
#' Using this function with polyploid data (where missing data is coded as "0")
#' may give spurious results.
#'
#' \subsection{Treatment types}{
#' \itemize{
#' \item{\code{"ignore"} - does not remove or replace missing data.}
#' \item{\code{"loci"} - removes all loci containing missing data in the entire
#' data set. }
#' \item{\code{"genotype"} - removes any genotypes/isolates/individuals with
#' missing data.}
#' \item{\code{"mean"} - replaces all NA's with the mean of the alleles for the
#' entire data set.}
#' \item{\code{"zero"} or \code{"0"} - replaces all NA's with "0". Introduces
#' more diversity.}
#' }
#' }
#'@return a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object.
#'
#'@note \emph{"wild missingno appeared!"}
#'
#'@seealso \code{\link[adegenet]{tab}}, \code{\link{poppr}}, \code{\link{poppr.amova}},
#' \code{\link{nei.dist}}, \code{\link{aboot}}
#'
#'@export
#'@author Zhian N. Kamvar
#' @examples
#'
#' data(nancycats)
#'
#' nancy.locina <- missingno(nancycats, type = "loci")
#'
#' ## Found 617 missing values.
#' ## 2 loci contained missing values greater than 5%.
#' ## Removing 2 loci : fca8 fca45
#'
#' nancy.genona <- missingno(nancycats, type = "geno")
#'
#' ## Found 617 missing values.
#' ## 38 genotypes contained missing values greater than 5%.
#' ## Removing 38 genotypes : N215 N216 N188 N189 N190 N191 N192 N302 N304 N310
#' ## N195 N197 N198 N199 N200 N201 N206 N182 N184 N186 N298 N299 N300 N301 N303
#' ## N282 N283 N288 N291 N292 N293 N294 N295 N296 N297 N281 N289 N290
#'
#' # Replacing all NA with "0" (see tab in the adegenet package).
#' nancy.0 <- missingno(nancycats, type = "0")
#'
#' ## Replaced 617 missing values
#'
#' # Replacing all NA with the mean of each column (see tab in the
#' # adegenet package).
#' nancy.mean <- missingno(nancycats, type = "mean")
#'
#' ## Replaced 617 missing values
#==============================================================================#
# ###''
# --,,`
# ##,`+
# #.-`.
# --','-.
# `+&#-+'
# .`','##
# &`''#*'
# `&-'-'#
# '`##.-,
missingno <- function(pop, type = "loci", cutoff = 0.05, quiet=FALSE, freq = FALSE){
if (is(pop, "genlight")){
warning("missingno cannot be applied to genlight objects at this time.")
return(pop)
}
if(sum(is.na(pop@tab)) > 0){
# removes any loci (columns) with missing values.
MISSINGOPTS <- c("loci", "genotypes", "mean", "zero", "0", "ignore", "asis")
freq_warning <- paste("Objects of class 'genind' must have integers in the",
"genotype matrix. Setting freq = TRUE will force the matrix to be numeric.",
"please see help('tab') for alternatives.")
type <- match.arg(tolower(type), MISSINGOPTS)
if (type %in% c("ignore", "asis")){
return(pop)
}
navals <- percent_missing(pop, type = type, cutoff = cutoff)
if (type == "loci"){
if(quiet != TRUE){
if (length(navals) == ncol(tab(pop))){
message("\n No loci with missing values above ",
paste0(cutoff*100,"%")," found.\n")
} else {
remloc <- locNames(pop)[!cumsum(nAll(pop)) %in% navals]
rem <- sum(is.na(tab(pop)))
missing_messenger(remloc, type = c("locus", "loci"), nremoved = rem,
cutoff = cutoff)
}
}
pop <- pop[, navals]
} else if (type == "genotypes"){
if(quiet != TRUE){
if (length(navals) == nInd(pop)){
message("\n No genotypes with missing values above ",
paste0(cutoff*100, "%")," found.\n")
} else {
remgeno <- indNames(pop)[-navals]
rem <- sum(is.na(tab(pop)))
missing_messenger(remgeno, type = c("genotype", "genotypes"),
nremoved = rem, cutoff = cutoff)
}
}
pop <- pop[navals, ]
} else if (type == "mean"){
if (!quiet){
message("\n Replaced ", sum(is.na(tab(pop)))," missing values.")
}
pop@tab <- tab(pop, freq = freq, NA.method = "mean", quiet=quiet)
if (freq){
warning(freq_warning)
}
}
# changes all NA's to 0. NOT RECOMMENDED. INTRODUCES MORE DIVERSITY.
else if (type %in% c("zero","0")){
if (!quiet){
message("\n Replaced ", sum(is.na(tab(pop)))," missing values.")
}
pop@tab <- tab(pop, freq = freq, NA.method = "zero", quiet=quiet)
if (freq){
warning(freq_warning)
}
} else {
message("\nInvalid type.\n")
}
} else {
if(quiet == FALSE){
message("\n No missing values detected.\n")
}
}
return(pop)
}
#==============================================================================#
#' Remove all non-phylogentically informative loci
#'
#' This function will facilitate in removing phylogenetically uninformative loci
#' from a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object.
#' The user has the ability to define what uninformative means by setting a
#' cutoff value for either percentage of differentiating genotypes or minor
#' allele frequency.
#'
#' @param pop a \code{\linkS4class{genclone}} or \code{\linkS4class{genind}}
#' object.
#'
#' @param cutoff \code{numeric}. A number from 0 to 1 defining the minimum
#' number of differentiating samples.
#'
#' @param MAF \code{numeric}. A number from 0 to 1 defining the minimum minor
#' allele frequency. This is passed as the \code{thresh} parameter of
#' \code{\link[adegenet]{isPoly}}.
#'
#' @param quiet \code{logical}. When \code{quiet = TRUE} (default), messages
#' indicating the loci removed will be printed to screen. When \code{quiet =
#' FALSE}, nothing will be printed to screen.
#'
#' @return A \code{genind} object with user-defined informative loci.
#'
#' @details This function will remove uninformative loci using a traditional MAF
#' cutoff (using \code{\link[adegenet]{isPoly}} from \pkg{adegenet}) as well
#' as analyzing the number of observed genotypes in a locus. This is important
#' for clonal organisms that can have fixed heterozygous sites not detected by
#' MAF methods.
#'
#' @note This will have a few side effects that affect certain analyses. First,
#' the number of multilocus genotypes might be reduced due to the reduced
#' number of markers (if you are only using a genind object). Second, if you
#' plan on using this data for analysis of the index of association, be sure
#' to use the standardized version (rbarD) that corrects for the number of
#' observed loci.
#'
#' @author Zhian N. Kamvar
#' @examples
#' # We will use a dummy data set to demonstrate how this detects uninformative
#' # loci using both MAF and a cutoff.
#'
#' genos <- c("A/A", "A/B", "A/C", "B/B", "B/C", "C/C")
#'
#' v <- sample(genos, 100, replace = TRUE)
#' w <- c(rep(genos[2], 99), genos[3]) # found by cutoff
#' x <- c(rep(genos[1], 98), genos[3], genos[2]) # found by MAF
#' y <- c(rep(genos[1], 99), genos[2]) # found by both
#' z <- sample(genos, 100, replace = TRUE)
#' dat <- df2genind(data.frame(v = v, w = w, x = x, y = y, z = z), sep = "/")
#'
#' informloci(dat)
#'
#' \dontrun{
#' # Ignore MAF
#' informloci(dat, MAF = 0)
#'
#' # Ignore cutoff
#' informloci(dat, cutoff = 0)
#'
#' # Real data
#' data(H3N2)
#' informloci(H3N2)
#'
#' }
#' @export
#==============================================================================#
#' @importFrom pegas as.loci
informloci <- function(pop, cutoff = 2/nInd(pop), MAF = 0.01, quiet = FALSE){
if (!is.genind(pop)){
stop("This function only works on genind objects.")
}
MLG <- mlg(pop, quiet = TRUE)
if (MLG < 3){
if(!isTRUE(quiet)){
cat("Not enough multilocus genotypes to be meaningful.\n")
}
return(pop)
}
cutoff <- ifelse(cutoff > 0.5, 1 - cutoff, cutoff)
MAF <- ifelse(MAF > 0.5, 1 - MAF, MAF)
min_ind = round(cutoff * nInd(pop))
if (!isTRUE(quiet)){
ind <- ifelse(min_ind == 1, "sample", "samples")
message("cutoff value: ", cutoff*100, " % ( ",min_ind, " ", ind," ).")
message("MAF : ", MAF)
}
if (pop@type == "PA"){
# cutoff applies to too many or too few typed individuals in AFLP cases.
glocivals <- apply(tab(pop), 2, sum) %in% min_ind:(nInd(pop) - min_ind)
} else {
genloc <- as.loci(pop)
the_loci <- attr(genloc, "locicol")
glocivals <- apply(genloc[the_loci], 2, test_table, min_ind, nInd(pop))
}
alocivals <- isPoly(pop, "locus", thres = MAF)
locivals <- alocivals & glocivals
if (all(locivals == TRUE)){
msg <- paste("\nAll sites polymorphic")
} else if (sum(locivals) < 2){
msg <- paste0("\nFewer than 2 loci found informative.",
"\nPerhaps you should choose a ",
"lower cutoff value?\nReturning with no changes.")
locivals <- rep(TRUE, nLoc(pop))
} else {
msg <- uninformative_loci_message(pop, glocivals, alocivals, locivals,
min_ind, ind, MAF)
# glocsum <- sum(!glocivals)
# alocsum <- sum(!alocivals)
# locsum <- sum(!locivals)
# fmsg <- paste("Found", locsum, "uninformative",
# ifelse(locsum != 1, "loci", "locus"), "\n",
# "============================")
# gmsg <- paste(glocsum,
# ifelse(glocsum != 1, "loci", "locus"), "found with",
# "a cutoff of", min_ind, ind,
# ifelse(glocsum == 0, "", ":\n"),
# paste(locNames(pop)[!glocivals], collapse = ", "))
# amsg <- paste(alocsum,
# ifelse(alocsum != 1, "loci", "locus"),
# "found with MAF <", signif(MAF, 3),
# ifelse(alocsum == 0, "", ":\n"),
# paste(locNames(pop)[!alocivals], collapse = ", "))
# msg <- paste("\n", fmsg, "\n", gmsg, "\n", amsg)
}
if (!isTRUE(quiet)){
message(msg)
}
if (pop@type == "PA"){
return(pop[, locivals])
}
return(pop[, loc = locNames(pop)[locivals]])
}
#==============================================================================#
#' Recode polyploid microsatellite data for use in frequency based statistics.
#'
#' As the genind object requires ploidy to be consistent across loci, a
#' workaround to importing polyploid data was to code missing alleles as "0"
#' (for microsatellite data sets). The advantage of this is that users would be
#' able to calculate Bruvo's distance, the index of association, and genotypic
#' diversity statistics. The tradeoff was the fact that this broke all other
#' analyses as they relied on allele frequencies and the missing alleles are
#' treated as extra alleles. This function removes those alleles and returns a
#' \code{\linkS4class{genclone}} or \code{\linkS4class{genind}} object where
#' allele frequencies are coded based on the number of alleles observed at a
#' single locus per individual. See the examples for more details.
#'
#' @param poly a \code{\linkS4class{genclone}}, \code{\linkS4class{genind}}, or
#' \code{\linkS4class{genpop}} object that has a ploidy of > 2
#' @param newploidy for genind or genclone objects: if \code{FALSE} (default),
#' the user-defined ploidy will stay constant. if \code{TRUE}, the ploidy for
#' each sample will be determined by the maximum ploidy observed for each
#' genotype.
#'
#' @param addzero add zeroes onto genind or genclone objects with uneven ploidy?
#' if \code{TRUE}, objects with uneven ploidies will have zeroes appended to all
#' loci to allow conversion to genpop objects. Defaults to \code{FALSE}.
#'
#' @return a \code{\linkS4class{genclone}}, \code{\linkS4class{genind}}, or
#' \code{\linkS4class{genpop}} object.
#'
#' @details The genind object has two caveats that make it difficult to work
#' with polyploid data sets:
#' \enumerate{
#' \item ploidy must be constant throughout the data set
#' \item missing data is treated as "all-or-none"
#' } In an ideal world, polyploid genotypes would be just as unambiguous as
#' diploid or haploid genotypes. Unfortunately, the world we live in is far
#' from ideal and a genotype of AB in a tetraploid organism could be AAAB,
#' AABB, or ABBB. In order to get polyploid data in to \pkg{adegenet} or
#' \pkg{poppr}, we must code all loci to have the same number of allelic
#' states as the ploidy or largest observed heterozygote (if ploidy is
#' unknown). The way to do this is to insert zeroes to pad the alleles. So, to
#' import two genotypes of:
#'
#' \tabular{rrrr}{
#' NA \tab 20 \tab 23 \tab 24\cr
#' 20 \tab 24 \tab 26 \tab 43
#' }
#' they should be coded as:
#' \tabular{rrrr}{
#' 0 \tab 20 \tab 23 \tab 24\cr
#' 20 \tab 24 \tab 26 \tab 43
#' }
#' This zero is treated as an extra allele and is represented in the genind object as so:
#' \tabular{rrrrrr}{
#' \strong{0} \tab \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr
#' 1 \tab 1 \tab 1 \tab 1 \tab 0 \tab 0\cr
#' 0 \tab 1 \tab 0 \tab 1 \tab 1 \tab 1
#' }
#'
#' This function remedies this problem by removing the zero column.
#' The above table would become:
#'
#' \tabular{rrrrr}{
#' \strong{20} \tab \strong{23} \tab \strong{24} \tab \strong{26} \tab \strong{43}\cr
#' 1 \tab 1 \tab 1 \tab 0 \tab 0\cr
#' 1 \tab 0 \tab 1 \tab 1 \tab 1
#' }
#'
#' With this, the user is able to calculate frequency based statistics on the
#' data set.
#'
#' @note This is an approximation, and a bad one at that. \pkg{Poppr} was not
#' originally intended for polyploids, but with the inclusion of Bruvo's
#' distance, it only made sense to attempt something beyond single use.
#'
#' @author Zhian N. Kamvar
#' @export
#' @examples
#' data(Pinf)
#' iPinf <- recode_polyploids(Pinf)
#'
#' # Note that the difference between the number of alleles.
#' nAll(Pinf)
#' nAll(iPinf)
#'
#' \dontrun{
#' library("ape")
#'
#' # Removing missing data.
#' setPop(Pinf) <- ~Country
#'
#' # Calculating Rogers' distance.
#' rog <- rogers.dist(genind2genpop(Pinf))
#' irog <- rogers.dist(recode_polyploids(genind2genpop(Pinf)))
#'
#' # We will now plot neighbor joining trees. Note the decreased distance in the
#' # original data.
#' plot(nj(rog), type = "unrooted")
#' add.scale.bar(lcol = "red", length = 0.02)
#' plot(nj(irog), type = "unrooted")
#' add.scale.bar(lcol = "red", length = 0.02)
#' }
#==============================================================================#
recode_polyploids <- function(poly, newploidy = FALSE, addzero = FALSE){
if (!is.genind(poly) && !is.genpop(poly)){
stop("input must be a genind object.")
} else if (!test_zeroes(poly) && addzero == FALSE){
warning("Input is not a polyploid data set, returning original.")
return(poly)
}
if (addzero && is.genind(poly)) {
if (test_zeroes(poly)) {
warning("addzero = TRUE, but data already has zeroes. Returning original.")
return(poly)
}
polydf <- genind2df(poly, sep = "/", usepop = FALSE)
maxploid <- max(ploidy(poly))
polydf <- generate_bruvo_mat(polydf, maxploid, sep = "/")
res <- df2genind(polydf, sep = "/", ploidy = maxploid,
pop = pop(poly), strata = strata(poly),
hierarchy = poly@hierarchy)
other(res) <- other(poly)
if (is.genclone(poly)) {
res <- as.genclone(res, mlg = poly@mlg)
}
return(res)
}
MAT <- tab(poly)
fac <- locFac(poly)
non_zero_cols_list <- lapply(alleles(poly), function(x) as.numeric(x) > 0)
non_zero_cols_vector <- unlist(non_zero_cols_list, use.names = FALSE)
poly@loc.fac <- fac[non_zero_cols_vector]
poly@loc.n.all <- stats::setNames(tabulate(locFac(poly), nbins = nLoc(poly)),
locNames(poly))
poly@tab <- MAT[, non_zero_cols_vector, drop = FALSE]
alleles(poly) <- mapply("[", alleles(poly), non_zero_cols_list,
SIMPLIFY = FALSE)
if (newploidy && is.genind(poly)){
ploc <- seploc(poly, res.type = "matrix")
ploid_mat <- vapply(ploc, FUN = apply, FUN.VALUE = integer(nInd(poly)),
MARGIN = 1, sum)
ploidy(poly) <- apply(ploid_mat, MARGIN = 1, max, na.rm = TRUE)
}
return(poly)
}
make_haplotypes_genind <- function(gid) {
ploidy <- max(ploidy(gid))
if (ploidy == 1) {
warning("This procedure does not work on haploid data. Returning data.")
return(gid)
}
if (is.null(strata(gid))) {
warning("No strata found... creating one from the population.")
strata(gid) <- data.frame(pop = pop(gid))
}
addStrata(gid) <- data.frame(Individual = indNames(gid))
df <- strata(gid)
df <- df[rep(seq(nrow(df)), each = ploidy), , drop = FALSE]
newdf <- separate_haplotypes(gid)
if (ploidy > 2) {
is_typed <- apply(newdf, 1, function(i) sum(is.na(i))) < nLoc(gid)
newdf <- newdf[is_typed, , drop = FALSE]
df <- df[is_typed, , drop = FALSE]
}
newgid <- df2genind(newdf, ploidy = 1, strata = df)
setPop(newgid) <- ~Individual
return(newgid)
}
make_haplotypes_genlight <- function(gid) {
ploidy <- max(ploidy(gid))
if (ploidy == 1) {
warning("This procedure does not work on haploid data. Returning data.")
return(gid)
}
if (is.null(strata(gid))) {
warning("No strata found... creating one from the population.")
strata(gid) <- data.frame(pop = pop(gid))
}
if (is.null(indNames(gid))) {
indNames(gid) <- .genlab("ind", nInd(gid))
}
addStrata(gid) <- data.frame(Individual = indNames(gid))
df <- strata(gid)
df <- df[rep(seq(nrow(df)), each = ploidy), , drop = FALSE]
newgid <- lapply(gid@gen, function(x, p) lapply(seq(p), split_gen, x), ploidy)
newgid <- new("genlight", unlist(newgid, recursive = FALSE, use.names = FALSE))
ploidy(newgid) <- 1L
alleles(newgid) <- alleles(gid)
strata(newgid) <- df
setPop(newgid) <- ~Individual
return(newgid)
}
split_gen <- function(i, gen) {
no_snps <- is.null(gen@snp[i][[1L]])
gen@snp <- if (no_snps) list(as.raw(rep(0L, length(gen@snp[[1L]])))) else gen@snp[i]
ploidy(gen) <- 1L
gen
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.