#'Subset snpRdata objects
#'
#'Subsets snpRdata objects by specific snps, samples, facets, subfacets, etc.
#'Statistics will need to be recalculated after subsetting to avoid confusion.
#'The bracket operators can be alternatively, following the same syntax.
#'
#'Sample and snp facets to subset over can be provided. Filtering by facet
#'categories is performed by naming the facet as an argument name, then
#'providing the levels to keep or remove to that argument. See examples. Facets
#'designated as described in \code{\link{Facets_in_snpR}}.
#'
#'@param x snpRdata object.
#'@param .snps numeric, default \code{1:nrow(x)}. Row numbers corresponding to
#' SNPs to keep. Negative subscripts allowed.
#'@param .samps numeric, default \code{1:ncol(x)}. Column numbers corresponding
#' to samples to keep. Negative subscripts allowed.
#'@param i numeric, default \code{1:nrow(x)}. Row numbers corresponding to SNPs
#' to keep. Negative subscripts allowed.
#'@param j numeric, default \code{1:nrow(x)}. Row numbers corresponding to SNPs
#' to keep. Negative subscripts allowed.
#'@param ... Facet subsetting may be specified by providing the facet as an
#' argument and then providing the levels to keep or remove. Setting pop =
#' "ASP", for example, would keep only samples in the "ASP" level of the pop
#' facet, and setting pop.fam = "ASP.A" would keep only samples the ASP pop and
#' the A fam. Negative subscripts are allowed: pop = -c("ASP", "PAL") would
#' remove samples in the ASP or PAL pops. Subsetting by multiple facets is
#' supported, although negative and positive subscripts cannot be mixed across
#' sample or SNP facets. They may be mixed between the two.
#'@param .facets Character, default NULL. Sample facet to select by. Alternative
#' to direct specification as in \code{pop = "ASP"}.
#'@param .subfacets Character, default NULL. Sample facet level matching a level
#' of the provided \code{.facets}. Alternative to direct specification as in
#' \code{pop = "ASP"}.
#'@param .snp.facets Character, default NULL. SNP facet to select by.
#' Alternative to direct specification as in \code{chr = "groupIX"}.
#'@param .snp.subfacets Character, default NULL. SNP facet level matching a
#' level of the provided \code{.snp.facets}. Alternative to direct
#' specification as in \code{chr = "groupIX"}.
#'@param drop logical, default FALSE. Deprecated.
#'
#'@name subset_snpRdata
#'@aliases subset_snpR_data [
#'
#' @examples
#' # Keep only individuals in the ASP and PAL populations
#' # and on the LGIX or LGIV chromosome.
#' subset_snpR_data(stickSNPs, pop = c("ASP", "PAL"),
#' chr = c("groupIX", "groupIV"))
#'
#' # keep individuals/SNPs in the first 10 columns/rows.
#' subset_snpR_data(stickSNPs, 1:10, 1:10)
#'
#' # negative subscripts: remove individuals in family B
#' subset_snpR_data(stickSNPs, fam = -"B")
#'
#' # negative subscripts: remove individuals in pop PAL AND fam B or C,
#' # and keep only SNPs on LGIV
#' subset_snpR_data(stickSNPs, pop.fam = -c("PAL.B", "PAL.C"),
#' chr = "groupIV")
#'
#' # using the bracket operator, same as example 1
#' stickSNPs[pop = c("ASP", "PAL"), chr = c("groupIX", "groupIV")]
#'
#' # bracket operator, excluding first ten SNPs and only keeping pop = PAL,
#' # fam = B
#' stickSNPs[-c(1:10), pop.fam = c("PAL.B")]
#'
#' # using the .facet etc arguments, useful when the facet is stored as an
#' # object
#' target_facet <- "pop"
#' target_subfacet <- "ASP"
#' subset_snpR_data(stickSNPs,
#' .facets = target_facet,
#' .subfacets = target_subfacet)
NULL
#' @export
#' @describeIn subset_snpRdata subset_snpR_data
subset_snpR_data <- function(x, .snps = 1:nsnps(x), .samps = 1:nsamps(x), ..., .facets = NULL, .subfacets = NULL, .snp.facets = NULL, .snp.subfacets = NULL){
#============extract facet info===============
.argnames <- match.call()
.argnames <- as.list(.argnames)
.argnames <- .argnames[-1]
.is.facet <- which(!names(.argnames) %in% c("x", ".snps", ".samps", ".facets", ".subfacets", ".snp.facets", ".snp.subfacets"))
if(length(.is.facet) > 0){
if(any(!is.null(.facets), !is.null(.subfacets),
!is.null(.snp.facets), !is.null(.snp.subfacets))){
stop("The '.facets', '.subfacets', '.snp.facets', or '.snp.subfacets' arguments cannot be used with arguments other than '.snps' or '.samps' (such as pop = 'ASP'). If you wish to subset using the 'facet = subfacet' approach, please do not use these arguments.\n")
}
facets <- names(.argnames)[.is.facet]
# eval the facets
## check for negatives, fix, and note
.has.negatives <- grep("-", .argnames[.is.facet])
.argnames[.is.facet][.has.negatives] <- lapply(.argnames[.is.facet][.has.negatives], function(x){
x <- gsub("-", "", x)
if(any(length(x) > 2)){stop("Negative subscripts cannot be provided inside c() calls. Please place negative subscripts outside c() calls.\n")}
return(x[2])
} )
.single.part <- !grepl("^c\\(", .argnames[.is.facet][.has.negatives])
.argnames[.is.facet][.has.negatives][!.single.part] <- lapply(.argnames[.is.facet][.has.negatives][!.single.part], str2lang)
## eval
for(.candidate_facets in 1:length(.is.facet)){
if(any(names(.argnames[.is.facet][.candidate_facets]) %in% c("facet", "subfacet", "facets", "subfacets", "snp.facet", "snp.subfacet", "snp.subfacets", "snp.facets"))){
stop("Facets and subfacets are now desginated directly using, for example, pop = c('my.pop1', 'my.pop2'), not using the 'facet', 'subfacet', etc arguments.\n")
}
# try to eval text in two ways: once for a basic string, once for code the evaluates to a string
.res1 <- try(list(eval(parse(text = .argnames[.is.facet][.candidate_facets]))), silent = TRUE)
if(methods::is(.res1, "try-error")){
.argnames[.is.facet][.candidate_facets] <- list(eval(parse(text = paste0("c(\"", .argnames[.is.facet][.candidate_facets], "\")"))))
}
else{
.argnames[.is.facet][.candidate_facets] <- .res1
}
}
}
else if(any(!is.null(.facets), !is.null(.subfacets),
!is.null(.snp.facets), !is.null(.snp.subfacets))){
return(.subset_snpR_data(x,
snps = .snps,
samps = .samps,
facets = .facets,
subfacets = .subfacets,
snp.facets = .snp.facets,
snp.subfacets = .snp.subfacets))
}
else{
facets <- NULL
}
#===========sanity checks====================
if(!is.snpRdata(x)){
stop("x must be a snpRdata object.\n")
}
msg <- character(0)
# check facet types
if(!is.null(facets)){
if(any(facets %in% c("facet", "subfacet", "facets", "subfacets", "snp.facet", "snp.subfacet", "snp.subfacets", "snp.facets"))){
stop("Facets and subfacets are now desginated directly using, for example, pop = c('my.pop1', 'my.pop2'), not using the 'facet', 'subfacet', etc arguments. If you wish to subset using the facet = 'pop', subfacet = 'ASP'-style approach, please use the '.facets', '.subfacets', '.snp.facets', or '.snp.subfacets' arguments.\n")
}
facets <- .check.snpR.facet.request(x, facets, "none", TRUE)
if(any(c("sample", "complex") %in% facets[[2]]) & !identical(.samps, 1:nsamps(x))){
msg <- c(msg, "Sample level facets cannot be provided alongside a vector of samples to retain/remove.\n")
}
if(any(c("snp", "complex") %in% facets[[2]]) %in% facets[[2]] & !identical(.snps, 1:nsnps(x))){
msg <- c(msg, "SNP level facets cannot be provided alongside a vector of SNPs to retain/remove.\n")
}
if("complex" %in% facets[[2]]){
msg <- c(msg, "Complex (sample + SNP) facets cannot be used for subsetting. Please split into SNP and sample components and rerun.\n")
}
# if(length(facets[[1]]) > 1){
# valid <- logical(length(facets[[1]]))
# for(i in 1:length(facets[[1]])){
# valid[i] <- ifelse(length(grep(facets[[1]][i], facets[[1]][-i])) == 0, TRUE, FALSE)
# }
#
# if(sum(valid) < length(valid)){
# msg <- c(msg, paste0("Some facets are listed more than once amongst arguments: ", paste0(facets[[1]][which(!valid)], collapse = ", "),
# ". Each facet may only be listed once. Joint facets (such as pop.fam) will return only cases true in both, seperate facets will return cases true in either.\n"))
# }
# }
}
# .snps and .samps
if(is.logical(.snps)){
if(length(.snps) != nsnps(x)){
if(length(.snps) > nsnps(x)){
warning("More .snps/j requested than present in data")
}
if(length(.snps) %% nsnps != 0){
warning("Length of .snps/i is not a multiple of number of SNPs in x.\n")
}
.snps <- rep(.snps, length.out = nsnps(x))
}
.snps <- which(.snps)
}
if(is.logical(.samps)){
if(length(.samps) != nsamps(x)){
if(length(.samps) > nsamps(x)){
warning("More .samps/j requested than present in data")
}
if(length(.samps) %% nsamps != 0){
warning("Length of .samps/i is not a multiple of number of samples in x.\n")
}
.samps <- rep(.samps, length.out = nsamps(x))
}
.samps <- which(.samps)
}
if(!is.numeric(.snps) & !is.integer(.snps)){
msg <- c(msg, ".snps/i must be a numeric vector.\n")
}
else{
if(!all(.snps == as.integer(.snps))){
msg <- c(msg, ".snps/i must only contain integers.\n")
}
if(max(.snps) > nrow(x) | min(.snps) == 0){
msg <- c(msg, "All requested snps must be within 1:nsnps(x).\n")
}
}
if(!is.numeric(.samps) & !is.integer(.samps)){
msg <- c(msg, ".samps/j must be a numeric vector.\n")
}
else{
if(!all(.samps == as.integer(.samps))){
msg <- c(msg, ".samps/j must only contain integers.\n")
}
if(max(.samps) > ncol(x) | min(.samps) == 0){
msg <- c(msg, "All requested samples must be within 1:nsnps(x).\n")
}
}
if(length(msg) > 0){
stop(msg)
}
#===========subset===========================
if(!is.null(facets)){
# check to see if there are mixed negative and positive subsets
if(any(facets[[2]] == "snp")){
if(!all(which(facets[[2]] == "snp") %in% .has.negatives) & !all(!which(facets[[2]] == "snp") %in% .has.negatives)){
stop("Cannot mix negative and positive subscripts in SNP subfacets.\n")
}
}
if(any(facets[[2]] == "sample")){
if(!all(which(facets[[2]] == "sample") %in% .has.negatives) & !all(!which(facets[[2]] == "sample") %in% .has.negatives)){
stop("Cannot mix negative and positive subscripts in sample subfacets.\n")
}
}
# check for flipped facet orders
ord.flipped <- which(facets[[1]] != names(.argnames[.is.facet]))
## if any flipped, need to flip subfacets as well
if(any(ord.flipped)){
for(i in 1:length(ord.flipped)){
split.ord.facet <- unlist(.split.facet(facets[[1]][i]))
split.raw.facet <- unlist(.split.facet(names(.argnames[.is.facet])[i]))
re_ord <- match(split.raw.facet, split.ord.facet)
for(j in 1:length(.argnames[.is.facet][[i]])){
split.lev <- unlist(.split.facet(.argnames[.is.facet][[i]][j]))
.argnames[.is.facet][[i]][j] <- paste0(split.lev[re_ord], collapse = ".")
}
}
}
}
#====================snps====================
if("snp" %in% facets[[2]]){
if(any(which(facets[[2]] == "snp") %in% .has.negatives)){
.snps <- rep(TRUE, nsnps(x))
}
else{
.snps <- logical(nsnps(x))
}
snp.facets <- facets[[1]][which(facets[[2]] == "snp")]
for(i in 1:length(snp.facets)){
snp.subfacets <- .argnames[[.is.facet[which(facets[[2]] == "snp")][i]]]
for(j in 1:length(snp.subfacets)){
new.matches <- .fetch.snp.meta.matching.task.list(x, c(NA, NA, snp.facets[i], snp.subfacets[j]))
if(length(new.matches) == 0){
stop(paste0("No snp found matching: ", snp.facets[i], " -- ", snp.subfacets[j], "\n"))
}
if(which(facets[[2]] == "snp")[i] %in% .has.negatives){
.snps[new.matches] <- FALSE
}
else{
.snps[new.matches] <- TRUE
}
}
}
.snps <- which(.snps)
}
if("sample" %in% facets[[2]]){
if(any(which(facets[[2]] == "sample") %in% .has.negatives)){
.samps <- rep(TRUE, nsamps(x))
}
else{
.samps <- logical(nsamps(x))
}
samp.facets <- facets[[1]][which(facets[[2]] == "sample")]
for(i in 1:length(samp.facets)){
samp.subfacets <- .argnames[[.is.facet[which(facets[[2]] == "sample")][i]]]
for(j in 1:length(samp.subfacets)){
new.matches <- .fetch.sample.meta.matching.task.list(x, c(samp.facets[i], samp.subfacets[j], NA, NA))
if(length(new.matches) == 0){
stop(paste0("No sample found matching: ", samp.facets[i], " -- ", samp.subfacets[j], "\n"))
}
if(which(facets[[2]] == "sample")[i] %in% .has.negatives){
.samps[new.matches] <- FALSE
}
else{
.samps[new.matches] <- TRUE
}
}
}
.samps <- which(.samps)
}
#===============return================
# check that we still have data
if(length(.snps) == 0){
stop("No SNPs remain after subsetting!\n")
}
if(length(.samps) == 0){
stop("No samples remain after subsetting!\n")
}
nsampm <- sample.meta(x)[.samps,]
nsnpm <- snp.meta(x)[.snps,]
r <- try(x@filters, silent = TRUE)
if(methods::is(r, "try-error")){
return(import.snpR.data(genotypes(x)[.snps, .samps, drop = FALSE], snp.meta = nsnpm,
sample.meta = nsampm, mDat = x@mDat,
.skip_filters = TRUE))
}
else{
return(import.snpR.data(genotypes(x)[.snps, .samps, drop = FALSE], snp.meta = nsnpm,
sample.meta = nsampm, mDat = x@mDat, .pass_filters = x@filters,
.skip_filters = TRUE))
}
}
# Old subset version with different facet specification. For internal use.
.subset_snpR_data <- function(x, snps = 1:nrow(x), samps = 1:ncol(x), facets = NULL, subfacets = NULL, snp.facets = NULL, snp.subfacets = NULL){
#=========sanity checks========
if(!is.snpRdata(x)){
stop("x must be a snpRdata object.\n")
}
msg <- character(0)
if(max(snps) > nrow(x)){
msg <- c(msg, "All requested snps must be within 1:nrow(x).\n")
}
if(max(samps) > ncol(x)){
msg <- c(msg, "All requested samps must be within 1:ncol(x).\n")
}
if(length(msg) > 0){
stop(msg)
}
#=========subfunctions=========
fix.for.one.snp <- function(x){
if(nrow(x) == 1){
# fix geno tables
x@geno.tables <- lapply(x@geno.tables, FUN = function(y){
a <- matrix(y, nrow = 1)
colnames(a) <- names(y)
return(a)}
)
}
return(x)
}
#=========run subset===========
# if subfacets or snp.subfacets were selected, figure out which samples and loci to keep
if(!(is.null(snp.facets[1])) & !(is.null(snp.subfacets[1])) | !(is.null(facets[1])) & !(is.null(subfacets[1]))){
# if snp.subfacets are requested
if(!(is.null(snp.facets[1])) & !(is.null(snp.subfacets[1]))){
if(!(any(snp.subfacets == ".base"))){
t.snp.meta <- x@snp.meta
# check for and get info on complex facets
complex.snp.facets <- snp.facets[grep("(?<!^)\\.", snp.facets, perl = T)]
if(length(complex.snp.facets) > 0){
for(i in 1:length(complex.snp.facets)){
tfacets <- unlist(.split.facet(complex.snp.facets[i]))
tcols <- t.snp.meta[colnames(t.snp.meta) %in% tfacets]
t.snp.meta <- cbind(t.snp.meta, .paste.by.facet(tcols, match(colnames(tcols), tfacets)))
}
colnames(t.snp.meta)[(ncol(t.snp.meta) - length(complex.snp.facets) + 1):ncol(t.snp.meta)] <- complex.snp.facets
}
# get the snps to keep
t.snp.meta <- t.snp.meta[,colnames(t.snp.meta) %in% snp.facets]
fsnps <- which(as.logical(rowSums(matrix(as.matrix(t.snp.meta) %in% snp.subfacets, nrow(x@snp.meta))))) # here's the snps to keep, those where at least one subfacet level is present in the provided snp.subfacets.
snps <- snps[snps %in% fsnps]
}
}
# if sample subfacets are requested
if(!(is.null(facets[1])) & !(is.null(subfacets[1]))){
if(!any(subfacets == ".base")){
t.samp.meta <- x@sample.meta
# check for and get info on complex facets
complex.samp.facets <- facets[grep("(?<!^)\\.", facets, perl = T)]
if(length(complex.samp.facets) > 0){
for(i in 1:length(complex.samp.facets)){
tfacets <- unlist(.split.facet(complex.samp.facets[i]))
tcols <- t.samp.meta[colnames(t.samp.meta) %in% tfacets]
t.samp.meta <- cbind(t.samp.meta, .paste.by.facet(tcols, match(colnames(tcols), tfacets)))
}
colnames(t.samp.meta)[(ncol(t.samp.meta) - length(complex.samp.facets) + 1):ncol(t.samp.meta)] <- complex.samp.facets
}
# get the samples to keep
t.samp.meta <- t.samp.meta[,colnames(t.samp.meta) %in% facets]
fsamps <- which(as.logical(rowSums(matrix(as.matrix(t.samp.meta) %in% subfacets, nrow(x@sample.meta))))) # here's the samples to keep, those where at least one subfacet level is present in the provided sample subfacets.
samps <- samps[samps %in% fsamps]
}
}
}
# sort snps and samps according to sample/snp id
snps <- snps[order(x@snp.meta$.snp.id[snps])]
samps <- samps[order(x@sample.meta$.sample.id[samps])]
# check that we still have data
if(length(snps) == 0){
stop("No SNPs remain after subsetting!\n")
}
if(length(samps) == 0){
stop("No samples remain after subsetting!\n")
}
# subset
if(!identical(samps, 1:ncol(x))){
dat <- genotypes(x)[snps, samps]
if(length(samps) == 1){
dat <- as.data.frame(dat, stringsAsFactors = F)
}
dat <- import.snpR.data(dat, snp.meta = snp.meta(x)[snps,,drop = FALSE], sample.meta = sample.meta(x)[samps,,drop = FALSE], mDat = x@mDat)
if(any(x@facets != ".base")){
dat <- .add.facets.snpR.data(dat, x@facets[-which(x@facets == ".base")])
}
if(length(x@sn$sn) != 0){
sn <- x@sn$sn[,-c(1:(ncol(x@snp.meta) - 1))]
sn <- sn[snps, samps]
sn <- cbind(dat@snp.meta[,-ncol(dat@snp.meta)], sn)
dat@sn <- list(type = x@sn$type, sn = sn)
}
warning("Since samples were subset, any stats will need to be recalculated.\n")
return(dat)
}
else{
if(!is.null(x@sn$sn)){
sn <- x@sn$sn[,-c(1:(ncol(x@snp.meta) - 1))]
sn <- sn[snps,]
sn <- cbind(snp.meta(x)[snps,-ncol(x@snp.meta)], sn)
sn <- list(type = x@sn$type, sn = sn)
}
else{
sn <- list()
}
# change snps and samps to snp and sample IDs
keep.rows <- snps
snps <- snp.meta(x)$.snp.id[snps]
x <- snpRdata(.Data = genotypes(x)[which(snp.meta(x)$.snp.id %in% snps),, drop = FALSE],
sample.meta = sample.meta(x),
snp.meta = snp.meta(x)[which(snp.meta(x)$.snp.id %in% snps),, drop = FALSE],
facet.meta = x@facet.meta[x@facet.meta$.snp.id %in% snps,, drop = FALSE],
mDat = x@mDat,
snp.form = x@snp.form,
ploidy = .ploidy(x),
bi_allelic = .is.bi_allelic(x),
filters = x@filters,
data.type = ifelse("data.type" %in% methods::slotNames(x), x@data.type, "genotypic"),
geno.tables = list(gs = x@geno.tables$gs[x@facet.meta$.snp.id %in% snps,, drop = FALSE],
as = x@geno.tables$as[x@facet.meta$.snp.id %in% snps,, drop = FALSE],
wm = x@geno.tables$wm[x@facet.meta$.snp.id %in% snps,, drop = FALSE]),
# ac = x@ac[x@facet.meta$.snp.id %in% snps,],
facets = x@facets,
facet.type = x@facet.type,
stats = x@stats[x@stats$.snp.id %in% snps,, drop = FALSE],
pairwise.stats = x@pairwise.stats[x@pairwise.stats$.snp.id %in% snps,, drop = FALSE],
sn = sn,
names = x@names,
row.names = x@row.names[keep.rows])
x <- fix.for.one.snp(x)
warning("Any window stats will need to be recalculated.\n")
return(x)
}
}
#'Filter SNPs in snpRdata objects.
#'
#'\code{filter_snps} filters snpRdata objects to remove SNPs or individuals
#'which fail to pass user defined thresholds for several statistics. Since this
#'function removes all calculated statistics, etc. from the snpRdata object,
#'this should usually be the first step in an analysis. See details for filters.
#'
#'
#'Possible filters: \itemize{ \item{maf, minor allele frequency: }{removes SNPs
#'where the minor allele frequency is too low. Can look for mafs below
#'#'provided either globally or search each population individually.}
#'\item{hf_hets, high observed heterozygosity: }{removes SNPs where the observed
#'heterozygosity is too high.} \item{min_ind, minimum individuals: }{removes
#'SNPs that were genotyped in too few individuals.} \item{min_loci, minimum
#'loci: }{removes individuals sequenced at too few loci.} \item{non_poly,
#'non-polymorphic SNPs: }{removes SNPs that are not polymorphic (not true
#'SNPs).} \item{bi_al, non-biallelic SNPs: }{removes SNPs that have more than
#'two observed alleles. This is mostly an internal argument, since the various
#'snpRdata import options use it automatically to prevent downstream errors in
#'other snpR functions. } \item{remove_garbage: }{Quickly removes any loci or
#'samples that are jointly poorly genotyped prior to other filtering. This can
#'be useful if some individuals or loci are of poor enough quality that they
#'will bias other filters.} \item{LD_prune_sigma: }{Sliding-window based Linkage
#'Disequilibrium filtering to remove non-independant loci}.}
#'
#'@section Filter order:
#'
#'Note that filtering out poorly sequenced individuals creates a possible
#'conflict with the loci filters, since after individuals are removed, some loci
#'may no longer pass filters. For example, if a portion of individuals in one
#'population all carry the only instances of a rare minor allele that still
#'passes the maf threshold, removing those individuals may cause the loci to no
#'longer be polymorphic in the sample.
#'
#'To counter this, the \code{re_run} argument can be used to pass the data
#'through a second filtering step after individuals are removed. By default, the
#'"partial" re-run option is used, which re-runs only the non-polymorphic filter
#'(if it was originally set). The "full" option re-runs all set filters. Note
#'that re-running any of these filters may cause individuals to fail the
#'individual filter after loci removal, and so subsequent tertiary re-running of
#'the individual filters, followed by the loci filters, and so on, could be
#'justified. This is not done automatically here, however, if the
#'\code{inds_first} option is selected to filter poorly sequenced individuals
#'prior to applying loci filters, any re-run option will re-run individuals
#'filters after the loci filters have been applied.
#'
#'Filter order (snps vs samples first) can be controlled with the \code{inds_fist}
#'argument. If individuals are filtered first, poorly sequenced individuals
#'will be re-filtered following locus removal if re-running is requested.
#'
#'@section LD pruning:
#'
#'LD pruning can be conducted using any of the methods described in
#'\code{\link{calc_pairwise_ld}} using the \code{LD_prune_} argument family.
#'Loci are removed greedily: starting from the first locus pair with elevated
#'LD, the more poorly sequenced loci is removed. If each locus is equally well
#'sequenced, the second loci (by position) is removed. If any elevated locus
#'pairs remain, the next pair is addressed the same way until no pairs remain.
#'
#'@section Facet-based filters (within-group filtering):
#'
#'Via the \code{maf_facets} argument, this function can filter by minor allele
#'frequencies in either \emph{all} samples or \emph{each level of a supplied
#'sample specific facet and the entire dataset}. In the latter case, any SNPs
#'that pass the maf filter in \emph{any} facet level are considered to pass the
#'filter. The latter should be used in instances where population sizes are very
#'different, there are \emph{many} populations, and/or allele frequencies are
#'very different between populations and thus common alleles of interest in one
#'population might be otherwise filtered out.
#'
#'The \code{hwe_facets} argument is the inverse of this: loci will be removed if they
#'fail the provided hwe filter in any facet level. In both cases, Facets should
#'be provided as described in \code{\link{Facets_in_snpR}}.
#'
#'LD filtering can likewise be conducted within facets. SNP-level facets dictate
#'the levels within which LD values are calculated (chromosome, etc.),
#'and loci will be removed if they display elevated LD within sample-level facets.
#'
#'
#'@param x snpRdata object.
#'@param maf numeric between 0 and 1 or FALSE, default FALSE. Minimum acceptable
#' minor allele frequency. Either maf or mac filtering are allowed, not both.
#'@param mac integer, between 0 and 1 minus the total number of individuals.
#' Loci with less \emph{or equal to} this many minor alleles will be removed.
#' Either maf or mac filtering are allowed, not both. \code{mac = 1} is
#' therefore a singleton filter.
#'@param mgc integer, between 0 and the total number of individuals/2. Loci
#' where the minor allele is present in less \emph{or equal to} this many
#' individuals will be removed. Either mac or mgc filtering are allowed, not
#' both. \code{mgc = 1} is analagous to a singleton filter, but will also
#' remove loci with one homozygous minor individual.
#'@param hf_hets numeric between 0 and 1 or FALSE, default FALSE. Maximum
#' acceptable heterozygote frequency.
#'@param hwe numeric between 0 and 1 or FALSE, default FALSE. Minimum acceptable
#' HWE p-value.
#'@param hwe_excess_side character, default "both". Options:
#' \itemize{\item{heterozygote:} only loci with a heterozygote excess are
#' removed.
#' \item{homozygote:} only loci with a homozygote excess are removed.
#' \item{both:} loci with either a heterozygote or homozygote excess
#' are removed.}.
#'@param fwe_method character, default "none". Option to use for Family-Wise
#' Error rate correction for HWE filtering. If requested, only SNPs with
#' p-values below the alpha provided to the \code{hwe} argument \emph{after FWE
#' correction} will be removed. See \code{\link[stats]{p.adjust}} for
#' information on method options.
#'@param singletons logical, default FALSE. Depricated, use \code{mac = 1} to
#' remove singletons. If TRUE, removes singletons (loci where there is only a
#' single minor allele). If population sizes are reasonably high, this is more
#' or less redundant if \code{maf} is also set.
#'@param min_ind numeric between 0 and 1 or FALSE, default FALSE. Minimum
#' proportion of individuals in which a loci must be sequenced.
#'@param min_loci numeric between 0 and 1 or FALSE, default FALSE. Minimum
#'proportion of SNPs at which an individual must be genotyped.
#'@param inds_first logical, default FALSE. If TRUE, individuals will be
#' filtered out for missing data if that option is selected prior to loci being
#' filtered. Otherwise loci are filtered first.
#'@param remove_garbage numeric between 0 and 1 or FALSE, default FALSE. Optionally
#' do a filter to remove very poorly sequenced individuals and loci
#' jointly before applying other filters. This can be used to reduce biases
#' caused by very bad loci/individuals prior to full filtering. This number
#' should be lower than either the \code{min_ind} or \code{min_loci} parameters
#' if those arguments are used and should generally be quite permissive to
#' remove only truly bad loci or individuals.
#'@param re_run character or FALSE, default "partial". When individuals are
#' removed via min_ind, it is possible that some SNPs that initially passed
#' filtering steps will now violate some filters. SNP filters can be re-run
#' automatically via several methods: \itemize{ \item{partial: } Re-filters for
#' non-polymorphic loci (non_poly) only, if that filter was requested
#' initially. \item{full: } Re-runs the full filtering scheme (save for
#' min_loci).} Note: if \code{inds_first = TRUE}, all re-run options other than
#' FALSE will re-run the individual filter for missing data again after loci
#' filtering.
#'@param maf_facets character or NULL, default NULL. Defines a sample facet over
#' which the minor allele frequency can be checked. SNPs will only fail the maf
#' filter if they fail in every level of every provided facet.
#'@param hwe_facets character or NULL, default NULL. Defines a sample facet over
#' which the hwe filter can be checked. SNPs will fail the hwe filter if they
#' fail in any level of any provided facet.
#'@param non_poly logical, default TRUE. If TRUE, non-polymorphic loci will be
#' removed.
#'@param bi_al logical, default TRUE. If TRUE, loci with more than two alleles
#' will be removed. Note that this is mostly an internal argument and should
#' rarely be used directly, since import.snpR.data and other snpRdata object
#' creation functions all pass SNPs through this filter because many snpR
#' functions will fail to work if there are more than two alleles at a locus.
#'@param LD_prune_sigma numeric or FALSE, default FALSE. If LD pruning, the
#' window size in kb across which to consider LD loci pairs. Windows will be
#' equal to two times \code{LD_prune_sigma}.
#'@param LD_prune_step numeric, default equal to \code{LD_prune_sigma}. Step
#' size between LD windows, in kb. The default ensures that each SNP is in
#' exactly two windows, allowing for smoother filtering.
#'@param LD_prune_r numeric, default FALSE. The LD filter cutt-off value. Pairs
#' above this value will be greedily pruned. Must be between 0 and 1 if LD
#' prunning is conducted.
#'@param LD_prune_facet character, default NULL. Defines a facet over which the
#' minor allele frequency can be checked. Windows will be calculated within any
#' SNP parts of the facet, LD values will be calculated only with levels of any
#' sample parts. For example 'chr.pop' would calculate LD values across
#' windows, then filter SNPs above \code{LD_prune_r} in any populations.
#'@param LD_prune_par numeric, default FALSE. If numeric, LD calculation will be
#' conducted in parallel using \code{LD_prune_par} threads.
#'@param LD_prune_method character, default "CLD". The method to use for LD
#' calculations and prunning. The options are:
#' \itemize{\item{CLD}\item{Dprime}\item{rsq}}. See
#' \code{\link{calc_pairwise_ld}} for details.
#'@param LD_prune_use_ME logical, default FALSE. If TRUE, uses
#' Minimization-Expectation to do LD calculations for all
#' \code{LD_prune_method} options other than "CLD". This can be very slow. See
#' \code{\link{calc_pairwise_ld}} for details.
#'@param LD_prune_ME_sigma numeric, default 0.0001. The cutt-off value for difference
#' in haplotype frequencies to use when conducting ME. See
#' \code{\link{calc_pairwise_ld}} for details.
#'@param verbose Logical, default TRUE. If TRUE, some progress updates and
#' filtering notes will be printed to the console.
#'
#'@return A data.frame in the same format as the input, with SNPs and
#' individuals not passing the filters removed.
#'
#'@export
#'@author William Hemstrom
#'
#' @examples
#' # Filter with a minor allele frequency of 0.05, maximum heterozygote
#' # frequency of 0.55, 50% minimum individuals, and at least 75% of loci
#' # sequenced per individual.
#' filter_snps(stickSNPs, maf = 0.05, hf_hets = 0.55,
#' min_ind = 0.5, min_loci = 0.75)
#'
#' # The same filters, but with minor allele frequency considered per-population
#' # and a full re-run of loci filters after individual removal.
#' filter_snps(stickSNPs, maf = 0.05, hf_hets = 0.55, min_ind = 0.5,
#' min_loci = 0.75, re_run = "full", maf_facets = "pop")
#'
filter_snps <- function(x, maf = FALSE,
mac = 0,
mgc = 0,
hf_hets = FALSE,
hwe = FALSE, fwe_method = "none",
hwe_excess_side = "both",
singletons = FALSE,
min_ind = FALSE,
min_loci = FALSE,
inds_first = FALSE,
remove_garbage = FALSE,
re_run = "partial",
maf_facets = NULL,
hwe_facets = NULL,
non_poly = TRUE,
bi_al = TRUE,
LD_prune_sigma = FALSE,
LD_prune_step = LD_prune_sigma,
LD_prune_r = FALSE,
LD_prune_method = "CLD",
LD_prune_facet = NULL,
LD_prune_par = FALSE,
LD_prune_use_ME = FALSE,
LD_prune_ME_sigma = 0.0001,
verbose = TRUE){
#==============do sanity checks====================
if(singletons){
warning("The singletons argument is depriceated. Please use mac = 1 instead!")
if(mac == 0){
mac <- 1
}
}
if(maf){
if(!is.numeric(maf)){
stop("maf must be a numeric value.")
}
if(length(maf) != 1){
stop("maf must be a numeric vector of length 1.")
}
if(mac != 0){
stop("mac and maf cannot both be set.\n")
}
if(maf < 0 | maf >= .5){
stop("maf must be greater than or equal to 0 and less than .5")
}
if(!is.null(maf_facets) & !isFALSE(maf_facets)){
if(length(maf_facets) == 1 & maf_facets[1] == ".base"){
maf_facets <- NULL
}
else{
maf_facets <- .check.snpR.facet.request(x, maf_facets, "none")
# add any needed facets...
miss.facets <- maf_facets[which(!(maf_facets %in% x@facets))]
if(length(miss.facets) != 0){
if(verbose){cat("Adding missing facets...\n")}
# need to fix any multivariate facets (those with a .)
x <- .add.facets.snpR.data(x, miss.facets)
}
# check for bad facets to remove (those that don't just consider samples)
if(any(!x@facet.type[x@facets %in% maf_facets] %in% c("sample", ".base"))){
vio.facets <- x@facets[match(maf_facets, x@facets)]
vio.facets <- vio.facets[which(x@facet.type[x@facets %in% maf_facets] != "sample")]
warning(paste0("Facets over which to maf.filter must be sample specific facets, not snp specific facets! Removing non-sample facets: \n", paste0(vio.facets, collapse = " "), ".\n"))
maf_facets <- maf_facets[-which(maf_facets %in% vio.facets)]
if(length(maf_facets) == 0){maf_facets <- NULL}
}
}
}
}
if(mac){
if(mgc){
stop("mac and mgc cannot both be set.")
}
if(!is.numeric(mac)){
stop("mac must be a numeric value.")
}
if(length(mac) != 1){
stop("mac must be a numeric vector of length 1.")
}
if(mac >= nsamps(x) | mac < 0){
stop("mac must be greater than or equal to zero and less than the number of samples (a maf of 0.5 in fully sequenced loci).")
}
if(mac != as.integer(mac)){
stop("mac must be an integer (but can be a numeric object).")
}
}
if(mgc){
if(!is.numeric(mgc)){
stop("mgc must be a numeric value.")
}
if(length(mgc) != 1){
stop("mgc must be a numeric vector of length 1.")
}
if(mgc >= nsamps(x)/2 | mgc < 0){
stop("mgc must be greater than or equal to zero and less than half the number of samples (a maf of 0.5 in fully sequenced loci).")
}
if(mgc != as.integer(mgc)){
stop("mgc must be an integer (but can be a numeric object).")
}
}
if(hwe){
if(!is.numeric(hwe)){
stop("hwe must be a numeric value.")
}
if(length(hwe) != 1){
stop("hwe must be a numeric vector of length 1.")
}
if(hwe <= 0 | hwe >= 1){
stop("hwe must be a value between 0 and 1.")
}
good_hwe_sides <- c("both", "homozygote", "heterozygote")
if(!hwe_excess_side %in% good_hwe_sides){
stop("hwe_excess_sides must be one of: ", paste0(good_hwe_sides, collapse = ", "))
}
if(!is.null(hwe_facets) & !isFALSE(hwe_facets)){
if(length(hwe_facets) == 1 & hwe_facets[1] == ".base"){
hwe_facets <- NULL
}
else{
hwe_facets <- .check.snpR.facet.request(x, hwe_facets)
}
}
}
if(hf_hets){
if(!is.numeric(hf_hets)){
stop("hf_hets must be a numeric value.")
}
if(length(hf_hets) != 1){
stop("hf_hets must be a numeric vector of length 1.")
}
if(hf_hets <= 0 | hf_hets >= 1){
stop("hf_hets must be a value between 0 and 1.")
}
}
if(min_ind){
if(!is.numeric(min_ind)){
stop("min_ind must be a numeric value.")
}
if(length(min_ind) != 1){
stop("min_ind must be a numeric vector of length 1.")
}
if(min_ind > 1 | min_ind < 0){
stop("min_ind is the minimum proportion of sequenced individuals, and so must be between 0 and 1.\n")
}
}
if(min_loci){
if(!is.numeric(min_loci) | (min_loci <= 0 | min_loci >= 1) | length(min_loci) != 1){
stop("min_loci must be a numeric value between but not equal to 0 and 1.")
}
}
if(is.numeric(LD_prune_sigma)){
good.methods <- c("CLD", "Dprime", "rsq")
if(!LD_prune_method %in% good.methods){
stop("LD_prune_method not recognized. Options:", paste0(good.methods, collapse = ", "), "\n")
}
if(!is.numeric(LD_prune_r)){
stop("LD_prune_r must be a numeric value between 0 and 1\n")
}
if(LD_prune_r > 1 | LD_prune_r <= 0){
stop("LD_prune_r must be a numeric value between 0 and 1\n")
}
if(!is.numeric(LD_prune_step)){
stop("LD_prune_step must be a numeric value (step length in kb).\n")
}
if(LD_prune_step >= 2*LD_prune_step){
warning("Non-overlapping windows may result in odd pruning at window edges--did you mean to have a higher sigma or lower step size?\n")
}
LD_prune_step <- eval(LD_prune_step) # force eval now
LD_prune_facet <- .check.snpR.facet.request(x, LD_prune_facet, remove.type = "none", fill_with_base = TRUE, return_base_when_empty = TRUE)
if(length(LD_prune_facet) != 1){
stop("Only one LD_prune_facet can be provided.\n")
}
LDf_snp <- .check.snpR.facet.request(x, LD_prune_facet, "sample")
LDf_samp <- .check.snpR.facet.request(x, LD_prune_facet, "snp")
if(!isFALSE(LD_prune_par)){
LD_prune_par <- .par_checker(LD_prune_par)
}
# add any needed facets...
miss.facets <- LDf_samp[which(!(LDf_samp %in% x@facets))]
if(length(miss.facets) != 0){
if(verbose){cat("Adding missing LD facets...\n")}
# need to fix any multivariate facets (those with a .)
x <- .add.facets.snpR.data(x, miss.facets)
}
}
if(!isFALSE(remove_garbage)){
if(!is.numeric(remove_garbage)){
stop("remove_garbage must be a numeric value.")
}
if(length(remove_garbage) != 1){
stop("remove_garbage must be a numeric vector of length 1.")
}
if(remove_garbage > 1 | remove_garbage < 0){
stop("remove_garbage is a minimum proportion of sequenced individuals/loci, and so must be between 0 and 1.\n")
}
if(is.numeric(min_loci)){
if(min_loci <= remove_garbage){
stop("The min_loci threshold should be higher than the garbage removal threshold if supplied.\n")
}
}
if(is.numeric(min_ind)){
if(min_ind <= remove_garbage){
stop("The min_ind threshold should be higher than the garbage removal threshold if supplied.\n")
}
}
}
if(re_run != FALSE){
if(re_run != "partial" & re_run != "full"){
if(verbose){cat("re_run must be set to partial or full if not FALSE.\n")}
}
}
#==============set up, get values used later, clean up data a bit, define subfunctions==========
if(verbose){cat("Initializing...\n")}
#get headers
headers <- x@snp.meta
snp_form <- x@snp.form
mDat <- x@mDat
#function to filter by loci, to be called before and after min ind filtering (if that is requested.)
filt_by_loci <- function(){
# Store filter status in vio.snps. Those that are violating a filter will be marked TRUE, remove these.
#==========================run filters: bi_allelic/non_poly========================
vio.snps <- logical(nrow(x)) #vector to track status
amat <- x@geno.tables$as[x@facet.meta$facet == ".base",,drop = FALSE]
gmat <- x@geno.tables$gs[x@facet.meta$facet == ".base",,drop = FALSE]
wmat <- x@geno.tables$wm[x@facet.meta$facet == ".base",,drop = FALSE]
# non-biallelic and non-polymorphic loci
if(bi_al | non_poly){
bimat <- ifelse(amat, TRUE, FALSE)
if(bi_al){
if(verbose){cat("Filtering non-biallelic loci...\n")}
bi <- ifelse(rowSums(bimat) > 2, T, F) # if false, should keep the allele
if(verbose){cat(paste0("\t", sum(bi), " bad loci\n"))}
vio.snps[which(bi)] <- T
x <- .update_filters(x, "bi-allelic", NA, NA)
}
if(non_poly){
if(verbose){cat("Filtering non_polymorphic loci...\n")}
np <- ifelse(rowSums(bimat) < 2, T, F) # if false, should keep the allele
if(verbose){cat(paste0("\t", sum(np), " bad loci\n"))}
vio.snps[which(np)] <- T
x <- .update_filters(x, "non-polymorphic", NA, NA)
}
}
#========min inds=======
if(min_ind){
if(verbose){cat("Filtering loci sequenced in few individuals...\n")}
mi <- wmat[,colnames(wmat) == mDat]
mi <- (nrow(x@sample.meta) - mi)/nrow(x@sample.meta) < min_ind
vio.snps[which(mi)] <- T
if(verbose){cat(paste0("\t", sum(mi), " bad loci\n"))}
x <- .update_filters(x, "poorly sequenced loci--min_ind", min_ind, ".base")
}
#========minor allele frequency, both total and by pop. Should only run if bi_al = TRUE.=========
if(maf){
#if not filtering with multiple pops
if(is.null(maf_facets) | isFALSE(maf_facets)){
if(verbose){cat("Filtering low minor allele frequencies, no pops...\n")}
# check to see if we need to calculate mafs:
if(any(colnames(x@stats) == "maf")){ # check that mafs have been calculated, the all facet must exist
if(any(is.na(x@stats$maf[x@stats$facet == ".base"]))){ # check that mafs have been calculated for the all facet
mafs <- 1 - matrixStats::rowMaxs(amat)/rowSums(amat)
}
else{
mafs <- x@stats$maf[x@stats$facet == ".base"]
}
}
else{
mafs <- 1 - matrixStats::rowMaxs(amat)/rowSums(amat)
}
mafs <- mafs < maf #less than required, set to true and reject.
mafs[is.na(mafs)] <- TRUE
if(verbose){cat(paste0("\t", sum(mafs), " bad loci\n"))}
vio.snps[which(mafs)] <- T
}
else{
if(verbose){cat("Filtering low minor allele frequencies by facet.\n")}
# pmafs <- logical(nrow(x))
# see if we need to calculate mafs
if(any(colnames(x@stats) == "maf")){ # mafs have been calculated
# get mafs for any uncalculated facets
# if mafs have been calculated, but not for of the requested any facets...
if(!any(x@stats$facet %in% maf_facets)){
x <- calc_maf(x, facets = maf_facets)
}
#if mafs have been calculated for some but not all of our facets
else if(any(is.na(x@stats$maf[x@stats$facet %in% maf_facets]))){
run.facets <- unique(x@stats$facet[which(is.na(x@stats$maf[x@stats$facet %in% maf_facets]))])
x <- calc_maf(x, facets = run.facets)
}
}
else{
x <- calc_maf(x, facets = maf_facets)
}
# grab mafs
mafs <- x@stats[x@stats$facet %in% maf_facets,]
mafs$maf <- mafs$maf < maf
# now, figure out in how many subfacets the maf is too low. If all, the loci violates the filter
cmafs <- reshape2::dcast(mafs[,c(".snp.id", "maf")], fun.aggregate = sum,
formula = ... ~ "maf", value.var = "maf")
# add in the overall maf, since differential fixation would otherwise be removed.
if(any(is.na(x@stats$maf[x@stats$facet == ".base"]))){ # check that mafs have been calculated for the all facet
a.mafs <- 1 - matrixStats::rowMaxs(amat)/rowSums(amat)
}
else{
a.mafs <- x@stats$maf[x@stats$facet == ".base"]
}
a.mafs <- a.mafs < maf
cmafs$maf <- cmafs$maf + a.mafs
# check vio and report
maf.vio <- which(cmafs$maf == (1 + length(unique(mafs$subfacet))))
if(verbose){cat(paste0("\t", length(maf.vio), " bad loci\n"))}
vio.snps[maf.vio] <- T
}
# update note-keeping
x <- .update_filters(x, "Minor Allele Frequency--maf", maf,
ifelse(is.null(maf_facets) | isFALSE(maf_facets), ".base", maf_facets))
}
#========mac/mgc========================================
if(mac | mgc){
if(mgc){
# in this case, this is the count of *individuals with the minor allele*
if(verbose){cat("Filtering low minor genotype counts.\n")}
hs <- which(substr(colnames(gmat), 1, snp_form/2) != substr(colnames(gmat), (snp_form/2) + 1, snp_form))
het_c <- matrixStats::rowSums2(gmat[,hs,drop = FALSE])
min_g_c <- matrixStats::rowSums2(gmat[,-hs,drop = FALSE]) - matrixStats::rowMaxs(gmat[,-hs,drop = FALSE])
tgac <- het_c + min_g_c
mgc.vio <- which(tgac <= mgc)
if(verbose){cat(paste0("\t", length(mgc.vio), " bad loci\n"))}
vio.snps[mgc.vio] <- T
x <- .update_filters(x, "minor genotype count--mgc", mgc, ".base")
}
else{
if(verbose){cat("Filtering low minor genotype counts.\n")}
# singletons exist wherever the total allele count - the maf count is 1.
mac.vio <- which(matrixStats::rowSums2(amat) - matrixStats::rowMaxs(amat) <= mac)
if(verbose){cat(paste0("\t", length(mac.vio), " bad loci\n"))}
vio.snps[mac.vio] <- T
x <- .update_filters(x, "minor allele count--mac", mac, ".base")
}
}
#========hf_hets. Should only run if bi_al = TRUE.==========
if(hf_hets){
if(verbose){cat("Filtering high frequency heterozygote loci...\n")}
# get heterozygote frequency
hs <- which(substr(colnames(gmat), 1, snp_form/2) != substr(colnames(gmat), (snp_form/2) + 1, snp_form))
het_f <- rowSums(gmat[,hs])/rowSums(gmat)
# check violation
het_f <- het_f > hf_hets #if false, heterozygote frequency is lower than cut-off, keep locus
if(verbose){cat(paste0("\t", sum(het_f), " bad loci\n"))}
vio.snps[which(het_f)] <- T
x <- .update_filters(x, "high-frequency heterozygotes--hf_hets", hf_hets, ".base")
}
#========hwe violation======================================
if(hwe){
if(verbose){cat("Filtering loci out of hwe...\n")}
# no facets, easy
if(is.null(hwe_facets) | isFALSE(hwe_facets)){
if(!.check_calced_stats(x, ".base", "hwe")$.base){
.make_it_quiet(x <- calc_hwe(x))
}
if(hwe_excess_side != "both"){
calced <- .check_calced_stats(x,".base", c("fis"))$.base
if(!calced){
.make_it_quiet(x <- calc_fis(x))
}
}
phwe <- x@stats$pHWE[x@stats$facet == ".base"]
if(fwe_method != "none"){
phwe <- stats::p.adjust(phwe, method = fwe_method[1])
}
phwe <- phwe <= hwe
# check sides
# consider sides
if(hwe_excess_side == "both"){
side <- 1
}
else if(hwe_excess_side == "heterozygote"){
side <- ifelse(x@stats$fis[x@stats$facet == ".base"] <= 0, 1, 0)
}
else{
side <- ifelse(x@stats$fis[x@stats$facet == ".base"] >= 0, 1, 0)
}
if(any(is.na(side))){
side[is.na(side)] <- 1
}
# recheck low.p
phwe <- phwe * side
phwe <- which(phwe == 1)
if(verbose){cat("\t", length(phwe), " bad loci\n")}
vio.snps[phwe] <- T
}
# facets, slightly more complicated
else{
# run again to ensure that the correct fwe method and case are used
.make_it_quiet(x <- calc_hwe(x, hwe_facets, fwe_method = fwe_method[1], fwe_case = "by_facet"))
if(hwe_excess_side != "both"){
calced <- .check_calced_stats(x, hwe_facets, c("fis"))
if(any(!unlist(calced))){
.make_it_quiet(x <- calc_fis(x, names(calced)[!unlist(calced)]))
}
}
# get the per-facet hwe stats, check against threshold, then condense by snp.
phwe <- get.snpR.stats(x, hwe_facets)
if(fwe_method != "none"){
phwe$low.p <- ifelse(phwe[,paste0("pHWE_byfacet_", fwe_method[1])] <= hwe, 1, 0)
}
else{
phwe$low.p <- ifelse(phwe$pHWE <= hwe, 1, 0)
}
# consider sides
if(hwe_excess_side == "both"){
phwe$side <- 1
}
else if(hwe_excess_side == "heterozygote"){
phwe$side <- ifelse(phwe$fis <= 0, 1, 0)
}
else{
phwe$side <- ifelse(phwe$fis >= 0, 1, 0)
}
if(any(is.na(phwe$side))){
phwe$side[is.na(phwe$side)] <- 1
}
# recheck low.p
phwe$low.p <- phwe$low.p * phwe$side
bad.loci <- tapply(phwe$low.p, phwe[,".snp.id"], sum, na.rm = TRUE)
bad.loci <- reshape2::melt(bad.loci)
bad.loci <- stats::na.omit(bad.loci)
bad.loci <- bad.loci[which(bad.loci$value > 0),]
bad.loci <- which(snp.meta(x)$.snp.id %in% bad.loci$Var1)
if(verbose){cat("\t", length(bad.loci), " bad loci\n")}
vio.snps[bad.loci] <- TRUE
}
x <- .update_filters(x, "Hardy-Weinburg Proportions--hwe",
paste0(hwe, ", excess side = ", hwe_excess_side),
ifelse(is.null(hwe_facets) | isFALSE(hwe_facets), ".base", hwe_facets))
}
#==========remove violating loci==================
if(any(vio.snps)){
if(sum(vio.snps) == nrow(x)){
stop("No loci passed filters.\n")
}
x <- x[-which(vio.snps),]
}
#=========do LD filtering--this happens after previous filters for compute time=========
if(is.numeric(LD_prune_sigma)){
if(verbose){cat("Filtering loci with high LD...\n")}
# get LD values
LD <- calc_pairwise_ld(x, facets = LD_prune_facet, par = LD_prune_par,
window_sigma = LD_prune_sigma,
window_gaussian = FALSE,
window_step = LD_prune_step,
window_triple_sigma = FALSE,
verbose = FALSE, CLD = ifelse(LD_prune_method == "CLD", "only", FALSE),
use.ME = LD_prune_use_ME,
sigma = LD_prune_ME_sigma,
.prox_only = TRUE)
if(nrow(LD) == 0){
warning("No valid LD loci pairs detected--often this means that window sizes are too small to have multiple SNPs in any single window.\n")
}
else{
# choose loci to remove greedily:
# 1) find loci with values above the cuttoff
# 2) for each pair, remove the locus with less genotypes (or the randomly otherwise)
# 3) repeat until nothing above the cuttoff
tar_col <- which(colnames(LD) == LD_prune_method)
LD <- LD[which(LD[[tar_col]] > LD_prune_r),]
LD <- unique(LD)
vio.snps <- numeric()
# add on genotyping counts
if(nrow(LD) > 0){
base_lev_rows <- which(x@facet.meta$facet == ".base" & x@facet.meta$subfacet == ".base")
LD <- merge(LD, as.data.table(cbind(.snp.id = x@facet.meta[base_lev_rows, ".snp.id"], .count = matrixStats::rowSums2(x@geno.tables$gs[base_lev_rows,]))),
by.x = "s1_.snp.id",
by.y = ".snp.id",
all.x = TRUE)
LD <- merge(LD, as.data.table(cbind(.snp.id = x@facet.meta[base_lev_rows, ".snp.id"], .count = matrixStats::rowSums2(x@geno.tables$gs[base_lev_rows,]))),
by.x = "s2_.snp.id",
by.y = ".snp.id",
all.x = TRUE)
}
# loop through rejects, removing all offending pairs untill we are done
while(nrow(LD) > 0){
trm <- ifelse(LD$.count.x[1] < LD$.count.y[1], LD$s1_.snp.id[1], LD$s2_.snp.id[1])
vio.snps <- c(vio.snps, trm)
rmr <- which(LD$s1_.snp.id == trm | LD$s2_.snp.id == trm)
LD <- LD[-rmr,,drop = FALSE]
}
# report and subset
if(verbose){cat(paste0("\t", length(vio.snps), " bad loci\n"))}
vio.snps <- match(vio.snps, snp.meta(x)$.snp.id)
if(length(vio.snps) == nrow(x)){
stop("No loci passed filters.\n")
}
x <- x[-vio.snps,]
x <- .update_filters(x, "LD pruning",
paste0("r=", LD_prune_r,
" window_sigma=", LD_prune_sigma,
" window_step=", LD_prune_step,
" LD_method=", LD_prune_method,
ifelse(LD_prune_use_ME, paste0(" ME_sigma=", LD_prune_ME_sigma), "")),
LD_prune_facet)
}
}
return(x)
}
#funciton to filter by individuals.
min_loci_filt <- function(){
if(verbose){cat("Filtering out individuals sequenced in few kept loci...\n")}
mcounts <- matrixStats::colSums2(ifelse(x != mDat, 1, 0))
rejects <- which(mcounts/nrow(x) < min_loci)
if(length(rejects) > 0){
if(length(rejects) == ncol(x)){
stop("No individuals passed filters.\n")
}
old.facets <- x@facets
x <- x[,-rejects]
if(verbose){cat("Re-calculating and adding facets.\n")}
if(any(old.facets != ".base")){
x <- .add.facets.snpR.data(x, old.facets[-which(old.facets == ".base")])
}
}
x <- .update_filters(x, "poorly sequenced individuals--min_loci", min_loci, NA)
return(list(x = x, rejects = rejects))
}
#==========================call the functions as requested.==================
if(is.numeric(remove_garbage)){
if(verbose){cat("Removing garbage individuals/loci.\n")}
mcounts <- matrixStats::colSums2(ifelse(x != mDat, 1, 0))
rejects <- which(mcounts/nrow(x) < remove_garbage)
mi <- x@geno.tables$wm[,colnames(x@geno.tables$wm) == mDat] # for backwards compat
mi <- which((nrow(x@sample.meta) - mi)/nrow(x@sample.meta) < remove_garbage)
if(length(rejects) > 0){
if(length(mi) > 0){
.make_it_quiet(x <- x[-mi,-rejects])
}
else{
.make_it_quiet(x <- x[,-rejects])
}
}
else if(length(mi) > 0){
.make_it_quiet(x <- x[-mi,])
}
if(verbose){cat("\tRemoved", length(mi), "bad loci.\n\tRemoved", length(rejects), "bad individuals.\n")}
if(nrow(x) == 0){
stop("No SNPs passed garbage removal.\n")
}
if(ncol(x) == 0){
stop("No loci passed garbage removal.\n")
}
rm(mi, mcounts, rejects)
x <- .update_filters(x, "poorly sequenced individuals and loci jointly--remove_garbage",
remove_garbage, NA)
}
if(!inds_first){
if(any(c(non_poly, bi_al, maf, hf_hets, min_ind) != FALSE)){
if(verbose){cat("Filtering loci. Starting loci:", nrow(x), "\n")}
# run the filter
x <- filt_by_loci()
if(nrow(x) == 0 | is.null(nrow(x))){
stop("No loci remain after filters.")
}
if(verbose){cat("\tEnding loci:", nrow(x), "\n")}
}
}
# run the minimum sequenced loci filter
if(min_loci){
if(verbose){cat("Filtering individuals. Starting individuals:", ncol(x), "\n")}
x <- min_loci_filt()
if(length(x$rejects) == 0){
if(verbose){cat("No individuals removed.\n")}
x <- x$x
}
else{
x <- x$x
if(verbose){cat("\tEnding individuals:", ncol(x), "\n")}
}
}
if(inds_first){
if(any(c(non_poly, bi_al, maf, hf_hets, min_ind) != FALSE)){
if(verbose){cat("Filtering loci. Starting loci:", nrow(x), "\n")}
# run the filter
x <- filt_by_loci()
if(nrow(x) == 0 | is.null(nrow(x))){
stop("No loci remain after filters.")
}
if(verbose){cat("\tEnding loci:", nrow(x), "\n")}
}
}
if(re_run != FALSE){
if(!inds_first){
if(verbose){cat("Re-filtering loci...\n")}
if(re_run == "partial"){
maf <- FALSE
hf_hets <- FALSE
min_ind <- FALSE
bi_al <- FALSE
hwe <- FALSE
mac <- 0
singletons <- FALSE
LD_prune_sigma <- FALSE
}
if(any(c(non_poly, bi_al, maf, hf_hets, min_ind) != FALSE)){
x <- filt_by_loci() # re-filter loci to make sure that we don't have any surprise non-polys etc.
if(verbose){cat("\tFinal loci count:", nrow(x), "\n")}
}
else{
if(verbose){cat("\tNo variables to re-fitler.\n")}
}
}
else{
if(verbose){cat("Re-Filtering individuals. Starting individuals:", ncol(x), "\n")}
x <- min_loci_filt()
if(length(x$rejects) == 0){
if(verbose){cat("No individuals removed.\n")}
x <- x$x
}
else{
x <- x$x
if(verbose){cat("\tEnding individuals:", ncol(x), "\n")}
}
}
}
# return results
return(x)
}
#'Re-format SNP data.
#'
#'\code{format_snps} re-formats SNP data into a range of different possible
#'formats for use in snpR functions and elsewhere.
#'
#'While this function can accept a few non-snpRdata input formats, it will
#'reformat to a snpRdata object internally. As such, it takes a facets argument
#'that works identically to elsewhere in the package, as described in
#'\code{\link{Facets_in_snpR}}. This argument is only used for output formats
#'where facets are important, such as the genepop format.
#'
#'While this function can be used as an alternative to
#'\code{\link{import.snpR.data}} when the output argument is set to "snpRdata",
#'this is more complicated and not recommended. Instead, it is simpler
#'\code{\link{import.snpR.data}} or one of the wrappers in
#'\code{\link{snpR_import_wrappers}}. The option is kept for backwards
#'compatibility and internal use.
#'
#'If non-snpRdata is supplied, SNP and sample metadata may be provided. SNP
#'metadata may either be provided in the first few columns of x, the number of
#'which is designated by the input_meta_columns argument, or in a data.frame
#'given as via the snp.meta argument. Sample metadata may be provided in a
#'data.frame via the sample.meta argument.
#'
#'Output format options: \itemize{ \item{ac: }{allele count format, allele
#'counts tabulated for all samples or within populations.} \item{genepop:
#'}{genepop format, genotypes stored as four numeric characters (e.g. "0101",
#'"0204"), transposed, and formatted for genepop. Rownames are individual IDs in
#'genepop format, colnames are SNP ids, matching the first metadata column in
#'input.} \item{structure: }{STRUCTURE format, two lines per individual: allele
#'calls stored as single character numeric (e.g. "1", "2"). Allele calls per
#'individual stored on two subsequent lines.} \item{0000: }{numeric genotype tab
#'format, genotypes stored as four numeric characters (e.g. "0101", "0204").}
#'\item{hapmap: }{Migrate-n hapmap, allele counts tabulated within populations,
#'in migrate-n hapmap format. Since this migrate-n implementation is iffy, this
#'probably shouldn't be used much.} \item{NN: }{character genotype tab format,
#'genotypes stored as actual base calls (e.g. "AA", "CT").} \item{pa: }{allele
#'presence/absence format, presence or absence of each possible allele at each
#'possible genotype noted. Interpolation possible, with missing data substituted
#'with allele frequency in all samples or each population.} \item{rafm: }{RAFM
#'format, two allele calls at each locus stored in subsequent columns, e.g.
#'locus1.1 locus1.2.} \item{faststructure: }{fastSTRUCTURE format, identical to
#'STRUCTURE format save with the addition of filler columns proceeding data such
#'that exactly 6 columns proceed data. These columns can be filled with metadata
#'if desired.} \item{dadi: }{dadi format SNP data format, requires two columns
#'named "ref" and "anc" with the flanking bases around the SNP, e.g. "ACT" where
#'the middle location is the A/C snp.} \item{plink: }{PLINK! binary input
#'format, requires columns named "position" and one matching the name designated
#'with the 'chr' argument, and may contain a column named "cM", "cm", or
#'"morgans", containing linkage group/chr, snp ID, position in bp, and distance
#'in cM in order to create .bim extended map file.} \item{sn: }{Single character
#'numeric format. Each genotype will be listed as 0, 1, or 2, corresponding to
#'0, 1, or 2 minor alleles. Can be interpolated to remove missing data with the
#''interpolate' argument.} \item{sequoia: }{sequoia format. Each genotype is
#'converted to 0/1/2/ or -9 (for missing values). Requires columns ID, Sex,
#'BirthYear (or instead of BirthYear - BYmin and BYmax), optional column
#'Yearlast in sample metadata for running Sequoia. For more information see
#'sequoia documentation.} \item{fasta:} {fasta sequence format.} \item{vcf:}
#'{Variant Call Format, a standard format for SNPs and other genomic variants.
#'Genotypes are coded as 0/0, 0/1, 1/1, or ./. (for missing values), with a
#'healthy serving of additional metadata but very little sample metadata.}
#'\item{genalex: }{GenAlEx format. If an outfile is requested, the data will be
#'sorted according to any provided facets and written as an '.xlsx' object.}
#'\item{snpRdata: }{a snpRdata object.} }
#'
#'Note that for the "sn" format, the data can be interpolated to fill missing
#'data points, which is useful for PCA, genomic prediction, tSNE, and other
#'methods. To do so, specify interpolate = "af" to insert the expected number of
#'minor alleles given SNP allele frequency or "bernoulli" to do binomial draws
#'to determine the number of minor alleles at each missing data point, where the
#'probability of drawing a minor allele is equal to the minor allele frequency.
#'The expected number of minor alleles based on the later method is equal to the
#'interpolated value from the former, but the later allows for multiple runs to
#'determine the impact of stochastic draws and is generally preferred and
#'required for some downstream analysis. It is therefore the default. As a
#'slower but more accurate alternative to "af" interpolation, "iPCA" may be
#'selected. This an iterative PCA approach to interpolate based on SNP/SNP
#'covariance via \code{\link[missMDA]{imputePCA}}. If the ncp argument is not
#'defined, the number of components used for interpolation will be estimated
#'using \code{\link[missMDA]{estim_ncpPCA}}. In this case, this method is much
#'slower than the other methods, especially for large datasets. Setting an ncp
#'of 2-5 generally results in reasonable interpolations without the time
#'constraint.
#'
#'Note also that for the plink format, a .bed binary file can be generated. If
#'the "plink" option is selected and an outfile is designated, R will generate a
#'".sh" shell file with the same name given in the outfile argument. Running
#'this file will create a plink.bed file.
#'
#'Input formats: \itemize{ \item{NULL or snpRdata: }{snpRdata object, the
#'default.} \item{NN: }{SNP genotypes stored as actual base calls (e.g. "AA",
#'"CT").} \item{0000: }{SNP genotypes stored as four numeric characters (e.g.
#'"0101", "0204").} \item{snp_tab: }{SNP genotypes stored with genotypes in each
#'cell, but only a single nucleotide noted if homozygote and two nucleotides
#'seperated by a space if heterozygote (e.g. "T", "T G").} \item{sn: }{SNP
#'genotypes stored with genotypes in each cell as 0 (homozyogous allele 1), 1
#'(heterozygous), or 2 (homozyogus allele 2).} \item{ms: }{.ms file, as output
#'from the simulation program ms.}}
#'
#'
#'
#'@param x snpRdata object or data.frame. Input data, in any of the above listed
#' input formats.
#'@param output Character, default "snpRdata". The desired output format, see
#' description for details.
#'@param facets Character or NULL, default NULL. Facets over which to break up
#' data for some output formats, such as within one file for \code{genepop} or
#' across multiple files for \code{vcf}, following the format described in
#' \code{\link{Facets_in_snpR}}.
#'@param n_samp Integer or numeric vector, default NA. For structure or RAFM
#' outputs. How many random loci should be selected? Can either be an integer
#' or a numeric vector of loci to use.
#'@param interpolate Character or FALSE, default "bernoulli". If transforming to
#' "sn" or "pa" format, notes the interpolation method to be used to fill
#' missing data. Options are "bernoulli", "af", "iPCA", or FALSE. See details.
#'@param outfile character vector, default FALSE. If a file path is provided, a
#' copy of the output will be saved to that location. For some output styles,
#' such as genepop, additional lines will be added to the output to allow them
#' to be immediately run on commonly used programs.
#'@param ped data.frame default NULL. Optional argument for the "plink" output
#' format. A six column data frame containing Family ID, Individual ID,
#' Paternal ID, Maternal ID, Sex, and Phenotype and one row per sample. If
#' provided, outputs will contain information contained in ped. See plink
#' documentation for more details.
#'@param input_format Character, default NULL. Format of x, by default a
#' snpRdata object. See description for details.
#'@param input_meta_columns Numeric, default NULL. If x is not a snpRdata
#' object, optionally specifies the number of metadata columns preceding
#' genotypes in x. See details for more information.
#'@param input_mDat Character, default "NN". If x is not a snpRdata object, the
#' coding for missing \emph{genotypes} in x (typically "NN" or "0000").
#'@param sample.meta data.frame, default NULL. If x is not a snpRdata object,
#' optionally specifies a data.frame containing meta data for each sample. See
#' details for more information.
#'@param snp.meta data.frame, default NULL. If x is not a snpRdata object,
#' optionally specifies a data.frame containing meta data for each SNP. See
#' details for more information.
#'@param chr.length numeric, default NULL. Chromosome lengths, for ms input
#' files. Note that a single value assumes that each chromosome is of equal
#' length whereas a vector of values gives the length for each chromosome in
#' order.
#'@param ncp numeric or NULL, default 2. Number of components to consider for
#' iPCA sn format interpolations of missing data. If null, the optimum number
#' will be estimated, with the maximum specified by ncp.max. This can be very
#' slow.
#'@param ncp.max numeric, default 5. Maximum number of components to check for
#' when determining the optimum number of components to use when interpolating
#' sn data using the iPCA approach.
#'@param chr character, default "chr". Name of column containing chromosome
#' information, for VCF or plink! output.
#'@param position character, default "position". Name of column containing
#' position information, for VCF output.
#'@param phenotype character, default "phenotype". Optional name of column
#' containing phenotype information, for plink! output.
#'@param plink_recode_numeric Logical, default FALSE. If FALSE, all chrs/scaffs
#' will be renamed to numbers. This may be useful in some cases. If this is
#' FALSE, chromosome names will be checked for leading numbers and replaced
#' with a corresponding letter (0 becomes A, 1 becomes B, and so on).
#'@param verbose Logical, default FALSE. If TRUE, some progress updates will be
#' reported.
#'
#'@return A data.frame or snpRdata object with data in the correct format. May
#' also write a file to the specified path.
#'
#'@export
#'
#'@author William Hemstrom
#'@author Melissa Jones
#'
#' @examples
#' \dontrun{
#' #import data to a snpRdata object
#' ## get sample meta data
#' sample_meta <-
#' data.frame(pop = substr(colnames(stickRAW)[-c(1:3)], 1, 3),
#' fam = rep(c("A", "B", "C", "D"),
#' length = ncol(stickRAW) - 3),
#' stringsAsFactors = FALSE)
#' format_snps(stickRAW, input_format = "0000",
#' input_meta_columns = 3,
#' input_mDat = "0000", sample.meta = sample_meta)
#'
#' #allele count, separated by the pop facet.
#' format_snps(stickSNPs, "ac", facets = "pop")
#'
#' #genepop:
#' format_snps(stickSNPs, "genepop")
#'
#' #STRUCTURE, subsetting out 100 random alleles:
#' format_snps(stickSNPs, "structure", n_samp = 100)
#'
#' #STRUCTURE, subseting out the first 100 alleles:
#' format_snps(stickSNPs, "structure", n_samp = 1:100)
#'
#' #fastSTRUCTURE
#' format_snps(stickSNPs, "faststructure")
#'
#' #numeric:
#' format_snps(stickSNPs, "0000")
#'
#' #hapmap for migrate-n:
#' format_snps(stickSNPs, "hapmap", facets = "pop")
#'
#' #character:
#' format_snps(stickSNPs, "NN")
#'
#' #presence/absence, SNP data:
#' format_snps(stickSNPs, "pa")
#'
#' #RAFM, taking only 100 random snps and seperating by pop
#' format_snps(stickSNPs, "rafm", facets = "pop", n_samp = 100)
#'
#' #dadi
#' ## add ref and anc snp meta data columns to stickSNPs
#' dat <- as.data.frame(stickSNPs)
#' snp.meta(dat) <- cbind(ref = "ATA", anc = "ACT", snp.meta(stickSNPs))
#' format_snps(dat, "dadi", facets = "pop")
#'
#' #PLINK! format, not run to avoid file creation
#' format_snps(stickSNPs, "plink", outfile = "plink_out", chr = "chr")
#'
#'
#' #PLINK! format with provided ped
#' ped <- data.frame(fam = c(rep(1, 210), rep("FAM2", 210)), ind = 1:420,
#' mat = 1:420, pat = 1:420,
#' sex = sample(1:2, 420, replace = TRUE),
#' pheno = sample(1:2, 420, replace = TRUE))
#' format_snps(stickSNPs, "plink", outfile = "plink_out",
#' ped = ped, chr = "chr")
#' #note that a column in the sample metadata containing phenotypic information
#' #can be provided to the "phenotype" argument if wished.
#'
#' #Sequoia format
#' b <- sample.meta(stickSNPs)
#' b$ID <- 1:nrow(b)
#' b$Sex <- rep(c("F", "M", "U"), length.out=nrow(b))
#' b$BirthYear <- round(runif(n = nrow(b), 1,1))
#' a <- stickSNPs
#' b$ID <- paste0(sample.meta(a)$pop, sample.meta(a)$fam, sample.meta(a)$.sample.id)
#' sample.meta(a) <- b
#' format_snps(x = a, output = "sequoia")
#' #note: if using the birth year windows BYmin and BYmax and or Yearlast,
#' ensure that the column names are not stored as BY.min, BY.max, Year.last for
#' snpR.
#'
#' # VCF format
#' test <- format_snps(stickSNPs, "vcf", chr = "chr")
#'
#' # GenAlEx format (write to file to generate a facet-sorted xlsx file)
#' test <- format_snps(stickSNPs, "genalex")
#' }
format_snps <- function(x, output = "snpRdata", facets = NULL, n_samp = NA,
interpolate = "bernoulli", outfile = FALSE,
ped = NULL, input_format = NULL,
input_meta_columns = NULL, input_mDat = NULL,
sample.meta = NULL, snp.meta = NULL, chr.length = NULL,
ncp = 2, ncp.max = 5, chr = "chr", position = "position",
phenotype = "phenotype", plink_recode_numeric = FALSE,
verbose = FALSE){
POS <- NULL
if(!isTRUE(verbose)){
cat <- function(...){}
}
#======================sanity checks================
if(!is.null(input_format)){
if(tolower(input_format) == "snprdata"){input_format <- NULL}
}
# check that a useable output format is given. keming
output <- tolower(output) # convert to lower case.
if(output == "nn"){output <- "NN"}
pos_outs <- c("ac", "genepop", "structure", "0000", "hapmap", "NN", "pa",
"rafm", "faststructure", "dadi", "plink", "sn", "snprdata",
"colony","adegenet", "fasta", "lea", "sequoia", "vcf", "genalex")
if(!(output %in% pos_outs)){
stop("Unaccepted output format specified. Check documentation for options.\n")
}
# check that provided snpRdata objects are in the correct format
if(is.null(input_format)){
if(!is.snpRdata(x)){
stop("If x is not a snpRdata object, provide input data format.\n")
}
}
else if(is.snpRdata(x)){
imput_format <- NULL
}
# bi-allelic check
if(is.snpRdata(x)){
if(!.is.bi_allelic(x)){
pos_nbi_outs <- c("genepop", "0000", "pa")
if(!output %in% pos_nbi_outs){
stop(paste0("The output format you selected is not currently supported for non-biallelic data. Currently supported formats are:",
paste0(pos_nbi_outs, collapse = ", "), ".\n"))
}
}
}
# do checks, print info
if(is.null(input_format) & !is.null(facets[1])){
facet.types <- x@facet.type[match(facets, x@facets)]
snp.facets <- which(facet.types == "snp")
both.facets <- which(facet.types == "both")
sample.facets <- which(facet.types == "sample")
}
else{
both.facets <- character()
snp.facets <- both.facets
sample.facets <- both.facets
facet.types <- both.facets
}
if(output == "ac"){
cat("Converting to allele count format.\n")
}
else if(output == "genepop"){
cat("Converting to genepop format.\n")
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
}
else if(output == "structure" | output == "faststructure"){
if(output == "structure"){cat("Converting to STRUCTURE format.\n")}
else{cat("Converting to fastSTRUCTURE format.\n")}
if(length(n_samp) > 1){
if(is.integer(n_samp)){
cat("Number of designated sub-samples to take:", length(n_samp), "\n")
}
else{
stop("Number of sub-samples to take must be a positive integer vector.\n")
}
}
else if (is.numeric(n_samp)){
if(floor(n_samp) == n_samp & n_samp > 0){
cat("Number of designated sub-samples to take:", n_samp, "\n")
}
else{
stop("Number of sub-samples to take must be a positive integer vector.\n")
}
}
else if (!is.na(n_samp)){
stop("Number of sub-samples to take must be a positive integer vector.\n")
}
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
if(output == "faststructure"){
if(length(facets) > 5){
stop("Too many facets selected for fastSTRUCTURE. Limit 5.\n")
}
}
else{
if(length(facets) > 1){
stop("Only one facet permitted for structure format. Complex, sample only facets are allowed.\n")
}
}
}
else if(output == "0000"){
cat("Converting to numeric 2 character format.\n")
}
else if(output == "hapmap"){
cat("Converting to migrate-N hapmap format.\n")
}
else if(output == "NN"){
cat("Converting to NN format.\n")
}
else if(output == "pa"){
cat("Converting to allele presence/absense format.\n")
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
}
else if(output == "rafm"){
cat("Converting to RAFM format.\n")
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
}
else if(output == "dadi"){
if(is.null(input_format)){
if(!("ref" %in% colnames(x@snp.meta)) | !("anc" %in% colnames(x@snp.meta))){
stop("Reference and ancestor/outgroup flanking bases required in snp metadata columns named 'ref' and 'anc', respecitvely")
}
}
else {
if(!("ref" %in% colnames(x)) | !("anc" %in% colnames(x))){
stop("Reference and ancestor/outgroup flanking bases required in columns named 'ref' and 'anc', respecitvely")
}
}
cat("Converting to dadi format...\n")
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
}
else if(output == "plink"){
.check.installed("genio")
if(is.null(chr)){
stop("The chr argument must be provided for plink output, indicating which column of the data contains chromosome information.\n")
}
if(is.null(input_format)){
if(!all(c("position") %in% colnames(x@snp.meta)) | !any(chr %in% colnames(x@snp.meta))){
stop("A column named position and one matching the argument 'chr' containing position in bp and chr/linkage group/scaffold must be present in snp metadata!")
}
}
else if(!all(c("position") %in% colnames(x)) | !any(chr %in% colnames(x))){
stop("A column named position and one matching the argument 'chr' containing position in bp and chr/linkage group/scaffold must be present in x!")
}
if(!is.null(ped)){
if(!is.data.frame(ped)){
stop("ped must be a six column data.frame containg Family ID, Individual ID, Paternal ID, Maternal ID, Sex, and Phenotype and one row per sample. See plink documentation.\n")
}
if(ncol(ped) != 6 | nrow(ped) != ncol(x)){
stop("ped must be a six column data.frame containg Family ID, Individual ID, Paternal ID, Maternal ID, Sex, and Phenotype and one row per sample. See plink documentation.\n")
}
}
cat("Converting to PLINK! binary format.\n\tThis relies on the genio package--please cite this for publication.\n")
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
}
else if(output == "sn"){
cat("Converting to single character numeric format.\n")
if(length(c(both.facets, snp.facets)) != 0){
warning("Removing invalid facet types (snp or snp and sample specific).\n")
facets <- facets[-c(snp.facets, both.facets)]
}
}
else if(output == "snprdata"){
if(is.null(input_format)){
stop("Data already in snpRdata object.\n")
}
if(input_format != "ms"){
if(all(is.null(snp.meta), is.null(input_meta_columns)) | is.null(input_mDat)){
stop("sample meta, snp meta, and input missing data format must be provided for conversion to snpRdata object.")
}
else if(is.null(snp.meta) & !is.null(input_meta_columns)){
cat("Using input metadata columns as snp meta.\n")
}
}
else if(is.null(input_mDat)){
input_mDat <- -1
}
cat("Converting to snpRdata object.\n")
}
else if(output == "colony"){
cat("Converting to colony format.\n")
}
else if(output == "adegenet"){
cat("Converting to adegenet genind object. SNP metadata will be discarded.\n")
}
else if(output == "fasta"){
cat("Converting to psuedo-fasta file. All snps will be treated as a single sequence.\nHeterozygotes will be randomly called as either the major or minor allele.\n")
}
else if(output == "lea"){
cat("Converting to LEA geno format. All metadata will be discarded.\n")
}
else if(output == "sequoia"){
cat("Converting to Sequoia format.\n")
}
else if(output == "vcf"){
if(!position %in% colnames(snp.meta(x))){
stop(paste0("No column named matching position argument (", position, ") in SNP metadata. This is required for VCF output.\n"))
}
if(!chr %in% colnames(snp.meta(x))){
stop(paste0("No column named matching chr argument (", chr, ") in SNP metadata. This is required for VCF output.\n"))
}
cat("Converting to VCF format.\n")
}
else if(output == "genalex"){
cat("Converting to genalex format.")
.check.installed("openxlsx")
}
else{
stop("Please specify output format.")
}
if(interpolate == "iPCA"){
.check.installed("missMDA")
}
#======================put data into snpRdata object if not in that format to start with================
if(!is.null(input_format)){
cat("Converting data to snpRdata, NN format.\n")
if(input_format == "ms"){
if(!is.null(snp.meta) | !is.null(input_meta_columns)){
input_meta_columns <- NULL
snp.meta <- NULL
warning("For ms inputs, provided snp.meta will be ignored and will be pulled from
input ms instead.\n")
}
if(is.null(sample.meta)){
stop("For ms input, please provide sample metadata.\n")
}
if(!is.numeric(chr.length)){
stop("For ms input, please provide either a single or a vector of chromosome lengths.\n")
}
if(!is.character(x)){
stop("For ms input, please provide a file path to the 'x' argument.\n")
}
else if(length(x) != 1){
stop("For ms input, please provide a file path to the 'x' argument.\n")
}
else if(!file.exists(x)){
stop("File provided to 'x' not found.\n")
}
cat("Parsing ms file...")
x <- .process_ms(x, chr.length)
snp.meta <- x$meta
x <- x$x
x <- .convert_2_to_1_column(x)
cat(" done.\n")
input_format <- "sn"
}
if(!is.null(input_meta_columns)){
headers <- x[,c(1:input_meta_columns)]
x <- x[,-c(1:input_meta_columns)]
}
else{
if(is.null(snp.meta)){
headers <- data.frame(snpID = 1:nrow(x))
}
else{
headers <- snp.meta
}
}
if(is.null(sample.meta)){
sample.meta <- data.frame(ID = 1:ncol(x))
}
#====================checks===================
if(is.null(input_mDat)){
stop("Input missing data encoding required.\n")
}
if(input_format == "NN"){
cat("Input format: NN\n")
if(nchar(input_mDat) != 2){
stop("Missing data format must be two character.\n")
}
else{
cat("Missing data format: ", input_mDat, "\n")
}
}
else if(input_format == "0000"){
cat("Input format: 0000\n")
if(nchar(input_mDat) != 4){
stop("Missing data format must be four characters.\n")
}
else{
cat("Missing data format: ", input_mDat, "\n")
}
}
else if(input_format == "snp_tab"){
cat("Input format: snp_tab.\n")
if(nchar(input_mDat) != 2){
stop("Missing data format must be two characters.\n")
}
else{
cat("Missing data format: ", input_mDat, "\n")
}
}
else if(input_format == "sn"){
cat("Iput format: sn\n")
if(input_mDat %in% c(0:2)){
stop("Missing data format must be other than 0, 1, or 2.\n")
}
else{
cat("Missing data format: ", input_mDat, "\n")
}
}
else{
stop("Unsupported input format.")
}
#====================convert to snpRdata intermediate=============
# 0000 put into snpRdata
if(input_format == "0000"){
#vectorize and replace
xv <- as.matrix(x)
xv <- gsub("01", "A", xv)
xv <- gsub("02", "C", xv)
xv <- gsub("03", "G", xv)
xv <- gsub("04", "T", xv)
xv <- gsub(substr(input_mDat, 1, 2), "N", xv)
x <- as.data.frame(xv, stringsAsFactors = F)
rm(xv)
input_mDat <- "NN"
#rebind to matrix and remake data.
if(output == "NN"){
rdata <- cbind(sample.meta, x) #all done if just converting to NN
}
else{
x <- import.snpR.data(x, sample.meta = sample.meta, snp.meta = headers, mDat = "NN")
}
}
# convert snp_tab to snpRdata
if(input_format == "snp_tab"){
xv <- as.matrix(x)
ptf <- nchar(xv) == 1
xv[ptf] <- paste0(xv[ptf], xv[ptf]) #double up homozygotes
remove(ptf)
xv <- gsub(" ", "", xv) #combine heterozygotes.
xv[xv == input_mDat] <- "NN" #replace with correct missing data
x <- as.data.frame(xv, stringsAsFactors = F)
if(output == "NN"){
rdata <- cbind(headers, x)
}
else{
x <- import.snpR.data(x, sample.meta = sample.meta, snp.meta = headers, mDat = "NN")
}
}
#convert "sn" format to NN
if(input_format == "sn"){
xv <- as.matrix(x)
xv[xv == 0] <- "AA"
xv[xv == 1] <- "AC"
xv[xv == 2] <- "CC"
xv[xv == input_mDat] <- "NN"
x <- as.data.frame(xv, stringsAsFactors = F)
if(output == "NN"){
rdata <- as.data.frame(x) #all done if just converting to NN
}
else{
x <- import.snpR.data(x, sample.meta = sample.meta, snp.meta = headers, mDat = "NN")
}
}
if(input_format == "NN"){
x <- import.snpR.data(x, sample.meta = sample.meta, snp.meta = headers, mDat = "NN")
}
if(!is.null(facets)){
x <- .add.facets.snpR.data(x, facets)
}
# if this is the desired output, we're done.
if(output == "snprdata"){
if(outfile != FALSE){
saveRDS(x, outfile)
}
return(x)
}
}
#======================prepare outfile==================================================================
if(outfile != FALSE){
if(is.character(outfile) & length(outfile) == 1){
cat("Printing results to:", outfile, "\n")
}
else{
stop("Outfile must be a character vector of length 1.\n")
}
}
#======================do conversions===================================================================
# add missing facets
if(length(facets) == 0){facets <- NULL}
if(!is.null(facets[1])){
if(!all(facets %in% x@facets)){
cat("Adding missing facets.\n")
new.facets <- facets[which(!(facets %in% x@facets))]
x <- .add.facets.snpR.data(x, new.facets)
}
}
# set facets to ".base" if not requested
if(is.null(facets[1])){
facets <- ".base"
}
# convert to allele count, migrate-n, or dadi format. Migrate-n should ALWAYS have multiple pops (why else would you use it?)
if(output == "ac" | output == "hapmap" | output == "dadi"){
if(output == "hapmap"){cat("WARNING: Data does not have header or pop spacer rows.\n")}
#=========basic ac constructor function, to be on each facet==========
# x: stats slot of a snpR object, filtered to the desired facets
# maj: major allele identities across all facets at the relevent snps
# min: minor allele identities across all facets at the relevent snps
# mis.al: missing allele coding
get.ac <- function(x, maj, min, mis.al){
# initialize:
if(is.null(nrow(x))){
temp.x <- matrix(x, ncol = length(x))
colnames(temp.x) <- names(x)
x <- temp.x
}
out <- data.frame(n_total = numeric(length(maj)),
n_alleles = numeric(length(maj)),
ni1 = numeric(length(maj)),
ni2 = numeric(length(maj)))
# get the column from as matching the target allele.
maj.col.match <- match(maj, colnames(x))
out$ni1 <- t(x)[maj.col.match + seq(0, length(x) - ncol(x), by = ncol(x))]
# ni1 is the rowsums minus this
out$ni2 <- rowSums(x) - out$ni1
out$n_total <- rowSums(x)
out$n_alleles <- rowSums(ifelse(out[,3:4] != 0, 1, 0))
out[is.na(out)] <- 0
return(out)
}
#=========apply to requested facets=======
# get missing maf info
mafs_to_calc <- .check_calced_stats(x, unique(c(".base", facets)), "maf")
if(any(!unlist(mafs_to_calc))){
x <- calc_maf(x, names(mafs_to_calc[which(!unlist(mafs_to_calc))]))
}
# grab the major and minors for each snp in the analysis.
.snp.id <- facet <- subfacet <- NULL
x@stats <- dplyr::arrange(x@stats, .snp.id, facet, subfacet)
maj <- x@stats$major[x@stats$facet == ".base"]
min <- x@stats$minor[x@stats$facet == ".base"]
maj <- rep(maj, each = length(unique(x@facet.meta[x@facet.meta$facet %in% facets,]$subfacet)))
min <- rep(min, each = length(unique(x@facet.meta[x@facet.meta$facet %in% facets,]$subfacet)))
# get the ac file
ac.dat <- get.ac(x@geno.tables$as[x@facet.meta$facet %in% facets,],
maj = maj,
min = min,
substr(x@mDat, 1, nchar(x@mDat)/2))
if(output == "dadi"){
rdata <- cbind(x@stats[x@stats$facet %in% facets, c("facet", "subfacet", ".snp.id")], ac.dat)
ni1 <- reshape2::dcast(rdata, facet + .snp.id ~ subfacet, value.var = c("ni1"))
ni2 <- reshape2::dcast(rdata, facet + .snp.id ~ subfacet, value.var = c("ni2"))
rdata <- cbind(ref = x@snp.meta$ref, # since everything is sorted by .snp.id, this will match.
anc = x@snp.meta$anc,
Allele1 = x@stats[x@stats$facet == ".base", "major"],
ni1[order(ni1$.snp.id), 3:ncol(ni1), drop = FALSE],
Allele2 = x@stats[x@stats$facet == ".base", "minor"],
ni2[order(ni2$.snp.id), 3:ncol(ni2), drop = FALSE],
x@snp.meta[,which(!(colnames(x@snp.meta) %in% c("ref", "anc", ".snp.id")))])
rdata <- as.data.frame(rdata)
colnames(rdata)[c(3,3 + length(3:ncol(ni1)) + 1)] <- c("Allele1", "Allele2")
if(length(facets) == 1 & facets[1] == ".base"){
colnames(rdata)[c(4,6)] <- ".base"
}
}
else{
rdata <- cbind(x@facet.meta[x@facet.meta$facet %in% facets,],
ac.dat)
if(output == "hapmap"){
rdata <- dplyr::arrange(rdata, facet, subfacet, .snp.id)
}
}
}
if(output == "NN"){
rdata <- cbind(x@snp.meta, as.data.frame(x))
colnames(rdata)[1:ncol(x@snp.meta)] <- colnames(x@snp.meta)
}
##convert to genepop or numeric format (v)
if (output == "genepop" | output == "0000"){
if(!.is.bi_allelic(x)){
if(output == "genepop"){
rdata <- as.data.frame(t(genotypes(x)))
row.names(rdata) <- paste0(row.names(rdata), " ,") #adding space and comma to row names, as required.
}
else{
rdata <- genotypes(x)
rdata <- cbind(x@snp.meta, rdata)
colnames(rdata)[1:ncol(x@snp.meta)] <- colnames(x@snp.meta)
}
}
else{
# keming
#vectorize and replace
xv <- as.matrix(x)
xv <- gsub("A", "01", xv)
xv <- gsub("C", "02", xv)
xv <- gsub("G", "03", xv)
xv <- gsub("T", "04", xv)
xv <- gsub(substr(x@mDat, 1, 2), "0000", xv)
# keming
if(output == "genepop"){ #convert to genepop
rdata <- as.data.frame(t(xv), stringsAsFactors = F) #remove extra columns and transpose data
row.names(rdata) <- paste0(row.names(rdata), " ,") #adding space and comma to row names, as required.
}
# else if(output == "baps"){}
else {#prepare numeric output, otherwise same format
rdata <- as.data.frame(xv, stringsAsFactors = F)
rdata <- cbind(x@snp.meta, rdata)
colnames(rdata)[1:ncol(x@snp.meta)] <- colnames(x@snp.meta)
}
}
}
##convert to structure, fastStructure or RAFM format (v)
if (output == "structure" | output == "rafm" | output == "faststructure" | output == "genalex"){
if(length(facets) > 1){
stop("Only one facet at a time permitted for structure/rafm/faststructure/genalex!\n")
}
facets <- .check.snpR.facet.request(x, facets)
#subset if requested
if(all(!is.na(n_samp))){
cat("Subsetting ")
if(length(n_samp) > 1){
cat("designated SNPs.\n")
x <- subset_snpR_data(x, n_samp)
}
else{
cat(n_samp, " random SNPs.\n")
x <- subset_snpR_data(x, sample(1:nrow(x)))
}
}
# transpose, since these are sample based, then split into two matrices
xv <- t(as.matrix(x))
xv1 <- substr(xv, 1, 1)
xv2 <- substr(xv, 2, 2)
# replace
xv1[xv1 == "A"] <- 1
xv1[xv1 == "C"] <- 2
xv1[xv1 == "G"] <- 3
xv1[xv1 == "T"] <- 4
xv1[xv1 == substr(x@mDat, 1, nchar(x@mDat)/2)] <- 0
xv2[xv2 == "A"] <- 1
xv2[xv2 == "C"] <- 2
xv2[xv2 == "G"] <- 3
xv2[xv2 == "T"] <- 4
xv2[xv2 == substr(x@mDat, 1, nchar(x@mDat)/2)] <- 0
#create output matrix and bind the data to it, structure format
if(output == "structure" | output == "faststructure"){
outm <- matrix(NA, nrow = 2*(ncol(x)), ncol = nrow(x))
#fill
outm[seq(1,nrow(outm),2),] <- xv1
outm[seq(2,nrow(outm),2),] <- xv2
#add sample names
snames <- character(nrow(outm))
snames[seq(1,nrow(outm),2)] <- x@names
snames[seq(2,nrow(outm),2)] <- x@names
# fix missing to -9
outm[outm == 0] <- -9
if(output == "structure"){ # bind sample and metadata for structure
#prep pop numbers
facets <- unlist(.split.facet(facets))
struc.meta <- as.numeric(as.factor(
.paste.by.facet(sample.meta(x)[rep(1:nsamps(x), each = 2),], colnames(sample.meta(x)) %in% facets)))
if(length(struc.meta) > 0){
rdata <- cbind(ind = snames,
struc.meta,
as.data.frame(outm, stringsAsFactors = F))
}
else{
rdata <- cbind(ind = snames,
as.data.frame(outm, stringsAsFactors = F))
}
colnames(rdata)[2] <- paste(facets, collapse = ".")
}
else{ #add a bunch of filler columns for faststructure and change missing data to -9
if(length(facets) == 1 & facets[1] == ".base"){
rdata <- cbind(ind = snames,
matrix("filler", nrow(outm), 5),
as.data.frame(outm, stringsAsFactors = F))
colnames(rdata)[2:6] <- paste0("filler", 1:5)
}
else{
n.valid.facets <- sum(colnames(x@sample.meta) %in% facets)
rdata <- cbind(ind = snames,
x@sample.meta[,colnames(x@sample.meta) %in% facets],
matrix("filler", nrow(outm), 5 - n.valid.facets),
as.data.frame(outm, stringsAsFactors = F))
colnames(rdata)[(1 + n.valid.facets):6] <- paste0("filler", 1:(5 - n.valid.facets))
}
}
}
else if(output == "genalex"){
rdata <- matrix(NA, nrow = nsamps(x), ncol = nsnps(x)*2)
rdata[,seq(1, ncol(rdata), by = 2)] <- xv1
rdata[,seq(2, ncol(rdata), by = 2)] <- xv2
rdata <- matrix(as.numeric(rdata), nrow(rdata), ncol(rdata))
}
#create output matrix and bind to it, RAFM format.
else{
outm <- matrix(NA, ncol(x), nrow(x)*2)
#fill
outm[,seq(1,ncol(outm),2)] <- xv1
outm[,seq(2,ncol(outm),2)] <- xv2
#replance missings with NA
outm[outm == 0] <- NA
#add column names
colnames(outm) <- paste0("locus", sort(rep(1:nrow(x),2)), rep(c(".1", ".2"), ncol(outm)/2))
#add subpop numbers, if given
rdata <- cbind(x@sample.meta[,colnames(x@sample.meta) %in% facets],
as.data.frame(outm, stringsAsFactors = F))
if(any(colnames(x@sample.meta) %in% facets)){
colnames(rdata)[1:sum(colnames(x@sample.meta) %in% facets)] <- colnames(x@sample.meta)[colnames(x@sample.meta) %in% facets]
}
}
}
#presence/absence format
if(output == "pa"){
xv <- as.matrix(x)
#function to produce vectorized presence absence (as much as possible, not vectorized for NAs)
pa_alleles <- function(xv, snp_form, mDat, nsamp, nloci){
nsamp <- nrow(xv)
nloci <- ncol(xv)
#get all possible genotypes
gs <- unique(as.vector(xv))
#which genotype is the missing data?
mpos <- which(gs == mDat)
#what are the possible alleles at all loci?
as <- unique(c(substr(gs,1,snp_form/2), substr(gs, (snp_form/2 + 1), snp_form*2)))
as <- sort(as[as != substr(mDat, 1, snp_form/2)]) #that aren't N?
#make the table
cat("Creating presence/absence table...\n")
#convert genotype vector to allele vector
xva1 <- substr(xv, 1, snp_form/2)
xva2 <- substr(xv, (snp_form/2+1),snp_form)
#initialize
amat <- matrix(0, nsamp, length(as)*nloci)
if(any(grepl("_", as))){stop("Detected '_' in genotypes. Alleles cannot be coded with underscores for pa conversion.\n")}
colnames(amat) <- paste0(sort(rep(1:nrow(x),length(as))), "_", as) #initialize all of the columns, with locus number followed by allele. Will remove anything empty after assignment.
#fill in
for(i in 1:length(as)){
pr1 <- grep(as[i], xva1) # unique rows which have the allele in either position one or position two.
pr2 <- grep(as[i], xva2)
amat[,grep(paste0("_", as[i]),colnames(amat))][pr1] <- amat[,grep(paste0("_", as[i]),colnames(amat))][pr1] + 1 #set the allele as present in the correct rows. This works because we look only at the G amat columns first, then put set only the individual IDs with a G to one. I think.
amat[,grep(paste0("_", as[i]),colnames(amat))][pr2] <- amat[,grep(paste0("_", as[i]),colnames(amat))][pr2] + 1
}
#remove alleles not seen at loci.
amat <- amat[,which(colSums(amat) != 0)]
#get allele counts per loci:
cat("Filling in missing data with NAs.\n")
###########
#fill in missing data with NAs.
if(.is.bi_allelic(x)){ # fully vectorized
xmc <- which(x == mDat) #which samples had missing data?
adj <- floor(xmc / nsamp) #how many loci over do I need to adjust xmc, since in amat each locus occupies two columns?
adj[xmc%%nsamp == 0] <- adj[xmc%%nsamp == 0] - 1 #shift over anything that got screwed up by being in the last sample
xmc <- xmc + (nsamp*adj) #adjust xmc for extra columns.
if(any(amat[xmc] != 0) | any(amat[xmc + nsamp] != 0)){
stop("Missing data values were not properly identified for replacement with NAs. This usually happens when SNP data is not completely bi-allelic. Try filtering out non-biallelic and non-polymorphic SNPs using filter_snps.\n")
}
amat[xmc] <- NA #make the first allele NA
amat[xmc + nsamp] <- NA #make the second allele (another column over) NA.
}
else{ # need to loop through loci :(
loc_key <- gsub("_.+", "", colnames(amat))
loc_key <- table(loc_key)
loc_key <- loc_key[order(as.numeric(names(loc_key)))]
loc_key <- cumsum(loc_key)
loc_key <- c(0, loc_key)
xmc <- which(as.matrix(x) == x@mDat, arr.ind = TRUE) #which samples had missing data?
for(j in 1:(length(loc_key) - 1)){
tl <- xmc[which(xmc[,1] == j),]
if(sum(amat[tl[,2], (loc_key[j] + 1):loc_key[j + 1]]) != 0){stop("Failed to correctly fill NAs.\n")}
amat[tl[,2], loc_key[j]:loc_key[j + 1]] <- NA
}
}
return(amat)
}
amat <- pa_alleles(t(xv), x@snp.form, x@mDat)
# interpolate?
if(interpolate == "bernoulli"){
amat <- t(.interpolate_sn(t(amat), "bernoulli"))
}
else if(interpolate == "af"){
amat <- t(.interpolate_sn(t(amat), "af"))
}
else if(interpolate == "iPCA"){
amat <- t(.interpolate_sn(t(amat), "iPCA", ncp = ncp, ncp.max = ncp.max))
}
# if(interp_miss){
# #average number observed in columns
# cat("Interpolating missing data...\n")
# afs <- colMeans(amat, TRUE)
# temp <- which(is.na(amat))/nrow(amat)
# fill_element <- floor(temp) + 1 #get the column for each missing data point
# fill_element[which(temp %% 1 == 0)] <- fill_element[which(temp %% 1 == 0)] - 1 #correct for anything in the last row
# amat[which(is.na(amat))] <- afs[fill_element] #fill with the appropriate allele frequency.
# }
# else{cat("Finished. Warning: Missing data counts are also stored!\n")}
amat <- cbind(samp = as.character(colnames(x)), as.data.frame(amat, stringsAsFactors = F))
rdata <- amat
}
#PLINK format
if(output == "plink"){
#===============make a bed file using genio==========
# remove any snps with no data
sn <- as.matrix(format_snps(x, "sn", interpolate = FALSE)[,-c(1:(ncol(snp.meta(x)) - 1))])
bads <- which(rowSums(is.na(sn)) == ncol(sn))
if(length(bads) > 0){
sn <- sn[-bads,]
warning("Removed some loci without any called genotypes.\n")
}
# check for bad chr names
if(!plink_recode_numeric){
chrs <- summarize_facets(x, chr)$chr
lead <- unlist(lapply(chrs, substr, stop = 1, start = 1))
bad_chrs <- which(lead %in% 0:9)
if(length(bad_chrs) > 0){
lead[bad_chrs] <- LETTERS[1:10][match(lead[bad_chrs], 0:9)]
n.chrs <- snp.meta(x)[,chr]
if(length(bads) > 0){
n.chrs <- n.chrs[-bads]
}
substr(n.chrs[which(n.chrs %in% chrs[bad_chrs])], 1, 1) <- lead[match(n.chrs[which(n.chrs %in% chrs[bad_chrs])], chrs)]
warning("Found numbers at the start of chr/scaffold names, which plink does not allow. Replaced with letters (0:9 = A:J).\n")
}
else{
n.chrs <- snp.meta(x)[,chr]
if(length(bads) > 0){
n.chrs <- n.chrs[-bads]
}
}
}
else{
n.chrs <- snp.meta(x)[,chr]
if(length(bads) > 0){
n.chrs <- n.chrs[-bads]
}
}
# sort by chr then position
if(length(bads) > 0){
n.ord <- order(n.chrs, snp.meta(x)$position[-bads])
}
else{
n.ord <- order(n.chrs, snp.meta(x)$position)
}
n.chrs <- n.chrs[n.ord]
sn <- sn[n.ord,]
if(isFALSE(outfile)){
genio::write_bed("plink_out.bed", sn, verbose = FALSE)
cat("Wrote bed file to plink_out.bed\n")
}
genio::write_bed(paste0(outfile, ".bed"), sn, verbose = FALSE)
#===============make a ped file=================
# make an empty set of ped header columns if not provided
lower.sample.cols <- tolower(colnames(x@sample.meta))
if(is.null(ped)){
if(any(lower.sample.cols == "fam")){
fam <- x@sample.meta[,which(lower.sample.cols == "fam")]
}
else{
fam <- rep("NA", ncol(x))
}
if(any(lower.sample.cols == "patid")){
PatID <- x@sample.meta[,which(lower.sample.cols == "patid")]
}
else{
PatID <- rep("NA", ncol(x))
}
if(any(lower.sample.cols == "sex")){
Sex <- x@sample.meta[,which(lower.sample.cols == "sex")]
}
else{
Sex <- rep("NA", ncol(x))
}
if(any(colnames(x@sample.meta) == phenotype)){
cat("Using phenotype column as phenotype.")
Phenotype <- x@sample.meta[,which(colnames(x@sample.meta) == phenotype)]
if(length(unique(stats::na.omit(Phenotype))) == 2){
uf <- factor(Phenotype)
cat("Two phenotypes detected, options encoded as: \n\t",
levels(uf)[1], " = 0\n\t",
levels(uf)[2], " = 1\n\t",
"\n\tNA as -9.\n")
uf <- as.numeric(uf)
uf[is.na(uf)] <- -9
Phenotype <- uf
}
else{
warning("Plink does not typically support more than two phenotypes. Phenotypes left as-is, with missing data as NA (plink requires -9).\n")
}
}
else{
cat("No phenotype column in sample metadata detected, filling with missing.\n")
Phenotype <- rep("-9", ncol(x))
}
if(any(lower.sample.cols == "matid")){
MatID <- x@sample.meta[,which(lower.sample.cols == "matid")]
}
else{
MatID <- rep("NA", ncol(x))
}
ped <- data.frame(fam = fam,
ind = colnames(x),
PatID = PatID,
MatID = MatID,
Sex = Sex,
Phenotype = Phenotype)
}
else{
if(any(colnames(ped) == "Phenotype")){
Phenotype <- ped$Phenotype
if(length(unique(stats::na.omit(Phenotype))) == 2){
uf <- factor(Phenotype)
cat("Two phenotypes detected, options encoded as: \n\t",
levels(uf)[1], " = 0\n\t",
levels(uf)[2], " = 1\n\t",
"\n\tNA as -9.\n")
uf <- as.numeric(uf)
uf[is.na(uf)] <- -9
Phenotype <- uf
ped$Phenotype <- Phenotype
}
else{
warning("Plink does not typically support more than two phenotypes. Phenotypes left as-is, with missing data as NA (plink requires -9).\n")
}
}
}
# save .fam
fam <- ped
# change missing data value and add a space between alleles.
x.np <- as.vector(t(x))
x.np[x.np == x@mDat] <- "00"
x.np <- gsub("(.)(.)", "\\1 \\2", x.np)
# rebind
ped <- cbind(ped, matrix(x.np, nrow(ped), nrow(x)), stringsAsFactors = F)
#===============make an extended map file=================
a.names <- get.snpR.stats(x, stats = "maf")$single[,c("major", "minor")]
# with morgans
if(any(colnames(x@snp.meta) %in% c("cM", "cm", "morgans"))){
bim <- data.frame(chr = x@snp.meta[,chr],
rs = x@snp.meta$.snp.id,
cM = x@snp.meta[,which(colnames(x@snp.meta) %in% c("cM", "cm", "morgans"))][,1],
bp = x@snp.meta$position,
a1 = a.names[,1],
a2 = a.names[,2])
}
# without morgans
else{
bim <- data.frame(chr = x@snp.meta[,chr],
rs = x@snp.meta$.snp.id,
cM = 0,
bp = x@snp.meta$position,
a1 = a.names[,1],
a2 = a.names[,2])
}
# remove bads
if(length(bads) > 0){
bim <- bim[-bads,]
}
# re-order
bim <- bim[n.ord,]
# re-name
bim$chr <- n.chrs
# recode chr
if(plink_recode_numeric){
bim$chr <- as.numeric(as.factor(bim$chr))
}
# grab normal map file
map <- bim[,1:4]
# name output
rdata <- list(ped = ped, map = map, bim = bim, fam = fam)
}
# colony format for 01234
if(output == "colony"){
#gsub the values
rdata <- gsub(pattern = "N", replacement = "0", x = as.matrix(x))
rdata <- gsub(pattern = "A", replacement = "1", x = rdata)
rdata <- gsub(pattern = "C", replacement = "2", x = rdata)
rdata <- gsub(pattern = "G", replacement = "3", x = rdata)
rdata <- gsub(pattern = "T", replacement = "4", x = rdata)
rdata <- gsub(pattern = "^([0-4])([0-4])", replacement = "\\1 \\2", rdata)
# transpose
rdata <- t(rdata)
rdata <- cbind(colnames(x), as.data.frame(rdata))
}
# single-character numeric format
if(output == "sn" | output == "lea" | output == "sequoia"){
# grab major/minor info via calc_maf, unless already calculated
if(!any(colnames(x@stats) == "major")){
x <- calc_maf(x)
}
else if(any(is.na(x@stats$major[x@stats$facet == ".base"]))){
x <- calc_maf(x)
}
min <- x@stats$minor[x@stats$facet == ".base"]
maj <- x@stats$major[x@stats$facet == ".base"]
# check to see if each allele is the minor, assign a one if so
a1 <- substr(unlist(t(genotypes(x))), 1, 1)
a2 <- substr(unlist(t(genotypes(x))), 2, 2)
a1 <- a1 == rep(min, each = ncol(x))
a2 <- a2 == rep(min, each = ncol(x))
rdata <- t(a1 + a2)
rdata[as.matrix(genotypes(x)) == x@mDat] <- NA
# sn
if(output == "sn"){
# grab out metadata
meta <- x@snp.meta
# if interpolating and there are any bad loci with zero called genotypes, remove them
if(interpolate != FALSE){
bad.loci <- ifelse(is.na(rdata), 0, 1)
bad.loci <- which(rowSums(bad.loci) == 0)
if(length(bad.loci) > 0){
rdata <- rdata[-bad.loci,]
meta <- meta[-bad.loci,]
warning("Some loci had no called genotypes and were removed: ", paste0(bad.loci, collapse = ", "), "\n")
}
}
# interpolate?
if(interpolate == "bernoulli"){
rdata <- .interpolate_sn(rdata, "bernoulli")
}
else if(interpolate == "af"){
rdata <- .interpolate_sn(rdata, "af")
}
else if(interpolate == "iPCA"){
rdata <- .interpolate_sn(rdata, "iPCA", ncp = ncp, ncp.max = ncp.max)
}
# bind and save
rdata <- cbind(meta[,-which(colnames(meta) == ".snp.id")], as.data.frame(rdata))
}
# sequoia
else if(output == "sequoia"){
rdata[is.na(rdata)] <- -9
rdata <- t(rdata)
rdata <- matrix(as.numeric(rdata), ncol = ncol(rdata))
colnames(rdata) <- 1:ncol(rdata)
fix.sex.sequoia <- function(y){
#format sex for sequoia (1 = F,2 = M,3 = U,4 = H, NA)
sexes <- c("F", "M", "U", "H", "1", "2", "3", "4")
sex <- sample.meta(x)$Sex
sex[which(!sex %in% sexes)] <- 3
sex[which(sex == "F")] <- 1
sex[which(sex == "M")] <- 2
sex[which(sex == "U")] <- 3
sex[which(is.na(sex) == TRUE)] <- 3
sex[which(sex == "H")] <- 4
return(sex)
}
#for lifehistory data input needs id, sex, year born
if(!all(c("Sex", "BirthYear", "ID", "BYmin", "BYmax", "Yearlast") %in% colnames(x@sample.meta))){
warning("Required columns Sex, ID, Yearlast, BirthYear, BYmax, and BYmin not found in sample metadata.\n")
}
ID <- x@sample.meta$ID
sex <- fix.sex.sequoia(x)
#ACTUALLY MAKE THE TABLE
lhtable <- data.frame(ID=ID,
Sex = as.numeric(sex),
BirthYear = sample.meta(x)$BirthYear,
BY.min = sample.meta(x)$BYmin,
BY.max = sample.meta(x)$BYmax,
Year.last = sample.meta(x)$Yearlast,
stringsAsFactors = F)
rownames(rdata) <- ID
rdata = list(dat=rdata, lh=lhtable)
}
# lea
else{
rdata <- 2 - rdata
rdata[is.na(rdata)] <- 9
}
}
# adegenet
if(output == "adegenet"){
pop.col <- which(colnames(x@sample.meta) == "pop")
if(ncol(x@sample.meta) > 1){
strata <- x@sample.meta[,-ncol(x@sample.meta)]
if(!is.data.frame(strata)){
strata <- as.data.frame(strata, stringsAsFactors = F)
colnames(strata) <- colnames(x@sample.meta)[-ncol(x@sample.meta)]
row.names(strata) <- colnames(x)
}
else{
strata <- NULL
}
}
if(length(pop.col) > 0){
rdata <- adegenet::df2genind(t(as.data.frame(x, stringsAsFactors = F)), ncode = 1,
NA.char = substr(x@mDat, 1, nchar(x@mDat)/2),
strata = strata,
loc.names = x@snp.meta$.snp.id,
pop = x@sample.meta$pop)
}
else{
rdata <- adegenet::df2genind(t(as.data.frame(x, stringsAsFactors = F)), ncode = 1,
NA.char = substr(x@mDat, 1, nchar(x@mDat)/2),
strata = strata,
loc.names = x@snp.meta$.snp.id)
}
}
# fasta
if(output == "fasta"){
# first, need to randomly draw either a major or minor allele for the heterozygotes
sn <- format_snps(x, "sn", interpolate = FALSE)
sn <- sn[,-(1:(ncol(x@snp.meta) - 1))]
sn <- as.matrix(sn)
hets <- which(sn == 1)
nhets <- length(hets)
draws <- stats::rbinom(nhets, 1, .5)
draws[draws == 1] <- 2
sn[hets] <- draws
# assign major or minor back
maf.check <- .check_calced_stats(x, ".base", stats = "maf")
if(!unlist(maf.check)){
x <- calc_maf(x)
}
s <- .get.snpR.stats(x)
# majors
sn[is.na(sn)] <- "N"
majs <- which(sn == 0)
rmajs <- majs %% nrow(sn)
rmajs[rmajs == 0] <- nrow(sn)
rmajs <- s$major[rmajs]
sn[majs] <- rmajs
# minors
mins <- which(sn == 2)
rmins <- mins %% nrow(sn)
rmins[rmins == 0] <- nrow(sn)
rmins <- s$minor[rmins]
sn[mins] <- rmins
# paste together by columns
rdata <- do.call(paste0, as.data.frame(t(sn), stringsAsFactors = F)) # faster than the tidyr version!
names(rdata) <- colnames(sn)
}
# VCF
if(output == "vcf"){
pos <- `#CHROM` <- NULL
maxes <- tapply(snp.meta(x)[,position], snp.meta(x)[,chr], max)
chr_opts <- unique(snp.meta(x)[,chr])
maxes <- maxes[match(chr_opts, names(maxes))]
chr_filt_lines <- paste0("##contig=<ID=", chr_opts, ",length=", maxes,">")
# meta
r <- try(x@filters, silent = TRUE)
if(methods::is(r, "try-error")){
.make_it_quiet(vcf_meta <- c("##fileformat=VCFv4.0",
paste0("##fileDate=", gsub("-", "", Sys.Date())),
"##source=snpR",
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
'##INFO=<ID=AC,Number=1,Type=Integer,Description="Allele Count">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
chr_filt_lines))
}
else{
.make_it_quiet(vcf_meta <- c("##fileformat=VCFv4.0",
paste0("##fileDate=", gsub("-", "", Sys.Date())),
"##source=snpR",
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
'##INFO=<ID=AC,Number=1,Type=Integer,Description="Allele Count">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
paste0("##snpR filter_snps=", filters(x)),
chr_filt_lines))
}
# meta columns
if(all(c("REF", "ALT") %in% colnames(snp.meta(x)))){
cat("Pulling REF and ALT from snp.metadata. If this was defined elsewhere than in a source .vcf file, this may cause issues--please remove these columns from the snp.metadata and re-run in this case.\n")
data_meta <- data.frame(CHROM = snp.meta(x)[,chr],
POS = snp.meta(x)[,position],
ID = paste0("SNP_", 1:nrow(x)),
REF = snp.meta(x)$REF,
ALT = snp.meta(x)$ALT,
QUAL = ".",
FILTER = "PASS",
INFO = paste0("NS=", .get.snpR.stats(x)$maj.count + .get.snpR.stats(x)$min.count,
";AC=", .get.snpR.stats(x)$min.count),
FORMAT = "GT")
}
else{
data_meta <- data.frame(CHROM = snp.meta(x)[,chr],
POS = snp.meta(x)[,position],
ID = paste0("SNP_", 1:nrow(x)),
REF = .get.snpR.stats(x)$major,
ALT = .get.snpR.stats(x)$minor,
QUAL = ".",
FILTER = "PASS",
INFO = paste0("NS=", .get.snpR.stats(x)$maj.count + .get.snpR.stats(x)$min.count,
";AC=", .get.snpR.stats(x)$min.count),
FORMAT = "GT")
}
colnames(data_meta)[1] <- "#CHROM"
malt <- which(data_meta$ALT == "N")
mref <- which(data_meta$REF == "N")
if(length(malt) > 0){
data_meta$ALT[malt] <- "."
}
if(length(mref) > 0){
data_meta$REF[mref] <- "."
}
# genotypes, use sn format intermediate
rdata <- format_snps(x, "sn", interpolate = F)[,-c(1:(ncol(snp.meta(x)) - 1))]
rdata <- as.matrix(rdata)
rdata[rdata == 0] <- "0/0"
rdata[rdata == 1] <- "0/1"
rdata[rdata == 2] <- "1/1"
rdata[is.na(rdata)] <- "./."
rdata <- as.data.frame(rdata)
rdata <- list(meta = vcf_meta, genotypes = rdata, data_meta = data_meta)
}
#======================return the final product, printing an outfile if requested.=============
if(outfile != FALSE){
osp <- options("scipen")$scipen
options(scipen = 999)
cat("Writing output file...\n")
if(any(facets == ".base")){
facets <- facets[-which(facets == ".base")]
}
if(output == "genepop"){ # if(output %in% c("genepop", "baps"))
cat("\tPreparing genepop file...\n")
# get list of snps
llist <- paste0("SNP", "_", 1:ncol(rdata), ",")
llist[length(llist)] <- paste0("SNP_", ncol(rdata))
# write output
base::cat(paste0(unlist(.split.facet(outfile))[1], "_genepop\n"), file = outfile)
base::cat(llist, "\nPOP\n", file = outfile, append = T)
# write the tables, splitting by pop if requested:
if(length(facets) > 0){
cat("\tWriting genepop file seperated by populations. First provided facet is treated as pop.\t")
# sort by pop
write.facets <- sort(unlist(.split.facet(facets)))
facet.cols <- match(write.facets, colnames(x@sample.meta))
pop <- .paste.by.facet(sample.meta(x), facet.cols)
pop <- gsub(" ", ".", pop)
rdata$pop <- pop
pop <- sort(unique(rdata$pop)) # for later
# sort and remove pop column
rdata$rnames <- rownames(rdata)
rdata <- dplyr::arrange(rdata, pop)
pop.rows <- rdata$pop
rownames(rdata) <- rdata$rnames
rdata <- rdata[,-which(colnames(rdata) %in% c("pop", "rnames"))]
#second loop prints results.
for (i in 1:(length(pop))){
cat(pop[i], "\t")
data.table::fwrite(rdata[pop.rows == pop[i],], outfile, quote = F, sep = "\t", col.names = F, row.names = T, append = T)
if(i != length(pop)){
base::cat("POP\n", file = outfile, append = T)
}
}
cat("\t Done.\n")
}
else{
data.table::fwrite(rdata, outfile, quote = F, sep = "\t", col.names = F, row.names = T, append = T)
}
}
else if(output == "ac"){
#write the raw output
data.table::fwrite(rdata, outfile, quote = FALSE, col.names = T, sep = "\t", row.names = F)
#write a bayescan object if pop list was provided.
if(length(facets) > 0){
outfile <- paste0(outfile, ".bayes")
cat("\tWriting bayescan file seperated by populations. First provided facet is treated as pop.\t")
pop <- x@sample.meta[,colnames(x@sample.meta) == facets[1]]
u.pops <- unique(pop)
trdat <- rdata[rdata$facet == facets[1], c("subfacet", ".snp.id", "n_total", "n_alleles", "ni1", "ni2")]
#write the header
base::cat("[loci]=", nrow(x), "\n\n[populations]=", length(u.pops), "\n\n", file = outfile, sep = "")
#write the data for each population.
for(i in 1:length(u.pops)){
base::cat("[pop]=", i, "\n", file = outfile, append = T, sep = "") #write header
tdat <- trdat[trdat$subfacet == u.pops[i],]
wdat <- cbind(snp = tdat$.snp.id, tdat[,3:6])
data.table::fwrite(wdat,
outfile, col.names = F, row.names = F, quote = F, sep = "\t",
append = T) # write the data for this population.
base::cat("\n", file = outfile, append = T) # add a line break
}
}
}
else if(output == "structure" | output == "faststructure"){
data.table::fwrite(rdata, outfile, quote = FALSE, col.names = F, sep = "\t", row.names = F)
}
else if(output == "plink"){
data.table::fwrite(map, paste0(outfile, ".map"), quote = F, col.names = F, sep = "\t", row.names = F)
data.table::fwrite(fam, paste0(outfile, ".fam"), quote = F, col.names = F, sep = "\t", row.names = F)
data.table::fwrite(ped, paste0(outfile, ".ped"), quote = F, col.names = F, sep = "\t", row.names = F)
data.table::fwrite(bim, paste0(outfile, ".bim"), quote = F, col.names = F, sep = "\t", row.names = F)
}
else if(output == "adegenet"){
saveRDS(rdata, paste0(outfile, ".RDS"))
}
else if(output == "fasta"){
writeobj <- c(rbind(paste0(">", names(rdata)), rdata))
writeLines(writeobj, outfile)
}
else if(output == "lea"){
utils::write.table(rdata, outfile, quote = FALSE, col.names = F, sep = "", row.names = F)
}
else if(output == "colony"){
data.table::fwrite(rdata, outfile, quote = FALSE, col.names = F, sep = " ", row.names = F)
}
else if(output == "sequoia"){
data.table::fwrite(rdata$dat, paste0("genos_", outfile), quote = FALSE, col.names = F, sep = "\t", row.names = F)
data.table::fwrite(rdata$lh, paste0("lh_", outfile), quote = FALSE, col.names = T, sep = "\t", row.names = F)
}
else if(output == "vcf"){
if(length(facets) > 0){
facets <- .check.snpR.facet.request(x, facets, "none")
cat("\tWriting VCF file for each facet level.\t")
# write per facet
for(i in 1:length(facets)){
# cat(snpR:::.console_hline())
# cat(i, "\n\n")
samp.facet <- .check.snpR.facet.request(x, facets[i])
if(samp.facet %in% x@facets){
x <- .add.facets.snpR.data(x, .check.snpR.facet.request(x, facets[i]))
}
# write per facet level
task.list <- .get.task.list(x, facets[i])
for(j in 1:nrow(task.list)){
# cat(j, "\n")
# determine matching data
matches <- list(.fetch.snp.meta.matching.task.list(x, task.list[j,]),
.fetch.sample.meta.matching.task.list(x, task.list[j,]))
# outfile name
ofn <- task.list[j,c(2,4)]
if(any(ofn == ".base")) ofn <- ofn[-which(ofn == ".base")]
if(length(ofn) == 0){
ofn <- outfile
}
else{
ofn <- paste0(tools::file_path_sans_ext(outfile), "_", paste0(ofn, collapse = "_"), ".vcf")
}
trm <- rdata$meta
contig_matches <- grep("contig", trm)
missing_contigs <- trm[contig_matches]
missing_contigs <- gsub("^##contig=<ID=", "", unlist(missing_contigs))
missing_contigs <- gsub(",length=.+$", "", missing_contigs)
missing_contigs <- which(!missing_contigs %in% snp.meta(x)[matches[[1]],chr])
if(length(missing_contigs) > 0){
trm <- trm[-contig_matches[missing_contigs]]
}
writeLines(trm, ofn)
data.table::fwrite(dplyr::arrange(cbind(rdata$data_meta[matches[[1]],],
rdata$genotypes[matches[[1]], matches[[2]]]),
`#CHROM`, POS),
ofn,
append = T,
quote = F,
sep = "\t",
row.names = F,
col.names = T)
}
}
}
else{
writeLines(rdata$meta, outfile)
data.table::fwrite(dplyr::arrange(cbind(rdata$data_meta,
rdata$genotypes),
`#CHROM`, POS),
outfile,
append = T,
quote = F,
sep = "\t",
row.names = F,
col.names = T)
}
}
else if(output == "genalex"){
wb <- openxlsx::createWorkbook(creator = "snpR")
openxlsx::addWorksheet(wb, sheetName = "Data")
if(length(facets) > 0){
# figure out samples in each facet
levs <- .paste.by.facet(sample.meta(x), unlist(.split.facet(facets)))
ulevs <- unique(levs)
matches <- vector("list", length(ulevs))
names(matches) <- ulevs
for(i in 1:length(ulevs)){
matches[[i]] <- .fetch.sample.meta.matching.task.list(x, c(facets, ulevs[i], ".base", ".base"))
}
# make the header
header <- c(nsnps(x), nsamps(x), length(matches), unlist(lapply(matches, length)))
names(header) <- NULL
openxlsx::writeData(wb, "Data", matrix(header, 1), colNames = FALSE)
openxlsx::writeData(wb, "Data", matrix(c("snpR exported data", "", "", ulevs), 1), startRow = 2, colNames = FALSE)
# make the column titles
col_titles <- c("SAMPLE", "POP", c(rbind(paste0("SNP", 1:nsnps(x)), "")))
openxlsx::writeData(wb, "Data", matrix(col_titles, nrow = 1), startRow = 3, colNames = FALSE)
# write the data
prog <- 4
for(i in 1:length(matches)){
part <- (prog-3):(prog-4 + length(matches[[i]]))
openxlsx::writeData(wb, "Data", cbind(paste0("Sample", part), levs[matches[[i]]]), colNames = FALSE, startRow = prog)
openxlsx::writeData(wb, "Data", rdata[matches[[i]],], colNames = FALSE, startRow = prog, startCol = 3)
prog <- prog + length(matches[[i]])
}
}
else{
# header
header <- c(nsnps(x), nsamps(x), 1)
# make the header
openxlsx::writeData(wb, "Data", matrix(header, 1), colNames = FALSE)
openxlsx::writeData(wb, "Data", matrix(c("snpR exported data", "", "", "base"), 1), startRow = 2, colNames = FALSE)
# make the column titles
col_titles <- c("SAMPLE", "POP", c(rbind(paste0("SNP", 1:nsnps(x)), "")))
openxlsx::writeData(wb, "Data", matrix(col_titles, nrow = 1), startRow = 3, colNames = FALSE)
# data
openxlsx::writeData(wb, "Data", cbind(paste0("Sample", 1:nsamps(x)), "base"), colNames = FALSE, startRow = 4)
openxlsx::writeData(wb, "Data", rdata, colNames = FALSE, startRow = 4, startCol = 3)
}
openxlsx::saveWorkbook(wb, file = outfile, overwrite = TRUE)
}
else{
data.table::fwrite(rdata, outfile, quote = FALSE, col.names = T, sep = "\t", row.names = F)
}
options(scipen = osp)
return(TRUE)
}
#return results
else{
return(rdata)
}
}
#' Checks for duplicated samples in snpRdata.
#'
#' Searches through a snpR dataset and, for every designated sample, determines
#' the proportion of identical genotypes in every other sample. This function
#' \emph{is not overwrite safe}.
#'
#' If an id column is specified, y should contain sample IDs matching those
#' contained in that column. If not, y should contain sample indices instead.
#' The proportion of identical genotypes between matching samples and all other
#' samples are calculated. By default, every sample will be checked.
#'
#' @param x snpRdata object
#' @param y numeric or character, default 1:ncol(x). Designates the sample
#' indices or IDs in x for which duplicates will be checked.
#' @param id.col character, default NULL. Designates a column in the sample
#' metadata which contains sample IDs. If provided, y is assumed to contain
#' sample IDs uniquely matching those in the the sample ID column.
#' @param verbose logical, default FALSE. If TRUE, prints detailed progress
#' report.
#'
#' @return A list containing: \itemize{ \item{best_matches: } Data.frame listing
#' the best match for each sample noted in y and the percentage of genotypes
#' identical between the two samples. \item{data: } A list containing the
#' match proportion between each sample y and every sample in x, named for the
#' samples y. }
#'
#' @author William Hemstrom
#' @export
#' @examples
#' \dontrun{
#' # check for duplicates with sample 1
#' check_duplicates(stickSNPs, 1)
#'
#' # check duplicates using the .samp.id column as sample IDs
#' check_duplicates(stickSNPs, 1, id.col = ".sample.id")
#' }
check_duplicates <- function(x, y = 1:ncol(x), id.col = NULL, verbose = FALSE){
#============sanity checks============
msg <- character()
if(!is.null(id.col)){
if(length(id.col) != 1){
msg <- "Only one ID column may be provided."
}
if(!id.col %in% colnames(x@sample.meta)){
msg <- c(msg,
"ID column not found in sample metadata.")
}
else{
if(identical(y, 1:ncol(x))){
y <- x@sample.meta[,id.col]
}
if(length(unique(x@sample.meta[,id.col])) != length(x@sample.meta[,id.col])){
msg <- c(msg,
"Each entry in the ID column must be unique.")
}
bad.y <- which(!y %in% x@sample.meta[,id.col])
if(length(bad.y) > 0){
msg <- c(msg,
paste0("Some samples in y not found in ID column: ", paste0(y[bad.y], collapse = ", "), "."))
}
}
}
else{
if(!is.numeric(y)){
msg <- c(msg,
"y must be numeric if no sample ID column provided.")
}
else{
if(any(y > ncol(x))){
msg <- c(msg,
"All y values must be less than or equal to the number of samples in x.")
}
}
}
if(length(msg) > 0){
stop(paste0(msg, collapse = "\n"))
}
#============run the duplicate check==========
# initialize
out <- vector("list", length(y))
names(out) <- y
out.best <- data.frame(sample = y, best_match = character(length(y)),
percentage = numeric(length(y)),
comparisons = numeric(length(y)), stringsAsFactors = F)
# do each comparison
for(i in 1:length(y)){
if(verbose){
cat("Working on sample:", y[i], "\n")
}
# pick out this value
if(!is.null(id.col)){
t.samp.id <- which(x@sample.meta[,id.col] == y[i])
}
else{
t.samp.id <- y[i]
}
if(length(t.samp.id) == 0){
out.best$matches[i] <- "bad.ID"
if(verbose){cat("\tBad ID, skipping.\n")}
next()
}
#figure out which values are "NN"
t.samp <- genotypes(x)[,t.samp.id]
miss <- t.samp == "NN"
# finish initializing
out[[i]]$hits <- numeric(ncol(x))
out[[i]]$comparisons <- numeric(ncol(x))
if(!is.null(id.col)){
names(out[[i]]$hits) <- x@sample.meta[,id.col]
}
else{
names(out[[i]]$hits) <- 1:ncol(x)
}
names(out[[i]]$comparisons) <- names(out[[i]]$hits)
# compare to every other sample. Because we don't count "NN" comparisons, we need to explicitly loop.
for(j in (1:ncol(x))){
# skip if a self comparison
if(j == t.samp.id){
out[[i]]$hits[j] <- 0
out[[i]]$comparisons[j] <- 0
next()
}
# compare only loci non "NN" in both samples
c.samp <- genotypes(x)[,j]
c.miss <- c.samp == "NN"
u.miss <- which(c.miss | miss)
# check the proportion of identical genotypes
valid.comps <- length(c.samp[-u.miss])
out[[i]]$hits[j] <- sum(t.samp[-u.miss] == c.samp[-u.miss])/valid.comps
out[[i]]$comparisons[j] <- valid.comps
}
# figure out and save data on the "best", or most identical, hit.
best <- which.max(out[[i]]$hits)
out[[i]]$best <- out[[i]]$hits[best]
names(out[[i]]$best) <- names(out[[i]]$hits)[best]
# save to output summary.
out.best$best_match[i] <- paste0(names(out[[i]]$best), collapse = ", ")
out.best$percentage[i] <- out[[i]]$best
out.best$comparisons[i] <- out[[i]]$comparisons[best]
if(verbose){
cat("\tBest hit(s):", out.best$best_match[i], "\n")
cat("\tPercentage match(es):", out.best$percentage[i], "\n")
}
}
# return
return(list(best_matches = out.best, data = out))
}
#' Fetch the allele frequencies for all SNPs for each level of each requested
#' facet.
#'
#' Fetch allele frequencies for all SNPs for each level of all the requested
#' facets. Major and minor allele frequencies will be interleaved, with the major
#' allele first for each locus. Note that this particular function is not
#' overwrite-safe.
#'
#' @param x snpRdata object
#' @param facets character, default NULL. The facets for which to calculate
#' allele frequencies. See \code{\link{Facets_in_snpR}} for details.
#'
#' @return A named, nested list containing allele frequency matrices for each
#' facet level for all requested facets.
#'
#' @author William Hemstrom
#' @export
tabulate_allele_frequency_matrix <- function(x, facets = NULL){
..ord <- NULL
#==================prep and sanity check==================
msg <- character()
# check facets and get maf
facets <- .check.snpR.facet.request(x, facets, "none", T)
facet_types <- facets[[2]]
facets <- facets[[1]]
needed.sample.facets <- .check.snpR.facet.request(x, facets)
if(any(facet_types == "snp")){
needed.sample.facets <- c(needed.sample.facets, ".base")
needed.sample.facets <- unique(needed.sample.facets)
}
## add any missing maf data
missing_mafs <- .check_calced_stats(x, needed.sample.facets, "maf")
if(any(!unlist(missing_mafs))){
x <- calc_maf(x, names(missing_mafs)[which(!unlist(missing_mafs))])
}
am <- .get.snpR.stats(x, needed.sample.facets)
# grab column names
maj_min <- .get.snpR.stats(x)
maj_min <- paste0(rep(unique(maj_min$.snp.id), each = 2), "_",
c(maj_min$major, maj_min$minor)[rep(1:nrow(maj_min), each = 2) + (0:1) * nrow(maj_min)])
#==================run============
# intialize output
## need to do calcs for each sample level facet only
sample_facet_freqs <- vector("list", length(needed.sample.facets))
names(sample_facet_freqs) <- needed.sample.facets
## output
out <- vector("list", length = length(facets))
names(out) <- facets
# run for each facet
for(i in 1:length(sample_facet_freqs)){
# melt the allele frequencies down into a matrix:
tam <- data.table::dcast(as.data.table(am[which(am$facet == needed.sample.facets[i]),]), subfacet ~ .snp.id, value.var = "maf")
pops <- tam[,1]
tam <- tam[,-1]
amb <- 1 - tam
# interleave major and minor frequencies and save
ord <- rep(1:ncol(tam), each = 2) + (0:1) * ncol(tam)
amc <- .fix..call(cbind(amb, tam)[,..ord])
colnames(amc) <- maj_min
amc <- as.data.frame(amc)
rownames(amc) <- unlist(pops)
sample_facet_freqs[[i]] <- as.matrix(amc)
}
#==================break up for facets=========
for(i in 1:length(facets)){
# split up by snp levels if requested
if(facet_types[i] == "complex" | facet_types[i] == "snp"){
if(facet_types[i] == "snp"){
t.samp.facet <- ".base"
}
else{
t.samp.facet <- .check.snpR.facet.request(x, facets[i], "snp")
}
t.snp.facet <- .check.snpR.facet.request(x, facets[i], "sample")
snp.parts <- .get.task.list(x, t.snp.facet)[,-c(1:2)]
tout <- vector("list", length = nrow(snp.parts))
names(tout) <- snp.parts[,2]
for(j in 1:nrow(snp.parts)){
# these are the snpIDs that are a part of this facet level
include_snps <- unique(am$.snp.id[which(am[,snp.parts[j,1]] == snp.parts[j,2])])
# grab just those snps
tout[[j]] <- sample_facet_freqs[[which(needed.sample.facets == t.samp.facet)[1]]] # add everything
tout[[j]] <- tout[[j]][,which(as.numeric(gsub("_.+", "", colnames(tout[[j]]))) %in% include_snps), drop = F] # subset the requested snps only
}
# assign back, nesting with the snp facet name
out[[i]] <- tout
}
else{
if(is.null(.check.snpR.facet.request(x, facets[i]))){
facets[i] <- ".base"
}
out[[i]] <- list(.base = sample_facet_freqs[[which(needed.sample.facets == facets[i])]])
}
}
#=================return============
x <- .update_calced_stats(x, facets, "allele_frequency_matrix")
return(.merge.snpR.stats(x, stats = out, type = "allele_frequency_matrices"))
}
#' Filter down to one SNPs every \emph{n} bases.
#'
#' Selects one SNPs every \emph{n} bases. This can be
#' used to filter for SNPs in tight LD, such as when only one SNP per RAD-tag is
#' desired.
#'
#' Note that this approach takes the first SNP every \emph{n} bases, and so is
#' non-random. \code{\link{filter_snps}} can be used beforehand to ensure that
#' the selected SNPs are above a desired quality threshold to ensure that poor
#' SNPs are not selected over loci with more robust genotyping data.
#'
#' It is strongly recommended to provide a SNP facet to the facet argument. This
#' will define "chromosomes" for the purpose of selecting SNPs. If not set, snpR
#' will assume that \emph{all SNPs are on the same chromosome}, which may
#' produce undesired results.
#'
#' @param x snpRdata object
#' @param facet character, default NULL. SNP facet specifying chromosomes or
#' scaffolds. SNP positions will be independently considered depending on the
#' facet level.
#' @param n Integer. Specifies the minimum distance between selected SNPs.
#'
#' @return A snpRdata object containing the selected SNPs.
#'
#' @author William Hemstrom
#' @export
#'
#' @examples
#' # put large gaps inbetween the example data
#' gapped <- gap_snps(stickSNPs, "chr", 100000)
#'
#' # filter first to grab only very good SNPs
#' vgood <- filter_snps(stickSNPs, min_ind = .8)
#' gapped <- gap_snps(vgood, "chr", 100000)
gap_snps <- function(x, facet = NULL, n){
#==========sanity checks=========
if(!is.snpRdata(x)){
stop("x must be a snpRdata object.\n")
}
msg <- character()
ffacet <- .check.snpR.facet.request(x, facet, remove.type = "none", return.type = TRUE)
if(any(ffacet[[2]] %in% c("sample", "complex"))){
msg <- c(msg, "Non SNP facets are not permitted.\n")
}
facet <- .check.snpR.facet.request(x, facet, "sample")
if(length(facet) != 1){
msg <- c(msg, "Only one facet may be provided to gap_snps. Note that complex
SNP facets may be used (e.g. chr.scaffold).")
}
if(!"position" %in% colnames(snp.meta(x))){
msg <- c(msg, "There must be a 'position' column in the SNP metadata to use gap_snps!")
}
if(facet[1] == ".base"){
warning("Gapping SNPs by the base facet is not recommended. Consider using a SNP facet instead.\n")
}
if(".tfacet.ext" %in% colnames(snp.meta(x))){
msg <- c(msg, "A column named '.tfacet.ext' cannot be in the SNP metadata.\n")
}
if(length(msg) > 0){
stop(msg)
}
#==============subfunction==============
retained <- function(pos, n){
fsnp <- which.min(pos)
cs <- pos[fsnp]
# use a while loop to grab the snps
# should be a fairly quick loop, since it loops through each gap, not each
# snp. Might still be a bit slow with really large data, can fix later if
# I get requests...
while(cs[length(cs)] + n <= max(pos)){
cs <- c(cs,
min(pos[pos >= cs[length(cs)] + n]))
}
ret <- match(cs, pos)
return(ret)
}
#==============find the SNPs to retain=====================
# get a task list, then run for each task
task.list <- .get.task.list(x, facet)
keep.snps <- numeric()
# no par option for now, should usually be quick. Can add if requested.
for(i in 1:nrow(task.list)){
tsm <- snp.meta(x)[.fetch.snp.meta.matching.task.list(x, task.list[i,]),]
ret <- retained(tsm$position, n)
keep.snps <- c(keep.snps, tsm$.snp.id[ret])
}
#==============subset and return========
# grab the snp indexes to keep
ids <- sort(match(keep.snps, snp.meta(x)$.snp.id))
cat("Selected ", length(ids), "out of ", nrow(x), "SNPs.\n")
# subset
return(x[ids,])
}
#' Gather citations for methods used with a snpRdata object
#'
#' snpR will automatically track the methods used for calculations on a
#' snpRdata object. Using \code{\link{citations}} on that object will provide
#' details on the the methods used, and can optionally write a .bib bibtex
#' formatted library containing citations for these methods.
#'
#' Printed outputs contain the statistic calculated, the in-line citation for
#' the method used, a bibtex key which corresponds to the .bib library written
#' if the outbib argument is used, and a quick note giving any details.
#'
#' @param x snpRdata object
#' @param outbib character, default FALSE. An optional file path to which a .bib
#' bibtex library containing all of the citations for the used methods will be
#' written.
#' @param return_bib logical, default FALSE. If TRUE, returns a list containing
#' the bib entries and their details. The bib entries are formatted according
#' to \code{\link[rbibutils]{readBib}}.
#'
#' @author William Hemstrom
#' @returns If return_bib is TRUE, a list containing four parts:
#' \itemize{\item{keys: } A vector of bibtex keys for each method.
#' \item{stats: } A vector of the stats used. \item{details: } A vector of
#' details for each method. \item{bib: } A \code{bibentry} for each citation,
#' see \code{\link[rbibutils]{readBib}}.}
#'
#' @export
#' @examples
#' # calculate pi
#' x <- calc_pi(stickSNPs)
#'
#' # fetch citations
#' citations(x)
#'
#' # fetch citations and write bibtex .bib to citations.bib
#' ## not run
#' \dontrun{
#' citations(x, outbib = "citations.bib")
#' }
citations <- function(x, outbib = FALSE, return_bib = FALSE){
#==========sanity checks=======
if(!is.snpRdata(x)){
stop("x must be a snpRdata object.\n")
}
.check.installed("rbibutils")
#==========grab bib============
bib.file <- system.file("extdata", "snpR_citations.bib", package = "snpR")
bib <- rbibutils::readBib(bib.file)
#==========filter bib==========
keys <- as.character(unlist(purrr::map(x@citations, "key")))
bib <- bib[keys]
#==========shout at the user=====
deets <- unlist(purrr::map(x@citations, "details"))
stats <- names(x@citations)
cat("Citations for methods used thus far: \n")
for(i in 1:length(keys)){
cat("==============================================\n")
cat("Statistic: ", stats[i], "\n\n")
cat("Citation: ", .quick_grab_cite(bib[keys[i]]), "\n\n")
cat("Bibtex key: ", keys[i], "\n\n")
cat("Details: ", deets[i], "\n")
}
cat("==============================================\n\n")
#==========print bib=============
if(!isFALSE(outbib)){
if(file.exists(outbib)){
current_bib <- rbibutils::readBib(outbib)
not_in_current <- which(!keys %in% names(current_bib))
if(length(not_in_current) > 0){
#==========filter bib==========
bib <- bib[keys[not_in_current]]
current_bib <- c(current_bib, bib)
rbibutils::writeBib(current_bib, outbib)
}
}
else{
rbibutils::writeBib(bib, outbib)
cat(".bib file can be found at: ", outbib, "\n")
}
}
if(!isFALSE(return_bib)){
return(list(keys = keys, stats = stats, details = deets, bib = bib))
}
}
#' Report on filters used on a snpRdata object.
#'
#' snpR will automatically track the methods used on a
#' snpRdata object. Using \code{\link{filters}} on that object will provide
#' details on the the filters used.
#'
#' Printed outputs contain the filters used in the order they were applied
#' alongside the filtering stringency and any facets applied over, if
#' applicable. Note that some output formats from \code{format_snps}, like the
#' \code{vcf} format, will notes on the filters used as well.
#'
#' @param x snpRdata object
#'
#' @author William Hemstrom
#'
#' @return Prints easily human readable results to the console and also returns
#' a more machine readable string.
#'
#' @export
#' @examples
#' # filter the data
#' x <- filter_snps(stickSNPs,
#' min_ind = 0.75,
#' min_loci = 0.75,
#' maf = 0.1,
#' maf_facets = "pop")
#'
#' # fetch the filters used
#' filters(x)
filters <- function(x){
r <- try(x@filters, silent = TRUE)
if(methods::is(r, "try-error")){
cat("This is an old `snpRdata` object that did not track filters.\n")
}
else{
line <- character(0)
for(i in 1:nrow(r)){
cat("Filter:", r[[1]][i], "\n")
line_part <- unlist(strsplit(r[[1]][i], "--"))
line_part <- line_part[length(line_part)]
if(!is.na(r[[2]][i])){
cat("\tStringency:", r[[2]][i], "\n")
line_part <- c(line_part, paste0("=", r[[2]][i]))
}
if(!is.na(r[[3]][i])){
cat("\tFacet:", r[[3]][i], "\n")
line_part <- c(line_part, paste0(",facet=", r[[3]][i]))
}
cat("\n")
line <- c(line, paste0(line_part, collapse = ""))
}
}
return(paste0(line, collapse = ";"))
}
#' Summarize possible snpRdata object facet options
#'
#' List either all of the possible SNP and sample facets (if called with no
#' facets) or all of the categories for each requested facet.
#'
#' If called with sample facets, returns a table of counts of each category
#' otherwise returns a vector of categories.
#'
#' @param x snpRdata object
#' @param facets character. Categorical metadata variables by which to break up
#' analysis. See \code{\link{Facets_in_snpR}} for more details. If NULL, the
#' possible SNP and sample facets will be listed. If facets are instead
#' provided, the \emph{categories} for each facet will instead be listed.
#'
#' @export
#' @author William Hemstrom
#' @return A named list containing either the possible SNP and sample facets or
#' the categories for all of the requested facets.
#' @examples
#' # list available facets
#' summarize_facets(stickSNPs)
#'
#' # return details for a few facets
#' summarize_facets(stickSNPs, c("pop", "chr.pop", "fam.pop"))
summarize_facets <- function(x, facets = NULL){
if(!is.snpRdata(x)){
stop("x must be a snpRdata object.\n")
}
facets <- .check.snpR.facet.request(x, facets, remove.type = "none", fill_with_base = FALSE, return_base_when_empty = FALSE, return.type = TRUE)
base_facets <- which(facets[[1]] == ".base")
if(length(base_facets) > 0){
facets[[1]] <- facets[[1]][-base_facets]
facets[[2]] <- facets[[2]][-base_facets]
}
# If called without facets, list availabilities
if(length(facets[[1]]) == 0){
message("Returning list of facets. For more details, try asking for information about a specific facet (such as 'pop' or 'pop.chr')!")
return(list(SNP = colnames(snp.meta(x)),
sample = colnames(sample.meta(x))))
}
# otherwise return options for each facet
else{
out <- vector("list", length(facets[[1]]))
names(out) <- facets[[1]]
for(i in 1:length(out)){
if(facets[[2]][i] == "snp"){
out[[i]] <- unique(.get.task.list(x, facets[[1]][i])[,4])
}
else if(facets[[2]][i] == "sample"){
out[[i]] <- .paste.by.facet(sample.meta(x), unlist(.split.facet(facets[[1]][i])))
out[[i]] <- table(out[[i]])
}
else if(facets[[2]][i] == "complex"){
split_facet <- unlist(.split.facet(facets[[1]][i]))
samp_part <- .check.snpR.facet.request(x, split_facet, fill_with_base = FALSE, return_base_when_empty = FALSE)
snp_part <- .check.snpR.facet.request(x, split_facet, remove.type = "sample", fill_with_base = FALSE, return_base_when_empty = FALSE)
samp_part <- sample.meta(x)[,samp_part, drop = FALSE]
samp_part <- lapply(samp_part, unique)
snp_part <- snp.meta(x)[,snp_part, drop = FALSE]
snp_part <- lapply(snp_part, unique)
opts <- expand.grid(c(samp_part, snp_part))
out[[i]] <- unique(.paste.by.facet(opts, split_facet))
}
}
}
return(out)
}
#' Merge two snpRdata objects
#'
#' Merge two snpRdata objects using sample and SNP metadata. Functions
#' much like base R's \code{\link[base]{merge}} function, but
#' the 'by' and 'all' options can be specified at the SNP and sample level.
#'
#' While this function can be used essentially identically to how one might
#' use base R's \code{\link[base]{merge}} function, there are a few differences
#' to note.
#'
#' First, samples that are genotyped at identical loci
#' in both data sets can be handled several ways, controlled by the
#' \code{resolve_conflicts} argument: \itemize{\item{warning:} Return a harsh
#' warning and a data frame with more information on genotypes at identical
#' samples/SNPs are different between \code{x} and \code{y}.
#' \item{error: } The default, return an error when conflicts are detected.
#' \item{x} Use genotypes from \code{x} to resolve conflicts.
#' \item{y} Use genotypes from \code{y} to resolve conflicts.
#' \item{random} Randomly sample (non-missing) genotypes from \code{x}
#' and \code{y} to resolve conflicts.}
#' Note that called genotypes are always taken over un-called genotypes when
#' there are merge conflicts, and missing data in one but not the other data set
#' will not trigger an error or a warning if those options are selected.
#'
#' Secondly, the \code{by} and \code{all} arugment families from
#' \code{\link[base]{merge}} are extended to refer to either samples or SNPs,
#' such that all samples can be maintained but not all SNPs, for example.
#'
#' Lastly, all of the \code{all} family of arguments default to \code{TRUE}
#' instead of \code{FALSE}, since purely overlapping genotypes/SNPs is unlikely
#' to be desired. \code{FALSE} values provided to any specific \code{all}
#' argument will sill override \code{all = TRUE}, as in
#' \code{\link[base]{merge}}.
#'
#' At present, \code{\link{merge_snpRdata}} is not maximally efficient in that
#' it will remove all tabulated statistics and re-tabulate all internal
#' summaries. Improvements are in development.
#'
#' @param x,y \code{snpRdata} objects to merge
#' @param by.sample,by.sample.x,by.sample.y Columns of sample metadata by which
#' to merge across samples--function identically to the \code{by}, \code{by.x},
#' and \code{by.y} arguments to \code{\link[base]{merge}}, see documentation
#' there for details.
#' @param by.snp,by.snp.x,by.snp.y Columns of SNP metadata by which to merge
#' across SNPs--function idetically to the \code{by}, \code{by.x}, and
#' \code{by.y} arguments to \code{\link[base]{merge}}, see documentation there
#' for details.
#' @param all logical, default TRUE. If TRUE, all samples and SNPs will be
#' maintained in the output \code{snpRdata} object, with missing data matching
#' the missing data format of \code{x} added where genotypes are not in
#' either \code{x} or \code{y}.
#' @param all.x.snps,all.y.snps logical, default \code{all}. Keep SNPs in the data
#' even if they are only present in \code{x} or \code{y}, respectively.
#' @param all.x.samples,all.y.samples logical, default \code{all}. Keep samples in the
#' data even if they are only present in \code{x} or \code{y}, respectively.
#' @param resolve_conflicts character, default 'error'. Controls how
#' conflicting genotypic information in \code{x} and \code{y} is handled. See
#' 'Details' for options and explanation.
#'
#' @author William Hemstrom
#'
#' @return A merged \code{snpRdata} object.
#'
#' @export
#'
#' @examples
#' # create data to merge in
#' y <- data.frame(s1 = c("GG", "NN"),
#' s2 = c("GG", "TG"),
#' s3 = c("GG", "TT"),
#' s4 = c("GA", "TT"),
#' s5 = c("GG", "GT"),
#' s6 = c("NN", "GG"))
#'
#' snp.y <- data.frame(chr = c("groupVI", "test_chr"),
#' position = c(212436, 10))
#'
#' samp.y <- data.frame(pop = c("ASP", "ASP", "ASP", "test1", "test2", "test3"),
#' ID = c(1, 2, 3, "A1", "A2", "A3"),
#' fam = c("A", "B", "C", "T", "T", "T"))
#' y <- import.snpR.data(y, snp.y, samp.y)
#'
#' x <- stickSNPs
#' sample.meta(x)$ID <- 1:ncol(x)
#'
#' \dontrun{
#' # Not run, will error due to conflicts
#' z <- merge_snpRdata(x, y)
#'
#' # Not run, will return a warning and report mismatches
#' z <- merge_snpRdata(x, y, resolve_conflicts = "warning")
#' }
#'
#' # take a random genotype in the case of conflicts
#' z <- merge_snpRdata(x, y, resolve_conflicts = "random")
#' z
#'
merge_snpRdata <- function(x, y, by.sample = intersect(names(sample.meta(x)), names(sample.meta(y))),
by.sample.x = by.sample, by.sample.y = by.sample,
by.snp = intersect(names(snp.meta(x)), names(snp.meta(y))),
by.snp.x = by.snp, by.snp.y = by.snp,
all = TRUE, all.x.snps = all, all.y.snps = all,
all.x.samples = all, all.y.samples = all,
resolve_conflicts = "error"){
.sample.id.from.x <- .sample.id.from.y <- .snp.id.from.x <- .snp.id.from.y <- .source.ob <- NULL
.sample.id <- .snp.id <- NULL
#========sanity checks==============
if(!is.snpRdata(x)){
stop("x must be a snpRdata object.\n")
}
if(!is.snpRdata(y)){
stop("y must be a snpRdata object.\n")
}
if(x@snp.form != y@snp.form){
stop("x and y have snps in a different format.\n")
}
if(x@mDat != y@mDat){
if(x@mDat %in% colnames(y@geno.tables$wm)){
stop("x and y have missing data in a different format. The missing data format from x ", x@mDat, " is also present in the genotypes of y, so merging cannot procceed.\n")
}
warning("x and y have missing data in a different format. The missing data format from x ", x@mDat, " will be used.\n")
gy <- genotypes(y)
gy[gy == y@mDat] <- x@mDat
y <- import.snpR.data(gy, snp.meta(y), sample.meta(y), x@mDat)
}
if(".sample.id" %in% by.sample){
by.sample <- by.sample[-which(by.sample == ".sample.id")]
}
if(".snp.id" %in% by.snp){
by.snp <- by.snp[-which(by.snp == ".snp.id")]
}
good.resolves <- c("warning", "error", "random", "x", "y")
if(!resolve_conflicts %in% good.resolves){
stop(paste0("'", resolve_conflicts, "' not a recognized option for the 'resolve_conflicts' argument.\n"))
}
#========match samples and SNPs using the usual merge tool, which also handles all of the 'all' and 'by' options naturally=============
sm1 <- sample.meta(x)
sm2 <- sample.meta(y)
# handle the case where there are no matching sample metadata columns.
if(length(by.sample.x) == 0 | length(by.sample.y) == 0){
if(all.x.samples & all.y.samples){
sm.m <- data.table::rbindlist(list(x = sm1, y = sm2), fill = TRUE, idcol = ".source.ob")
sm.m[,.sample.id.from.x := ifelse(.source.ob == "x", .sample.id, NA)]
sm.m[,.sample.id.from.y := ifelse(.source.ob == "y", .sample.id, NA)]
sm.m$.source.ob <- NULL
sm.m <- as.data.frame(sm.m)
}
else{
stop("No matching sample metadata columns by which to merge.\n")
}
}
else{
sm.m <- merge(sm1, sm2, by.x = by.sample.x, by.y = by.sample.y,
all.x = all.x.samples, all.y = all.y.samples, suffixes = c(".from.x", ".from.y"), sort = FALSE)
}
sm.id.cols <- grep("^\\.sample\\.id", colnames(sm.m))
snm1 <- snp.meta(x)
snm2 <- snp.meta(y)
# handle the case where there are no matching snp metadata columns.
if(length(by.snp.x) == 0 | length(by.snp.y) == 0){
if(all.x.snps & all.y.snps){
snm.m <- data.table::rbindlist(list(x = snm1, y = snm2), fill = TRUE, idcol = ".source.ob")
snm.m[,.snp.id.from.x := ifelse(.source.ob == "x", .snp.id, NA)]
snm.m[,.snp.id.from.y := ifelse(.source.ob == "y", .snp.id, NA)]
snm.m$.source.ob <- NULL
snm.m <- as.data.frame(snm.m)
}
else{
stop("No matching snp metadata columns by which to merge.\n")
}
}
else{
snm.m <- merge(snm1, snm2, by.x = by.snp.x, by.y = by.snp.y,
all.x = all.x.snps, all.y = all.y.snps, suffixes = c(".from.x", ".from.y"), sort = FALSE)
}
snm.id.cols <- grep("^\\.snp\\.id", colnames(snm.m))
genotypes.m <- matrix(x@mDat, nrow = nrow(snm.m), ncol = nrow(sm.m))
gt <- list(gs = matrix(nrow = nrow(snm.m), ncol = length(unique(c(colnames(x@geno.tables$gs), colnames(y@geno.tables$gs))))),
as = matrix(nrow = nrow(snm.m), ncol = length(unique(c(colnames(x@geno.tables$as), colnames(y@geno.tables$as))))),
wm = matrix(nrow = nrow(snm.m), ncol = 1))
#=======merge genotypes by indices in merged metadata--genotypes present in both============
if(nrow(snm.m) == 0){
stop("No loci remain after merging.\n")
}
if(nrow(sm.m) == 0){
stop("No samples remain after merging.\n")
}
idents.snp <- which(!is.na(snm.m$.snp.id.from.x) & !is.na(snm.m$.snp.id.from.y))
idents.samp <- which(!is.na(sm.m$.sample.id.from.x) & !is.na(sm.m$.sample.id.from.y))
if(length(idents.snp) > 0 & length(idents.samp) > 0){
g.matches.x <- genotypes(x)[match(snm.m$.snp.id.from.x[idents.snp], snp.meta(x)$.snp.id),
match(sm.m$.sample.id.from.x[idents.samp], sample.meta(x)$.sample.id)]
g.matches.y <- genotypes(y)[match(snm.m$.snp.id.from.y[idents.snp], snp.meta(y)$.snp.id),
match(sm.m$.sample.id.from.y[idents.samp], sample.meta(y)$.sample.id)]
if(all(c(dim(g.matches.y), dim(g.matches.x))) > 0){
# set matches to x or y if requested, filling missing genotypes when possible from other set
if(resolve_conflicts == "y"){
g.matches.m <- g.matches.y
g.matches.m[g.matches.m == x@mDat] <- g.matches.x[g.matches.m == x@mDat]
}
else if(resolve_conflicts == "x"){
g.matches.m <- g.matches.x
g.matches.m[g.matches.m == x@mDat] <- g.matches.y[g.matches.m == x@mDat]
}
else{
if(resolve_conflicts == "error"){
stop("Some genotypes at identical loci sequenced in samples in both 'x' and 'y' are not identical. For more information on conflicts, merge_snpRdata() may instead be run with the 'resolve_conflicts' argument set to 'warning'\n")
}
# handle mismatches
## first sub-in any missing genotypes, since these will be seen as implicit matches in favor of the one without missing data.
g.matches.m <- g.matches.x
g.matches.m[g.matches.m == x@mDat] <- g.matches.y[g.matches.m == x@mDat]
g.matches.y[g.matches.y == x@mDat] <- g.matches.m[g.matches.y == x@mDat]
matching_idents <- g.matches.m == g.matches.y # should be no mDats here, since those will now match.
# throw an error if requested
if(resolve_conflicts == "warning"){
if(any(!matching_idents)){
warning("Error: Genotypic mismatches in identical samples/snps. Returning matrix of mismatches.")
matching_idents <- cbind(snp.meta(x)[match(snm.m$.snp.id.from.x[idents.snp], snp.meta(x)$.snp.id),], matching_idents)
colnames(matching_idents)[(ncol(snp.meta(x)) + 1): ncol(matching_idents)] <- paste0("Obj_x_sample_", match(sm.m$.sample.id.from.x[idents.samp], sample.meta(x)$.sample.id))
return(matching_idents)
}
}
# grab a random x or y genotype if requested
else if(resolve_conflicts == "random"){
take.y <- stats::rbinom(sum(!matching_idents), 1, .5)
take.y <- as.logical(take.y)
g.matches.m[!matching_idents][take.y] <- g.matches.y[!matching_idents][take.y]
}
}
genotypes.m[idents.snp, idents.samp] <- as.matrix(g.matches.m)
}
}
#=======merge genotypes by indices in merged metadata--genotypes not present in both============
genotypes.m <- data.table::as.data.table(genotypes.m)
# samples in only one:
x.only <- which(is.na(sm.m$.sample.id.from.y) & !is.na(sm.m$.sample.id.from.x))
y.only <- which(is.na(sm.m$.sample.id.from.x) & !is.na(sm.m$.sample.id.from.y))
if(length(x.only) > 0){
set(genotypes.m, which(!is.na(snm.m$.snp.id.from.x)), x.only,
data.table::as.data.table(genotypes(x)[match(snm.m$.snp.id.from.x[which(!is.na(snm.m$.snp.id.from.x))], snp.meta(x)$.snp.id),
match(sm.m$.sample.id.from.x[x.only], sample.meta(x)$.sample.id)]))
}
if(length(y.only) > 0){
set(genotypes.m, which(!is.na(snm.m$.snp.id.from.y)), y.only,
data.table::as.data.table(genotypes(y)[match(snm.m$.snp.id.from.y[which(!is.na(snm.m$.snp.id.from.y))], snp.meta(y)$.snp.id),
match(sm.m$.sample.id.from.y[y.only], sample.meta(y)$.sample.id)]))
}
# snps in only one
x.only <- which(is.na(snm.m$.snp.id.from.y) & !is.na(snm.m$.snp.id.from.x))
y.only <- which(is.na(snm.m$.snp.id.from.x) & !is.na(snm.m$.snp.id.from.y))
if(length(x.only) > 0){
set(genotypes.m, x.only, which(!is.na(sm.m$.sample.id.from.x)),
data.table::as.data.table(genotypes(x)[match(snm.m$.snp.id.from.x[x.only], snp.meta(x)$.snp.id),
match(sm.m$.sample.id.from.x[which(!is.na(sm.m$.sample.id.from.x))], sample.meta(x)$.sample.id)]))
}
if(length(y.only) > 0){
set(genotypes.m, y.only, which(!is.na(sm.m$.sample.id.from.y)),
data.table::as.data.table(genotypes(y)[match(snm.m$.snp.id.from.y[y.only], snp.meta(y)$.snp.id),
match(sm.m$.sample.id.from.y[which(!is.na(sm.m$.sample.id.from.y))], sample.meta(y)$.sample.id)]))
}
# fix any lingering "."'s
bad.snm.cols <- grep("\\.", colnames(snm.m))
if(length(bad.snm.cols) > 0){
colnames(snm.m)[bad.snm.cols] <- gsub("\\.", "_", colnames(snm.m)[bad.snm.cols])
}
bad.sm.cols <- grep("\\.", colnames(sm.m))
if(length(bad.sm.cols) > 0){
colnames(sm.m)[bad.sm.cols] <- gsub("\\.", "_", colnames(sm.m)[bad.sm.cols])
}
return(import.snpR.data(genotypes = genotypes.m,
snp.meta = snm.m[,-grep("_snp_id", colnames(snm.m))],
sample.meta = sm.m[,-grep("_sample_id", colnames(sm.m))],
mDat = x@mDat))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.