#' Calculate Weir and Cockerham's FST from genotypes
#'
#' Function takes a long format genotype data table and calculates
#' Weir & Cockerhams FST, i.e. theta (Weir & Cockerham, 1984). Can
#' also conduct permutation testing and calculate bootstrap
#' confidence intervals.
#'
#' @param dat Data table: A long format data table of biallelic genotypes,
#' coded as '/' separated alleles ('0/0', '0/1', '1/1'), or counts
#' of the Alt alleles (0, 1, 2, repsectively).
#' Four columns are required:
#' \enumerate{
#' \item The population ID (see param \code{popCol}).
#' \item The sampled individual ID (see param \code{sampCol}).
#' \item The locus ID (see param \code{locusCol}).
#' \item The genotype (see param \code{genoCol}).
#' }
#'
#' @param popCol Character: The column name with the population information.
#' Default = \code{'POP'}.
#'
#' @param sampCol Character: The column name with the sampled individual information.
#' Default = \code{'SAMPLE'}.
#'
#' @param locusCol Character: The column name with the locus information.
#' Default = \code{'LOCUS'}.
#'
#' @param genoCol Character: The column name with the genotype information.
#' Default = \code{'GT'}.
#'
#' @param num.cores Integer: Number of cores. Default is 1.
#'
#' @param iters Integer: The number of permutations or bootstrap replicates.
#' Default = 100.
#'
#' @param permTest Logical: Should a permutation test be conducted?
#' Default = \code{FALSE}.
#'
#' @param bootCI Logical: Should bootstrap confidence intervals be estimated?
#' Default = \code{FALSE}.
#'
#' @param doPairs Logical: Should pairwise FST be calculated? Default = \code{FALSE},
#' which calculates the among population FST.
#'
#' @param doDist Logical: Should a a distance matrix of FST be returned?
#' Default = \code{FALSE}. Only applied when \code{doPairs==TRUE}.
#'
#' @param perLocus Logical: Should the per locus FST be returned?
#' Default = \code{FALSE}.
#'
#' @details Permutation tests involve random shuffling of individuals
#' among populations, recalculating FST, and testing the hypothesis
#' that the permuted FST > observed FST. The p-value represents the
#' proportion of iterations that were "TRUE" to this expression. \cr \cr
#' Bootstrapping randomly samples the loci with replacement
#' to estimate the distribution of FST values around the multilocus
#' FST. Confidence intervals are calculated as the 2.5 and
#' 97.5 percentiles of the obtained bootstrapped FST values.
#'
#' @return Returns a list, the contents vary depending on argument choices. \cr
#' When \code{doPairs==FALSE}, the analysis is among populations.
#' \enumerate{
#' \item \code{$multilocus.mean}: the multilocus FST.
#' \item \code{$multilocus.perms}: a vector of permuted FST values.
#' \item \code{$multilocus.pval}: the p-value from permutation tests.
#' \item \code{$multilocus.ci}: a vector of bootstrap confidence intervals.
#' \item \code{$perlocus}: a data table of the FST for each locus.
#' }
#' When \code{doPairs==TRUE}, the analysis is between population pairs.
#' \enumerate{
#' \item \code{$multilocus.mean}: a data table of multilocus FST.
#' \item \code{$multilocus.ci}: a data table of bootstrap confidence intervals.
#' \item \code{$multilocus.dist}: a distance matrix of multilocus FST values.
#' \item \code{$perlocus}: a data table of the FST for each locus.
#' }
#'
#' @examples
#' data(data_4pops)
#'
#' # Among popuation FST, with permutation tests
#' # and bootstrap confidence intervals.
#' fst_among <- fstWC_genos(data_4pops
#' , permTest=TRUE
#' , bootCI=TRUE
#' , iters=50)
#'
#' # Pairwise FST, with per locus estimates
#' # and a distance matrix
#' fst_pairs <- fstWC_genos(data_4pops
#' , doPairs=TRUE
#' , doDist=TRUE
#' , perLocus=TRUE)
#'
#' @export
fstWC_genos <- function(dat, popCol='POP', sampCol='SAMPLE'
, locusCol='LOCUS', genoCol='GT', num.cores=1
, permTest=FALSE, bootCI=FALSE, iters=100
, doPairs=FALSE, doDist=FALSE, perLocus=FALSE){
# BEGIN ...........
# --------------------------------------------+
# Assertions and environment
# --------------------------------------------+
# Check the class of dat
if(!'data.table' %in% class(dat)){ stop("Argument dat isn't a data table.")}
# Check that all columns are specified correctly
if(sum(c(popCol, sampCol, locusCol, genoCol) %in% colnames(dat)) != 4){
stop('Arguments popCol, sampCol, locusCol, and genoCol must be
columns in dat. See ?genos2freqs.')
}
# Rename columns
colnames(dat)[
match(c(sampCol, popCol, locusCol, genoCol), colnames(dat))
] <- c('SAMPLE', 'POP', 'LOCUS', 'GT')
# Get genotype class
genoClass <- class(dat$GT)
# Convert to integer
if(genoClass=='character'){
dat[, GT:=genoscore_converter(GT)]
} else if(genoClass=='numeric'){
dat[, GT:=as.integer(GT)]
}
# --------------------------------------------+
# Register parallel cluster if requested
# --------------------------------------------+
if(num.cores>1){
my.cluster <- makeCluster(num.cores)
registerDoParallel(my.cluster)
}
# --------------------------------------------+
# Code: Set up key parameters
# --------------------------------------------+
# Get the unique population and sample combinations
popsampDt <- unique(dat[, c('POP', 'SAMPLE')])
# Create a genotype matrix
genoMat <- DT2Mat_genos(dat, sampCol='SAMPLE', locusCol='LOCUS', genoCol='GT')
# Match genoMat rows to populations in popsampDt to
# create a population ID
pop_id <- popsampDt$POP[match(rownames(genoMat), popsampDt$SAMPLE)]
# Create an empty list to store values in
fstList <- list()
# --------------------------------------------+
# Code: FST among all populations
# --------------------------------------------+
if(doPairs==FALSE){
# START AMONG
# Observed variance components
obsVarcomps <- fstWC_varcomps(dat=genoMat, input_type='genos', pop_id=pop_id, num.cores=num.cores)
# The observed FST
fstList$multilocus.mean <- sum(obsVarcomps$NUMER) / sum(obsVarcomps$DENOM)
if(permTest==TRUE){
fstList$multilocus.perms <- fstWC_perm(genoMat=genoMat, pop_id=pop_id, perms=iters, num.cores=num.cores)
# p-value for permutations
fstList$multilocus.pval <- sum(fstList$multilocus.perms > fstList$multilocus.mean) / iters
}
if(bootCI==TRUE){
fstList$multilocus.boots <- fstWC_boot(dat=genoMat, input_type='genos', pop_id=pop_id, boots=iters, num.cores=num.cores)
# Percentiles
fstList$multilocus.ci <- quantile(fstList$multilocus.boots, c(0.025, 0.975))
}
if(perLocus==TRUE){
obsVarcomps[, FST:=NUMER/DENOM]
fstList$perlocus <- obsVarcomps[, c('LOCUS', 'FST')]
}
# Kill cluster if run in parallel
if(num.cores>1){
gc()
stopCluster(my.cluster)
}
# Output
return(fstList)
# END AMONG
}
# --------------------------------------------+
# Code: FST between population pairs
# --------------------------------------------+
if(doPairs==TRUE){
# START PAIRWISE
# Unique populations
uniq_pops <- unique(pop_id)
# Population pair combinations
pairCombos <- combn(uniq_pops, 2)
# Get the pairwise variance components
cat('Performing pairwise FST calculations', '\n')
pairCalcs <- apply(pairCombos, 2, function(pops){
cat('....', pops, '\n')
# Create a list to return
popList <- list()
# Get the population ID indices
pop1 <- which(pop_id==pops[1])
pop2 <- which(pop_id==pops[2])
# Pair name
pair_id <- paste(pops[1], pops[2], sep='/')
# Subset the genotypes and ID
genoSub <- genoMat[c(pop1, pop2),]
pop_id_sub <- pop_id[c(pop1, pop2)]
# Calculate the variance components
cat('........ Variance components', '\n')
popsVarcomps <- fstWC_varcomps(dat=genoSub, input_type='genos', pop_id=pop_id_sub, num.cores=num.cores)
popsVarcomps$POP1 <- pops[1]
popsVarcomps$POP2 <- pops[2]
popsVarcomps$PAIR <- pair_id
# Multilocus FST
popList$multilocus.mean <- data.table(
popsVarcomps[1, c('POP1', 'POP2', 'PAIR')]
, FST=sum(popsVarcomps$NUMER)/sum(popsVarcomps$DENOM))
# Permutation tests
if(permTest==TRUE){
cat('........ Permutation tests', '\n')
boot_fst <- fstWC_perm(genoMat=genoSub, pop_id=pop_id_sub, perms=iters, num.cores=num.cores)
boot_pval <- sum(boot_fst > popList$multilocus.mean) / iters
popList$multilocus.perms <- data.table(PAIR=pair_id, POP1=pops[1], POP2=pops[2], PERM=1:iters, FST=boot_fst)
popList$multilocus.pval <- data.table(PAIR=pair_id, POP1=pops[1], POP2=pops[2], PVAL=boot_pval)
}
# Bootstrap confidence intervals
if(bootCI==TRUE){
cat('........ Bootstrap confidence intervals', '\n')
boot_fst <- fstWC_boot(dat=genoSub, pop_id=pop_id_sub, boots=iters, num.cores=num.cores)
popList$multilocus.boots <- data.table(PAIR=pair_id, POP1=pops[1], POP2=pops[2], BOOT=1:iters, FST=boot_fst)
boot_ci <- quantile(boot_fst, c(0.025, 0.975))
popList$multilocus.ci <- data.table(PAIR=pair_id, POP1=pops[1], POP2=pops[2], CI025=boot_ci[1], CI975=boot_ci[2])
}
# Calculate per Locus FST
if(perLocus==TRUE){
cat('........ Per locus FST', '\n')
popsVarcomps[, FST:=NUMER/DENOM]
popList$perlocus <- popsVarcomps[, c('PAIR', 'POP1', 'POP2', 'LOCUS', 'FST')]
}
# Return all the calculations for this population pair
return(popList)
})
# Combine multilocus FST pairwise data
fstList$multilocus.mean <- do.call('rbind', lapply(pairCalcs, function(pops){ pops$multilocus.mean }))
# If permutation test conducted, combine pairwise data
if(doPairs==TRUE){
fstList$multilocus.perms <- do.call('rbind', lapply(pairCalcs, function(pops){ pops$multilocus.perms }))
fstList$multilocus.pvals <- do.call('rbind', lapply(pairCalcs, function(pops){ pops$multilocus.pval }))
}
# If bootstrapped CI calculated, combine pairwise data
if(bootCI==TRUE){
fstList$multilocus.boots <- do.call('rbind', lapply(pairCalcs, function(pops){ pops$multilocus.boots }))
fstList$multilocus.ci <- do.call('rbind', lapply(pairCalcs, function(pops){ pops$multilocus.ci }))
}
# If a distance matrix is required, make it.
if(doDist==TRUE){
distMat <- matrix(0, nrow=length(uniq_pops), ncol=length(uniq_pops), dimnames=list(uniq_pops, uniq_pops))
for(j in 1:ncol(pairCombos)){
pop1 <- pairCombos[1,j]
pop2 <- pairCombos[2,j]
distMat[pop2, pop1] <- fstList$multilocus.mean[POP1==pop1 & POP2==pop2]$FST
fstList$multilocus.dist <- as.dist(distMat, diag=TRUE)
}
}
# If per locus FST estimates calculated, combine pairwise data
if(perLocus==TRUE){
fstList$perlocus <- do.call('rbind', lapply(pairCalcs, function(pops){ pops$perlocus }))
}
# Kill cluster if run in parallel
if(num.cores>1){
gc()
stopCluster(my.cluster)
}
# Output
return(fstList)
# END PAIRWISE
}
# ............ END
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.