Nothing
#' Fst Computation
#'
#' This function calculates pairwise Fst values along with confidence intervals and p-values between populations according to the method proposed by Wright(1949) and updated by Weir and Cockerham (1984)
#'
#' @param geno a data frame containing allele frequency data generated from stamppConvert, or a genlight object containing genotype data, individual IDs, population IDs and ploidy levels
#' @param nboots number of bootstraps to perform across loci to generate confidence intervals and p-values
#' @param percent the percentile to calculate the confidence interval around
#' @param nclusters number of proccesor treads or cores to use during calculations.
#' @details If possible, using multiple processing threads or cores is recommended to assist in calculating Fst values over a large number of bootstraps.
#' @return An object list with the components:
#' \code{Fsts}{a matrix of pairwise Fst values between populations}
#' \code{Pvalues}{a matrix of p-values for each of the pairwise Fst values containined in the 'Fsts' matrix}
#' \code{Bootstraps}{a dataframe of each Fst value generated during Bootstrapping and the associated confidence intervals}
#' If nboots<2, no bootstrapping is performed and therefore only a matrix of Fst values is returned.
#' @examples
#' # import genotype data and convert to allele frequecies
#' data(potato.mini, package="StAMPP")
#' potato.freq <- stamppConvert(potato.mini, "r")
#' # Calculate pairwise Fst values between each population
#' potato.fst <- stamppFst(potato.freq, 100, 95, 1)
#' @author Luke Pembleton <luke.pembleton at agriculture.vic.gov.au>
#' @references Wright S (1949) The Genetical Structure of Populations. Annals of Human Genetics 15, 323-354. <doi:10.1111/j.1469-1809.1949.tb02451.x>
#' Weir BS, Cockerham CC (1984) Estimating F Statistics for the ANalysis of Population Structure. Evolution 38, 1358-1370. <doi:10.2307/2408641>
#' @import adegenet parallel doParallel foreach
#' @export
stamppFst <- function(geno, nboots=100, percent=95, nclusters=1){
if(class(geno)=="genlight"){ #if genotype object is a genlight object convert to a data.frame
geno2 <- geno
geno <- as.matrix(geno2) #extract genotype data from genlight object
sample <- row.names(geno) #individual names
pop.names <- pop(geno2) #population names
ploidy <- ploidy(geno2) #ploidy level
geno=geno*(1/ploidy) #convert genotype data (number of allele A) to precentage allele frequency
geno[is.na(geno)]=NaN
format <- vector(length=length(geno[,1]))
format[1:length(geno[,1])]="genlight"
pops <- unique(pop.names) #population names
pop.num <- vector(length=length(pops)) #create vector of population ID numbers
for (i in 1:length(geno[,1])){
pop.num[i]=which(pop.names[i]==pops) #assign population ID numbers to individuals
}
genoLHS <- as.data.frame(cbind(sample, pop.names, pop.num, ploidy, format), stringsAsFactors=FALSE)
geno <- cbind(genoLHS, geno) #combine genotype data with labels to form stampp geno file
geno[,2]=as.character(pop.names)
geno[,4]=as.numeric(as.character(geno[,4]))
row.names(geno)=NULL
}
geno <- geno[,-5] #remove format information
cl <- makeCluster(nclusters)
registerDoParallel(cl) #establish clusters for multithreaded computing
percent <- ((100-percent)/100)/2
d=1
res <- numeric(length=nboots)
totalind <- nrow(geno) #number of individuals
nloc <- ncol(geno)-4 #number of loci/markers
pops <- unique(geno[,2]) #population IDs
npops <- length(pops) #number of populations
simple.geno <- geno
p.a <- ph.a <- oh <-ninds <- matrix(NA, ncol=nloc, nrow=npops, dimnames=list(pops)) #matrix to store counts of allele A, level of hetrozygousity and number of individuals without missing data per loci
for (i in 1:npops){ #loop for each population
refpop <- pops[i]
pop.geno <- subset(simple.geno, simple.geno[,2]==refpop)
if(nloc>1){ #if there is 2 or more loci
ninds[i,] <- colSums((pop.geno[,5:(4+nloc)]!="NaN")*(pop.geno[,4]/2), na.rm=TRUE) #number of diploid eqiv genomes at each loci without missing data
p.a[i,] <- colMeans(pop.geno[5:(4+nloc)], na.rm=TRUE) #total number of alleles in population at each loci multipled by number of diploid eqiv genomes
oh[i,] <- colSums((((pop.geno[,5:(4+nloc)])*(pop.geno[,5:(4+nloc)]<=0.5)) + ((pop.geno[5:(4+nloc)]-0.5)*(pop.geno[5:(4+nloc)]>0.5 & pop.geno[5:(4+nloc)]!=1)))*(pop.geno[,4]/2), na.rm=TRUE) #level of hetrozygousity in population multipled by number of diploid eqiv genomes
oh[i,] <- oh[i,]*2
}else{ #if the genotype dataset only has 1 locus
ninds[i,] <- sum((pop.geno[,5:(4+nloc)]!="NaN")*(pop.geno[,4]/2), na.rm=TRUE) #number of diploid eqiv genomes at each loci without missing data
p.a[i,] <- colMeans(pop.geno[5:(4+nloc)], na.rm=TRUE) #total number of alleles in population at each loci multipled by number of diploid eqiv genomes
oh[i,] <- sum((((pop.geno[,5:(4+nloc)])*(pop.geno[,5:(4+nloc)]<=0.5)) + ((pop.geno[5:(4+nloc)]-0.5)*(pop.geno[5:(4+nloc)]>0.5 & pop.geno[5:(4+nloc)]!=1)))*(pop.geno[,4]/2), na.rm=TRUE) #level of hetrozygousity in population multipled by number of diploid eqiv genomes
oh[i,] <- oh[i,]*2
}
}
p <- p.a #frequency of allele A based on the number of diploid eqiv genomes at each loci without missing data
oh <- oh/ninds #frequency of hetrozygous genotypes at each loci based on the number of diploid eqiv genomes without missing data
index2 <- index1 <- NULL
step <- 2
step2 <- 1
for(i in step:npops){
index1 <- c(index1, i:npops)
index2 <- c(index2, rep(step2, length(step:npops)))
step=step+1
step2=step2+1
}
#### Fst calculation ####
r=2
ninds.dup.1 <- ninds[index1,]
ninds.dup.2 <- ninds[index2,]
p1 <- p[index1,]
p2 <- p[index2,]
oh1 <- oh[index1,]
oh2 <- oh[index2,]
n.bar <- (ninds.dup.1+ninds.dup.2)/r
nc <- (r*n.bar)-(((ninds.dup.1^2)+(ninds.dup.2^2)) / (r*n.bar))
p.bar <- ((ninds.dup.1*p1)/(r*n.bar)) + ((ninds.dup.2*p2)/(r*n.bar))
s.square <- ((ninds.dup.1*((p1-p.bar)^2))/n.bar) + ((ninds.dup.2*((p2-p.bar)^2))/n.bar)
h.bar <- ((ninds.dup.1*oh1)/(r*n.bar)) + ((ninds.dup.2*oh2)/(r*n.bar))
a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) ))
b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar))
c <- (1/2)*h.bar
index.inf <- which(!is.finite(a) | !is.finite(b) | !is.finite(c)) #index of infinite values
a[index.inf]=NA #remove infinite values
b[index.inf]=NA #remove infinite values
c[index.inf]=NA #remove infinite values
if(nloc>1){ #if there is more than 1 locus in the genotype dataset
if(npops>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision
fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results
}else{ #if there is only 2 populations, ie. 1 pairwise comparision
fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results
}
}else{ # if there is only 1 locus in the genotype dataset
fst <- a/(a+b+c) #Fst results
}
fstmat <- matrix(NA, nrow=npops, ncol=npops, dimnames=list(pops, pops))
fstmat[lower.tri(fstmat)]=fst #assign Fst values to matrix of pairwise comparisions
#### end of Fst calculation ####
################################
#### Bootstrapped Fst calc ####
if(nboots>1){
boots <- matrix(NA, ncol=(nboots+4), nrow=((npops*npops)-npops)/2) #matrix to store Fst results from bootstrapping
pvalues <- matrix(NA, ncol=npops, nrow=npops, dimnames=list(pops, pops)) #matrix to store p-values of pairwise Fsts
name.index <- matrix(NA, ncol=2, nrow=((npops*npops)-npops)/2)
line=0
for(i in 1:(npops-1)){
for(y in (i+1):npops){
line=line+1
name.index[line,c(1,2)]=c(i,y) #all pairwise population names
}
}
boot.names1 <- pops[name.index[,1]]
boot.names2 <- pops[name.index[,2]]
boot.names <- cbind.data.frame(boot.names1, boot.names2, stringsAsFactors=FALSE)
popnames <- array(unique(geno[,2]))
p.master <- p
oh.master <- oh
ninds.master <- ninds
res <- foreach(d = 1:nboots, .combine=cbind) %dopar% { #loop for the specified number of bootstraps
boot.index <- sample((1:nloc), nloc, replace=TRUE) #bootstrap loci names
p <- p.master[,boot.index] #frequency of allele A based on bootstrapped loci
oh <- oh.master[,boot.index] #frequecny of heterozygousity based on bootstrapped loci
ninds <- ninds.master[,boot.index] #number of inds without missing data based on bootstrapped loci
r=2
ninds.dup.1 <- ninds[index1,]
ninds.dup.2 <- ninds[index2,]
p1 <- p[index1,]
p2 <- p[index2,]
oh1 <- oh[index1,]
oh2 <- oh[index2,]
n.bar <- (ninds.dup.1+ninds.dup.2)/r
nc <- (r*n.bar)-(((ninds.dup.1^2)+(ninds.dup.2^2)) / (r*n.bar))
p.bar <- ((ninds.dup.1*p1)/(r*n.bar)) + ((ninds.dup.2*p2)/(r*n.bar))
s.square <- ((ninds.dup.1*((p1-p.bar)^2))/n.bar) + ((ninds.dup.2*((p2-p.bar)^2))/n.bar)
h.bar <- ((ninds.dup.1*oh1)/(r*n.bar)) + ((ninds.dup.2*oh2)/(r*n.bar))
a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) ))
b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar))
c <- (1/2)*h.bar
index.inf <- which(!is.finite(a) | !is.finite(b) | !is.finite(c)) #index of infinite values
a[index.inf]=NA #remove infinite values
b[index.inf]=NA #remove infinite values
c[index.inf]=NA #remove infinite values
if(nloc>1){ #if there is more than 1 locus in the genotype dataset
if(npops>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision
rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #bootstrapped Fst results
}else{#if there is only 2 populations, ie. 1 pairwise comparision
sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #bootstrapped Fst results
}
}else{ #if there is only 1 locus in the genotype dataset
a/(a+b+c) #bootstrapped Fst results
}
}
boots[,(1:nboots)]=res #add bootstrapped results to matrix
boots <- cbind(boot.names, boots) #add pairwise population names to the matrix of bootstrap results
bootrows <- length(boots[,1]) #number of pairwise comparisions
order.boots <- boots[,3:(2+nboots)]
order.boots <- data.matrix(order.boots)
order.boots <- t(apply(order.boots, 1, sort, na.last=T)) #sort bootstrapped Fst values from smallest to largest
lowerper <- order.boots[,(ceiling(percent*nboots))] #identify the Fst values on the lower CI boundry
upperper <- order.boots[,(floor((1-percent)*nboots))] #identify the Fst values on the upper CI boundry
pval <- (rowSums(order.boots<=0, na.rm=TRUE))/nboots #calculate p-values
boots[,3:(2+nboots)]=order.boots
boots[,(3+nboots):(5+nboots)]=cbind(lowerper, upperper, pval)
line=0
for(i in 1:(npops-1)){
for(y in (i+1):npops){
#store the calculated p-value results in a matrix format
line=line+1
pvalues[y,i]=boots[line,(nboots+5)]
fstcol <- match(boots[line,1],pops)
fstrow <- match(boots[line,2],pops)
boots[line, (6+nboots)]=fstmat[fstrow, fstcol]
}
}
row.names(pvalues)=pops #label the rows of the p-value matrix with the population names
colnames(pvalues)=pops #label the columns of the p-value matrix with the population names
bootscol <- cbind("Population1", "Population2", t(c(1:nboots)), "Lower bound CI limit", "Upper bound CI limit", "p-value", "Fst")
colnames(boots) <- bootscol #label the columns of the matrix of bootstrap results
results <- list(Fsts=fstmat, Pvalues=pvalues, Bootstraps=boots)
}else{
results <- fstmat
}
stopCluster(cl)
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.