Nothing
#' OutFLANK: An Fst outlier approach by Mike Whitlock and Katie Lotterhos,
#' University of British Columbia.
#'
#' This function is the original implementation of Outflank by Whitlock and
#' Lotterhos. dartR simply provides a convenient wrapper around their functions
#' and an easier install being an r package (for information please refer to
#' their github repository)
#'
#' This method looks for Fst outliers from a list of Fst's for different loci.
#' It assumes that each locus has been genotyped in all populations with
#' approximately equal coverage.
#'
#'OutFLANK estimates the distribution of Fst based on a trimmed sample of Fst's.
#'It assumes that the majority of loci in the center of the distribution are
#'neutral and infers the shape of the distribution of neutral Fst using a
#'trimmed set of loci. Loci with the highest and lowest Fst's are trimmed from
#'the data set before this inference, and the distribution of Fst df/(mean Fst)
#' is assumed to'follow a chi-square distribution. Based on this inferred
#' distribution, each locus is given a q-value based on its quantile in the
#' inferred null'distribution.
#'
#'The main procedure is called OutFLANK -- see comments in that function
#'immediately below for input and output formats. The other functions here are
#'necessary and must be uploaded, but are not necessarily needed by the user
#'directly.
#'
#'Steps:
#'
#'@param FstDataFrame A data frame that includes a row for each locus, with
#'columns as follows:
#'\itemize{
#' \item $LocusName: a character string that uniquely names
#' each locus.
#' \item $FST: Fst calculated for this locus. (Kept here to
#' report the unbased Fst of the results)
#' \item $T1: The numerator of the estimator for Fst
#' (necessary, with $T2, to calculate mean Fst)
#' \item $T2: The denominator of the estimator of Fst
#' \item $FSTNoCorr: Fst calculated for this locus without
#' sample size correction. (Used to find outliers)
#' \item $T1NoCorr: The numerator of the estimator for Fst
#' without sample size correction (necessary, with $T2, to
#' calculate mean Fst)
#' \item $T2NoCorr: The denominator of the estimator of Fst
#' without sample size correction
#' \item $He: The heterozygosity of the locus (used to screen
#' out low heterozygosity loci that have a different distribution)
#' }
#'
#' @param LeftTrimFraction The proportion of loci that are trimmed from the
#' lower end of the range of Fst before the likelihood funciton is applied
#' [default 0.05].
#' @param RightTrimFraction The proportion of loci that are trimmed from the
#' upper end of the range of Fst before the likelihood funciton is applied
#' [default 0.05].
#' @param Hmin The minimum heterozygosity required before including calculations
#' from a locus [default 0.1].
#' @param NumberOfSamples The number of spatial locations included in the data
#' set.
#' @param qthreshold The desired false discovery rate threshold for calculating
#' q-values [default 0.05].
#' @return
#'
#' The function returns a list with seven elements:
#' \itemize{
#' \item FSTbar: the mean FST inferred from loci not marked as outliers
#' \item FSTNoCorrbar: the mean FST (not corrected for sample size -gives an
#' upwardly biased estimate of FST)
#' \item dfInferred: the inferred number of degrees of freedom for the
#' chi-square distribution of neutral FST
#' \item numberLowFstOutliers: Number of loci flagged as having a significantly
#' low FST (not reliable)
#' \item numberHighFstOutliers: Number of loci identified as having
#' significantly high FST
#' \item results: a data frame with a row for each locus. This data frame
#' includes all the original columns in the
#' data set, and six new ones:
#' \itemize{
#' \item $indexOrder (the original order of the input data set),
#' \item $GoodH (Boolean variable which is TRUE if the expected
#' heterozygosity is greater than the Hemin set by input),
#' \item $OutlierFlag (TRUE if the method identifies the locus as
#' an outlier, FALSE otherwise), and
#' \item $q (the q-value for the test of neutrality for the locus)
#' \item $pvalues (the p-value for the test of neutrality for the
#' locus)
#' \item $pvaluesRightTail the one-sided (right tail) p-value for
#' a locus
#' }
#' }
#' @export
#' @author Bernd Gruber (bugs? Post to
#' \url{https://groups.google.com/d/forum/dartr}); original implementation of
#' Whitlock & Lotterhos
utils.outflank <- function(FstDataFrame,
LeftTrimFraction = 0.05,
RightTrimFraction = 0.05,
Hmin = 0.1,
NumberOfSamples,
qthreshold = 0.05) {
#Setting up necessary columns in dataframe
Fstdata <- outputDFStarterNoCorr(FstDataFrame, Hmin)
# making working dataframe with real Fst (no NAs), storing NAs to add back
# later
# This also removes loci with He values lower than Hmin from the working data
# frame
nonkeepers <- which((is.na(Fstdata$FSTNoCorr)) | (Fstdata$He < Hmin))
if (length(nonkeepers) > 0) {
workingDataFrame <- Fstdata[-nonkeepers, ]
} else{
workingDataFrame <- Fstdata
}
storedDataFrameNA <- Fstdata[nonkeepers, ]
#Finding upper and lower bounds for trimming (eliminating NAs, but not
# negative FSTs)
sortedDataFrame <-
workingDataFrame[order(workingDataFrame$FSTNoCorr), ]
NLociTotal <- length(sortedDataFrame$FSTNoCorr)
SmallestKeeper <- ceiling(NLociTotal * LeftTrimFraction)
LargestKeeper <- floor(NLociTotal * (1 - RightTrimFraction))
LowTrimPoint <- sortedDataFrame$FSTNoCorr[[SmallestKeeper]]
HighTrimPoint <- sortedDataFrame$FSTNoCorr[[LargestKeeper]]
if (LowTrimPoint < 0) {
writeLines(
error(
"ERROR: The smallest FST in the trimmed set must be > 0. Please use a larger LeftTrimFraction."
)
)
return()
}
if (HighTrimPoint >= 1) {
writeLines(
error(
"ERROR: The largest FST in the trimmed set must be < 1. Please use a larger RightTrimFraction."
)
)
return()
}
# finding dfInferred and Fstbar iteratively
putativeNeutralListTemp <- ifelse(workingDataFrame$FSTNoCorr > 0, TRUE, FALSE)
oldOutlierFlag <- rep(FALSE, NLociTotal)
# Note: All negative FST loci are maked as putative outliers, which will
#need to be tested with the coalescent model later. In the
# meantime, they are removed so as to not confuse the likelihood function.
keepGoing <-TRUE
count <-0
# writeLines(paste(mean(workingDataFrame$FSTNoCorr[putativeNeutralListTemp])))
while (keepGoing) {
count <-count + 1
if (count > 19) {
keepGoing <-FALSE
writeLines(important("Exceeded iteration maximum.")) ###Try with
#increased maximum value for count two lines above.
}
FstbarNoCorrTemp <-fstBarCalculatorNoCorr(workingDataFrame[putativeNeutralListTemp,])
dfInferredTemp <-EffectiveNumberSamplesMLE(
workingDataFrame$FSTNoCorr[putativeNeutralListTemp],
FstbarNoCorrTemp,
NumberOfSamples,
LowTrimPoint,
HighTrimPoint
)
workingDataFrame <-pOutlierFinderChiSqNoCorr(workingDataFrame,
FstbarNoCorrTemp,
dfInferredTemp,
qthreshold)
#### mark all negative FSTs as outliers if lowest nonneg FST is outlier
#(because negative Fst estimates can't be evaluated through
#### the chi-square approach on their own)
if (any(workingDataFrame$OutlierFlag[workingDataFrame$FSTNoCorr < LowTrimPoint]))
workingDataFrame$OutlierFlag[workingDataFrame$FSTNoCorr < 0] <-TRUE
#### Any loci previously marked as $OutlierFlag=TRUE remain so, even if
#the new iteration doesn''t flag them as outliers
#### workingDataFrame$OutlierFlag=!as.logical((!workingDataFrame$OutlierFlag)*(!oldOutlierFlag))
# Resetting neutral list, and checking whether the outlier list has
#stabilized
putativeNeutralListTemp <-ifelse((!workingDataFrame$OutlierFlag), TRUE, FALSE)
if (sum(putativeNeutralListTemp) == 0) {
writeLines(error("No loci in neutral list...\n"))
return(error("FAIL"))
}
if (identical(oldOutlierFlag, workingDataFrame$OutlierFlag))
keepGoing <-FALSE
###### if all in trimmed get IDed as outlier - return to user with
#warning
if (all(workingDataFrame$OutlierFlag[workingDataFrame$FSTNoCorr < LowTrimPoint])) {
writeLines(
error(
"All loci with Fst below the lower (lefthand) trim point were marked as outliers. Re-run with larger LeftTrimFraction or smaller qthreshold."
)
)
return(0)
}
if (all(workingDataFrame$OutlierFlag[workingDataFrame$FSTNoCorr > HighTrimPoint])) {
writeLines(
error(
"All loci with Fst above the upper (righthand) trim point were marked as outliers. Re-run with smaller RightTrimFraction or smaller qthreshold."
)
)
return(0)
}
oldOutlierFlag <-workingDataFrame$OutlierFlag
# writeLines(paste(as.character(count),' ',as.character(sum(putativeNeutralListTemp))))
}
if (count > 19)
writeLines(important("Loop iteration limit exceeded."))
numberLowFstOutliers <-sum(workingDataFrame$OutlierFlag[(workingDataFrame$FSTNoCorr < LowTrimPoint)])
numberHighFstOutliers <-sum(workingDataFrame$OutlierFlag[(workingDataFrame$FSTNoCorr > HighTrimPoint)])
FSTbar <-fstBarCalculator(workingDataFrame[putativeNeutralListTemp,])
# merge NA list back to working list, and sort by original order\t
resultsDataFrame <-rbind(workingDataFrame, storedDataFrameNA)
resultsDataFrame <-resultsDataFrame[order(resultsDataFrame$indexOrder),]
# return new dataframe
list(
FSTbar = FSTbar,
FSTNoCorrbar = FstbarNoCorrTemp,
dfInferred = dfInferredTemp,
numberLowFstOutliers = numberLowFstOutliers,
numberHighFstOutliers = numberHighFstOutliers,
results = resultsDataFrame
)
}
outputDFStarterNoCorr <- function(FstDataFrame,
Hmin = 0.1) {
# This will take a given dataframe with $LocusName, $FST,$He, $T1, $T2, etc.
#and initialize $indexOrder,$GoodH,$OutlierFlag (to 0),
# and $q (to 1).
# Hmin is the smallest allowable He for which a locus should be included in
#the initial calculations. By default this requires that a
# locus have heterozygosity equal to 10% or more.
len <-length(FstDataFrame$FSTNoCorr)
indexOrder <-seq(1, len)
GoodH <-ifelse(FstDataFrame$He < Hmin, "lowH", "goodH")
OutlierFlag <-ifelse(is.na(FstDataFrame$FSTNoCorr), NA, FALSE)
qvalues <-rep(NA, len)
pvalues <-rep(NA, len)
pvaluesRightTail <-rep(NA, len)
cbind(FstDataFrame,
indexOrder,
GoodH,
qvalues,
pvalues,
pvaluesRightTail,
OutlierFlag)
}
pOutlierFinderChiSqNoCorr <- function(DataList,
Fstbar,
dfInferred,
qthreshold = 0.05) {
# Finds outliers based on chi-squared distribution Takes given values of
#dfInferred and Fstbar, and returns a list of p-values and
# q-values for all loci based on chi-square. Assumes that the DataList
#input has a column called $FSTNoCorr and that empty columns
# exist for $qvalues and $OutlierFlag
# Divide DataList into 3 lists: DataListGood has $FST>0; DataListNeg has
#cases where $FST <=0; and DataListNA has cases where $FST is
# NA. DataListNeg is necessary to keep separate here because these cases
#do not have meaningful results with the chi-square aprach;
# however, they do carry information.
DataListGood <-DataList[which(DataList$FSTNoCorr > 0),]
DataListNonPosFst <-DataList[which(DataList$FSTNoCorr <= 0),]
DataListNA <-DataList[which(is.na(DataList$FSTNoCorr)),]
pList <-pTwoSidedFromChiSq(DataListGood$FSTNoCorr * (dfInferred) / Fstbar, dfInferred)
pListRightTail <-1 - pchisq(DataListGood$FSTNoCorr * (dfInferred) /
Fstbar, dfInferred)
# Note: Change made 13 June 2014; q-values now only calcualted on
#right-tail one-sided p-values
qtemp <-qvalue::qvalue(pListRightTail,
fdr.level = qthreshold,
pi0.method = "bootstrap")
# Note: Using the bootstrap method here seems OK, but if this causes
#problems remove the pi0.method='bootstrap' in the previous line
# to revert to the default.
DataListGood$pvalues <-pList
DataListGood$pvaluesRightTail <-pListRightTail
DataListGood$qvalues <-qtemp$qvalues
DataListGood$OutlierFlag <-qtemp$significant
rbind(DataListGood, DataListNonPosFst, DataListNA)
}
pTwoSidedFromChiSq <- function(x,
df) {
# Takes a value x, finds the two-sided p-value for comparison to a
#chi-square distribution with df degrees of freedom.
pOneSided <-pchisq(x, df)
ifelse(pOneSided > 0.5, (1 - pOneSided) * 2, pOneSided * 2)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.