Compare assigner's
Fst calculations (using Weir and Cockerham, 1984)
against alternatives. To make differences and bottlenecks really
stand out we will use a simulated dataset and an empirical RADseq dataset
both with more than 10K markers.
rm(list = ls())
You might be interested in this package called here or
use setwd("/YOUR/NEW/PATH/HERE/")
, to change the working directory.
Install the R packages below
# install required packages install.packages(c("diveRsity", "hierfstat", "StAMPP")) if (!require("devtools")) install.packages("devtools") devtools::install_github('ericarcher/strataG', build_vignettes = TRUE) if(!require("BiocManager")) install.packages("BiocManager") BiocManager::install(version = "3.12") BiocManager::install("SNPRelate") if (!require("ggcorrplot")) install.packages("ggcorrplot") devtools::install_github("thierrygosselin/assigner", build_vignettes = TRUE)
Versions of packages and software:
# Check packages versions packageVersion('assigner') # 0.5.8 packageVersion('diveRsity') # 1.9.90 packageVersion('hierfstat') # 0.5.7 packageVersion('stAMPP') # 1.6.1 packageVersion('strataG') # 2.4.910 packageVersion('SNPRelate') # 1.24.0
Optional software:
We're going to use the unfiltered vcf sunflower dataset from Owens et al. 2016, kindly provided by Gregory Owens. This dataset as missing data.
Owens GL, Baute GJ, Rieseberg LH. Revisiting a classic case of introgression: hybridization and gene flow in Californian sunflowers. Abbott RJ, Barton NH, Good JM, editors. Molecular Ecology. 2016;25: 2630–2643. doi:10.1111/mec.13569
Download the files:
After running this vignette once, I saw something strange in the results. To explore this further, this second grind of testing add 2 additional datasets: data_assigner_sim_01 and data_assigner_sim_02. The documentation explains how these simulated data were produced. They differ mainly in migration rate which resulted in contrastingly different Fst, admixture/membership probability.
We use radiator::genomic_converter to produce the required files or R objects:
library(assigner) # radiator is installed automatically with assigner sunflower.filtered <- radiator::genomic_converter( data = "sunflower.vcf", strata = "sunflower.strata.tsv", output = c( "genepop", # for GenoDive and diveRsity "gtypes", # for strataG "genlight", # for StAMPP "hierfstat", # for Hierfstat "snprelate" # for SNPRelate ) ) %>% dplyr::glimpse()
VCF stats:
radiator::genomic_converter
will only keep polymorphic markers
in common between the strata.The sunflower.filtered
object is a list with several objects inside:
names(sunflower.filtered) #> [1] "hierfstat" "gtypes" "genlight" "snprelate" "tidy.data" # the tidy dataset is used by assigner
The function also produced a folder with several files, including:
To learn more about the output folders and files: radiator::genomic_converter
Note on timing
I could use fancy timing packages (microbenchmark, bench), but for simplicity we're just going to use system.time
.
data_assigner_sim_01 %<>% dplyr::mutate(POP_ID = factor(x = POP_ID, levels = c("POP_1", "POP_2", "POP_3", "POP_4", "POP_5"))) %>% dplyr::arrange(POP_ID) sim.data1 <- radiator::genomic_converter( data = data_assigner_sim_01, output = c( "genepop", # for GenoDive and diveRsity "gtypes", # for strataG "genlight", # for StAMPP "hierfstat", # for Hierfstat "snprelate" # for SNPRelate ), filename = "sim01", path.folder = "fst_sim01", internal = TRUE, verbose = FALSE ) sim.data1$tidy.data %<>% dplyr::arrange(POP_ID)
data_assigner_sim_02 %<>% dplyr::mutate(POP_ID = factor(x = POP_ID, levels = c("POP_1", "POP_2", "POP_3", "POP_4", "POP_5"))) %>% dplyr::arrange(POP_ID) sim.data2 <- radiator::genomic_converter( data = data_assigner_sim_02, output = c( "genepop", # for GenoDive and diveRsity "gtypes", # for strataG "genlight", # for StAMPP "hierfstat", # for Hierfstat "snprelate" # for SNPRelate ), filename = "sim02", path.folder = "fst_sim02", internal = TRUE, verbose = FALSE ) sim.data2$tidy.data %<>% dplyr::arrange(POP_ID)
Let's start with the Gold Standard, Jérôme Goudet's hierfstat package.
# ETA ~ 30 min on my MBP, so yes, you have time for coffee... pops <- as.character(unique(sunflower.filtered$tidy.data$POP_ID))# pop string system.time( hierfstat.fst <- hierfstat::pairwise.WCfst( dat = sunflower.filtered$hierfstat, diploid = TRUE ) %>% # The result need some work # We transform the matrix into a useful tibble: magrittr::set_colnames(x = ., value = pops) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = TRUE, na.diag = TRUE, relative = FALSE ) %>% magrittr::set_colnames(x = ., value = c("POP1", "POP2", "HIERFSTAT")) %>% dplyr::arrange(POP1, POP2) ) #> 1233.880sec ~20 min hierfstat.fst
pops <- as.character(unique(sim.data1$tidy.data$POP_ID))# pop string hierfstat.sim01.fst <- hierfstat::pairwise.WCfst( dat = sim.data1$hierfstat, diploid = TRUE ) %>% # The result need some work # We transform the matrix into a useful tibble: magrittr::set_colnames(x = ., value = pops) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = TRUE, na.diag = TRUE, relative = FALSE ) %>% magrittr::set_colnames(x = ., value = c("POP1", "POP2", "HIERFSTAT_SIM01")) %>% dplyr::arrange(POP1, POP2) pops <- as.character(unique(sim.data2$tidy.data$POP_ID))# pop string hierfstat.sim02.fst <- hierfstat::pairwise.WCfst( dat = sim.data2$hierfstat, diploid = TRUE ) %>% # The result need some work # We transform the matrix into a useful tibble: magrittr::set_colnames(x = ., value = pops) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = TRUE, na.diag = TRUE, relative = FALSE ) %>% magrittr::set_colnames(x = ., value = c("POP1", "POP2", "HIERFSTAT_SIM02")) %>% dplyr::arrange(POP1, POP2)
This is Eric Archer's strataG package.
system.time( stratag.fst <- strataG::popStructTest( g = sunflower.filtered$gtypes, stats = "fst", type = "both", quietly = TRUE, max.cores = parallel::detectCores() - 1, nrep = 0, keep.null = FALSE, write.output = FALSE) %$% pairwise$result %>% dplyr::select(POP1 = strata.1, POP2 = strata.2, STRATAG = Fst) ) stratag.fst #>657.377sec ~ 11 min
stratag.sim01.fst <- strataG::popStructTest( g = sim.data1$gtypes, stats = "fst", type = "both", quietly = TRUE, max.cores = parallel::detectCores() - 1, nrep = 0, keep.null = FALSE, write.output = FALSE) %$% pairwise$result %>% dplyr::select(POP1 = strata.1, POP2 = strata.2, STRATAG_SIM01 = Fst) stratag.sim02.fst <- strataG::popStructTest( g = sim.data2$gtypes, stats = "fst", type = "both", quietly = TRUE, max.cores = parallel::detectCores() - 1, nrep = 0, keep.null = FALSE, write.output = FALSE) %$% pairwise$result %>% dplyr::select(POP1 = strata.1, POP2 = strata.2, STRATAG_SIM02 = Fst)
genepop
file.pops <- as.character(unique(sunflower.filtered$tidy.data$POP_ID)) # pop string system.time( diversity.fst <- diveRsity::diffCalc( infile = list.files( path = ".", pattern = "_genepop.gen", full.names = TRUE, recursive = TRUE ), outfile = "sunflower_diversity_output", fst = TRUE, pairwise = TRUE, para = TRUE ) %$% pairwise %$% Fst %>% # The result need some work # We transform the matrix into a useful tibble: magrittr::set_colnames(x = ., value = pops) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = FALSE, # FALSE because their's only the lower diag na.diag = TRUE, relative = FALSE ) %>% # we switch the POP id column here to match the others magrittr::set_colnames(x = ., value = c("POP2", "POP1", "DIVERSITY")) ) #> 42.711 sec Wow! this is very fast, plus other distances are automatically generated! diversity.fst
pops <- as.character(unique(sim.data1$tidy.data$POP_ID)) # pop string diversity.sim01.fst <- diveRsity::diffCalc( infile = list.files( path = ".", pattern = "sim01_genepop.gen", full.names = TRUE, recursive = TRUE ), outfile = "sim_data1_diversity_output", fst = TRUE, pairwise = TRUE, para = TRUE ) %$% pairwise %$% Fst %>% # The result need some work # We transform the matrix into a useful tibble: magrittr::set_colnames(x = ., value = pops) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = FALSE, # FALSE because their's only the lower diag na.diag = TRUE, relative = FALSE ) %>% # we switch the POP id column here to match the others magrittr::set_colnames(x = ., value = c("POP2", "POP1", "DIVERSITY_SIM01")) pops <- as.character(unique(sim.data2$tidy.data$POP_ID)) # pop string diversity.sim02.fst <- diveRsity::diffCalc( infile = list.files( path = ".", pattern = "sim02_genepop.gen", full.names = TRUE, recursive = TRUE ), outfile = "sim_data2_diversity_output", fst = TRUE, pairwise = TRUE, para = TRUE ) %$% pairwise %$% Fst %>% # The result need some work # We transform the matrix into a useful tibble: magrittr::set_colnames(x = ., value = pops) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = FALSE, # FALSE because their's only the lower diag na.diag = TRUE, relative = FALSE ) %>% # we switch the POP id column here to match the others magrittr::set_colnames(x = ., value = c("POP2", "POP1", "DIVERSITY_SIM02"))
We need the strata
strata <- radiator::read_strata(strata = "sunflower.strata.tsv") %$% strata
Showing off the global Fst only:
system.time( snprelate.fst <- SNPRelate::snpgdsFst( gdsobj = sunflower.filtered$snprelate, population = strata$STRATA, # factors required sample.id = strata$INDIVIDUALS, snp.id = NULL, method = "W&C84", remove.monosnp = TRUE, maf = NaN, missing.rate = NaN, autosome.only = FALSE, with.id = FALSE, verbose = TRUE ) ) #> 0.029sec wo! names(snprelate.fst) #> snprelate.fst$Fst: weighted Fst estimate #> snprelate.fst$MeanFst: the average of Fst estimates across SNPs #> snprelate.fst$FstSNP: a vector of Fst for each SNP
Build a function to conduct pairwise Fst with SNPRelate:
pairwise_fst_snprelate <- function(pop.pairwise, data, strata) { strata.temp <- dplyr::filter(.data = strata, STRATA %in% pop.pairwise) %>% dplyr::mutate(STRATA = droplevels(STRATA)) fst <- SNPRelate::snpgdsFst( gdsobj = data, population = strata.temp$STRATA, # factors required sample.id = strata.temp$INDIVIDUALS, snp.id = NULL, method = "W&C84", remove.monosnp = TRUE, maf = NaN, missing.rate = NaN, autosome.only = FALSE, with.id = FALSE, verbose = TRUE ) #prepare the results into a tibble: fst <- tibble::tibble( POP1 = pop.pairwise[1], POP2 = pop.pairwise[2], SNPRELATE_MEAN = fst$MeanFst, SNPRELATE_WEIGHTED = fst$Fst ) return(fst) }
To run we need all combination of populations:
pop.pairwise <- utils::combn(as.character(unique(strata$STRATA)), 2, simplify = FALSE)
Run the pairwise implementation:
system.time( snprelate.fst <- purrr::map_dfr( .x = pop.pairwise, .f = pairwise_fst_snprelate, data = sunflower.filtered$snprelate, strata = strata ) ) #> 0.245 sec that's the fastest so far!
strata <- radiator::generate_strata(data = sim.data1$tidy.data) %>% dplyr::rename(STRATA = POP_ID) pop.pairwise <- utils::combn(as.character(unique(strata$STRATA)), 2, simplify = FALSE) snprelate.sim01.fst <- purrr::map_dfr( .x = pop.pairwise, .f = pairwise_fst_snprelate, data = sim.data1$snprelate, strata = strata ) %>% dplyr::rename(SNPRELATE_MEAN_SIM01 = SNPRELATE_MEAN, SNPRELATE_WEIGHTED_SIM01 = SNPRELATE_WEIGHTED) strata <- radiator::generate_strata(data = sim.data2$tidy.data) %>% dplyr::rename(STRATA = POP_ID) pop.pairwise <- utils::combn(as.character(unique(strata$STRATA)), 2, simplify = FALSE) snprelate.sim02.fst <- purrr::map_dfr( .x = pop.pairwise, .f = pairwise_fst_snprelate, data = sim.data2$snprelate, strata = strata ) %>% dplyr::rename(SNPRELATE_MEAN_SIM02 = SNPRELATE_MEAN, SNPRELATE_WEIGHTED_SIM02 = SNPRELATE_WEIGHTED)
genepop
fileAnalysis -> Pairwise Differentiation -> F_st
, choose 1 permutations
.GenoDive
is coded in Objective-C (~ < 3 sec on my MBP)# Import in R # Add the proper columns and rows names pops <- as.character(unique(sunflower.filtered$tidy.data$POP_ID)) # pop string genodive.fst <- readr::read_tsv( file = "genodive.fst.tsv", col_names = pops ) %>% as.matrix(.) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = TRUE, # FALSE because their's only the lower diag na.diag = TRUE, relative = FALSE ) %>% magrittr::set_colnames(x = ., value = c("POP1", "POP2", "GENODIVE")) genodive.fst
Obviously, here the timing gets doesn't reflect the whole process:
pops <- as.character(unique(sim.data1$tidy.data$POP_ID)) # pop string genodive.sim01.fst <- readr::read_tsv( file = list.files( path = ".", pattern = "genodive.sim01.fst.tsv", full.names = TRUE, recursive = TRUE ), col_names = pops ) %>% as.matrix(.) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = TRUE, # FALSE because their's only the lower diag na.diag = TRUE, relative = FALSE ) %>% magrittr::set_colnames(x = ., value = c("POP1", "POP2", "GENODIVE_SIM01")) pops <- as.character(unique(sim.data2$tidy.data$POP_ID)) # pop string genodive.sim02.fst <- readr::read_tsv( file = list.files( path = ".", pattern = "genodive.sim02.fst.tsv", full.names = TRUE, recursive = TRUE ), col_names = pops ) %>% as.matrix(.) %>% magrittr::set_rownames(x = ., value = pops) %>% radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = TRUE, # FALSE because their's only the lower diag na.diag = TRUE, relative = FALSE ) %>% magrittr::set_colnames(x = ., value = c("POP1", "POP2", "GENODIVE_SIM02"))
Luke Pembleton's StAMPP
package use genlight
object as input:
system.time( stampp.fst <- StAMPP::stamppFst( geno = sunflower.filtered$genlight, nboots = 1, percent = 95, nclusters = parallel::detectCores() - 1 ) %>% # The distance matrix need some love # if we want to compare it with the others: radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = FALSE, na.diag = TRUE, relative = FALSE ) %>% # we switch the POP id column here to match the others magrittr::set_colnames(x = ., value = c("POP2", "POP1", "STAMPP")) %>% dplyr::arrange(POP1, POP2) ) #> 39.281 sec!
stampp.sim01.fst <- StAMPP::stamppFst( geno = sim.data1$genlight, nboots = 1, percent = 95, nclusters = parallel::detectCores() - 1 ) %>% # The distance matrix need some love # if we want to compare it with the others: radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = FALSE, na.diag = TRUE, relative = FALSE ) %>% # we switch the POP id column here to match the others magrittr::set_colnames(x = ., value = c("POP2", "POP1", "STAMPP_SIM01")) %>% dplyr::arrange(POP1, POP2) stampp.sim02.fst <- StAMPP::stamppFst( geno = sim.data2$genlight, nboots = 1, percent = 95, nclusters = parallel::detectCores() - 1 ) %>% # The distance matrix need some love # if we want to compare it with the others: radiator::distance2tibble( x = ., remove.diag = TRUE, remove.lower = FALSE, na.diag = TRUE, relative = FALSE ) %>% # we switch the POP id column here to match the others magrittr::set_colnames(x = ., value = c("POP2", "POP1", "STAMPP_SIM02")) %>% dplyr::arrange(POP1, POP2)
I'm not particularly fond of VCFtools for Fst. Too much manipulation of files. For the sake of the exercise I'm going to compare it quickly with 2 populations for the empirical dataset only.
pop1 <- dplyr::filter(strata, STRATA == "G100") %>% dplyr::select(INDIVIDUALS) %>% readr::write_tsv(x = ., path = "pop1.txt", col_names = FALSE) pop2 <- dplyr::filter(strata, STRATA == "G102") %>% dplyr::select(INDIVIDUALS) %>% readr::write_tsv(x = ., path = "pop2.txt", col_names = FALSE)
the command used: ```{bash, eval = FALSE} vcftools --vcf sunflower.vcf --weir-fst-pop pop1.txt --weir-fst-pop pop2.txt --out pop1_vs_pop2
PLINK users, note that the `--fst` option in [PLINK v.1.9](https://www.cog-genomics.org/plink/1.9/basic_stats#fst) is actually a port of VCFtools.. For me, comparison with VCFtools stops here, too painfull to conduct more pairwise, getting the ouput ready for R, etc. This is prone to human error and too much work to automate for no benefit (below you'll understand how those VCFtools results compares with the other packages). ## Simulated datasets *not conducted* # assigner ## Empirical dataset ```r assigner.fst <- assigner::fst_WC84( data = sunflower.filtered$tidy.data, pairwise = TRUE, parallel.core = parallel::detectCores() - 1, verbose = TRUE ) %$% pairwise.fst %>% dplyr::rename(ASSIGNER = FST) %>% dplyr::select(-N_MARKERS) #> 49 sec assigner.fst
Learn more about assigner::fst_WC84
with the dedicated vignette.
assigner.sim01.fst <- assigner::fst_WC84( data = sim.data1$tidy.data, pairwise = TRUE, parallel.core = parallel::detectCores() - 1, verbose = TRUE ) %$% pairwise.fst %>% dplyr::rename(ASSIGNER_SIM01 = FST) %>% dplyr::select(-N_MARKERS) assigner.sim02.fst <- assigner::fst_WC84( data = sim.data2$tidy.data, pairwise = TRUE, parallel.core = parallel::detectCores() - 1, verbose = TRUE ) %$% pairwise.fst %>% dplyr::rename(ASSIGNER_SIM02 = FST) %>% dplyr::select(-N_MARKERS)
fst.summary <- suppressWarnings( assigner.fst %>% dplyr::full_join(diversity.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(genodive.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(hierfstat.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(stampp.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(stratag.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(snprelate.fst, by = c("POP1", "POP2")) )
If you haven't run the codes above:
fst.summary <- readr::read_tsv(file = "fst.summary.tsv", col_types = "ccnnnnnnnn") fst.summary
If we highlight the first line, pops G100 and G102:
dplyr::filter(fst.summary, dplyr::row_number() == 1L)
Gold Standard: HIERFSTAT 0.12
There's obviously differences when we look at the pairwise comparisons separately. The difference between the approaches taken in the packages/software probably depends on numerous things:
Let's look at it globally with a correlation matrix:
fst.corr <- round(cor(x = dplyr::select(fst.summary, -POP1, -POP2)), 6) fst.corr
I prefer to look at it in a tibble...
fst.corr.tibble <- radiator::distance2tibble( x = fst.corr, na.diag = TRUE, relative = FALSE, pop.levels = c("HIERFSTAT", "STRATAG", "ASSIGNER", "DIVERSITY", "STAMPP", "GENODIVE", "SNPRELATE_MEAN", "SNPRELATE_WEIGHTED")) %>% magrittr::set_colnames(x = ., value = c("ID1", "ID2", "CORRELATION")) %>% dplyr::arrange(dplyr::desc(CORRELATION)) fst.corr.tibble
Very interesting results!
Let's plot the results:
plot.fst <- fst.summary %>% tidyr::unite(data = ., col = PAIRS, POP1, POP2, sep = "-") %>% tidyr::gather(data = ., key = SOFTWARE, value = FST, -PAIRS) %>% dplyr::mutate( SOFTWARE = factor( x = SOFTWARE, levels = c("HIERFSTAT", "ASSIGNER", "STAMPP", "STRATAG", "DIVERSITY", "GENODIVE", "SNPRELATE_WEIGHTED", "SNPRELATE_MEAN"), ordered = TRUE)) pairs.levels <- dplyr::group_by(plot.fst, PAIRS) %>% dplyr::summarise(FST = mean(FST, na.rm = TRUE)) %>% dplyr::arrange(FST) %$% PAIRS under.the.sea.palette <- c("#1C2344", "#163D7D", "#118386", "#89D6D6", "#6E783D", "#FC564F", "#901546", "#FFD521")#, "#010101") plot.fst %<>% dplyr::mutate(PAIRS = factor(x = PAIRS, levels = pairs.levels, ordered = TRUE)) %>% ggplot2::ggplot(data = ., ggplot2::aes(x = FST, y = PAIRS, colour = SOFTWARE)) + ggplot2::geom_jitter(alpha = 0.8) + ggplot2::scale_colour_manual(values = under.the.sea.palette)
Generate a summary tibble:
fst.summary.sim01<- suppressWarnings( assigner.sim01.fst %>% dplyr::full_join(diversity.sim01.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(genodive.sim01.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(hierfstat.sim01.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(stampp.sim01.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(stratag.sim01.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(snprelate.sim01.fst, by = c("POP1", "POP2")) %>% dplyr::mutate_if(.tbl = ., .predicate = is.numeric, .funs = round, 4) )
fst.summary.sim01 <- readr::read_tsv(file = "fst.summary.sim01.tsv", col_types = "ccnnnnnnnn") fst.summary.sim01
Generate the correlation tibble:
fst.corr.tibble.sim01 <- round(cor(x = dplyr::select(fst.summary.sim01, -POP1, -POP2)), 6) %>% radiator::distance2tibble( x = ., na.diag = TRUE, relative = FALSE, pop.levels = c("HIERFSTAT_SIM01", "STRATAG_SIM01", "ASSIGNER_SIM01", "DIVERSITY_SIM01", "STAMPP_SIM01", "GENODIVE_SIM01", "SNPRELATE_MEAN_SIM01", "SNPRELATE_WEIGHTED_SIM01")) %>% magrittr::set_colnames(x = ., value = c("ID1", "ID2", "CORRELATION_SIM01")) %>% dplyr::arrange(dplyr::desc(CORRELATION_SIM01)) fst.corr.tibble.sim01
Generate a summary tibble:
fst.summary.sim02<- suppressWarnings( assigner.sim02.fst %>% dplyr::full_join(diversity.sim02.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(genodive.sim02.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(hierfstat.sim02.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(stampp.sim02.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(stratag.sim02.fst, by = c("POP1", "POP2")) %>% dplyr::full_join(snprelate.sim02.fst, by = c("POP1", "POP2")) %>% dplyr::mutate_if(.tbl = ., .predicate = is.numeric, .funs = round, 4) )
fst.summary.sim02 <- readr::read_tsv(file = "fst.summary.sim02.tsv", col_types = "ccnnnnnnnn") fst.summary.sim02
Generate the correlation tibble:
fst.corr.tibble.sim02 <- round(cor(x = dplyr::select(fst.summary.sim02, -POP1, -POP2)), 6) %>% radiator::distance2tibble( x = ., na.diag = TRUE, relative = FALSE, pop.levels = c("HIERFSTAT_SIM02", "STRATAG_SIM02", "ASSIGNER_SIM02", "DIVERSITY_SIM02", "STAMPP_SIM02", "GENODIVE_SIM02", "SNPRELATE_MEAN_SIM02", "SNPRELATE_WEIGHTED_SIM02")) %>% magrittr::set_colnames(x = ., value = c("ID1", "ID2", "CORRELATION_SIM02")) %>% dplyr::arrange(dplyr::desc(CORRELATION_SIM02)) fst.corr.tibble.sim02
The table timing was produced using the empirical dataset.
summary.table <- readr::read_tsv(file = "fst_comparisons_packages_features.tsv", col_types = readr::cols(.default = readr::col_character())) knitr::kable(summary.table) %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE)
Note:
You could go and look at the 36 pairwise comparisons, but the end of the story is very simple:
MeanFst
.SNPRelate
MeanFst as a systematic downward bias and the weighted estimate is
usually a little higher.assigner
and hierfstat
have equal values down to a very large number of decimals!assigner
and hierfstat
show consistent behaviors in the presence of
missing data.assigner
is different than the others: it transform -
Fst to 0.In the absence of missing data, because overall Fst values are similar, in the end, it's worth looking at:
Please let me know if I did something wrong or forgot to mention an interesting feature of your favorite package.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.