Intro

Developements of the fpga packages were heavily inspired by gstudio. While gstudio provides all of the functions and much more use friendliness, we were faced with the challenge to analyse several thousands of indivudals from hunderts of populations and gstudio was to slow. This vignette severs as a comparison between fpga and gstudio.

Data

We use the arapat data from gstudio:

library(abind)
library(fpga)
library(gstudio)


# We have 
# - 1000 Individuals
# - with 3 loci
# - 200 possible allels
# - in 10 populations

set.seed(111)
dat0 <- data.frame(
  pop = factor(as.character(rep(1:9, each = 100))), 
  loc1 = do.call(c, lapply(split(sample(c(1:100, NA), 900 * 2, TRUE), rep(1:900, each = 2)), locus)), 
  loc2 = do.call(c, lapply(split(sample(c(200:250, NA), 900 * 2, TRUE), rep(1:900, each = 2)), locus)), 
  loc3 = do.call(c, lapply(split(sample(c(1:200), 900 * 2, TRUE), rep(1:900, each = 2)), locus))
)

## With NA
dat0 <- data.frame(
  pop = factor(as.character(rep(1:3, each = 10))), 
  loc1 = do.call(c, lapply(split(sample(c(1:10, NA), 30 * 2, TRUE), rep(1:30, each = 2)), locus)), 
#  loc2 = do.call(c, lapply(split(sample(c(200:205, NA), 30 * 2, TRUE), rep(1:30, each = 2)), locus)), 
  loc3 = do.call(c, lapply(split(sample(c(1:4), 30 * 2, TRUE), rep(1:30, each = 2)), locus))
)

# No NA
dat0 <- data.frame(
  pop = factor(as.character(rep(1:3, each = 10))), 
  loc1 = do.call(c, lapply(split(sample(c(1:10), 30 * 2, TRUE), rep(1:30, each = 2)), locus)), 
#  loc2 = do.call(c, lapply(split(sample(c(200:205, NA), 30 * 2, TRUE), rep(1:30, each = 2)), locus)), 
  loc3 = do.call(c, lapply(split(sample(c(1:4), 30 * 2, TRUE), rep(1:30, each = 2)), locus))
)

head(dat0)

dat2 <- gstudio2fpga(dat0)
identical(dat1, dat2)
dat1
dat2
dat00 <- fpga2gstudio(dat1)
identical(dat0, dat00)


# data for fpga
x <- lapply(column_class(dat0, class = "locus"), function(i) apply(alleles(dat0[[i]]), 2, as.integer))
dat1 <- abind(list(sapply(x, function(y) y[, 1]), 
                  sapply(x, function(y) y[, 2])), along = 3)

Genetic Metrics

Frequencies

gstudio::frequencies(dat0)
freq(dat1)

mean(frequencies(dat0)$Frequency) == mean(freq(dat1)$freq)
sum(frequencies(dat0)$Frequency) 
sum(freq(dat1)$freq)

Number of alleles:

system.time(na0 <- A(dat0))
system.time(na1 <- n_alls(dat1))

na0$A == na1

Observed heterozygosity:

system.time(ho0 <- Ho(dat0))
system.time(ho1 <- het_obs(dat1))
ho0$Ho == ho1

Expected heterozygosity:

system.time(he0 <- He(dat0))
system.time(he1 <- het_exp(dat1))
he0$He == he1

Effective number of allels:

system.time(na0 <- Ae(dat0))
system.time(na1 <- n_eff_alls(dat1))

na0$Ae == na1

Adding a stratum

All of the above function can be stratified by population, e.g.,

system.time(na0 <- genetic_diversity(dat0, stratum = "pop", mode = "A"))
system.time(na1 <- n_alls(dat1, dat0[, 1]))
sum(na0$A) == sum(na1)

Genetic structure

gst

dest

Pairwise genetic structure

GST



jmsigner/fpga documentation built on May 19, 2019, 1:56 p.m.