bnpsd.R
In bnpsd: Simulate Genotypes from the BN-PSD Admixture Model

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
# this package
library(bnpsd)
# for nice colors
library(RColorBrewer)
# for visualizing coancestry matrix with plot_popkin, and other handy functions
library(popkin)

# dimensions of data/model
# number of individuals (NOTE this is 10x less than in publication!)
n_ind <- 100
# number of intermediate subpops
k_subpops <- 10

# define population structure
# subpopulation FST vector, up to a scalar
inbr_subpops <- 1 : k_subpops
# desired bias coefficient
bias_coeff <- 0.5
# desired FST for the admixed individuals
Fst <- 0.1
# admixture proportions from 1D geography
obj <- admix_prop_1d_linear(
    n_ind = n_ind,
    k_subpops = k_subpops,
    bias_coeff = bias_coeff,
    coanc_subpops = inbr_subpops,
    fst = Fst
)
admix_proportions <- obj$admix_proportions
# rescaled inbreeding vector for intermediate subpopulations
inbr_subpops <- obj$coanc_subpops

# get pop structure parameters of the admixed individuals
coancestry <- coanc_admix(admix_proportions, inbr_subpops)

# verify that we got the desired FST!
fst_admix(admix_proportions, inbr_subpops)

# the mean of the diagonal of the coancestry matrix also equals FST
inbr <- diag(coancestry)
fst(inbr) # `fst` is a function in the popkin package

# verify that we got the desired `bias_coeff` too!
mean(coancestry) / Fst

## ---- fig.width = 4, fig.height = 2, fig.align = 'center'---------------------
# visualize the per-subpopulation inbreeding coefficients (FSTs)
# shrink default margins
par(mar = c(4, 4, 0, 0) + 0.2)
# colors for independent subpopulations
col_subpops <- brewer.pal(k_subpops, "Paired")

barplot(inbr_subpops, col = col_subpops, names.arg = 1 : k_subpops, ylim = c(0, 1),
	xlab = 'Subpopulation', ylab = 'Inbreeding coeff.')

## ---- fig.width = 6, fig.height = 1.5, fig.align = 'center'-------------------
# visualize the admixture proportions
# shrink default margins
par(mar = c(1, 4, 0, 0) + 0.2)
barplot(
    t(admix_proportions),
    col = col_subpops,
    border = NA,
    space = 0,
    ylab = 'Admixture prop.'
)
mtext('Individuals', 1)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# Visualize the coancestry matrix using "popkin"!
plot_popkin(
    coancestry,
    # zero inner margin (plus padding) because we have no labels
    mar = 0,
    leg_title = 'Coancestry'
)

## -----------------------------------------------------------------------------
# number of loci in simulation (NOTE this is 30x less than in publication!)
m_loci <- 10000
# draw all random Allele Freqs (AFs) and genotypes
# reuse the previous inbr_subpops, admix_proportions
out <- draw_all_admix(
    admix_proportions = admix_proportions,
    inbr_subpops = inbr_subpops,
    m_loci = m_loci,
    # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them
    want_p_subpops = TRUE,
    # NOTE: asking for `p_ind` increases memory usage substantially,
    # so don't ask for it unless you're sure you want it!
    want_p_ind = TRUE
)
# genotypes
X <- out$X
# ancestral AFs
p_anc <- out$p_anc
# intermediate independent subpopulation AFs
p_subpops <- out$p_subpops
# individual-specific AFs
p_ind <- out$p_ind

## ---- fig.width = 4, fig.height = 2, fig.align = 'center'---------------------
# inspect distribution of ancestral AFs (~ Uniform(0.01, 0.5))
# shrink default margins for these figures
par(mar = c(4, 4, 0, 0) + 0.2)
hist(p_anc, xlab = 'Ancestral AF', main = '', xlim = c(0, 1))

# distribution of intermediate population AFs
# (all subpopulations combined)
# (will be more dispersed toward 0 and 1 than ancestral AFs)
hist(p_subpops, xlab = 'Intermediate Subpopulation AF', main = '', xlim = c(0, 1))

# distribution of individual-specific AFs (admixed individuals)
# (admixture reduces differentiation, so these resemble ancestral AFs a bit more)
hist(p_ind, xlab = 'Individual-specific AF', main = '', xlim = c(0, 1))

# genotype distribution of admixed individuals
barplot(table(X), xlab = 'Genotypes', ylab = 'Frequency', col = 'white')

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
# for best estimates, group individuals into subpopulations using the geography
# this averages more individuals in estimating the minimum kinship
subpops <- ceiling( ( 1 : n_ind ) / n_ind * k_subpops )
table(subpops) # got k_subpops = 10 with 100 individuals each
# now estimate kinship using popkin
kinship_estimate <- popkin(X, subpops)
# replace diagonal with inbreeding coeffs. to match coancestry matrix
coancestry_estimate <- inbr_diag(kinship_estimate)

# Visualize the coancestry matrix using "popkin"!
plot_popkin(
    list( coancestry, coancestry_estimate ),
    titles = c('Truth', 'Estimate'),
    # second value is top margin, for panel titles
    mar = c(0, 2.5),
    leg_title = 'Coancestry'
)

## -----------------------------------------------------------------------------
coancestry_subpops_estimate <- popkin_af( p_subpops )
coancestry_ind_estimate <- popkin_af( p_ind )

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
plot_popkin(
    list( diag( inbr_subpops ), coancestry_subpops_estimate ),
    titles = c('Truth', 'Estimate'),
    mar = c(0, 2.5),
    leg_title = 'Coancestry'
)

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
plot_popkin(
    list( coancestry, coancestry_ind_estimate ),
    titles = c('Truth', 'Estimate'),
    mar = c(0, 2.5),
    leg_title = 'Coancestry'
)

## -----------------------------------------------------------------------------
# reuse the previous m_loci, inbr_subpops, admix_proportions
# draw ancestral AFs
p_anc <- draw_p_anc(m_loci)
# draw intermediate independent subpopulations AFs
p_subpops <- draw_p_subpops(p_anc, inbr_subpops)
# draw individual-specific AFs
p_ind <- make_p_ind_admix(p_subpops, admix_proportions)
# draw genotypes
X <- draw_genotypes_admix(p_ind)

## ---- fig.width = 4, fig.height = 2, fig.align = 'center'---------------------
# this increases the proportion of rare alleles
p_anc_beta <- draw_p_anc(m_loci, beta = 0.1)
# shrink default margins for these figures
par(mar = c(4, 4, 0, 0) + 0.2)
hist(p_anc_beta, xlab = 'Ancestral AF', main = '', xlim = c(0, 1))

## ---- fig.width = 4, fig.height = 2, fig.align = 'center'---------------------
# this increases the proportion of rare alleles
p_anc_beta <- rbeta(m_loci, shape1 = 0.1, shape2 = 1)
# shrink default margins for these figures
par(mar = c(4, 4, 0, 0) + 0.2)
hist(p_anc_beta, xlab = 'Ancestral AF', main = '', xlim = c(0, 1))

## -----------------------------------------------------------------------------
# draw genotypes for one individual from the ancestral population
# use "cbind" to turn the vector p_anc into a column matrix
# ("draw_genotypes_admix" expects a matrix)
X_anc <- draw_genotypes_admix( cbind( p_anc ) )
# returns a column matrix:
dim(X_anc)

# draw genotypes from intermediate populations
# draws one individual per intermediate population
X_subpops <- draw_genotypes_admix(p_subpops)

## -----------------------------------------------------------------------------
# data dimensions
# number of individuals
n_ind <- 100
# number of intermediate subpops
k_subpops <- 10

# define population structure
# subpopulation FST vector, up to a scalar
inbr_subpops <- 1 : k_subpops
# desired bias coefficient
bias_coeff <- 0.5
# desired FST for the admixed individuals
Fst <- 0.1

# admixture proportions from *circular* 1D geography
obj <- admix_prop_1d_circular(
    n_ind = n_ind,
    k_subpops = k_subpops,
    bias_coeff = bias_coeff,
    coanc_subpops = inbr_subpops,
    fst = Fst
)
admix_proportions <- obj$admix_proportions
inbr_subpops <- obj$coanc_subpops

# get pop structure parameters of the admixed individuals
coancestry <- coanc_admix(admix_proportions, inbr_subpops)

# verify that we got the desired FST!
fst_admix(admix_proportions, inbr_subpops)

# verify that we got the desired bias_coeff too!
mean(coancestry) / Fst

## ---- fig.width = 4, fig.height = 1.2, fig.align = 'center'-------------------
# visualize the per-subpopulation inbreeding coefficients (FSTs)
# tweak margins/etc
par(mar = c(2.5, 2.5, 0.3, 0) + 0.2, lab = c(2, 1, 7), mgp = c(1.5, 0.5, 0))
# colors for independent subpopulations
col_subpops <- brewer.pal(k_subpops, "Paired")
barplot(inbr_subpops, col = col_subpops, names.arg = colnames(admix_proportions), xlab = 'Subpopulation', ylab = 'Inbr')

## ---- fig.width = 4, fig.height = 1, fig.align = 'center'---------------------
# visualize the admixture proportions
# tweak margins/etc
par(mar = c(1, 4, 0.4, 0) + 0.2, lab = c(2, 2, 7))
barplot(
    t(admix_proportions),
    col = col_subpops,
    border = NA,
    space = 0,
    ylab = 'Admix prop'
)
mtext('Individuals', 1)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# Visualize the coancestry matrix using "popkin"!
plot_popkin(
    coancestry,
    leg_n = 3,
    mar = c(0, 0.4),
    leg_title = 'Coancestry'
)

## -----------------------------------------------------------------------------
# define population structure
# we'll have k_subpops = 3, each subpopulation with these sizes:
n1 <- 100
n2 <- 50
n3 <- 20

# here's the labels (for simplicity, list all individuals of S1 first, then S2, then S3)
labs <- c(
    rep.int('S1', n1),
    rep.int('S2', n2),
    rep.int('S3', n3)
)
# data dimensions infered from labs:

# number of individuals "n_ind"
length(labs)

# number of subpopulations "k_subpops"
k_subpops <- length(unique(labs))
k_subpops

# desired admixture matrix
admix_proportions <- admix_prop_indep_subpops(labs)

# got a numeric matrix with a single 1 value per row
# (denoting the sole subpopulation from which each individual draws its ancestry)
head(admix_proportions, 2)

# construct the intermediate subpopulation FST vector
# the desired final FST
Fst <- 0.2
# subpopulation FST vector, unnormalized so far
inbr_subpops <- 1 : k_subpops
# normalized to have the desired FST
# NOTE fst is a function in the `popkin` package
inbr_subpops <- inbr_subpops / fst(inbr_subpops) * Fst
# verify FST for the intermediate subpopulations
fst(inbr_subpops)

# get coancestry of the admixed individuals
coancestry <- coanc_admix(admix_proportions, inbr_subpops)
# before getting FST for individuals, weigh then inversely proportional to subpop sizes
weights <- weights_subpops(labs) # function from `popkin` package
# verify FST for individuals (same as for intermediate subpops for this pop structure)
fst_admix(admix_proportions, inbr_subpops, weights)

## ---- fig.width = 4, fig.height = 1.2, fig.align = 'center'-------------------
# visualize the per-subpopulation inbreeding coefficients (FSTs)
# tweak margins/etc
par(mar = c(2.5, 2.5, 0, 0) + 0.2, lab = c(2, 1, 7), mgp = c(1.5, 0.5, 0))
# colors for independent subpopulations
col_subpops <- brewer.pal(k_subpops, "Paired")
barplot(inbr_subpops, col = col_subpops, names.arg = colnames(admix_proportions), xlab = 'Subpopulation', ylab = 'Inbr')

## ---- fig.width = 4, fig.height = 1, fig.align = 'center'---------------------
# visualize the admixture proportions
# tweak margins/etc
par(mar = c(1, 4, 0.4, 0) + 0.2, lab = c(2, 2, 7))
barplot(
    t(admix_proportions),
    col = col_subpops,
    border = NA,
    space = 0,
    ylab = 'Admix prop'
)
mtext('Individuals', 1)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# Visualize the coancestry matrix using "popkin"!
plot_popkin(
    coancestry,
    leg_n = 3,
    mar = c(0, 0.4),
    leg_title = 'Coancestry'
)

## -----------------------------------------------------------------------------
tree_str <- '(S1:0.1,(S2:0.1,(S3:0.1,(S4:0.1,S5:0.1)N3:0.1)N2:0.1)N1:0.1)T;'

## ---- fig.width = 4, fig.height = 3.5, fig.align = 'center'-------------------
# base package for phylogenetics
library(ape)
# parses tree, creates object of class "phylo"
tree_subpops <- read.tree( text = tree_str )
# plot tree
par( mar = c(3,0,0,0) + 0.2 )
plot( tree_subpops, show.node.label = TRUE )
# add axis and label
axisPhylo( backward = FALSE )
mtext( 'Coancestry', side = 1, line = 2 )

## -----------------------------------------------------------------------------
p_subpops_tree <- draw_p_subpops_tree(
    p_anc = p_anc,
    tree_subpops = tree_subpops
)

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
# estimate coancestry
coancestry_subpops_tree_estimate <- popkin_af( p_subpops_tree )
# expected coancestry according to model
coancestry_subpops_tree <- coanc_tree( tree_subpops )
# plot
plot_popkin(
    list( coancestry_subpops_tree, coancestry_subpops_tree_estimate ),
    titles = c('Truth', 'Estimate'),
    mar = c(2, 2.5),
    leg_title = 'Coancestry',
    names = TRUE
)

## -----------------------------------------------------------------------------
1 - (1 - 0.1)^4

## -----------------------------------------------------------------------------
# number of admixed individuals
n_ind <- 100
# number of subpopulations, extracted from true coancestry
k_subpops <- nrow( coancestry_subpops_tree )

# define population structure
# for simplicity here we set spread parameter `sigma` directly
# admixture proportions from 1D geography
admix_proportions <- admix_prop_1d_linear(
    n_ind = n_ind,
    k_subpops = k_subpops,
    sigma = 0.5
)

# get pop structure parameters of the admixed individuals
coancestry <- coanc_admix(admix_proportions, coancestry_subpops_tree)

# calculate true FST of simulation
fst_admix(admix_proportions, coancestry_subpops_tree)

## ---- fig.width = 6, fig.height = 1.5, fig.align = 'center'-------------------
# visualize the admixture proportions
# colors for these independent subpopulations
col_subpops <- brewer.pal(k_subpops, "Paired")
# shrink default margins
par(mar = c(1, 4, 0, 0) + 0.2)
barplot(
    t(admix_proportions),
    col = col_subpops,
    border = NA,
    space = 0,
    ylab = 'Admixture prop.'
)
mtext('Individuals', 1)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# Visualize the coancestry matrix using "popkin"!
plot_popkin(
    coancestry,
    # zero inner margin (plus padding) because we have no labels
    mar = 0,
    leg_title = 'Coancestry'
)

## -----------------------------------------------------------------------------
# number of loci in simulation (NOTE this is 30x less than in publication!)
m_loci <- 10000
# draw all random Allele Freqs (AFs) and genotypes
# reuse the previous inbr_subpops, admix_proportions
out <- draw_all_admix(
    admix_proportions = admix_proportions,
    tree_subpops = tree_subpops,
    m_loci = m_loci
)
# genotypes
X <- out$X

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
# estimate kinship using popkin
# here we didn't use subpopulation labels for simplicity
kinship_estimate <- popkin(X)
# replace diagonal with inbreeding coeffs. to match coancestry matrix
coancestry_estimate <- inbr_diag(kinship_estimate)

# Visualize the coancestry matrix using "popkin"!
plot_popkin(
    list( coancestry, coancestry_estimate ),
    titles = c('Truth', 'Estimate'),
    # second value is top margin, for panel titles
    mar = c(0, 2.5),
    leg_title = 'Coancestry'
)