inst/doc/popkin.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(popkin)
# rename for simplicity
X <- hgdp_subset
dim(X)

## -----------------------------------------------------------------------------
# shorten subpopulation labels
colnames(X)[colnames(X) == 'AFRICA'] <- 'AFR'
colnames(X)[colnames(X) == 'MIDDLE_EAST'] <- 'MDE'
colnames(X)[colnames(X) == 'EUROPE'] <- 'EUR'
colnames(X)[colnames(X) == 'CENTRAL_SOUTH_ASIA'] <- 'SAS'
colnames(X)[colnames(X) == 'EAST_ASIA'] <- 'EAS'
colnames(X)[colnames(X) == 'OCEANIA'] <- 'OCE'
colnames(X)[colnames(X) == 'AMERICA'] <- 'AMR'
# order roughly by distance from Africa
# (without crossing the Atlantic Ocean)
pop_order <- c('AFR', 'MDE', 'EUR', 'SAS', 'EAS', 'OCE', 'AMR')
# applies reordering
X <- X[, order(match(colnames(X), pop_order))]
# extract subpopulations vector
subpops <- colnames(X)

## -----------------------------------------------------------------------------
X[1:10,1:15]

## -----------------------------------------------------------------------------
kinship <- popkin(X, subpops)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
plot_popkin(
    kinship,
    labs = subpops,
    # shared bottom and left margin value, to make space for labels
    mar = 1
)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
plot_popkin(
    inbr_diag( kinship ),
    labs = subpops,
    mar = 1
)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
plot_popkin(
    inbr_diag(kinship),
    labs = subpops,
    labs_even = TRUE,
    labs_line = 1,
    labs_cex = 0.7,
    mar = 2
)

## -----------------------------------------------------------------------------
# get weights
w <- weights_subpops(subpops)
# compute FST!
# Note: don't use the output to inbr_diag(kinship) or FST will be wrong!
fst(kinship, w)

## ---- fig.width = 4, fig.height = 2, fig.align = 'center'---------------------
# vector of inbreeding coefficients
inbrs <- inbr(kinship)
# quick plot
# adjust margins
par(mar = c(4, 4, 0, 0.2) + 0.2)
# see their distribution
plot(density(inbrs), xlab = 'inbreeding coefficient', main = '')

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# compute pairwise FST matrix from kinship matrix
pairwise_fst <- pwfst(kinship)
# fancy legend label
leg_title <- expression(paste('Pairwise ', F[ST]))
# NOTE no need for inbr_diag() here!
plot_popkin(
    pairwise_fst,
    labs = subpops,
    labs_even = TRUE,
    labs_line = 1,
    labs_cex = 0.7,
    leg_title = leg_title,
    mar = c(2, 0.2)
)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
indexes_AFR <- subpops == 'AFR'
# AFR subset of the kinship matrix
kinship_AFR <- kinship[indexes_AFR, indexes_AFR]

# kinship matrix plot
plot_popkin(
    inbr_diag( kinship_AFR ),
    mar = 0
)

# estimate FST before rescaling (this value will be wrong, too high!)
fst( kinship_AFR )

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# rescale kinship_AFR
# since subpops is missing, minimum kinship value is set to zero
# (no averaging between subpopulations)
kinship_AFR <- rescale_popkin( kinship_AFR )

# kinship matrix plot
plot_popkin(
    inbr_diag( kinship_AFR ),
    mar = c(0, 0.3)
)

# FST is now correct, relative to the MRCA of AFR individuals
fst( kinship_AFR )

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
# dummy labels to have lines in second panel
subpops_AFR <- subpops[ indexes_AFR ]
plot_popkin(
    # inbr_diag also works on a list of matrices
    inbr_diag( list( kinship, kinship_AFR ) ),
    # title of each panel
    titles = c('All', 'AFR only, rescaled'),
    # pass per-panel labels using a list
    labs = list( subpops, subpops_AFR ),
    # scalar options are shared across panels
    labs_even = TRUE,
    labs_line = 1,
    labs_cex = 0.5,
    # second value is top margin (to make space for titles)
    mar = c(2, 2)
)

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# create second level of labels
# first copy first-level labels
blocks <- subpops
# first block is AFR
blocks[blocks == 'AFR'] <- 'B1'
# second block is West Eurasians, broadly defined
blocks[blocks == 'MDE'] <- 'B2'
blocks[blocks == 'EUR'] <- 'B2'
blocks[blocks == 'SAS'] <- 'B2'
# third block is East Eurasians, broadly defined
blocks[blocks == 'EAS'] <- 'B3'
blocks[blocks == 'OCE'] <- 'B3'
blocks[blocks == 'AMR'] <- 'B3'

# plotting with different options per level is more complicated...
plot_popkin(
	inbr_diag( kinship ),
	labs = cbind( subpops, blocks ), # ... labs is now a matrix with levels on columns
	labs_even = c(TRUE, FALSE),      # ... even spacing for first level only
	labs_line = c(1, 2),             # ... put second level further out
	labs_cex = c(0.7, 1),            # ... don't shrink second level
	labs_sep = c(FALSE, TRUE),       # ... draw lines inside heatmap for second level only
	ylab_adj = 0.65,                 # push up outer margin ylab "Individuals"
    mar = 3                          # increase margins again
	)

## ---- fig.width = 6, fig.height = 2.8, fig.align = 'center'-------------------
plot_popkin(
    inbr_diag(list(kinship, kinship_AFR)),
	titles = c('All', 'AFR only, rescaled'),
    # list of matrices
	labs = list( cbind(subpops, blocks), subpops_AFR ),
    # non-list: values are reused for both panels
	labs_even = c(TRUE, FALSE),
	labs_line = c(1, 2),
	# make label bigger in second panel (custom per-panel values)
    # list of vectors
	labs_cex = list(c(0.5, 0.7), 1),
	# add lines for first level of second panel (custom per-panel values)
    # list of vectors
	labs_sep = list(c(FALSE, TRUE), TRUE),
    mar = c(3, 2)
	)

## ---- fig.width = 6.5, fig.height = 2.8, fig.align = 'center'-----------------
plot_popkin(
    inbr_diag(list(kinship, kinship_AFR)),
	titles = c('All', 'AFR only, rescaled'),
	labs = list( cbind(subpops, blocks), subpops_AFR ),
	labs_even = c(TRUE, FALSE),
	labs_line = c(1, 2),
	labs_cex = list(c(0.5, 0.7), 1),
	labs_sep = list(c(FALSE, TRUE), TRUE),
    mar = c(3, 2),
    # this option adds a legend per panel
    leg_per_panel = TRUE
	)

## -----------------------------------------------------------------------------
# number of loci (rows)
m_loci <- nrow( X )
# number of subpopulations (columns)
k_subpops <- length( pop_order )
# initialize matrix
P <- matrix( NA, nrow = m_loci, ncol = k_subpops )
# copy names from data
colnames( P ) <- pop_order
rownames( P ) <- rownames( X )
# navigate subpopulations
for ( u in 1 : k_subpops ) {
    # subpopulation label name
    pop <- pop_order[ u ]
    # columns of interest
    js <- subpops == pop
    # now average genotypes into allele frequency estimates and store
    P[ , u ] <- rowMeans( X[ , js ], na.rm = TRUE ) / 2
}

## -----------------------------------------------------------------------------
coancestry <- popkin_af( P )

## ---- fig.width = 4.2, fig.height = 3, fig.align = 'center'-------------------
# coancestry matrix plot
# NOTE: inbr_diag() is not needed for coancestry!
plot_popkin(
    coancestry,
    mar = 3,
    names = TRUE,
    ylab = 'Subpopulations'
)

## -----------------------------------------------------------------------------
# some subpopulation sizes
n1 <- 10
n2 <- n1
n3 <- 20
# construct unadmixed individuals
Q_pop1 <- matrix( c( 1, 0 ), ncol = 2, nrow = n1, byrow = TRUE )
Q_pop2 <- Q_pop1[ , 2:1 ] # reverse columns
# random admixture proportions
Q_admix <- rev( sort( runif( n3 ) ) )
Q_admix <- cbind( Q_admix, 1 - Q_admix )
# combine data for all populations
# add some informative row/col names too
rownames( Q_pop1 ) <- paste0( 'S1-ind', 1 : n1 )
rownames( Q_pop2 ) <- paste0( 'S2-ind', 1 : n2 )
rownames( Q_admix ) <- paste0( 'S3-ind', 1 : n3 )
Q <- rbind( Q_pop1, Q_pop2, Q_admix )
colnames( Q ) <- c('A1', 'A2')
# create subpopulation labels for later
labs1 <- c( rep.int( 'S1', n1 ), rep.int( 'S2', n2 ), rep.int( 'S3', n3 ) )
labs2 <- c( rep.int( 'Unadmixed', n1+n2 ), rep.int( 'Admixed', n3 ) )

## ---- fig.width = 6.5, fig.height = 2, fig.align = 'center'-------------------
# adjust margins
par(mar = c(2, 3, 1, 0.2) + 0.2)
# actual plot
plot_admix( Q )

## ---- fig.width = 6.5, fig.height = 3, fig.align = 'center'-------------------
# adjust margins
par(mar = c(6, 3, 1, 0.2) + 0.2)
# actual plot
plot_admix( Q, names = TRUE, xlab_line = 5 )

## ---- fig.width = 6.5, fig.height = 2.2, fig.align = 'center'-----------------
# adjust margins
par(mar = c(2, 3, 1, 0.2) + 0.2)
# actual plot
plot_admix( Q, labs = labs1 )

## ---- fig.width = 6.5, fig.height = 2.3, fig.align = 'center'-----------------
# adjust margins
par(mar = c(3, 3, 1, 0.2) + 0.2)
# actual plot
plot_admix( Q, labs = cbind( labs1, labs2 ), labs_line = c(0, 1), xlab_line = 2 )

Try the popkin package in your browser

Any scripts or data that you put into this service are public.

popkin documentation built on Jan. 7, 2023, 1:26 a.m.