#' Conduct an inference of \emph{k} prior to DAPC.
#'
#' Takes a long-format data table of genotypes and assist in a preliminary
#' inference of \emph{k}, the effective number of populations.
#' Inference of \emph{k} is facilitated through examination of the PCA screeplot
#' and through testing K-means testing.
#'
#' @param dat Data table: A long data table, e.g. like that imported from
#' \code{vcf2DT}. Genotypes can be coded as '/' separated characters
#' (e.g. '0/0', '0/1', '1/1'), or integers as Alt allele counts (e.g. 0, 1, 2).
#' Must contain the following columns,
#' \enumerate{
#' \item The sampled individuals (see param \code{sampCol}).
#' \item The locus ID (see param \code{locusCol}).
#' \item The genotype column (see param \code{genoCol}).
#' }
#'
#' @param scaling Character: How should the data (loci) be scaled?
#' Set to \code{'covar'} to scale to mean = 0, but variance is not
#' adjusted, i.e. PCA on a covariance matrix. Set to \code{'corr'}
#' to scale to mean = 0 and variance = 1, i.e. PCA on a
#' correlation matrix. Set to \code{'patterson'} to use the
#' Patteron et al. (2006) normalisation. Set to \code{'none'} to
#' if you do not want to do any scaling before PCA.
#'
#' @param sampCol Character: The column name with the sampled individual information.
#' Default is \code{'SAMPLE'}.
#'
#' @param locusCol Character: The column name with the locus information.
#' Default is \code{'LOCUS'}.
#'
#' @param genoCol Character: The column name with the genotype information.
#' Default is \code{'GT'}.
#'
#' @param kTest Integer: A vector of the number of \emph(k) values to test.
#' Default is \code{1:10}.
#'
#' @param pTest Integer: A vector of the number of \emph(p) PC axes to
#' fit K-means with.
#'
#' @param screeMax Integer: The maximum number of PC axes to plot in the screeplot.
#'
#' @param plotLook Character: The look of the plot. Default = \code{'ggplot'}, the
#' typical gray background with gridlines produced by \code{ggplot2}. Alternatively,
#' when set to \code{'classic'}, produces a base R style plot.
#'
#' @return Returns a list:
#' \code{$tab} is a datatable of \emph{k} and \code{p} values examined and
#' associated BIC value. \code{$fit} contains the individual outputs from
#' \code{kmeans} for each combination of parameters fitted. \code{$plot} is
#' a ggplot object.
#'
#' @details DAPC was made popular in the population genetics/molecular ecology
#' community following Jombart et al.'s (2010) paper. The method uses a DA
#' to model the genetic differences among populations using PC axes of genotypes
#' as predictors.
#'
#' The choice of the number of PC axes to use as predictors of genetic
#' differences among populations should be determined using the \emph{k}-1 criterion
#' described in Thia (2022). This criterion is based on the findings of
#' Patterson et al. (2006) that only the leading \emph{k}-1 PC axes of a genotype
#' dataset capture biologically meaningful structure. Users can use the function
#' \code{genomalicious::dapc_infer} to examine eigenvalue screeplots and
#' perform K-means clustering with different parameters to infer the number of
#' biologically informative PC axes.
#'
#' Users should use examine both the screeplot of eigenvalues and the different
#' K-means plots produced. The screeplot typically exhibits a break in the scree
#' around the putative \emph{k}. Additionally, different parameterisations of
#' K-means clustering should also converge on a similar conclusion. Users may
#' also find it useful to visualise scatterplots, e.g., using \code{pca_genos}.
#'
#' This function can also be used to determine populations de novo if the user
#' has not a priori expectation of the number of populations and the designation
#' of individuals. See Miller et al. (2020) and Thia (2022) for distinction and
#' importance of a priori vs. de novo population designations. The function
#' returns all K-means solutions for all parameter combinations. After insepcting
#' the screeplot and the K-means solutions, the desired K-means fit can be
#' extracted from the returned object to obtain de novo population designations
#' for downstream analysis, e.g., with \code{genomalicious::dapc_fit}.
#'
#' @references
#' Jombart et al. (2010) BMC Genetics. DOI: 10.1186/1471-2156-11-94
#' Miller et al. (2020) Heredity. DOI: 10.1038/s41437-020-0348-2
#' Patterson et al. (2006) PLoS Genetics. DOI: 10.1371/journal.pgen.0020190
#' Thia (2022) Mol. Ecol. DOI: 10.1111/1755-0998.13706
#'
#' @examples
#' library(genomalicious)
#'
#' data(data_Genos)
#'
#' # Test 1 to 10 with 3, 10, 20, and 40 PC axes, plotting just the first 10
#' # eigenvalues from the PCA, with a ggplot flavour.
#' inferK <- dapc_infer(
#' data_Genos,
#' kTest=1:10L,
#' pTest=c(3,10,20,40),
#' screeMax=10L,
#' plotLook='ggplot'
#' )
#'
#' # Tabulated statistics
#' inferK$tab
#'
#' # The K-means clustering results for k=3 fitted with p=3 PC axes
#' inferK$fit$`k=3,p=3`
#'
#' # The plot
#' inferK$plot
#'
#' @export
dapc_infer <- function(
dat, scaling='covar', sampCol='SAMPLE', locusCol='LOCUS', genoCol='GT',
kTest=1:10L, pTest, screeMax=20L, plotLook='ggplot'){
# --------------------------------------------+
# Libraries and assertions
# --------------------------------------------+
for(lib in c('data.table', 'dplyr', 'ggpubr')){ require(lib, character.only = TRUE)}
# Check that scaling is specified
if(!scaling %in% c('covar', 'corr', 'patterson', 'none')){
stop('Argument `scaling`` is invalid. See: ?pca_genos')
}
# Get the class of the genotypes
gtClass <- class(dat[[genoCol]])
# Check that genotypes are characters or counts
if(!gtClass %in% c('character', 'numeric', 'integer')){
stop("Check that genotypes are coded as '/' separated characters or as
counts of the Alt allele. See: ?pca_genos")
}
# Convert characters of separated alleles to counts
if(gtClass=='character'){
dat[[genoCol]] <- genoscore_converter(dat[[genoCol]])
}
# Convert numeric allele counts to integers
if(gtClass=='numeric'){
dat[[genoCol]] <- as.integer(dat[[genoCol]])
}
# Set the plot theme by plotLook
if(plotLook=='ggplot'){
plotTheme <- theme_gray() + theme(
legend.position='top',
axis.ticks.length = unit(0.2, 'cm'))
} else if(plotLook=='classic'){
plotTheme <- theme_bw() + theme(
panel.grid.major=element_blank()
, panel.grid.minor=element_blank()
, text=element_text(colour='black')
, legend.position='top'
, axis.ticks.length=unit(0.2, 'cm'))
}
# --------------------------------------------+
# Code
# --------------------------------------------+
# Fit the PCA
pca <- pca_genos(dat, scaling=scaling, sampCol=sampCol, locusCol=locusCol, genoCol=genoCol)
# Plot the eigenvalues
gg.eig <- pca_plot(pca, type='scree', axisIndex = 1:screeMax, plotLook=plotLook)
# Construct a table of all K and p number of axes to test
compTab <- CJ(K=kTest, P=pTest)
# Perform the test
kTestList <- lapply(1:nrow(compTab), function(i){
k <- compTab$K[i]
p <- compTab$P[i]
fit <- kmeans(x=pca$x[, 1:p], centers=k, nstart=10, iter.max=10)
n <- nrow(pca$x)
WSS <- fit$tot.withinss
BIC <- n*log(WSS/n) + log(n) * k
list(
tab=data.table(K=k, P=p, P.PLOT=paste0("bolditalic(p)*bold('=",p,"')"), BIC=BIC),
fit=fit
)
})
# Table of statistics
kfitTab <- lapply(kTestList, function(X) X$tab) %>%
do.call('rbind', .)
# Levels for plotting
p.plot.levels <- kfitTab[, c('P','P.PLOT')] %>%
copy %>%
setorder(., P) %>%
unique() %>%
.[['P.PLOT']]
kfitTab[, P.PLOT:=factor(P.PLOT, levels=p.plot.levels)]
# List of results
kfitList <- lapply(kTestList, function(X) X$fit)
names(kfitList) <- compTab[, paste0('k=',K,',p=',P)]
# Plot the fits
gg.fit <- kfitTab %>% ggplot(., aes(x=K, y=BIC)) +
plotTheme +
geom_point(colour='grey20', size=2) +
geom_path(colour='grey20') +
facet_wrap(~P.PLOT, labeller=label_parsed, scale='free_y') +
scale_x_continuous(breaks = ~round(unique(pretty(.))))
# Final plot
gg.infer <- ggarrange(
gg.eig, gg.fit, nrow=2, ncol=1, heights=c(40,60)
)
# Return
list(
tab=kfitTab[, c('K','P','BIC')],
fit=kfitList,
plot=gg.infer
) %>% return()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.