R/dapc_infer.R

Defines functions dapc_infer

Documented in dapc_infer

#' Conduct an inference of \emph{k} prior to DAPC.
#'
#' Takes a long-format data table of genotypes and assist in a preliminary
#' inference of \emph{k}, the effective number of populations.
#' Inference of \emph{k} is facilitated through examination of the PCA screeplot
#' and through testing K-means testing.
#'
#' @param dat Data table: A long data table, e.g. like that imported from
#' \code{vcf2DT}. Genotypes can be coded as '/' separated characters
#' (e.g. '0/0', '0/1', '1/1'), or integers as Alt allele counts (e.g. 0, 1, 2).
#' Must contain the following columns,
#' \enumerate{
#'   \item The sampled individuals (see param \code{sampCol}).
#'   \item The locus ID (see param \code{locusCol}).
#'   \item The genotype column (see param \code{genoCol}).
#' }
#'
#' @param scaling Character: How should the data (loci) be scaled?
#' Set to \code{'covar'} to scale to mean = 0, but variance is not
#' adjusted, i.e. PCA on a covariance matrix. Set to \code{'corr'}
#' to scale to mean = 0 and variance = 1, i.e. PCA on a
#' correlation matrix. Set to \code{'patterson'} to use the
#' Patteron et al. (2006) normalisation. Set to \code{'none'} to
#' if you do not want to do any scaling before PCA.
#'
#' @param sampCol Character: The column name with the sampled individual information.
#' Default is \code{'SAMPLE'}.
#'
#' @param locusCol Character: The column name with the locus information.
#' Default is \code{'LOCUS'}.
#'
#' @param genoCol Character: The column name with the genotype information.
#' Default is \code{'GT'}.
#'
#' @param kTest Integer: A vector of the number of \emph(k) values to test.
#' Default is \code{1:10}.
#'
#' @param pTest Integer: A vector of the number of \emph(p) PC axes to
#' fit K-means with.
#'
#' @param screeMax Integer: The maximum number of PC axes to plot in the screeplot.
#'
#' @param plotLook Character: The look of the plot. Default = \code{'ggplot'}, the
#' typical gray background with gridlines produced by \code{ggplot2}. Alternatively,
#' when set to \code{'classic'}, produces a base R style plot.
#'
#' @return Returns a list:
#' \code{$tab} is a datatable of \emph{k} and \code{p} values examined and
#' associated BIC value. \code{$fit} contains the individual outputs from
#' \code{kmeans} for each combination of parameters fitted. \code{$plot} is
#' a ggplot object.
#'
#' @details DAPC was made popular in the population genetics/molecular ecology
#' community following Jombart et al.'s (2010) paper. The method uses a DA
#' to model the genetic differences among populations using PC axes of genotypes
#' as predictors.
#'
#' The choice of the number of PC axes to use as predictors of genetic
#' differences among populations should be determined using the \emph{k}-1 criterion
#' described in Thia (2022). This criterion is based on the findings of
#' Patterson et al. (2006) that only the leading \emph{k}-1 PC axes of a genotype
#' dataset capture biologically meaningful structure. Users can use the function
#' \code{genomalicious::dapc_infer} to examine eigenvalue screeplots and
#' perform K-means clustering with different parameters to infer the number of
#' biologically informative PC axes.
#'
#' Users should use examine both the screeplot of eigenvalues and the different
#' K-means plots produced. The screeplot typically exhibits a break in the scree
#' around the putative \emph{k}. Additionally, different parameterisations of
#' K-means clustering should also converge on a similar conclusion. Users may
#' also find it useful to visualise scatterplots, e.g., using \code{pca_genos}.
#'
#' This function can also be used to determine populations de novo if the user
#' has not a priori expectation of the number of populations and the designation
#' of individuals. See Miller et al. (2020) and Thia (2022) for distinction and
#' importance of a priori vs. de novo population designations. The function
#' returns all K-means solutions for all parameter combinations. After insepcting
#' the screeplot and the K-means solutions, the desired K-means fit can be
#' extracted from the returned object to obtain de novo population designations
#' for downstream analysis, e.g., with \code{genomalicious::dapc_fit}.
#'
#' @references
#' Jombart et al. (2010) BMC Genetics. DOI: 10.1186/1471-2156-11-94
#' Miller et al. (2020) Heredity. DOI: 10.1038/s41437-020-0348-2
#' Patterson et al. (2006) PLoS Genetics. DOI: 10.1371/journal.pgen.0020190
#' Thia (2022) Mol. Ecol. DOI: 10.1111/1755-0998.13706
#'
#' @examples
#' library(genomalicious)
#'
#' data(data_Genos)
#'
#' # Test 1 to 10 with 3, 10, 20, and 40 PC axes, plotting just the first 10
#' # eigenvalues from the PCA, with a ggplot flavour.
#' inferK <- dapc_infer(
#'    data_Genos,
#'    kTest=1:10L,
#'    pTest=c(3,10,20,40),
#'    screeMax=10L,
#'    plotLook='ggplot'
#' )
#'
#' # Tabulated statistics
#' inferK$tab
#'
#' # The K-means clustering results for k=3 fitted with p=3 PC axes
#' inferK$fit$`k=3,p=3`
#'
#' # The plot
#' inferK$plot
#'
#' @export
dapc_infer <- function(
    dat, scaling='covar', sampCol='SAMPLE', locusCol='LOCUS', genoCol='GT',
    kTest=1:10L, pTest, screeMax=20L, plotLook='ggplot'){
  # --------------------------------------------+
  # Libraries and assertions
  # --------------------------------------------+
  for(lib in c('data.table', 'dplyr', 'ggpubr')){ require(lib, character.only = TRUE)}

  # Check that scaling is specified
  if(!scaling %in% c('covar', 'corr', 'patterson', 'none')){
    stop('Argument `scaling`` is invalid. See: ?pca_genos')
  }

  # Get the class of the genotypes
  gtClass <- class(dat[[genoCol]])

  # Check that genotypes are characters or counts
  if(!gtClass %in% c('character', 'numeric', 'integer')){
    stop("Check that genotypes are coded as '/' separated characters or as
         counts of the Alt allele. See: ?pca_genos")
  }

  # Convert characters of separated alleles to counts
  if(gtClass=='character'){
    dat[[genoCol]] <- genoscore_converter(dat[[genoCol]])
  }

  # Convert numeric allele counts to integers
  if(gtClass=='numeric'){
    dat[[genoCol]] <- as.integer(dat[[genoCol]])
  }

  # Set the plot theme by plotLook
  if(plotLook=='ggplot'){
    plotTheme <- theme_gray() + theme(
      legend.position='top',
      axis.ticks.length = unit(0.2, 'cm'))
  } else if(plotLook=='classic'){
    plotTheme <- theme_bw() + theme(
      panel.grid.major=element_blank()
      , panel.grid.minor=element_blank()
      , text=element_text(colour='black')
      , legend.position='top'
      , axis.ticks.length=unit(0.2, 'cm'))
  }

  # --------------------------------------------+
  # Code
  # --------------------------------------------+

  # Fit the PCA
  pca <- pca_genos(dat, scaling=scaling, sampCol=sampCol, locusCol=locusCol, genoCol=genoCol)

  # Plot the eigenvalues
  gg.eig <- pca_plot(pca, type='scree', axisIndex = 1:screeMax, plotLook=plotLook)

  # Construct a table of all K and p number of axes to test
  compTab <- CJ(K=kTest, P=pTest)

  # Perform the test
  kTestList <- lapply(1:nrow(compTab), function(i){
    k <- compTab$K[i]
    p <- compTab$P[i]
    fit <- kmeans(x=pca$x[, 1:p], centers=k, nstart=10, iter.max=10)

    n <- nrow(pca$x)
    WSS <- fit$tot.withinss
    BIC <- n*log(WSS/n) + log(n) * k

    list(
      tab=data.table(K=k, P=p, P.PLOT=paste0("bolditalic(p)*bold('=",p,"')"), BIC=BIC),
      fit=fit
    )
  })

  # Table of statistics
  kfitTab <- lapply(kTestList, function(X) X$tab) %>%
    do.call('rbind', .)

  # Levels for plotting
  p.plot.levels <- kfitTab[, c('P','P.PLOT')] %>%
    copy %>%
    setorder(., P) %>%
    unique() %>%
    .[['P.PLOT']]

  kfitTab[, P.PLOT:=factor(P.PLOT, levels=p.plot.levels)]

  # List of results
  kfitList <- lapply(kTestList, function(X) X$fit)
  names(kfitList) <- compTab[, paste0('k=',K,',p=',P)]

  # Plot the fits
  gg.fit <- kfitTab %>% ggplot(., aes(x=K, y=BIC)) +
    plotTheme +
    geom_point(colour='grey20', size=2) +
    geom_path(colour='grey20') +
    facet_wrap(~P.PLOT, labeller=label_parsed, scale='free_y') +
    scale_x_continuous(breaks = ~round(unique(pretty(.))))

  # Final plot
  gg.infer <- ggarrange(
    gg.eig, gg.fit, nrow=2, ncol=1, heights=c(40,60)
  )

  # Return
  list(
    tab=kfitTab[, c('K','P','BIC')],
    fit=kfitList,
    plot=gg.infer
  ) %>% return()
}
j-a-thia/genomalicious documentation built on Oct. 19, 2024, 7:51 p.m.