R/snapclust.R
In adegenet: Exploratory Analysis of Genetic and Genomic Data

Documented in snapclust

#' Maximum-likelihood genetic clustering using EM algorithm
#'
#' This function implements the fast maximum-likelihood genetic clustering
#' approach described in Beugin et al (2018). The underlying model is very close
#' to the model implemented by STRUCTURE, but allows for much faster estimation
#' of genetic clusters thanks to the use of the Expectation-Maximization (EM)
#' algorithm. Optionally, the model can explicitely account for hybridization
#' and detect different types of hybrids (see \code{hybrids} and
#' \code{hybrid.coef} arguments). The method is fully documented in a dedicated
#' tutorial which can be accessed using \code{adegenetTutorial("snapclust")}.
#'
#' @details The method is described in Beugin et al (2018) A fast likelihood
#'   solution to the genetic clustering problem. Methods in Ecology and
#'   Evolution \doi{10.1111/2041-210X.12968}. A dedicated
#'   tutorial is available by typing \code{adegenetTutorial("snapclust")}.
#'
#' @seealso The function \code{\link{snapclust.choose.k}} to investigate the optimal
#'   value number of clusters 'k'. 
#' 
#' @author Thibaut Jombart \email{thibautjombart@@gmail.com} and Marie-Pauline
#' Beugin
#'
#' @export
#'
#' @rdname snapclust
#'
#' @param x a \linkS4class{genind} object
#'
#' @param k the number of clusters to look for
#'
#' @param pop.ini parameter indicating how the initial group membership should
#'     be found. If \code{NULL}, groups are chosen at random, and the algorithm
#'     will be run \code{n.start times}. If "kmeans", then the function
#'     \code{find.clusters} is used to define initial groups using the K-means
#'     algorithm. If "ward", then the function \code{find.clusters} is used to
#'     define initial groups using the Ward algorithm. Alternatively, a factor
#'     defining the initial cluster configuration can be provided.
#'
#' @param max.iter the maximum number of iteration of the EM algorithm
#'
#' @param n.start the number of times the EM algorithm is run, each time with
#' different random starting conditions
#'
#' @param n.start.kmeans the number of times the K-means algorithm is run to
#'     define the starting point of the ML-EM algorithm, each time with
#'     different random starting conditions
#'
#' @param hybrids a logical indicating if hybrids should be modelled
#'     explicitely; this is currently implemented for 2 groups only.
#'
#' @param dim.ini the number of PCA axes to retain in the dimension reduction
#'     step for \code{\link{find.clusters}}, if this method is used to define
#'     initial group memberships (see argument \code{pop.ini}).
#'
#' @param hybrid.coef a vector of hybridization coefficients, defining the
#'     proportion of hybrid gene pool coming from the first parental population;
#'     this is symmetrized around 0.5, so that e.g. c(0.25, 0.5) will be
#'     converted to c(0.25, 0.5, 0.75)
#'
#' @param parent.lab a vector of 2 character strings used to label the two
#'     parental populations; only used if hybrids are detected (see argument
#'     \code{hybrids})
#'
#' @param ... further arguments passed on to \code{\link{find.clusters}}
#'
#' @return
#'
#' The function \code{snapclust} returns a list with the following
#' components:
#' \itemize{
#'
#' \item \code{$group} a factor indicating the maximum-likelihood assignment of
#' individuals to groups; if identified, hybrids are labelled after
#' hybridization coefficients, e.g. 0.5_A - 0.5_B for F1, 0.75_A - 0.25_B for
#' backcross F1 / A, etc.
#'
#' \item \code{$ll}: the log-likelihood of the model
#'
#' \item \code{$proba}: a matrix of group membership probabilities, with
#' individuals in rows and groups in columns; each value correspond to the
#' probability that a given individual genotype was generated under a given
#' group, under Hardy-Weinberg hypotheses.
#'
#' \item \code{$converged} a logical indicating if the algorithm converged; if
#' FALSE, it is doubtful that the result is an actual Maximum Likelihood
#' estimate.
#'
#' \item \code{$n.iter} an integer indicating the number of iterations the EM
#' algorithm was run for.
#'
#' }
#'
#' @examples
#' \dontrun{
#' data(microbov)
#'
#' ## try function using k-means initialization
#' grp.ini <- find.clusters(microbov, n.clust=15, n.pca=150)
#'
#' ## run EM algo
#' res <- snapclust(microbov, 15, pop.ini = grp.ini$grp)
#' names(res)
#' res$converged
#' res$n.iter
#'
#' ## plot result
#' compoplot(res)
#'
#' ## flag potential hybrids
#' to.flag <- apply(res$proba,1,max)<.9
#' compoplot(res, subset=to.flag, show.lab=TRUE,
#'                  posi="bottomleft", bg="white")
#'
#'
#' ## Simulate hybrids F1
#' zebu <- microbov[pop="Zebu"]
#' salers <- microbov[pop="Salers"]
#' hyb <- hybridize(zebu, salers, n=30)
#' x <- repool(zebu, salers, hyb)
#'
#' ## method without hybrids
#' res.no.hyb <- snapclust(x, k=2, hybrids=FALSE)
#' compoplot(res.no.hyb, col.pal=spectral, n.col=2)
#'
#' ## method with hybrids
#' res.hyb <- snapclust(x, k=2, hybrids=TRUE)
#' compoplot(res.hyb, col.pal =
#'           hybridpal(col.pal = spectral), n.col = 2)
#'
#'
#' ## Simulate hybrids backcross (F1 / parental)
#' f1.zebu <- hybridize(hyb, zebu, 20, pop = "f1.zebu")
#' f1.salers <- hybridize(hyb, salers, 25, pop = "f1.salers")
#' y <- repool(x, f1.zebu, f1.salers)
#'
#' ## method without hybrids
#' res2.no.hyb <- snapclust(y, k = 2, hybrids = FALSE)
#' compoplot(res2.no.hyb, col.pal = hybridpal(), n.col = 2)
#'
#' ## method with hybrids F1 only
#' res2.hyb <- snapclust(y, k = 2, hybrids = TRUE)
#' compoplot(res2.hyb, col.pal = hybridpal(), n.col = 2)
#'
#' ## method with back-cross
#' res2.back <- snapclust(y, k = 2, hybrids = TRUE, hybrid.coef = c(.25,.5))
#' compoplot(res2.back, col.pal = hybridpal(), n.col = 2)
#'
#' }

snapclust <- function(x, k, pop.ini = "ward", max.iter = 100, n.start = 10,
                      n.start.kmeans = 50, hybrids = FALSE, dim.ini = 100,
                      hybrid.coef = NULL, parent.lab = c('A', 'B'), ...) {

  if (!is.genind(x)) {
    stop("x is not a valid genind object")
  }

  if (any(ploidy(x) > 2)) {
    stop("snapclust not currently implemented for ploidy > 2")
  }

  if (all(ploidy(x) == 1)) {
    .ll.genotype <- .ll.genotype.haploid
  } else if (all(ploidy(x) == 2)) {
    .ll.genotype <- .ll.genotype.diploid
  } else {
    stop("snapclust not currently implemented for varying ploidy")
  }

  ## This function uses the EM algorithm to find ML group assignment of a set
  ## of genotypes stored in a genind object into 'k' clusters. We need an
  ## initial cluster definition to start with. The rest of the algorithm
  ## consists of:

  ## i) compute the matrix of allele frequencies

  ## ii) compute the likelihood of each genotype for each group

  ## iii) assign genotypes to the group for which they have the highest
  ## likelihood

  ## iv) go back to i) until convergence


  ## Disable multiple starts if the initial condition is not random
  use.random.start <- is.null(pop.ini)
  if (!use.random.start) {
    n.start <- 1L
  }

  if (n.start < 1L) {
    stop(sprintf(
      "n.start is less than 1 (%d); using n.start=1", n.start))
  }

  if (hybrids && k > 2) {
    warning(sprintf(
      "forcing k=2 for hybrid mode (requested k is %d)", k))
    k <- 2
  }


  ## Handle hybrid coefficients; these values reflect the contribution of the
  ## first parental population to the allele frequencies of the hybrid
  ## group. For instance, a value of 0.75 indicates that 'a' contributes to
  ## 75%, and 'b' 25% of the allele frequencies of the hybrid - a typical
  ## backcross F1 / a.

  if (hybrids) {
    if (is.null(hybrid.coef)) {
      hybrid.coef <- 0.5
    }
    hybrid.coef <- .tidy.hybrid.coef(hybrid.coef)
  }


  ## Initialisation using 'find.clusters'
  if (!is.null(pop.ini)) {
    if (tolower(pop.ini)[1] %in% c("kmeans", "k-means")) {
      pop.ini <- find.clusters(x, n.clust = k, n.pca = dim.ini,
                               n.start = n.start.kmeans,
                               method = "kmeans", ...)$grp
    } else if (tolower(pop.ini)[1] %in% c("ward")) {
      pop.ini <- find.clusters(x, n.clust = k, n.pca = dim.ini,
                               method = "ward", ...)$grp
    }
  }

  ## There is one run of the EM algo for each of the n.start random initial
  ## conditions.
  ll <- -Inf # this will be the total loglike

  for (i in seq_len(n.start)) {

    ## Set initial conditions: if initial pop is NULL, we create a random
    ## group definition (each clusters have same probability)
    if (use.random.start) {
      pop.ini <- sample(seq_len(k), nInd(x), replace=TRUE)
    }

    ## process initial population, store levels
    pop.ini <- factor(pop.ini)
    lev.ini <- levels(pop.ini)[1:k] # k+1 would be hybrids

    ## ensure 'pop.ini' matches 'k'
    if (! (length(levels(pop.ini)) %in% c(k, k + length(hybrid.coef))) ) {
      stop("pop.ini does not have k clusters")
    }

    ## initialisation
    group <- factor(as.integer(pop.ini)) # set levels to 1:k (or k+1)
    genotypes <- tab(x)
    n.loc <- nLoc(x)
    counter <- 0L
    converged <- FALSE


    ## This is the actual EM algorithm

    while(!converged && counter<=max.iter) {

      ## get table of allele frequencies (columns) by population (rows);
      ## these are stored as 'pop.freq'; note that it will include extra
      ## rows for different types of hybrids too.

      if (hybrids) {
        pop(x) <- group
        id.parents <- .find.parents(x)
        x.parents <- x[id.parents]
        pop.freq <- tab(genind2genpop(x.parents, quiet=TRUE),
                        freq=TRUE)
        pop.freq <- rbind(pop.freq, # parents
                          .find.freq.hyb(pop.freq, hybrid.coef)) # hybrids
      } else {
        pop.freq <- tab(genind2genpop(x, pop=group, quiet=TRUE),
                        freq=TRUE)
      }

      ## ensures no allele frequency is exactly zero
      pop.freq <- .tidy.pop.freq(pop.freq, locFac(x))

      ## get likelihoods of genotypes in every pop
      ll.mat <- apply(genotypes, 1, .ll.genotype, pop.freq, n.loc)

      ## assign individuals to most likely cluster
      previous.group <- group
      group <- apply(ll.mat, 2, which.max)

      ## check convergence
      ## converged <- all(group == previous.group)
      old.ll <- .global.ll(previous.group, ll.mat)
      new.ll <- .global.ll(group, ll.mat)
      if (!is.finite(new.ll)) {
        ## stop(sprintf("log-likelihood at iteration %d is not finite (%f)",
        ##              counter, new.ll))
      }
      converged <- abs(old.ll - new.ll) < 1e-14
      counter <- counter + 1L

    }

    ## ## store the best run so far
    ## new.ll <- .global.ll(group, ll.mat)

    if (new.ll > ll || i == 1L) {
      ## store results
      ll <- new.ll
      out <- list(group = group, ll = ll)

      ## group membership probability
      rescaled.ll.mat <- .rescale.ll.mat(ll.mat)
      out$proba <- prop.table(t(exp(rescaled.ll.mat)), 1)

      out$converged <- converged
      out$n.iter <- counter
    }
  } # end of the for loop

  ## restore labels of groups
  out$group <- factor(out$group)
  if (hybrids) {
    if (!is.null(parent.lab)) {
      lev.ini <- parent.lab
    }
    hybrid.labels <- paste0(hybrid.coef, "_", lev.ini[1], "-",
                            1 - hybrid.coef, "_", lev.ini[2])
    lev.ini <- c(lev.ini, hybrid.labels)
  }
  levels(out$group) <- lev.ini
  colnames(out$proba) <- lev.ini


  ## compute the number of parameters; it is defined as the number of 'free'
  ## allele frequencies, multiplied by the number of groups

  out$n.param <- (ncol(genotypes) - n.loc) * length(lev.ini)

  class(out) <- c("snapclust", "list")
  return(out)
}











## Non-exported function which computes the log-likelihood of a genotype in
## every population. For now only works for diploid individuals. 'x' is a vector
## of allele counts; 'pop.freq' is a matrix of group allele frequencies, with
## groups in rows and alleles in columns.

## TODO: extend this to various ploidy levels, possibly optimizing procedures
## for haploids.

.ll.genotype.diploid <- function(x, pop.freq, n.loc){
  ## homozygote (diploid)
  ## p(AA) = f(A)^2 for each locus
  ## so that log(p(AA)) = 2 * log(f(A))
  ll.homoz.one.indiv <- function(f) {
    sum(log(f[x == 2L]), na.rm = TRUE) * 2
  }

  ll.homoz <- apply(pop.freq, 1, ll.homoz.one.indiv)

  ## heterozygote (diploid, expl with 2 loci)
  ## p(AB) = 2 * f(A) f(B)
  ## so that log(p(AB)) = log(f(A)) + log(f(B)) + log(2)
  ## if an individual is heterozygote for n.heter loci, the term
  ## log(2) will be added n.heter times

  ll.hetero.one.indiv <- function(f) {
    n.heter <- sum(x == 1L, na.rm = TRUE) / 2
    sum(log(f[x == 1L]), na.rm = TRUE) + n.heter * log(2)
  }

  ll.heteroz <- apply(pop.freq, 1, ll.hetero.one.indiv)

  return(ll.homoz + ll.heteroz)
}



.ll.genotype.haploid <- function(x, pop.freq, n.loc){

  ## p(A) = f(A) for each locus
  ll.one.indiv <- function(f) {
    sum(log(f[x == 1L]), na.rm = TRUE)
  }

  ll <- apply(pop.freq, 1, ll.one.indiv)

  return(ll)
}






## Non-exported function computing the total log-likelihood of the model given a vector of group
## assignments and a table of ll of genotypes in each group

.global.ll <- function(group, ll){
  sum(t(ll)[cbind(seq_along(group), as.integer(group))], na.rm=TRUE)
}





## Non-exported function making a tidy vector of weights for allele frequencies
## of parental populations. It ensures that given any input vector of weights
## 'w' defining the types of hybrids, the output has the following properties:

## - strictly on ]0,1[

## - symmetric around 0.5, e.g. c(.25, .5) gives c(.25, .5, .75)

## - sorted by decreasing values (i.e. hybrid types are sorted by decreasing
## proximity to the first parental population.

.tidy.hybrid.coef <- function(w) {
  w <- w[w > 0 & w < 1]
  w <- sort(unique(round(c(w, 1-w), 4)), decreasing = TRUE)
  w
}





## Non-exported function determining vectors of allele frequencies in hybrids
## from 2 parental populations. Different types of hybrids are determined by
## weights given to the allele frequencies of the parental populations. Only one
## such value is provided and taken to be the weight of the 1st parental
## population; the complementary frequency is derived for the second parental
## population.

## Parameters are:

## - x: matrix of allele frequencies for population 'a' (first row) and 'b'
## (second row), where allele are in columns.


## - w: a vector of weights for 'a' and 'b', each value determining a type of
## hybrid. For instance, 0.5 is for F1, 0.25 for backcrosses F1/parental, 0.125
## for 2nd backcross F1/parental, etc.

## The output is a matrix of allele frequencies with hybrid types in rows and
## alleles in columns.

.find.freq.hyb <- function(x, w) {
  out <- cbind(w, 1-w) %*% x
  rownames(out) <- w
  out
}




## Non-exported function trying to find the two parental populations in a genind
## object containing 'k' clusters. The parental populations are defined as the
## two most distant clusters. The other clusters are deemed to be various types
## of hybrids. The output is a vector of indices identifying the individuals
## from the parental populations.

.find.parents <- function(x) {
  ## matrix of pairwise distances between clusters, using Nei's distance
  D <- as.matrix(dist.genpop(genind2genpop(x, quiet = TRUE), method = 1))
  parents <- which(abs(max(D)-D) < 1e-14, TRUE)[1,]
  out <- which(as.integer(pop(x)) %in% parents)
  out
}





## Non-exported function enforcing a minimum allele frequency in a table of
## allele frequency. As we are not accounting for the uncertainty in allele
## frequencies, we need to allow for genotypes to be generated from a population
## which does not have the genotype's allele represented, even if this is at a
## low probability. The transformation is ad-hoc, and has the form:
##
## g(f_i) = (a + f_i /  \sum(a + f_i))

## where f_i is the i-th frequency in a given locus. However, this ensures that
## the output has two important properties:

## - it sums to 1
## - it contains no zero

## By default, we set 'a' to 0.01.

## Function inputs are:

## - 'pop.freq': matrix of allele frequencies, with groups in rows and alleles in
## columns

## - 'loc.fac': a factor indicating which alleles belong to which locus, as
## returned by 'locFac([a genind])'

.tidy.pop.freq <- function(pop.freq, loc.fac) {
  g <- function(f, a = .01) {
    (a + f) / sum(a + f)
  }

  out <- matrix(unlist(apply(pop.freq, 1, tapply, loc.fac, g),
                       use.names = FALSE),
                byrow=TRUE, nrow=nrow(pop.freq))
  dimnames(out) <- dimnames(pop.freq)
  return(out)
}



## This function rescales log-likelihood values prior to the computation of
## group membership probabilities.

## issue reported: prop.table(t(exp(ll.mat)), 1) can cause some numerical
## approximation problems; if numbers are large, exp(...) will return Inf
## and the group membership probabilities cannot be computed
##
## Solution: rather than use p_a = exp(ll_a) / (exp(ll_a) + exp(ll_b))
## we can use p_a = exp(ll_a - C) /  (exp(ll_a - C) + exp(ll_b - C))

## where 'C' is a sufficiently large constant so that exp(ll_i + C) is
## computable; naively we could use C = max(ll.mat), but the problem is this
## scaling can cause -Inf likelihoods too. In practice, we need to allow
## different scaling for each individual.

##out$proba <-
## prop.table(t(exp(ll.mat)), 1)


.rescale.ll.mat <- function(ll.mat) {
  ## we first compute ad-hoc minimum and maximum values of log-likelihood; these
  ## will be computer dependent; this is a quick fix, but better alternatives
  ## can be found.

  ## smallest ll such that exp(ll) is strictly > 0
  new_min <- (0:-1000)[max(which(exp(0:-1000) > 0))]

  ## largest ll such that exp(ll) is strictly less than +Inf
  new_max <- (1:1000)[max(which(exp(1:1000) < Inf))]

  counter <- 0


  ## find rescaling for a single individual;
  ## x: vector of ll values
  rescale.ll.indiv <- function(x) {
    ## set minimum to new_min
    x  <- x - min(x) + new_min

    ## set sum to the maximum
    if (sum(x) > new_max) {
      counter <<- counter + 1
      x <- x - min(x) # reset min to 0
      x <- new_min + (x / sum(x)) * new_max # range: new_min to new_max
    }
    return(x)
  }

  out <- apply(ll.mat, 2, rescale.ll.indiv)

  if (counter > 0) {
    msg <- paste("Large dataset syndrome:\n",
                 "for", counter, "individuals,",
                 "differences in log-likelihoods exceed computer precision;\n",
                 "group membership probabilities are approximated\n",
                 "(only trust clear-cut values)")
    message(msg)
  }

  return(out)
}