Nothing
#' Expectation-Maximization algorithm for global (genome-wide) admixture inference
#'
#' @param Geno
#' List of genotying matrices of size `M` (number of markers),
#' with each matrix of dimension `N` (number of individuals) x `L` (number of alleles)
#' whose elements consists of discrete or continuous allele dosages
#' @param K
#' Number of ancestral groups (positive integer superior or equal to 2)
#' @param P
#' Ploidy level (positive integer), to be used when read depth ratios are
#' specified in `Geno`, either as a single value to specify the same
#' ploidy for all individuals, or as a vector of size `N`
#' @param ParamToUpdate
#' Specify `both`, `Prop` or `Freq` to update both admixture proportions and
#' ancestral allele frequencies, only admixture proportions or only ancestral
#' allele frequencies, respectively
#' @param MaxIter
#' Maximum number of iterations
#' (positive integer greater than or equal to `MinIter`)
#' @param MinIter
#' Minimum number of iterations
#' (positive integer greater than or equal to 2 and smaller than or equal to `MaxIter`)
#' @param MinParamBound
#' Minimum value for admixture proportions and ancestral allele frequencies
#' (positive numeric value)
#' @param LogLikThresh
#' Algorithm convergence criterion (positive numeric value) consisting of a
#' log-likelihood difference value between two iterations
#' @param PropInit
#' Matrix of dimension `N` x `K` with initial admixture proportions
#' @param FreqInit
#' List of matrices of size `M` (number of markers), with each matrix being of
#' dimension `K` x `L` with initial allele frequencies in ancestral groups
#' @param NbThreads
#' Number of threads to be used (positive integer) with a default value of 0
#' setting automatically all threads available
#' @param Seed
#' Seed for reproducible inference (integer)
#' @param Verbose
#' A boolean describing if detailed information should be printed
#'
#' @return
#' A list of three items: a matrix of admixture proportions (`Prop`),
#' a list of matrices of allele frequencies in ancestral groups (`Freq`),
#' and a vector of log-likelihood values over iterations (`LogLik`)
#'
#' @description
#' This function performs global admixture inference from discrete or
#' continuous allele dosages
#'
#' @details
#' The function `AdmixGlobal()` performs global (genome-wide) admixture inference
#' from genotyping data (`Geno`) formatted as a list of matrices (one for each marker),
#' by specifying the number of ancestral groups (`K`).
#'
#' The inference is performed using an expectation-maximization (EM) algorithm whose
#' minimum (`MinIter`) and maximum (`Maxiter`) number of iterations can be fixed by
#' the user, as well as the log-likelihood convergence threshold (`LogLikThresh`).
#' A minimum value for admixture proportions ancestral allele frequencies
#' is set using `MinParamBound`, which should be above zero to prevent computational
#' issues. By default, all available threads/CPU cores are used but the number
#' can be chosen using `NbThreads`.
#'
#' If allele dosages are used, the ploidy level (`P`) does not have to be
#' specified as it is automatically calculated from the dosages. However, it
#' must be specified if the genotypic data correspond to ratios constrained
#' between 0 and 1 (e.g. obtained from allele read depths).
#'
#' By default, the EM algorithm is initialized with random parameter values but
#' admixture proportions and ancestral allele frequencies can be initialized
#' using `PropInit` and `FreqInit`, respectively.
#'
#' By default, both types of parameters are updated by the EM, but only one of
#' them can be updated while the other is fixed using `ParamToUpdate`. This is
#' particularly interesting when allele frequencies have been estimated on a
#' reference panel (e.g. stored into a list `FreqRef`)
#' and are used to estimated the admixture proportions
#' of a new panel of individuals. This scenario can be implemented by
#' initializing allele frequencies (`FreqInit=FreqRef`) and specifying
#' to update only admixture proportions (`ParamToUpdate=Prop`).
#'
#' @importFrom magrittr %>%
#' @importFrom magrittr set_names
#' @importFrom purrr map
#' @importFrom utils tail
#'
#' @seealso
#' * [SimulatePop()] to simulate a polyploid admixed population.
#' * [AdmixLocal()] to perform local admixture inference using the results from
#' the `AdmixGlobal()` function.
#' * [GlobalPlot()] to generate an admixture barplot using the results from
#' the `AdmixGlobal()` function.
#'
#' @export
#'
#' @examples
#' ## Simulate a polyploid admixed population
#' DataSim <- SimulatePop(K=3L, N=10L, P=6L, M=50L, C=5L, L=10L, Seed=123, NbThreads=1)
#'
#' ## Perform global admixture inference
#' ResGlobalAdmix <- AdmixGlobal(Geno=DataSim$Geno, K=3, Verbose=FALSE, NbThreads=1)
#'
#' ## Estimated admixture proportions
#' head(ResGlobalAdmix$Prop)
#'
#' ## Estimated allele frequencies at first marker
#' ResGlobalAdmix$Freq[[1]]
#'
#' ## Log-likelihood over iterations
#' head(ResGlobalAdmix$LogLik)
AdmixGlobal <- function(Geno, K, P=NULL, ParamToUpdate="both",
MaxIter=200L, MinIter=10L,
MinParamBound=1e-6, LogLikThresh=1e-3,
PropInit = NULL, FreqInit = NULL, NbThreads=0L,
Seed=123L, Verbose=TRUE){
## Initial checks
stopifnot("Geno must be a list of numeric matrices" =
is.list(Geno) && is.matrix(Geno[[1]]) && is.numeric(Geno[[1]]))
stopifnot("Geno must be a named list (marker names)" =
!is.null(names(Geno)))
stopifnot("Geno matrices must have column names (allele names at markers)" =
!is.null(colnames(Geno[[1]])))
stopifnot("Geno matrices must have row names (individual names)" =
!is.null(rownames(Geno[[1]])))
stopifnot("K must be an integer superior or equal to 2" =
K>=2L)
stopifnot("When informed, P must be a single positive integer or a vector of individual positive integers" =
is.null(P) || (all(P>=1L) && (length(P)==nrow(Geno[[1]]) || length(P)==1)))
stopifnot("ParamToUpdate must be chosen among: both, Prop or Freq" =
ParamToUpdate%in%c("both","Prop","Freq"))
stopifnot("When informed, PropInit must be a numeric matrix" =
is.null(PropInit) || (is.matrix(PropInit) && is.numeric(PropInit)))
stopifnot("When informed, PropInit must have the same number of rows as Geno matrices" =
is.null(PropInit) || nrow(PropInit)==nrow(Geno[[1]]))
stopifnot("When informed, PropInit must have K columns" =
is.null(PropInit) || ncol(PropInit)==K)
stopifnot("When informed, FreqInit matrices must be numeric matrices" =
is.null(FreqInit) || (is.matrix(FreqInit[[1]]) && is.numeric(FreqInit[[1]])))
stopifnot("When informed, FreqInit matrices must have the same number of columns as Geno matrices" =
is.null(FreqInit) || ncol(FreqInit[[1]])==ncol(Geno[[1]]))
stopifnot("When informed, FreqInit matrices must have K rows" =
is.null(FreqInit) || nrow(FreqInit[[1]])==K)
stopifnot("MaxIter must be greater than or equal to MinIter" =
MaxIter>=MinIter)
stopifnot("MinIter must be greater than or equal to 2" =
MinIter>=2)
stopifnot("LogLikThresh must be positive" =
LogLikThresh>0)
stopifnot("NbThreads should be a non-negative integer" =
NbThreads>=0)
stopifnot("Verbose should be a boolean" =
is.logical(Verbose))
## Set Seeds
set.seed(Seed)
## Quantity names
. <- NULL
Ind_names <- rownames(Geno[[1]]) %>% set_names(.,.)
Marker_names <- names(Geno) %>% set_names(.,.)
Group_names <- paste0("K",1:K) %>% set_names(.,.)
## Population quantities
N <- nrow(Geno[[1]])
M <- length(Geno)
Lm <- sapply(Geno,ncol)
## Set Geno according to ploidy
if(!is.null(P)){
if(length(P)==1){
.ScaleGenoMat(Geno,rep(P,N))
}else{
.ScaleGenoMat(Geno,P)
}
# Geno <- Geno %>% map(~.x*P/rowSums(.x))
}else{
P <- round(rowSums(Geno[[1]]))
}
## Additional checks
stopifnot("MinParamBound cannot be negative" =
MinParamBound>=0)
stopifnot("MinParamBound is too high relative to the maximum number of alleles and/or groups" =
MinParamBound<1/K && MinParamBound<1/max(Lm))
## Print information
if(Verbose){
cat("#####################################################\n",
"#####################################################\n",
"#### AdmixPoly ####\n",
"#### Inference of global admixture proportions ####\n",
"#####################################################\n",
"#####################################################\n\n",
"Number of markers = ", M,"\n",
"Number of individuals = ", N,"\n",sep = "")
cat("Ploidy levels in the population =",sort(unique(P)),"\n")
cat("Number of alleles across markers = [",min(Lm),",",max(Lm),"]\n\n",sep="")
cat("Settings:\n",
" - K = ",K,"\n",
" - ParamToUpdate = ",ParamToUpdate,"\n",
" - LogLikThresh = ",LogLikThresh,"\n",
" - MaxIter = ",MaxIter,"\n",
" - MinIter = ",MinIter,"\n",
" - MinParamBound = ",MinParamBound,"\n",
sep = "")
if(is.null(FreqInit)){
cat(" - FreqInit = NULL\n",sep = "")
}else{
cat(" - FreqInit = Specified by user\n",sep = "")
}
if(is.null(PropInit)){
cat(" - PropInit = NULL\n",sep = "")
}else{
cat(" - PropInit = Specified by user\n",sep = "")
}
if(NbThreads==0){
cat(" - NbThreads = all threads available\n", sep = "")
}else{
cat(" - NbThreads = ", NbThreads,"\n", sep = "")
}
}
## Initial allele frequencies
if(is.null(FreqInit)){
Freq <- .InitializeFreq(Geno, K, Group_names, Seed, NbThreads = NbThreads)
}else{
Freq <- FreqInit
}
## Initial admixture proportions
if(is.null(PropInit)){
Prop <- .rdirichlet_cpp(N,K,100,Seed) %>%
matrix(.,nrow = N,ncol = K,dimnames = list(Ind_names,Group_names))
}else{
Prop <- PropInit
}
## Initial Log-Likelihood
LogLik <- NULL
LogLikDiff <- Inf
LogLik_iter <- ComputeLogLik(Geno = Geno, Prop = Prop, Freq = Freq, NbThreads = NbThreads)
LogLik <- c(LogLik,LogLik_iter)
## Acceleration t0, t1 and t2 list
Acc_list <- list()
LogLikDiff_acc <- 0
## EM algorithm
if(Verbose) {
cat("\nEM algorithm:\n",sep = "")
# Print a neat header for the columns
# Column widths: Iter (6), LogLik (14), Diff (14), Status/Alpha (25)
cat(sprintf("%-6s %-14s %-14s %-25s\n", "Iter", "LogLik", "Diff", "Status"))
}
Iter <- 1
while((LogLikDiff>LogLikThresh & Iter>=MinIter & Iter<=MaxIter) | Iter<MinIter){
## Update parameters
Param_iter <- UpdateParam(Geno = Geno, Prop = Prop, Freq = Freq, ParamToUpdate = ParamToUpdate,
MinParamBound = MinParamBound, NbThreads = NbThreads)
Prop_iter <- Param_iter[[1]]
Freq_iter <- Param_iter[[2]]
## Store parameters for EM Acceleration
Acc_list <- append(list(unlist(Param_iter)),Acc_list)
# EM Acceleration
if(Iter%%2 == 1 & Iter > 1){
## Compute alpha
r = Acc_list[[2]] - Acc_list[[3]]
v = Acc_list[[1]] - Acc_list[[2]] - r
alpha = .ComputeAlpha(r = r,v = v)
## Accelerated updates
if(alpha < -1){
New_param <- Acc_list[[3]] - 2*alpha*r + alpha^2*v
## Prop
Prop_acc <- .FormatAccProp(New_param, N, K, MinParamBound)
## Freq
Freq_acc <- .FormatAccFreq(New_param, N, K, Lm, MinParamBound)
## Compute log-likelihood and reset parameters
LogLik_acc <- ComputeLogLik(Geno = Geno, Prop = Prop_acc, Freq = Freq_acc, NbThreads = NbThreads)
LogLikDiff_acc <- LogLik_acc - LogLik[length(LogLik)]
Acc_list <- list(c(Prop_acc,unlist(Freq_acc)))
}else{
Acc_list <- Acc_list[1]
}
}
## Compute log-likelihood and reset parameters
LogLik_iter <- ComputeLogLik(Geno = Geno, Prop = Prop_iter, Freq = Freq_iter, NbThreads = NbThreads)
LogLikDiff <- LogLik_iter - LogLik[length(LogLik)]
if(Iter%%2 == 1 & Iter > 1 & LogLikDiff_acc > LogLikDiff){
if(Verbose){
# Clean, column-aligned output for Accelerated step
status_msg <- sprintf("Accelerated (a=%.2f)", alpha)
cat(sprintf("%-6d %-14.4f %-14.6f %-25s\n", Iter, LogLik_acc, LogLikDiff_acc, status_msg))
}
Freq <- Freq_acc
Prop <- Prop_acc
LogLik <- c(LogLik,LogLik_acc)
}else{
if(Verbose) {
# Clean, column-aligned output for Standard step
cat(sprintf("%-6d %-14.4f %-14.6f %-25s\n", Iter, LogLik_iter, LogLikDiff, "Standard"))
}
Freq <- Freq_iter
Prop <- Prop_iter
LogLik <- c(LogLik,LogLik_iter)
}
## Miscellaneous
LogLikDiff_acc <- 0
Iter <- Iter + 1
## Termination checks
if(Verbose){
if(Iter>MaxIter){
cat("==> maximum number of iterations reached: ",Iter-1,"\n",sep="")
}else if(LogLikDiff<LogLikThresh){
cat("==> convergence criterion reached: Diff = ",LogLikDiff," < ",
LogLikThresh,"\n",sep="")
}
}
}
## Set row and column names of outputs
rownames(Prop) <- Ind_names
colnames(Prop) <- Group_names
names(Freq) <- Marker_names
for(m in 1:length(Freq)){
rownames(Freq[[m]]) <- Group_names
colnames(Freq[[m]]) <- colnames(Geno[[m]])
}
## Print information
if(Verbose) cat("\n","Algorithm stopped with LogLik = ",tail(LogLik,n=1),"\n\n",sep="")
## Results
res <- list(Prop = Prop, Freq = Freq, LogLik = LogLik)
## Set class
class(res) <- "AdmixGlobal"
## Return outputs
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.