##########################################################################
# Genotyping Uncertainty with Sequencing data and linkage MAPping
# Copyright 2017-2020 Timothy P. Bilton <timothy.bilton@agresearch.co.nz>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#########################################################################
#' BC, FS and IC method: Compute ordered linkage maps
#'
#' Method for inferring parental phase (e.g., ordered parental genotype pair (OPGP)) and
#' estimating recombination fractions in full-sib families.
#'
#' This function infers the parental phase (or ordered parental genotype pair (OPGP)) and
#' estimates adjacent recombination fractions using the hidden Markov model (HMM) approach as
#' described in \insertCite{bilton2018genetics1;textual}{GUSMap}.
#'
#' The optimization of the likelihood for the HMM is performed using either the Expectation-Maximumization (EM) algorithm
#' (\code{method="EM"}), direct numeric optimization via the \code{\link{optim}} function (\code{method="optim"}), or using
#' 20 iterations of the EM algorithm followed by optimization via direct numeric optimization (\code{method=NULL}).
#' The likelihood computations (and computation of derivatives if required) are scaled using
#' forward and backward recursion to avoid overflow issues and are performed in C. These computations
#' are also parallelization via the OpenMP package, where the argument \code{nThreads} specifies
#' how many threads to use. Be careful not to set \code{nThreads} to more than the number of threads available
#' on your computer (or bad things will happen). In addition, if the package is complied without OpenMP, then this
#' parallelization has no effect and the likelihood is computed in serial.
#'
#' If \code{mapped = TRUE}, then combined linkage groups must have been formed from the \code{\link{$addBIsnps}} function
#' first (and preferably ordered from the \code{\link{$orderLG}} function).
#'
#' @usage
#' BCobj$computeMap(chrom = NULL, init_r = 0.01, ep = 0.001, method = NULL, err = TRUE,
#' multiErr = FALSE, mapped = TRUE, nThreads = 1, inferOPGP = TRUE, rfthres = 0.1)
#' FSobj$computeMap(chrom = NULL, init_r = 0.01, ep = 0.001, method = NULL, sexSpec = FALSE, err = TRUE,
#' multiErr = FALSE, mapped = TRUE, nThreads = 1, inferOPGP = TRUE, rfthres = 0.1)
#' ICobj$computeMap(chrom = NULL, init_r = 0.01, ep = 0.001, method = "EM", err = TRUE,
#' multiErr = FALSE, mapped = TRUE, nThreads = 1, inferOPGP = TRUE, rfthres = 0.1)
#'
#' @param chrom A integer vector giving the indices of the chromosomes (or linkage groups) to be computed.
#' @param init_r A numeric value giving the initial values for the recombination fractions. Each
#' recombination fraction parameter is set to the same initial value.
#' @param ep A numeric value giving the initial value for the sequencing error parameter.
#' @param method A character string specifying whether optimization should be performed using
#' direct maximization (\code{optim}) or via the Expectation-Maximum (EM) algorithm (\code{EM}).
#' @param sexSpec Logical value. If \code{TRUE}, sex-specific recombination fractions are
#' are estimated.
#' @param err Locical value. If \code{TRUE}, the sequencing error parameter is estimated. Otherwise, the
#' sequenicng error parameter is fixed to the value of the \code{ep} argument.
#' @param multiErr Locical value. If \code{TRUE}, a separate sequencing error rate is used for each SNP, otherwise a common error rate is
#' used for each SNP.
#' @param mapped Locial value. If \code{TRUE}, the maps are computed using the marker order given
#' in the combined linkage groups. Otherwise, the maps are computed using the original marker order
#' given by the genomic assembly.
#' @param nThreads An integer value giving the number of threads to use in computing the likelihood in parallel.
#' @param inferOPGP Logical value. If \code{inferOPGP=TRUE} then the OPGPs will always be inferred, otherwise the OPGPs
#' will only be inferred if required.
#' @param rfthres Numeric value. Print output highlighting SNP pairs with high recombination fraction estimates.
#' Helpfully for identifying potentially probmatic SNPs.
#' @name $computeMap
#' @author Timothy P. Bilton and Chris Scott
#' @seealso \code{\link{BC}}, \code{\link{FS}}, \code{\link{IC}}
#' @references
#' \insertRef{bilton2018genetics1}{GUSMap}
#' @examples
#' #### Case 1: Compute linkage map from linkage groups
#' ## Simulate some sequencing data
#' set.seed(6745)
#' config <- list(list(sample(c(1,2,4), size=30, replace=TRUE)))
#' F1data <- simFS(0.01, config=config, meanDepth=10, nInd=50, epsilon=0.005)
#' ## Compute 2-point recombination fractions
#' F1data$rf_2pt(nClust=1)
#' ## create and order linkage groups
#' F1data$createLG()
#' F1data$addBIsnps()
#' F1data$orderLG(ndim=5)
#'
#' ## Compute the linkage map
#' F1data$computeMap()
#'
#' #### Case 2: Compute map using original assembly order
#' F1data$computeMap(mapped = FALSE)
#' @aliases $computeMap
rf_est_FS <- function(init_r=0.01, ep=0.001, ref, alt, OPGP, noFam=as.integer(1),
sexSpec=FALSE, seqErr=TRUE, multiErr=FALSE, trace=FALSE,
method = "optim", nThreads=1, thres=1000, ...){
## Do some checks
nInd <- lapply(ref,nrow) # number of individuals
nSnps <- ncol(ref[[1]]) # number of SNPs
## Check for depths that are too large
for(fam in 1:noFam){
badcalls <- which( (ref[[fam]]+alt[[fam]]) > thres)
if(length(badcalls) > 0){
d <- (ref[[fam]]+alt[[fam]])[badcalls]
ref[[fam]][badcalls] <- round(ref[[fam]][badcalls]/d * thres)
alt[[fam]][badcalls] <- round(alt[[fam]][badcalls]/d * thres)
ref[[fam]] <- matrix(as.integer(ref[[fam]]), nrow=nInd[[fam]], ncol=nSnps)
alt[[fam]] <- matrix(as.integer(alt[[fam]]), nrow=nInd[[fam]], ncol=nSnps)
}
}
if(method=="optim"){
# Arguments for the optim function
optim.arg <- list(...)
if(length(optim.arg) == 0)
optim.arg <- list(maxit = 20000, reltol=1e-15)
## Compute the K matrix for heterozygous genotypes
bcoef_mat <- Kab <- vector(mode="list", length=noFam)
for(fam in 1:noFam){
bcoef_mat[[fam]] <- choose(ref[[fam]]+alt[[fam]],ref[[fam]])
Kab[[fam]] <- bcoef_mat[[fam]]*(1/2)^(ref[[fam]]+alt[[fam]])
}
## If we want to estimate sex-specific r.f.'s
if(sexSpec){
# Work out the indices of the r.f. parameters of each sex
ps <- sort(unique(unlist(lapply(OPGP,function(x) which(x %in% 1:8)))))[-1] - 1
ms <- sort(unique(unlist(lapply(OPGP,function(x) which(x %in% c(1:4,9:12))))))[-1] - 1
npar <- c(length(ps),length(ms))
# Determine the initial values
if(length(init_r)==1)
para <- GUSbase::logit2(rep(init_r,sum(npar)))
else if(length(init_r) != sum(npar))
para <- GUSbase::logit2(rep(0.1,sum(npar)))
else
para <- init_r
# sequencing error
if(seqErr){
if(multiErr) {
if(length(ep) == 1) para <- c(para,GUSbase::logit(rep(ep, nSnps)))
else para <- c(para,GUSbase::logit(ep))
}
else para <- c(para,GUSbase::logit(ep))
}
## Are we estimating the error parameters?
seqErr=!is.null(ep)
## Find MLE
optim.MLE <- stats::optim(para, fn=score_fs_mp_ss_scaled_err, gr=score_extract,
method="BFGS", control=optim.arg,
ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
nInd=nInd,nSnps=nSnps,OPGP=OPGP,noFam=noFam,ps=ps,ms=ms,npar=npar,
seqErr=seqErr,extra=ep,nThreads=nThreads,multiErr=multiErr)
}
else{
# Determine the initial values
if(length(init_r)==1)
para <- GUSbase::logit2(rep(init_r,nSnps-1))
else if(length(init_r) != nSnps-1)
para <- GUSbase::logit2(rep(0.1,nSnps-1))
else
para <- init_r
# sequencing error
if(seqErr){
if(multiErr) {
if(length(ep) == 1) para <- c(para,GUSbase::logit(rep(ep, nSnps)))
else para <- c(para,GUSbase::logit(ep))
}
else para <- c(para,GUSbase::logit(ep))
}
## Find MLE
optim.MLE <- stats::optim(para, fn=score_fs_mp_scaled_err, gr=score_extract,
method="BFGS", control=optim.arg,
ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
nInd=nInd,nSnps=nSnps,OPGP=OPGP,noFam=noFam,
seqErr=seqErr,extra=ep,nThreads=nThreads,multiErr=multiErr)
}
# Print out the output from the optim procedure (if specified)
if(trace){
print(optim.MLE)
}
# Work out what to return for the recombination fractions
if(sexSpec){
if(npar[1] > 0) pindx = 1:npar[1]
else pindx = numeric(0)
if(npar[2] > 0) mindx = npar[1]+1:npar[2]
else mindx = numeric(0)
rfReturn = list(rf_p = rep(0,nSnps - 1), rf_m = rep(0,nSnps - 1))
rfReturn$rf_p[ps] = GUSbase::inv.logit2(optim.MLE$par[pindx])
rfReturn$rf_m[ms] = GUSbase::inv.logit2(optim.MLE$par[mindx])
#rfReturn <- list(rf_p=GUSbase::inv.logit2(optim.MLE$par[1:(nSnps-1)]),
# rf_m=GUSbase::inv.logit2(optim.MLE$par[1:(nSnps-1) + nSnps - 1]))
}
else
rfReturn <- list(rf=GUSbase::inv.logit2(optim.MLE$par[1:(nSnps-1)]))
# work out what to return for the sequencing errors
if(seqErr){
if(sexSpec){
if(multiErr) epReturn <- list(GUSbase::inv.logit(optim.MLE$par[sum(npar) + 1:nSnps]))
else epReturn <- list(GUSbase::inv.logit(optim.MLE$par[sum(npar) + 1]))
} else{
if(multiErr) epReturn <- list(GUSbase::inv.logit(optim.MLE$par[nSnps:(2*nSnps-1)]))
else epReturn <- list(GUSbase::inv.logit(optim.MLE$par[nSnps]))
}
}
else epReturn <- ep
# Check for convergence
if(trace & optim.MLE$convergence != 0)
warning(paste0('Optimization failed to converge properly with error ',optim.MLE$convergence,'\n smallest MLE estimate is: ', round(min(optim.MLE$par),6)))
# Return the results
return(c(rfReturn, ep=epReturn, loglik=-optim.MLE$value))
} else{ # EM algorithm approach
## Set up the parameter values
temp.arg <- list(...)
if(!is.null(temp.arg$maxit) && is.numeric(temp.arg$maxit) && length(temp.arg$maxit) == 1)
EM.arg = c(temp.arg$maxit)
else
EM.arg = c(50000)
if(!is.null(temp.arg$reltol) && is.numeric(temp.arg$reltol) && length(temp.arg$reltol) == 1){
EM.arg = c(EM.arg,temp.arg$reltol)
}
else
EM.arg = c(EM.arg,1e-15)
if(sexSpec){
ps <- sort(unique(unlist(lapply(OPGP,function(x) which(x %in% 1:8)))))[-1] - 1
ms <- sort(unique(unlist(lapply(OPGP,function(x) which(x %in% c(1:4,9:12))))))[-1] - 1
npar <- c(length(ps),length(ms))
ss_rf <- logical(2*(nSnps-1))
ss_rf[ps] <- TRUE
ss_rf[ms + nSnps-1] <- TRUE
# Determine the initial values
if(length(init_r)==1) init_r <- rep(init_r,sum(npar))
else if(length(init_r) != sum(npar)) init_r <- rep(0.01,sum(npar))
temp <- rep(0,2*nSnps-1)
temp[c(ps,ms + nSnps-1)] <- init_r
init_r = temp
} else {
ss_rf = 0;
# Determine the initial values
if(length(init_r)==1) init_r <- rep(init_r,2*(nSnps-1))
else if(length(init_r) != nSnps-1) init_r <- rep(0.01,2*(nSnps-1))
else init_r <- rep(init_r,2)
}
## convert the data into the right format:
OPGPmat = do.call(what = "rbind",OPGP)
ref_mat = do.call(what = "rbind",ref)
alt_mat = do.call(what = "rbind",alt)
if(multiErr){
if(length(ep) == 1) ep <- rep(ep, nSnps)
EMout <- .Call("EM_HMM_multierr", init_r, ep, ref_mat, alt_mat, OPGPmat,
noFam, unlist(nInd), nSnps, sexSpec, seqErr, EM.arg, as.integer(ss_rf), nThreads=nThreads)
}
else
EMout <- .Call("EM_HMM", init_r, ep, ref_mat, alt_mat, OPGPmat,
noFam, unlist(nInd), nSnps, sexSpec, seqErr, EM.arg, as.integer(ss_rf), nThreads=nThreads)
EMout[[3]] = EMout[[3]] + sum(log(choose(ref_mat+alt_mat,ref_mat)))
if(sexSpec){
return(list(rf_p=EMout[[1]][1:(nSnps-1)],rf_m=EMout[[1]][1:(nSnps-1) + nSnps-1],
ep=EMout[[2]],
loglik=EMout[[3]]))
}
else
return(list(rf=EMout[[1]][1:(nSnps-1)],
ep=EMout[[2]],
loglik=EMout[[3]]))
}
}
## recombination estimates for case where the phase is unkonwn.
## The r.f.'s are sex-specific and constrained to the range [0,1]
rf_est_FS_UP <- function(ref, alt, config, ep, method="optim", trace=F, nThreads=0, thres=1000, ...){
## Check imputs
if( any( ref<0 | !is.finite(ref)) || any(!(ref == round(ref))))
stop("The read count matrix for the reference allele is invalid")
if( any( alt<0 | !is.finite(alt)) || any(!(alt == round(alt))))
stop("The read count matrix for the alternate allele is invalid")
if( !is.vector(config) || !is.numeric(config) || any(!(config %in% 1:9)) )
stop("Invalid config vector. It must be a numeric vector with entries between 1 and 9.")
if( !is.logical(trace) || is.na(trace) )
trace = FALSE
if(!(method %in% c("EM","optim")))
stop("Specified optimization method is unknown. Please select one of 'EM' or 'optim'")
## Check for depths that are too large
badcalls <- which( (ref+alt) > thres)
if(length(badcalls) > 0){
d <- (ref+alt)[badcalls]
ref[badcalls] <- round(ref[badcalls]/d * thres)
alt[badcalls] <- round(alt[badcalls]/d * thres)
ref <- matrix(as.integer(ref), nrow=nrow(ref), ncol=ncol(ref))
alt <- matrix(as.integer(alt), nrow=nrow(alt), ncol=ncol(alt))
}
nInd <- nrow(ref) # number of individuals
nSnps <- ncol(ref) # number of SNPs
if(!is.integer(ref))
ref <- matrix(as.integer(ref), nrow=nInd, ncol=nSnps)
if(!is.integer(alt))
alt <- matrix(as.integer(alt), nrow=nInd, ncol=nSnps)
if(!is.integer(config))
config <- as.integer(config)
if(method == "optim"){
# Arguments for the optim function
optim.arg <- list(...)
if(length(optim.arg) == 0)
optim.arg <- list(maxit = 1000, reltol=1e-10)
# Work out the indices of the r.f. parameters of each sex
ps <- which(config %in% c(1,2,3))[-1] - 1
ms <- which(config %in% c(1,4,5))[-1] - 1
npar <- c(length(ps),length(ms))
## Compute the K matrix for heterozygous genotypes
bcoef_mat <- matrix(1, nrow=nrow(ref), ncol=ncol(ref)) #choose(ref+alt,ref)
Kab <- bcoef_mat*(1/2)^(ref+alt)
## Are we estimating the error parameters?
seqErr=!is.null(ep)
para <- GUSbase::logit(rep(0.5,sum(npar)))
# sequencing error
if(length(ep) != 1 & !is.null(ep))
para <- c(para,GUSbase::logit(0.01))
else if(!is.null(ep))
para <- c(para,GUSbase::logit(ep))
if(nSnps > 2){
## Find MLE
optim.MLE <- stats::optim(para,ll_fs_up_ss_scaled_err,method="BFGS",control=optim.arg,
ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
nInd=nInd,nSnps=nSnps,config=config,ps=ps,ms=ms,npar=npar,
seqErr=seqErr,nThreads=nThreads)
# Print out the output from the optim procedure (if specified)
if(trace){
print(optim.MLE)
}
# Check for convergence
if(optim.MLE$convergence != 0)
warning(paste0('Optimization failed to converge properly with error ',optim.MLE$convergence,'\n smallest MLE estimate is: ', round(min(optim.MLE$par),6)))
# Return the MLEs
return(list(rf_p=GUSbase::inv.logit(optim.MLE$par[1:npar[1]]),rf_m=GUSbase::inv.logit(optim.MLE$par[npar[1]+1:npar[2]]),
ep=GUSbase::inv.logit(optim.MLE$par[sum(npar)+1])))
}
else if(nSnps == 2){
## If both SNPs are informative, need to use the Nelder-Mead to distinguish between the two sexes.
if(all(config == 1)){
optim.MLE <- stats::optim(para,ll_fs_up_ss_scaled_err,method="Nelder-Mead",control=optim.arg,
ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
nInd=nInd,nSnps=nSnps,config=config,ps=ps,ms=ms,npar=npar,
seqErr=seqErr)
}
## Otherwise, proceed as normal
else{
optim.MLE <- stats::optim(para,ll_fs_up_ss_scaled_err,method="BFGS",control=optim.arg,
ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
nInd=nInd,nSnps=nSnps,config=config,ps=ps,ms=ms,npar=npar,
seqErr=seqErr)
}
# Print out the output from the optim procedure (if specified)
if(trace){
print(optim.MLE)
}
# Check for convergence
if(trace & optim.MLE$convergence != 0)
warning(paste0('Optimization failed to converge properly with error ',optim.MLE$convergence,'\n smallest MLE estimate is: ', round(min(optim.MLE$par),6)))
# Return the MLEs
return(list(rf_p=inv.logit(optim.MLE$par[npar[1]]),rf_m=inv.logit(optim.MLE$par[npar[1]+npar[2]]),
ep=inv.logit(optim.MLE$par[sum(npar)+1])))
}
}
else{ ## EM approach
## Set up the parameter values
temp.arg <- list(...)
if(!is.null(temp.arg$maxit) && is.numeric(temp.arg$maxit) && length(temp.arg$maxit) == 1)
EM.arg = c(temp.arg$maxit)
else
EM.arg = c(5000)
if(!is.null(temp.arg$reltol) && is.numeric(temp.arg$reltol) && length(temp.arg$reltol) == 1){
EM.arg = c(EM.arg,temp.arg$reltol)
}
else
EM.arg = c(EM.arg,1e-5)
## work out which rf can be estimated
ps <- which(config %in% c(1,2,3))[-1] - 1
ms <- which(config %in% c(1,4,5))[-1] - 1
npar <- c(length(ps),length(ms))
ss_rf <- logical(2*(nSnps-1))
ss_rf[ps] <- TRUE
ss_rf[ms + nSnps-1] <- TRUE
## Are we estimating the error parameters?
seqErr=!is.null(ep)
EMout <- .Call("EM_HMM_UP", rep(0.5,(nSnps-1)*2), ep, ref, alt, config,
as.integer(1), nInd, nSnps, seqErr, EM.arg, as.integer(ss_rf), nThreads=nThreads)
return(list(rf_p=EMout[[1]][ps],rf_m=EMout[[1]][nSnps-1+ms],
ep=EMout[[2]],
loglik=EMout[[3]]))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.