R/rf_2pt.R

Defines functions rf_est_FS_2pt rf_2pt_multi rf_2pt_single comb

Documented in comb

##########################################################################
# Genotyping Uncertainty with Sequencing data and linkage MAPping (GUSMap)
# Copyright 2017-2020 Timothy P. Bilton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#########################################################################
#' BC, FS, and IC  method: Compute 2-point recombination fraction estimates and LOD scores
#' 
#' Method for estimating 2-point recombination fraction and associated LOD scores.
#' 
#' In this function, the recombination fraction for all pairs of SNPs is computed. Estimation 
#' is performed by optimimizing the hidden Markov model (HMM) likelihood given by 
#' \insertCite{bilton2018genetics1;textual}{GUSMap} for two markers, where the parental phase 
#' that is used is the one that maximizes the likelihood, given the segregation type that has been 
#' inferred. The \code{err} specifies whether sequencing errors should
#' also be estimated when computing the 2-point recombination fraction estimates. LOD scores associated 
#' with each pair of SNPs are also computed.
#' 
#' Parallelization is used to speed up the estimation process using the \code{\link[foreach]{foreach}}
#' function. The argument \code{nClust} specifies the number of cores to use in the parallelization. 
#' Note: It is important that the number of cores you specify is not more than what is available on 
#' your comupter (otherwise bad things can happen!).
#' 
#' Currently, only 2-point recombination fractions for a single population/family can be computed. However,
#' there are plans to extend this function to multiple families in the future.
#'  
#' Note: This function can take a while, espically when there are a large number of SNPs. 
#' 
#' @usage
#' BCobj$rf_2pt(nClust = 2, err = FALSE)
#' FSobj$rf_2pt(nClust = 2, err = FALSE)
#' ICobj$rf_2pt(nClust = 2, err = FALSE)
#' 
#' @param nClust An integer value for the number of cores to use in the parallelization of 
#' computing the 2-point recombination fraction estimates. 
#' @param err Locial value. If \code{TRUE}, a sequencing error parameter is also estimated as 
#' part of the 2-point recombination fraction estimates. If \code{FALSE}, then the sequencing 
#' error parameter is fixed at zero. 
#' 
#' @name $rf_2pt
#' @seealso \code{\link{BC}}, \code{\link{FS}}, \code{\link{IC}}
#' @author Timothy P. Bilton
#' @references 
#' \insertRef{bilton2018genetics1}{GUSMap}
#' @examples
#' ## simulate some sequencing data
#' set.seed(6745)
#' config <- list(list(sample(c(1,2,4), size=30, replace=TRUE)))
#' F1data <- simFS(0.01, config=config, meanDepth=10, nInd=50)
#' 
#' ## Compute 2-point recombination fractions
#' F1data$rf_2pt(nClust=1)
#' @aliases $rf_2pt


## function needed for foreach loop
comb <- function(...){
  mapply('rbind',...,SIMPLIFY=FALSE)
}

### Function for computing the pairwise RF in a single full-sib family.
rf_2pt_single <- function(ref, alt, config, config_infer, group, group_infer, nClust, nInd, err=FALSE){
  
  nSnps = as.integer(2)
  noFam = as.integer(1)
  init_ep = ifelse(err, 0.001, 0)
  ## if estimating sequencing error. Make sure that the initial value is nonzero
  if(err & init_ep < 0.0001)
    init_ep <- 0.0001
  
  if(length(c(unlist(group), unlist(group_infer))) == 0)
    stop("There are no SNPs in the data set.")
  
  ## Calcuate the number of SNPs
  indx_BI <- c(group$BI, group_infer$BI)
  nSnps_BI <- length(indx_BI)
  indx_MI <- c(group$MI, group_infer$SI)
  nSnps_MI <- length(indx_MI)
  indx_PI <- c(group$PI)
  nSnps_PI = length(indx_PI)
  
  ## set up the config vector
  if(!is.null(config_infer))
    config[which(!is.na(config_infer))] <- config_infer[which(!is.na(config_infer))]
  ## Check that there are no missing configs in the data
  if(any(is.na(config[c(indx_MI , indx_BI , indx_PI)])))
    stop("There are some missing segregation types in the data.")
  
  ## Set up the Clusters
  #cl <- parallel::makeCluster(nClust)
  doParallel::registerDoParallel(nClust)

  cat("\nComputing 2-point recombination fraction estimates ...\n")
  if(nSnps_PI > 0){
    cat("Paternal informative SNPs\n")
    ## Paternal informative SNPs
    rf.PI <- foreach::foreach(snp1 = iterators::iter(seq(length.out=nSnps_PI)), .combine=comb) %dopar% {
      rf <- replicate(2,numeric(nSnps_PI),simplify=F)
      for(snp2 in seq_len(snp1-1)){
        ind = indx_PI[c(snp1,snp2)]
        ref2 <- list(ref[,ind])
        alt2 <- list(alt[,ind])
        #bcoef_mat <- list(choose(ref2[[1]]+alt2[[1]],ref2[[1]]))
        #Kab       <- list(bcoef_mat[[1]]*(1/2)^(ref2[[1]]+alt2[[1]]))
        ## compute the rf's and LOD
        rf.est1 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(5,5) + 2*(config[ind]==3))), seqErr=err)
        rf.est2 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(5,6) + 2*(config[ind]==3))), seqErr=err)
        rf.ind1 <- switch(which.max(c(rf.est1$loglik,rf.est2$loglik)), TRUE, FALSE)
        if(rf.ind1){
          rf[[1]][snp2] <- rf.est1$rf
          rf[[2]][snp2] <- rf.est1$LOD
        }
        else{
          rf[[1]][snp2] <- rf.est2$rf
          rf[[2]][snp2] <- rf.est2$LOD
        }
      }
      return(rf)
    }
    for(i in 1:2){
      rf.PI[[i]][upper.tri(rf.PI[[i]])] <- t(rf.PI[[i]])[upper.tri(rf.PI[[i]])]
    }
  } else rf.PI = replicate(2, matrix(nrow=0, ncol=0))
  
  if(nSnps_MI > 0){
    cat("Maternal informative SNPs\n")
    ## Maternal informative SNPs
    rf.MI <- foreach::foreach(snp1 = iterators::iter(seq(length.out=nSnps_MI)), .combine=comb) %dopar% {
      rf <- replicate(2,numeric(nSnps_MI),simplify=F)
      for(snp2 in seq_len(snp1-1)){
        ind = indx_MI[c(snp1,snp2)]
        ref2 <- list(ref[,ind])
        alt2 <- list(alt[,ind])
        rf.est1 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(9,9) + 2*(config[ind]==5))), seqErr=err)
        rf.est2 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(9,10) + 2*(config[ind]==5))), seqErr=err)
        rf.ind1 <- switch(which.max(c(rf.est1$loglik,rf.est2$loglik)), TRUE, FALSE)
        if(rf.ind1){
          rf[[1]][snp2] <- rf.est1$rf
          rf[[2]][snp2] <- rf.est1$LOD
        }
        else{
          rf[[1]][snp2] <- rf.est2$rf
          rf[[2]][snp2] <- rf.est2$LOD
        }
      }
      return(rf)
    }  
    for(i in 1:2){
      rf.MI[[i]][upper.tri(rf.MI[[i]])] <- t(rf.MI[[i]])[upper.tri(rf.MI[[i]])]
    }
  } else rf.MI = replicate(2, matrix(nrow=0, ncol=0))
  
  if(nSnps_BI > 0){
    cat("Both informative SNPs\n")
    ### Both Informative
    rf.BI <- foreach::foreach(snp1 = iterators::iter(seq(length.out=nSnps_BI)), .combine=comb) %dopar% {
      rf <- replicate(2,numeric(nSnps_BI),simplify=F)
      for(snp2 in seq_len(snp1-1)){
        ind = indx_BI[c(snp1,snp2)]
        ref2 <- list(ref[,ind])
        alt2 <- list(alt[,ind])
        # phase 1
        temp1 <- rf_est_FS_2pt(0.1, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(1,1))), seqErr=err)
        temp2 <- rf_est_FS_2pt(0.4, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(1,1))), seqErr=err)
         rf.est1 <- switch(which.max(c(temp1$loglik,temp2$loglik)),temp1,temp2) 
        # phase 2
        temp1 <- rf_est_FS_2pt(0.1, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(1,2))), seqErr=err)
        temp2 <- rf_est_FS_2pt(0.4, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(1,2))), seqErr=err)
        rf.est2 <- switch(which.max(c(temp1$loglik,temp2$loglik)),temp1,temp2) 
        # phase 3
        temp1 <- rf_est_FS_2pt(0.1, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(1,4))), seqErr=err)
        temp2 <- rf_est_FS_2pt(0.4, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(1,4))), seqErr=err)
        rf.est4 <- switch(which.max(c(temp1$loglik,temp2$loglik)),temp1,temp2) 
        ## work out which is best
        rf.ind <- switch(which.max(c(rf.est1$loglik,rf.est2$loglik,rf.est4$loglik)), 1, 2, 3)
        if(rf.ind == 1){
          rf[[1]][snp2] <- rf.est1$rf
          rf[[2]][snp2] <- rf.est1$LOD
        }
        else if(rf.ind == 2){
          rf[[1]][snp2] <- rf.est2$rf
          rf[[2]][snp2] <- rf.est2$LOD
        }
        else {
          rf[[1]][snp2] <- rf.est4$rf
          rf[[2]][snp2] <- rf.est4$LOD
        }
       }
      return(rf)
    }
    for(i in 1:2){
      rf.BI[[i]][upper.tri(rf.BI[[i]])] <- t(rf.BI[[i]])[upper.tri(rf.BI[[i]])]
    }
  } else rf.BI = replicate(2, matrix(nrow=0, ncol=0))

  if(nSnps_PI > 0 & nSnps_BI > 0){  
    cat("Paternal informative vs Both informative\n")
    ## Paternal and Informative SNPs
    rf.PI.BI <- foreach::foreach(snp.ps = iterators::iter(seq(length.out=nSnps_PI)), .combine=comb) %dopar% {
      rf <- replicate(2,numeric(nSnps_BI),simplify=F)
      for(snp.bi in 1:nSnps_BI){
        ind <- c(indx_PI[snp.ps],indx_BI[snp.bi])
        ref2 <- list(ref[,ind])
        alt2 <- list(alt[,ind])
        ## compute the rf's and LOD
        rf.est1 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(5,1) + 2*c(config[ind[[1]]]==3,0))), seqErr=err)
        rf.est2 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(5,2) + 2*c(config[ind[[1]]]==3,0))), seqErr=err)
        rf.ind1 <- switch(which.max(c(rf.est1$loglik,rf.est2$loglik)), TRUE, FALSE)
        if(rf.ind1){
          rf[[1]][snp.bi] <- rf.est1$rf
          rf[[2]][snp.bi] <- rf.est1$LOD
        }
        else{
          rf[[1]][snp.bi] <- rf.est2$rf
          rf[[2]][snp.bi] <- rf.est2$LOD
        }
      }
      return(rf)
    }
  } else rf.PI.BI = replicate(2, matrix(nrow=nSnps_PI, ncol=nSnps_BI))
  
  if(nSnps_MI > 0 & nSnps_BI > 0){
    cat("Maternal informative vs Both informative\n")
    ## Maternal and Informative SNPs
    rf.MI.BI <- foreach::foreach(snp.mi = iterators::iter(seq(length.out=nSnps_MI)), .combine=comb) %dopar% {
      rf <- replicate(2,numeric(nSnps_BI),simplify=F)
      for(snp.bi in 1:nSnps_BI){
        ind <- c(indx_MI[snp.mi],indx_BI[snp.bi])
        ref2 <- list(ref[,ind])
        alt2 <- list(alt[,ind])
        ## compute the rf's and LOD
        rf.est1 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(9,1) + 2*c(config[ind[[1]]]==5,0))), seqErr=err)
        rf.est2 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(9,3) + 2*c(config[ind[[1]]]==5,0))), seqErr=err)
        rf.ind1 <- switch(which.max(c(rf.est1$loglik,rf.est2$loglik)), TRUE, FALSE)
        if(rf.ind1){
          rf[[1]][snp.bi] <- rf.est1$rf
          rf[[2]][snp.bi] <- rf.est1$LOD
        } else{
          rf[[1]][snp.bi] <- rf.est2$rf
          rf[[2]][snp.bi] <- rf.est2$LOD
        }
      }
      return(rf)
    }
  } else rf.MI.BI = replicate(2, matrix(nrow=nSnps_MI, ncol=nSnps_BI))
  
  ## For the non-informative computations
  ## Really done so that we can check that there is no miss identification of the group
  if(nSnps_MI > 0 & nSnps_PI > 0){
    cat("Maternal informative vs Paternal informative\n")
    rf.MI.PI <- foreach::foreach(snp.mi = iterators::iter(seq(length.out=nSnps_MI)), .combine=comb) %dopar% {
      rf <- replicate(2,numeric(nSnps_PI),simplify=F)
      for(snp.pi in 1:nSnps_PI){
        ind <- c(indx_MI[snp.mi],indx_PI[snp.pi])
        ref2 <- list(ref[,ind])
        alt2 <- list(alt[,ind])
        ## compute the rf's and LOD
        rf.est1 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(9,9) + 2*c(config[ind] %in% c(3,5)))), seqErr=err)
        rf.est2 <- rf_est_FS_2pt(0.2, ep=init_ep, ref=ref2,alt=alt2, OPGP=list(as.integer(c(9,10) + 2*c(config[ind] %in% c(3,5)))), seqErr=err)
        rf.ind1 <- switch(which.max(c(rf.est1$loglik,rf.est2$loglik)), TRUE, FALSE)
        if(rf.ind1){
          rf[[1]][snp.pi] <- rf.est1$rf
          rf[[2]][snp.pi] <- rf.est1$LOD
        } else{
          rf[[1]][snp.pi] <- rf.est2$rf
          rf[[2]][snp.pi] <- rf.est2$LOD
        }
      }
      return(rf)
    }
  } else rf.MI.PI = replicate(2, matrix(nrow=nSnps_MI, ncol=nSnps_PI))
  
  ## Build the rf and LOD matrices
  origOrder <- order(c(indx_BI,indx_PI,indx_MI))
  rf.mat <- rbind(cbind(rf.BI[[1]],t(rf.PI.BI[[1]]),t(rf.MI.BI[[1]])),
                  cbind(rf.PI.BI[[1]], rf.PI[[1]], t(rf.MI.PI[[1]])),
                  cbind(rf.MI.BI[[1]], rf.MI.PI[[1]], rf.MI[[1]]))[origOrder,origOrder]
  LOD.mat <- rbind(cbind(rf.BI[[2]],t(rf.PI.BI[[2]]),t(rf.MI.BI[[2]])),
                   cbind(rf.PI.BI[[2]], rf.PI[[2]], t(rf.MI.PI[[2]])),
                   cbind(rf.MI.BI[[2]], rf.MI.PI[[2]], rf.MI[[2]]))[origOrder,origOrder]
  LOD.mat[which(LOD.mat < 0)] <- 0  # some negative LOD scores from rounding
  return(list(rf = rf.mat,LOD = LOD.mat))
}



### Function for computing the pairwise RF in a multiple-sib family.
rf_2pt_multi <- function(ref, alt, config, group, nClust, noFam, init_r = 0.25){
  
  if(length(unlist(group)) == 0)
    stop("There are no SNPs in the data set.")
  
  ## Calcuate the number of SNPs
  indx_BI <- c(group$BI)
  nSnps_BI <- length(indx_BI)
  indx_MI <- c(group$MI)
  nSnps_MI <- length(indx_MI)
  indx_PI <- c(group$PI)
  nSnps_PI = length(indx_PI)
  
  ## Set up the Clusters
  cl <- parallel::makeCluster(nClust)
  doParallel::registerDoParallel(cl)
  
  cat("\nComputing 2-point recombination fraction estimates ...\n")
  cat("Paternal informative SNPs\n")
  ## Paternal informative SNPs
  rf.PI <- foreach::foreach(snp1 = iterators::iter(seq(length.out=nSnps_PI)), .combine=comb) %dopar% {
    rf <- replicate(2,numeric(nSnps_PI),simplify=F)
    for(snp2 in seq_len(snp1-1)){
      ind = indx_PI[c(snp1,snp2)]
      configFam <- matrix(unlist(lapply(config, function(z) z[ind])),ncol=2, nrow=noFam,byrow=T)
      wFam <- which(apply(configFam,1, function(z) all(z %in% c(2,3))))
      OPGP <- vector(mode="list", length=noFam)
      ## Work out the phase
      for(fam in wFam){
        if(all(configFam[fam,] %in% c(2,3))){
          OPGP1 <- c(5,5) + 2*(configFam[fam,]==3)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(5,6) + 2*(configFam[fam,]==3)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
        else if(all(configFam[fam,]==1)){
          OPGP1 <- c(1,1)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                      OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                      OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP3 <- c(1,4)
          rf.est3 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP3), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik,rf.est3$loglik)), OPGP1, OPGP2, OPGP3)
        }
        else if(any(configFam[fam,] == 1) & any(configFam[fam,] %in% c(2,3))){
          OPGP1 <- c(1,1) + 4*(configFam[fam,]==2) + 6*(configFam[fam,]==3)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2) + 4*(configFam[fam,]==2) + 6*(configFam[fam,]==3)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
      }
      rf.est <- rf_est_FS(init_r=init_r,ref=lapply(ref[wFam], function(x) x[,ind]),
                                   alt=lapply(alt[wFam], function(x) x[,ind]),
                                   OPGP=OPGP[wFam], epsilon=NULL, noFam=length(wFam), nThreads=1)
      rf[[1]][snp2] <- rf.est$rf
      rf[[2]][snp2] <- rf.est$LOD
    }
    return(rf)
  }
  for(i in 1:2){
    rf.PI[[i]][upper.tri(rf.PI[[i]])] <- t(rf.PI[[i]])[upper.tri(rf.PI[[i]])]
  }
  
  cat("Maternal informative SNPs\n")
  ## Maternal informative SNPs
  rf.MI <- foreach::foreach(snp1 = iterators::iter(seq(length.out=nSnps_MI)), .combine=comb) %dopar% {
    rf <- replicate(2,numeric(nSnps_MI),simplify=F)
    for(snp2 in seq_len(snp1-1)){
      ind = indx_MI[c(snp1,snp2)]
      configFam <- matrix(unlist(lapply(config, function(z) z[ind])),ncol=2, nrow=noFam,byrow=T)
      wFam <- which(apply(configFam,1, function(z) all(z %in% c(4,5))))
      OPGP <- vector(mode="list", length=noFam)
      ## Work out the phase
      for(fam in wFam){
        if(all(configFam[fam,] %in% c(4,5))){
          OPGP1 <- c(9,9) + 2*(configFam[fam,]==5)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(9,10) + 2*(configFam[fam,]==5)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
        else if(all(configFam[fam,]==1)){
          OPGP1 <- c(1,1)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP3 <- c(1,4)
          rf.est3 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP3), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik,rf.est3$loglik)), OPGP1, OPGP2, OPGP3)
        }
        else if(any(configFam[fam,] == 1) & any(configFam[fam,] %in% c(4,5))){
          OPGP1 <- c(1,1) + 8*(configFam[fam,]==4) + 10*(configFam[fam,]==5)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1 + 8*(configFam[fam,1]==4) + 10*(configFam[fam,1]==5),3 + 7*(configFam[fam,2]==4) + 9*(configFam[fam,2]==5))
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
      }
      rf.est <- rf_est_FS(init_r=init_r,ref=lapply(ref[wFam], function(x) x[,ind]),
                                   alt=lapply(alt[wFam], function(x) x[,ind]),
                                   OPGP=OPGP[wFam], epsilon=NULL, noFam=length(wFam), nThreads=1)
      rf[[1]][snp2] <- rf.est$rf
      rf[[2]][snp2] <- rf.est$LOD
    }
    return(rf)
  }
  for(i in 1:2){
    rf.MI[[i]][upper.tri(rf.MI[[i]])] <- t(rf.MI[[i]])[upper.tri(rf.MI[[i]])]
  }
  
  cat("Both informative SNPs\n")
  ### Both Informative
  rf.BI <- foreach::foreach(snp1 = iterators::iter(seq(length.out=nSnps_BI)), .combine=comb) %dopar% {
    rf <- replicate(2,numeric(nSnps_BI),simplify=F)
    for(snp2 in seq_len(snp1-1)){
      ind = indx_BI[c(snp1,snp2)]
      configFam <- matrix(unlist(lapply(config, function(z) z[ind])),ncol=2, nrow=noFam,byrow=T)
      wFam <- which(apply(configFam,1, function(z) (all(z %in% 1)) | (z[1] %in% c(2:5) &  z[2] == 1) | (z[2] %in% c(2:5) &  z[1] == 1)))
      OPGP <- vector(mode="list", length=noFam)
      ## Work out the phase
      for(fam in wFam){
        if(all(configFam[fam,]==1)){
          OPGP1 <- c(1,1)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP3 <- c(1,4)
          rf.est3 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP3), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik,rf.est3$loglik)), OPGP1, OPGP2, OPGP3)
        }
        else if( any(configFam[fam,]==1) & any(configFam[fam,]%in%c(2,3)) ){
          OPGP1 <- c(1,1) + 4*(configFam[fam,]==2) + 6*(configFam[fam,]==3)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2) + 4*(configFam[fam,]==2) + 6*(configFam[fam,]==3)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
        else if( any(configFam[fam,]==1) & any(configFam[fam,]%in%c(4,5)) ){
          OPGP1 <- c(1,1) + 8*(configFam[fam,]==4) + 10*(configFam[fam,]==5)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1 + 8*(configFam[fam,1]==4) + 10*(configFam[fam,1]==5),3 + 7*(configFam[fam,2]==4) + 9*(configFam[fam,2]==5))
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
      }
      rf.est <- rf_est_FS(init_r=init_r,ref=lapply(ref[wFam], function(x) x[,ind]),
                                   alt=lapply(alt[wFam], function(x) x[,ind]),
                                   OPGP=OPGP[wFam], epsilon=NULL, noFam=length(wFam), nThreads=1)
      rf[[1]][snp2] <- rf.est$rf
      rf[[2]][snp2] <- rf.est$LOD
    }
    return(rf)
  }
  for(i in 1:2){
    rf.BI[[i]][upper.tri(rf.BI[[i]])] <- t(rf.BI[[i]])[upper.tri(rf.BI[[i]])]
  }
  
  cat("Paternal informative vs Both informative\n")
  ## Paternal and Informative SNPs
  rf.PI.BI <- foreach::foreach(snp.pi = iterators::iter(seq(length.out=nSnps_PI)), .combine=comb) %dopar% {
    rf <- replicate(2,numeric(nSnps_BI),simplify=F)
    for(snp.bi in 1:nSnps_BI){
      ind <- c(indx_PI[snp.pi],indx_BI[snp.bi])
      configFam <- matrix(unlist(lapply(config, function(z) z[ind])),ncol=2, nrow=noFam,byrow=T)
      wFam <- which(apply(configFam,1, function(z) z[1] %in% c(2,3) & z[2] == 1))
      OPGP <- vector(mode="list", length=noFam)
      ## Work out the phase
      for(fam in wFam){
        if(all(configFam[fam,]==1)){
          OPGP1 <- c(1,1)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP3 <- c(1,4)
          rf.est3 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP3), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik,rf.est3$loglik)), OPGP1, OPGP2, OPGP3)
        }
        else if(any(configFam[fam,] == 1) & any(configFam[fam,] %in% c(2,3))){
          OPGP1 <- c(1,1) + 4*(configFam[fam,]==2) + 6*(configFam[fam,]==3)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2) + 4*(configFam[fam,]==2) + 6*(configFam[fam,]==3)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
      }
      rf.est <- rf_est_FS(init_r=init_r,ref=lapply(ref[wFam], function(x) x[,ind]),
                                   alt=lapply(alt[wFam], function(x) x[,ind]),
                                   OPGP=OPGP[wFam], epsilon=NULL, noFam=length(wFam), nThreads=1)
      rf[[1]][snp.bi] <- rf.est$rf
      rf[[2]][snp.bi] <- rf.est$LOD
    }
    return(rf)
  }
  
  cat("Maternal informative vs Both informative\n")
  ## Maternal and Informative SNPs
  rf.MI.BI <- foreach::foreach(snp.mi = iterators::iter(seq(length.out=nSnps_MI)), .combine=comb) %dopar% {
    rf <- replicate(2,numeric(nSnps_BI),simplify=F)
    for(snp.bi in 1:nSnps_BI){
      ind <- c(indx_MI[snp.mi],indx_BI[snp.bi])
      configFam <- matrix(unlist(lapply(config, function(z) z[ind])),ncol=2, nrow=noFam,byrow=T)
      wFam <- which(apply(configFam,1, function(z) z[1] %in% c(4,5) & z[2] == 1))
      OPGP <- vector(mode="list", length=noFam)
      ## Work out the phase
      for(fam in wFam){
        if(all(configFam[fam,]==1)){
          OPGP1 <- c(1,1)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1,2)
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP3 <- c(1,4)
          rf.est3 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP3), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik,rf.est3$loglik)), OPGP1, OPGP2, OPGP3)
        }
        else if(any(configFam[fam,] == 1) & any(configFam[fam,] %in% c(4,5))){
          OPGP1 <- c(1,1) + 8*(configFam[fam,]==4) + 10*(configFam[fam,]==5)
          rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
          OPGP2 <- c(1 + 8*(configFam[fam,1]==4) + 10*(configFam[fam,1]==5),3 + 7*(configFam[fam,2]==4) + 9*(configFam[fam,2]==5))
          rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                        OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
          OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
        }
      }
      rf.est <- rf_est_FS(init_r=init_r,ref=lapply(ref[wFam], function(x) x[,ind]),
                                    alt=lapply(alt[wFam], function(x) x[,ind]),
                                    OPGP=OPGP[wFam], epsilon=NULL, noFam=length(wFam), nThreads=1)
      rf[[1]][snp.bi] <- rf.est$rf
      rf[[2]][snp.bi] <- rf.est$LOD
    }
    return(rf)
  }
  
  ## For the non-informative computations
  ## Really done so that we can check that there is no miss identification of the group
  cat("Maternal informative vs Paternal informative\n")
  rf.MI.PI <- foreach::foreach(snp.mi = iterators::iter(seq(length.out=nSnps_MI)), .combine=comb) %dopar% {
    rf <- replicate(2,numeric(nSnps_PI),simplify=F)
    for(snp.pi in 1:nSnps_PI){
      ind = c(indx_MI[snp.mi],indx_PI[snp.pi])
      configFam <- matrix(unlist(lapply(config, function(z) z[ind])),ncol=2, nrow=noFam,byrow=T)
      wFam <- which(apply(configFam,1, function(z) z[1] %in% c(4,5) & z[2] %in% c(2,3)))
      OPGP <- vector(mode="list", length=noFam)
      ## Work out the phase
      for(fam in wFam){
        OPGP1 <- c(5,5) + 2*(configFam[fam,] %in% c(3,5))
        rf.est1 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                      OPGP=list(OPGP1), epsilon=NULL, nThreads=1)
        OPGP2 <- c(5,6) + 2*(configFam[fam,] %in% c(3,5))
        rf.est2 <- rf_est_FS(init_r=init_r,ref=list(ref[[fam]][,ind]),alt=list(alt[[fam]][,ind]),
                                      OPGP=list(OPGP2), epsilon=NULL, nThreads=1)
        OPGP[[fam]] <- switch(which.min(c(rf.est1$loglik,rf.est2$loglik)), OPGP1, OPGP2)
      }
      rf.est <- rf_est_FS(init_r=init_r,ref=lapply(ref[wFam], function(x) x[,ind]),
                                   alt=lapply(alt[wFam], function(x) x[,ind]),
                                   OPGP=OPGP[wFam], epsilon=NULL, noFam=length(wFam), nThreads=1)
      rf[[1]][snp.pi] <- rf.est$rf
      rf[[2]][snp.pi] <- rf.est$LOD
    }
    return(rf)
  }
  
  #rf.MI.PI <- replicate(2, matrix(NA, nrow=nSnps_MI, ncol=nSnps_PI),simplify=FALSE)
  parallel::stopCluster(cl) 
  
  ## Build the rf and LOD matrices
  origOrder <- order(c(indx_BI,indx_PI,indx_MI))
  rf.mat <- rbind(cbind(rf.BI[[1]],t(rf.PI.BI[[1]]),t(rf.MI.BI[[1]])),
                  cbind(rf.PI.BI[[1]], rf.PI[[1]], t(rf.MI.PI[[1]])),
                  cbind(rf.MI.BI[[1]], rf.MI.PI[[1]], rf.MI[[1]]))[origOrder,origOrder]
  LOD.mat <- rbind(cbind(rf.BI[[2]],t(rf.PI.BI[[2]]),t(rf.MI.BI[[2]])),
                   cbind(rf.PI.BI[[2]], rf.PI[[2]], t(rf.MI.PI[[2]])),
                   cbind(rf.MI.BI[[2]], rf.MI.PI[[2]], rf.MI[[2]]))[origOrder,origOrder]
  return(list(rf = rf.mat,LOD = LOD.mat))
}




rf_est_FS_2pt <- function(init_r=0.01, ep=0.001, ref, alt, OPGP,
                          seqErr=T, trace=F, noFam=as.integer(1), method = "optim", nThreads = 1){
  
  nInd <- lapply(ref,nrow)  # number of individuals
  nSnps <- as.integer(2)   # number of SNPs
  
  if(method=="optim"){
    
    ## Compute the K matrix for heterozygous genotypes
    bcoef_mat <- Kab <- vector(mode="list", length=noFam)
    for(fam in 1:noFam){
      bcoef_mat[[fam]] <- choose(ref[[fam]]+alt[[fam]],ref[[fam]])
      Kab[[fam]] <- bcoef_mat[[fam]]*(1/2)^(ref[[fam]]+alt[[fam]])
    }
    
    if(seqErr)
      para <- c(GUSbase::logit2(init_r),GUSbase::logit(ep))
    else
      para <- GUSbase::logit2(init_r)

    ## Find MLE
    optim.MLE <- stats::optim(para, fn=score_fs_mp_scaled_err, gr = score_extract, 
                       method="BFGS", 
                       ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
                       nInd=nInd,nSnps=nSnps,OPGP=OPGP,noFam=noFam,
                       seqErr=seqErr,extra=ep,nThreads=nThreads)
    ep_hat <- ifelse(seqErr,GUSbase::inv.logit(optim.MLE$par[2]),ep)
    ## find the LOD score
    LOD <- -(optim.MLE$value - ll_fs_mp_scaled_err(1000,ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
                                                nInd=nInd,nSnps=nSnps,noFam=noFam,seqErr=F,
                                                OPGP=OPGP, extra=ep_hat, nThreads=nThreads))
    # return the results
    return(list(rf=GUSbase::inv.logit2(optim.MLE$par[1]), 
                ep=ep_hat, loglik=-optim.MLE$value, LOD=LOD))
  } else{ # EM algorithm approach
    ## Set up the parameter values
    EM.arg = c(500,1e-15)
    
    ## convert the data into the right format:
    OPGPmat = do.call(what = "rbind",OPGP)
    ref_mat = do.call(what = "rbind",ref)
    alt_mat = do.call(what = "rbind",alt)
    
    EMout <- .Call("EM_HMM", rep(init_r,2), ep, ref_mat, alt_mat, OPGPmat,
                   noFam, unlist(nInd), nSnps, FALSE, seqErr, EM.arg, as.integer(0), nThreads=nThreads)
    
    EMout[[3]] = EMout[[3]] + sum(log(choose(ref_mat+alt_mat,ref_mat)))
    ## find the LOD score
    LOD <- EMout[[3]] + ll_fs_mp_scaled_err(1000,ref=ref,alt=alt,bcoef_mat=bcoef_mat,Kab=Kab,
                                                    nInd=nInd,nSnps=nSnps,noFam=noFam,seqErr=F,
                                                    OPGP=OPGP, extra=EMout[[2]], nThreads=nThreads)
    return(list(rf=EMout[[1]][1], ep=EMout[[2]],
                loglik=EMout[[3]], LOD=LOD))
  }
}
tpbilton/GUSMap documentation built on Feb. 22, 2025, 12:27 p.m.