R/simulate.R

Defines functions simFS

Documented in simFS

##########################################################################
# Genotyping Uncertainty with Sequencing data and linkage MAPping
# Copyright 2017-2020 Timothy P. Bilton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#########################################################################
#' Simulation sequencing data for full-sib families.
#' 
#' Simulate genotypes and sequencing depth for the progeny of a full-sib family
#' 
#' The \code{simFS} function simulates sequencing data for the progeny of a
#' full-sib family according to the simulation parameters assuming no
#' interference. The process for the simulation is as follows;
#' \itemize{
#' \item Inheritance vectors are simulated according to the true recombination
#' fractions, assuming no interference.
#' \item The inheritance vectors are converted to genotypes for the specified
#' OPGP vector (which is simulated based on the \code{config} vector).
#' \item Simulated read depths are generated by simulating realizations from
#' the read depth distribution. The alleles of the true genotype are then
#' sampled with equal probability and replacement until a sample size
#' corresponding to the read depth is obtained, where a sequencing error
#' (e.g. an reference allele called as alternate and vice versa) is simulated
#' to occur with probability \code{epsilon}. The mean read depth for each loci
#' is assumed to be equal.
#' }
#' The simulation can be performed with multiple families and/or multiple chromosomes.
#' 
#' Specifying the \code{config} argument in the correct is crucial and must be a nested list.
#' The number of elements of the list at the top level gives the number of families (which must be the 
#' same as the length of the \code{nInd} argument) and
#' the number elements of the list at the second level gives the number of chromosomes.
#' The elements at the bottom level of the list must be vectors of integers 1 to 9 where 1=both-informative (ABxAB),
#' 2=paternal-informative A (ABxAA), 3=paternal-informative B (ABxBB),
#' 4=maternal-informative (AAxAB), 5=maternal-informative (BBxAB), 6 =
#' uninformative (AAxAA), 7 = uninformative (AAxBB), 8 = uninformative (BBxAA)
#' and 9 = uninformative (BBxBB). The list must be set up such that the length of each 
#' chromosome whitin each family must be the same (see the examples for an ideal of how to set this up).
#' 
#' @param rVec_f,rVec_m Numeric vector of true paternal and maternal
#' recombination fractions (in the interval [0,0.5]). Currently, only a single value 
#' is allowed making the rf across all the loci the same.
#' @param epsilon Numeric value of the sequencing error rate.
#' @param config Nested list containing the config vector for each family and chromosome. 
#' See details on how to sepcify this correctly.
#' @param nInd Positive integer vector for the number of individuals in each family for the simulated data.
#' The length of the list gives the number of families in the simulated data set.
#' @param meanDepth Positive numeric value for the mean depth of the read depth
#' distribution.
#' @param thres Numeric value for the threshold value for which genotype calls with
#' a read depth less than the threshold are set to missing.
#' @param rd_dist Character value for the distribution for which the read depths are
#' simulated from. Currently, only negative binomial (\code{"Neg_Binom"}) and Poisson
#' (\code{"Pois"}) are implemented.
#' @param seed1 Numeric value. Random seed used for the simulation of the
#' parental phase (or OPGP).
#' @param seed2 Numeric value. Random seed used for the simulation of the data
#' sets.
#' @return An FS object containing the simulated data.
#' @author Timothy P. Bilton
#' @seealso \code{\link{FS}}
#' @examples
#' 
#' ## simulate a single full sib family with one chromosome 
#' config <- list(list(c(2,1,1,4,2,4,1,1,4,1,2,1)))
#' F1data <- simFS(0.001, config=config, nInd=50, meanDepth=5)
#' ## to look at the simulated data
#' F1data
#' 
#' ## Simulate mulitple families and chromosomes
#' config <- list(replicate(2, sample(c(1,2,4), size=10, replace=TRUE, prob=c(1,2,2)), simplify = FALSE),
#' replicate(2, sample(c(1,2,4), size=10, replace=TRUE, prob=c(1,2,2)), simplify = FALSE))
#' F1data <- simFS(0.001, config=config, nInd=c(50,45), meanDepth=5)
#' 
#' @export simFS

## Function for simulating sequencing data for a single full-sib family
simFS <- function(rVec_f, rVec_m=rVec_f, epsilon=0, config, nInd=100, meanDepth=5, thres=NULL,
                  rd_dist="Neg_Binom", seed1=1, seed2=1, MNIF=1){
  
  ## perform some checks for data input
  if(GUSbase::checkVector(nInd, type="pos_integer", minv=1))
    stop("Input for the number of individuals to simulate is invalid.")
  noFam = length(nInd)
  if( !is.numeric(rVec_f) || !is.numeric(rVec_m) || length(rVec_f) != 1 || length(rVec_m) != 1 || 
      any(rVec_f < 0) || any(rVec_m < 0) || any(rVec_f > 0.5) || any(rVec_m > 0.5) )
    stop("Recombination factions are required to be a numeric number between 0 and 0.5")
  if( GUSbase::checkVector(epsilon, type="pos_numeric", minv=0, maxv=1, equal=FALSE) || length(epsilon) != 1)
    stop("Sequencing error parameter is not single numeric number between 0 and 1")
  if(!is.list(config) || length(config) != noFam)
    stop("Segregation information needs to be a list equal to the nunber of chromosomes.")
  else{
    noChr <- unique(unlist(lapply(config, length)))
    if(length(noChr) != 1 && GUSbase::checkVector(noChr, minv=1))  
      stop("Number of chromosomes found in the 'config' list is not the same for each family")
    if(!all(unlist(lapply(config, is.list))))
      stop("'config' argument is not set-up correctly. Needs to be a nested list.")
    nSnps <- unique(lapply(config, function(x) unlist(lapply(x, length))))
    if(length(nSnps) != 1 || !is.list(nSnps))
      stop("The number of SNPs in each chromosome is not the same in each family for the 'config' argument.")
    else nSnps <- nSnps[[1]]  
    for(fam in 1:noFam){
      for(chr in 1:noChr){
        temp <- config[[fam]][[chr]]
        if(GUSbase::checkVector(temp, type="pos_integer", minv=1, maxv=9))
          stop(paste0("Segregation information for chromosome ",chr," in familiy ",fam," needs to be an integer vector with entires from 1 to 9"))
      }
    }
  }
  if( GUSbase::checkVector(meanDepth, type="pos_numeric", minv = 0) || length(meanDepth) != 1)
    stop("The mean of the read depth distribution is not a finitie positive number.")
  if( !is.null(thres) && (isValue(thres, type="pos_numeric", minv=0, equal=FALSE) || length(thres) != 1))
    stop("The read depth threshold value is not a finite numeric number")
  if( GUSbase::checkVector(seed1, type="pos_integer") || GUSbase::checkVector(seed2, type = "pos_integer"))
    stop("Seed values for the randomziation need to be positive numeric values")
  if(noFam > 1 && GUSbase::checkVector(MNIF, type="pos_integer", minv=1, maxv=noFam, equal=FALSE))
    stop(paste0("Minimum number of informative families must be between 1 and ",noFam))
  
  OPGP <- replicate(n = noChr, vector(mode = "list", length = noFam), simplify = F)
  genon <- ref <- alt <- vector(mode = "list", length=noFam)
  for(fam in 1:noFam){
    for(chr in 1:noChr){

      ## Create list of the recombination fraction and 1 minus the recombination fraction
      ## for each SNP
      rVec_f <- sapply(rep(rVec_f,length.out=nSnps[[chr]]-1), function(r) c(r,1-r),simplify=F)
      rVec_m <- sapply(rep(rVec_m,length.out=nSnps[[chr]]-1), function(r) c(r,1-r),simplify=F)
      
      ## simulate the parental haplotypes
      set.seed(seed1 + fam*3 + chr*39)
      parHap <- matrix(rep(paste0(rep("A",nSnps[[chr]])),4),nrow=4)
      parHap[cbind(sample(1:2,size=sum(config[[fam]][[chr]] %in% c(2,3)),replace=T),which(config[[fam]][[chr]]%in%c(2,3)))] <- "B"
      parHap[cbind(sample(3:4,size=sum(config[[fam]][[chr]] %in% c(4,5)),replace=T),which(config[[fam]][[chr]] %in% c(4,5)))] <- "B"
      parHap[cbind(c(rep(3,sum(config[[fam]][[chr]] %in% c(3,7,9))),rep(4,sum(config[[fam]][[chr]] %in% c(3,7,9)))),which(config[[fam]][[chr]] %in% c(3,7,9)))] <- "B"
      parHap[cbind(c(rep(1,sum(config[[fam]][[chr]] %in% c(5,8,9))),rep(2,sum(config[[fam]][[chr]] %in% c(5,8,9)))),which(config[[fam]][[chr]] %in% c(5,8,9)))] <- "B"
      parHap[cbind(c(sample(1:2,size=sum(config[[fam]][[chr]]==1),replace=T),sample(3:4,size=sum(config[[fam]][[chr]]==1),replace=T)),rep(which(config[[fam]][[chr]]==1),2))] <- "B"
      if(any(parHap[1:2,] == "B") && parHap[1,which(apply(parHap[1:2,],2,function(x) !(all(x=='A'))))[1]] == 'B')
        parHap[1:2,] <- parHap[2:1,]
      if(any(parHap[3:4,] == "B") && parHap[3,which(apply(parHap[3:4,],2,function(x) !(all(x=='A'))))[1]] == 'B')
        parHap[3:4,] <- parHap[4:3,]
      OPGP[[chr]][[fam]] <- as.integer(parHapToOPGP(parHap))
      
      #### Simulate the data sets
      set.seed(seed2 + fam*5 + chr*119)
      #### Simulate the true Meiosis for each individual at each SNP.
       mIndx <- matrix(c(sample(c(0,1),size=2*nInd[fam],replace=T)),ncol=1)
       for(i in 1:(nSnps[[chr]]-1)){
         newmIndx_f <- numeric(nInd[fam])
         newmIndx_m <- numeric(nInd[fam])
         for(j in 1:(nInd[fam])){
           newmIndx_f[j] <- sample(c(0,1),size=1,prob=c(rVec_f[[i]][(mIndx[j,i]==0)+1],rVec_f[[i]][(mIndx[j,i]==1)+1]))
           newmIndx_m[j] <- sample(c(0,1),size=1,prob=c(rVec_m[[i]][(mIndx[j+nInd[fam],i]==0)+1],rVec_m[[i]][(mIndx[j+nInd[fam],i]==1)+1]))
         }
         mIndx <- cbind(mIndx,c(newmIndx_f,newmIndx_m))
       }
       # Determine the true genotype calls
       geno <- rbind(sapply(1:nSnps[[chr]],function(x) parHap[mIndx[1:nInd[fam],x]+1,x]),sapply(1:nSnps[[chr]],function(x) parHap[mIndx[1:nInd[fam]+nInd[fam],x]+3,x]))
       geno <- sapply(1:nSnps[[chr]],function(y) {
         tempGeno <- geno[,y]
         sapply(1:nInd[fam], function(x) paste(sort(c(tempGeno[x],tempGeno[x+nInd[fam]])),collapse=""))
       })
       geno <- (geno=="AA")*2 + (geno=="AB")*1
       
       ### Now generate the sequencing data
       # 1: Simulate Depths
       depth <- matrix(0,nrow=nInd[fam], ncol=nSnps[[chr]])
       if(rd_dist=="NegBinom")
         depth[which(!is.na(geno))] <- stats::rnbinom(sum(!is.na(geno)),mu=meanDepth,size=2) 
       else   
         depth[which(!is.na(geno))] <- stats::rpois(sum(!is.na(geno)),meanDepth)
       # 2: simulate sequencing genotypes (with sequencing error rate of epsilon)
       aCounts <- matrix(stats::rbinom(nInd[fam]*nSnps[[chr]],depth,geno/2),ncol=nSnps[[chr]])
       bCounts <- depth - aCounts
       aCountsFinal <- matrix(stats::rbinom(nInd[fam]*nSnps[[chr]],aCounts,prob=1-epsilon),ncol=nSnps[[chr]]) + 
         matrix(stats::rbinom(nInd[fam]*nSnps[[chr]],bCounts,prob=epsilon),ncol=nSnps[[chr]])
       SEQgeno <- aCountsFinal/depth
       SEQgeno[which(SEQgeno^2-SEQgeno<0)] <- 0.5
       SEQgeno <- 2* SEQgeno  ## GBS genotype call
       genon[[fam]] <- cbind(genon[[fam]], SEQgeno)
       ref[[fam]]   <- cbind(ref[[fam]], aCountsFinal)
       alt[[fam]]   <- cbind(alt[[fam]], matrix(as.integer(depth-aCountsFinal), nrow=nInd, ncol=nSnps[[chr]]))
    }
  }
  chrom <- unlist(sapply(1:noChr, function(x) rep(x,nSnps[[x]]), simplify = F))
  pos  <-  unlist(sapply(1:noChr, function(x) 1:(nSnps[[x]]), simplify = F))
  
  temp <- lapply(1:noFam, function(x) matrix(unlist(lapply(config[[x]], unlist)), nrow=noFam, byrow=T))
  
  group <- summaryInfo <- list()
  if(noFam == 1){
    group$BI <- which(apply(temp[[1]],2,function(x) all(x==1)))
    group$PI <- which(apply(temp[[1]],2,function(x) all(x %in% c(2,3))))
    group$MI <- which(apply(temp[[1]],2,function(x) all(x %in% c(4,5))))
    
    nSnps <- as.integer(sum(nSnps))
    config <- list(as.vector(temp[[1]]))
    
    ## Create summary info
    temp <- ref[[1]] + alt [[1]]
    summaryInfo$data <- paste0(c(
      "Single Family Linkage analysis:\n\n",
      "Data Summary:\n",
      "Data file:\t", "Simulated dataset","\n",
      "Mean Depth:\t", round(mean(temp),4),"\n",
      "Mean Call Rate:\t",round(sum(temp!=0)/length(temp),4),"\n",
      "Number of ...\n",
      "  Progeny:\t",nInd,"\n",
      "  MI SNPs:\t",length(group$MI),"\n",
      "  PI SNPs:\t",length(group$PI),"\n",
      "  BI SNPs:\t",length(group$BI),"\n",
      "  Total SNPs:\t",length(unlist(group)),"\n\n"))
  }
  else{
    group.temp <- list()
    config <- lapply(config, unlist)
    # count the seg types for each SNP
    
    # MI
    count_MI <- apply(sapply(1:noFam,function(y) config[[y]] %in% c(4,5)),1,sum)*apply(sapply(1:noFam,function(y) !(config[[y]] %in% c(1,2,3))),1,all)
    group.temp$MI <- which(count_MI > (MNIF-1))
    # PI
    count_PI <- apply(sapply(1:noFam,function(y) config[[y]] %in% c(2,3)),1,sum)*apply(sapply(1:noFam,function(y) !(config[[y]] %in% c(1,4,5))),1,all)
    group.temp$PI <- which(count_PI > (MNIF-1))
    # BI
    count_BI <- apply(sapply(1:noFam,function(y) config[[y]] %in% c(1:5)),1,sum)*(
      apply(sapply(1:noFam,function(y) config[[y]] %in% c(1)),1,any) | apply(sapply(1:noFam,function(y) config[[y]] %in% c(2,3)),1,any) & apply(sapply(1:noFam,function(y) config[[y]] %in% c(4,5)),1,any))
    group.temp$BI <- which(count_BI > (MNIF-1))
    
    ## create the groups
    group$BI <- which(sort(unique(unlist(group.temp))) %in% group.temp$BI)
    group$MI <- which(sort(unique(unlist(group.temp))) %in% group.temp$MI)
    group$PI <- which(sort(unique(unlist(group.temp))) %in% group.temp$PI)
    ## work out which SNPs to keep
    indx <- logical(sum(nSnps))
    indx[unlist(group.temp, use.names = F)] <- TRUE
    ## Subset the data
    for(fam in 1:noFam){
      genon[[fam]]     <- genon[[fam]][,indx]
      ref[[fam]] <- ref[[fam]][,indx]
      alt[[fam]] <- alt[[fam]][,indx]
      config[[fam]] <- config[[fam]][indx]
    }
    chrom <- chrom[indx]
    pos <- pos[indx]
    nSnps <- sum(indx)
  }
  
  obj <- GUSbase::RA$new(
    list(genon = genon, ref = ref, alt = alt, chrom = chrom, pos = pos, noFam = as.integer(noFam),
         SNP_Names = NULL, indID = 1:sum(nInd), nSnps = nSnps, nInd = lapply(as.list(nInd), as.integer), 
         gform = "simFS", AFrq = NULL, infilename="Simulated dataset")
  )
  newObj <- FS$new(obj)
  
  newObj$.__enclos_env__$private$updatePrivate(
    list(group = group, config = config, noFam = noFam, summaryInfo=summaryInfo, config_orig=config, 
         simPara=list(OPGP=OPGP,nSnps=nSnps,config=config,meanDepth=meanDepth, nInd=nInd, seed1=seed1, seed2=seed2, ep=epsilon))
  )
  return(newObj)
}
tpbilton/GUSMap documentation built on Feb. 22, 2025, 12:27 p.m.