##########################################################################
# Genotyping Uncertainty with Sequencing data and linkage MAPping
# Copyright 2017-2020 Timothy P. Bilton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#########################################################################
#' Simulation sequencing data for full-sib families.
#'
#' Simulate genotypes and sequencing depth for the progeny of a full-sib family
#'
#' The \code{simFS} function simulates sequencing data for the progeny of a
#' full-sib family according to the simulation parameters assuming no
#' interference. The process for the simulation is as follows;
#' \itemize{
#' \item Inheritance vectors are simulated according to the true recombination
#' fractions, assuming no interference.
#' \item The inheritance vectors are converted to genotypes for the specified
#' OPGP vector (which is simulated based on the \code{config} vector).
#' \item Simulated read depths are generated by simulating realizations from
#' the read depth distribution. The alleles of the true genotype are then
#' sampled with equal probability and replacement until a sample size
#' corresponding to the read depth is obtained, where a sequencing error
#' (e.g. an reference allele called as alternate and vice versa) is simulated
#' to occur with probability \code{epsilon}. The mean read depth for each loci
#' is assumed to be equal.
#' }
#' The simulation can be performed with multiple families and/or multiple chromosomes.
#'
#' Specifying the \code{config} argument in the correct is crucial and must be a nested list.
#' The number of elements of the list at the top level gives the number of families (which must be the
#' same as the length of the \code{nInd} argument) and
#' the number elements of the list at the second level gives the number of chromosomes.
#' The elements at the bottom level of the list must be vectors of integers 1 to 9 where 1=both-informative (ABxAB),
#' 2=paternal-informative A (ABxAA), 3=paternal-informative B (ABxBB),
#' 4=maternal-informative (AAxAB), 5=maternal-informative (BBxAB), 6 =
#' uninformative (AAxAA), 7 = uninformative (AAxBB), 8 = uninformative (BBxAA)
#' and 9 = uninformative (BBxBB). The list must be set up such that the length of each
#' chromosome whitin each family must be the same (see the examples for an ideal of how to set this up).
#'
#' @param rVec_f,rVec_m Numeric vector of true paternal and maternal
#' recombination fractions (in the interval [0,0.5]). Currently, only a single value
#' is allowed making the rf across all the loci the same.
#' @param epsilon Numeric value of the sequencing error rate.
#' @param config Nested list containing the config vector for each family and chromosome.
#' See details on how to sepcify this correctly.
#' @param nInd Positive integer vector for the number of individuals in each family for the simulated data.
#' The length of the list gives the number of families in the simulated data set.
#' @param meanDepth Positive numeric value for the mean depth of the read depth
#' distribution.
#' @param thres Numeric value for the threshold value for which genotype calls with
#' a read depth less than the threshold are set to missing.
#' @param rd_dist Character value for the distribution for which the read depths are
#' simulated from. Currently, only negative binomial (\code{"Neg_Binom"}) and Poisson
#' (\code{"Pois"}) are implemented.
#' @param seed1 Numeric value. Random seed used for the simulation of the
#' parental phase (or OPGP).
#' @param seed2 Numeric value. Random seed used for the simulation of the data
#' sets.
#' @return An FS object containing the simulated data.
#' @author Timothy P. Bilton
#' @seealso \code{\link{FS}}
#' @examples
#'
#' ## simulate a single full sib family with one chromosome
#' config <- list(list(c(2,1,1,4,2,4,1,1,4,1,2,1)))
#' F1data <- simFS(0.001, config=config, nInd=50, meanDepth=5)
#' ## to look at the simulated data
#' F1data
#'
#' ## Simulate mulitple families and chromosomes
#' config <- list(replicate(2, sample(c(1,2,4), size=10, replace=TRUE, prob=c(1,2,2)), simplify = FALSE),
#' replicate(2, sample(c(1,2,4), size=10, replace=TRUE, prob=c(1,2,2)), simplify = FALSE))
#' F1data <- simFS(0.001, config=config, nInd=c(50,45), meanDepth=5)
#'
#' @export simFS
## Function for simulating sequencing data for a single full-sib family
simFS <- function(rVec_f, rVec_m=rVec_f, epsilon=0, config, nInd=100, meanDepth=5, thres=NULL,
rd_dist="Neg_Binom", seed1=1, seed2=1, MNIF=1){
## perform some checks for data input
if(GUSbase::checkVector(nInd, type="pos_integer", minv=1))
stop("Input for the number of individuals to simulate is invalid.")
noFam = length(nInd)
if( !is.numeric(rVec_f) || !is.numeric(rVec_m) || length(rVec_f) != 1 || length(rVec_m) != 1 ||
any(rVec_f < 0) || any(rVec_m < 0) || any(rVec_f > 0.5) || any(rVec_m > 0.5) )
stop("Recombination factions are required to be a numeric number between 0 and 0.5")
if( GUSbase::checkVector(epsilon, type="pos_numeric", minv=0, maxv=1, equal=FALSE) || length(epsilon) != 1)
stop("Sequencing error parameter is not single numeric number between 0 and 1")
if(!is.list(config) || length(config) != noFam)
stop("Segregation information needs to be a list equal to the nunber of chromosomes.")
else{
noChr <- unique(unlist(lapply(config, length)))
if(length(noChr) != 1 && GUSbase::checkVector(noChr, minv=1))
stop("Number of chromosomes found in the 'config' list is not the same for each family")
if(!all(unlist(lapply(config, is.list))))
stop("'config' argument is not set-up correctly. Needs to be a nested list.")
nSnps <- unique(lapply(config, function(x) unlist(lapply(x, length))))
if(length(nSnps) != 1 || !is.list(nSnps))
stop("The number of SNPs in each chromosome is not the same in each family for the 'config' argument.")
else nSnps <- nSnps[[1]]
for(fam in 1:noFam){
for(chr in 1:noChr){
temp <- config[[fam]][[chr]]
if(GUSbase::checkVector(temp, type="pos_integer", minv=1, maxv=9))
stop(paste0("Segregation information for chromosome ",chr," in familiy ",fam," needs to be an integer vector with entires from 1 to 9"))
}
}
}
if( GUSbase::checkVector(meanDepth, type="pos_numeric", minv = 0) || length(meanDepth) != 1)
stop("The mean of the read depth distribution is not a finitie positive number.")
if( !is.null(thres) && (isValue(thres, type="pos_numeric", minv=0, equal=FALSE) || length(thres) != 1))
stop("The read depth threshold value is not a finite numeric number")
if( GUSbase::checkVector(seed1, type="pos_integer") || GUSbase::checkVector(seed2, type = "pos_integer"))
stop("Seed values for the randomziation need to be positive numeric values")
if(noFam > 1 && GUSbase::checkVector(MNIF, type="pos_integer", minv=1, maxv=noFam, equal=FALSE))
stop(paste0("Minimum number of informative families must be between 1 and ",noFam))
OPGP <- replicate(n = noChr, vector(mode = "list", length = noFam), simplify = F)
genon <- ref <- alt <- vector(mode = "list", length=noFam)
for(fam in 1:noFam){
for(chr in 1:noChr){
## Create list of the recombination fraction and 1 minus the recombination fraction
## for each SNP
rVec_f <- sapply(rep(rVec_f,length.out=nSnps[[chr]]-1), function(r) c(r,1-r),simplify=F)
rVec_m <- sapply(rep(rVec_m,length.out=nSnps[[chr]]-1), function(r) c(r,1-r),simplify=F)
## simulate the parental haplotypes
set.seed(seed1 + fam*3 + chr*39)
parHap <- matrix(rep(paste0(rep("A",nSnps[[chr]])),4),nrow=4)
parHap[cbind(sample(1:2,size=sum(config[[fam]][[chr]] %in% c(2,3)),replace=T),which(config[[fam]][[chr]]%in%c(2,3)))] <- "B"
parHap[cbind(sample(3:4,size=sum(config[[fam]][[chr]] %in% c(4,5)),replace=T),which(config[[fam]][[chr]] %in% c(4,5)))] <- "B"
parHap[cbind(c(rep(3,sum(config[[fam]][[chr]] %in% c(3,7,9))),rep(4,sum(config[[fam]][[chr]] %in% c(3,7,9)))),which(config[[fam]][[chr]] %in% c(3,7,9)))] <- "B"
parHap[cbind(c(rep(1,sum(config[[fam]][[chr]] %in% c(5,8,9))),rep(2,sum(config[[fam]][[chr]] %in% c(5,8,9)))),which(config[[fam]][[chr]] %in% c(5,8,9)))] <- "B"
parHap[cbind(c(sample(1:2,size=sum(config[[fam]][[chr]]==1),replace=T),sample(3:4,size=sum(config[[fam]][[chr]]==1),replace=T)),rep(which(config[[fam]][[chr]]==1),2))] <- "B"
if(any(parHap[1:2,] == "B") && parHap[1,which(apply(parHap[1:2,],2,function(x) !(all(x=='A'))))[1]] == 'B')
parHap[1:2,] <- parHap[2:1,]
if(any(parHap[3:4,] == "B") && parHap[3,which(apply(parHap[3:4,],2,function(x) !(all(x=='A'))))[1]] == 'B')
parHap[3:4,] <- parHap[4:3,]
OPGP[[chr]][[fam]] <- as.integer(parHapToOPGP(parHap))
#### Simulate the data sets
set.seed(seed2 + fam*5 + chr*119)
#### Simulate the true Meiosis for each individual at each SNP.
mIndx <- matrix(c(sample(c(0,1),size=2*nInd[fam],replace=T)),ncol=1)
for(i in 1:(nSnps[[chr]]-1)){
newmIndx_f <- numeric(nInd[fam])
newmIndx_m <- numeric(nInd[fam])
for(j in 1:(nInd[fam])){
newmIndx_f[j] <- sample(c(0,1),size=1,prob=c(rVec_f[[i]][(mIndx[j,i]==0)+1],rVec_f[[i]][(mIndx[j,i]==1)+1]))
newmIndx_m[j] <- sample(c(0,1),size=1,prob=c(rVec_m[[i]][(mIndx[j+nInd[fam],i]==0)+1],rVec_m[[i]][(mIndx[j+nInd[fam],i]==1)+1]))
}
mIndx <- cbind(mIndx,c(newmIndx_f,newmIndx_m))
}
# Determine the true genotype calls
geno <- rbind(sapply(1:nSnps[[chr]],function(x) parHap[mIndx[1:nInd[fam],x]+1,x]),sapply(1:nSnps[[chr]],function(x) parHap[mIndx[1:nInd[fam]+nInd[fam],x]+3,x]))
geno <- sapply(1:nSnps[[chr]],function(y) {
tempGeno <- geno[,y]
sapply(1:nInd[fam], function(x) paste(sort(c(tempGeno[x],tempGeno[x+nInd[fam]])),collapse=""))
})
geno <- (geno=="AA")*2 + (geno=="AB")*1
### Now generate the sequencing data
# 1: Simulate Depths
depth <- matrix(0,nrow=nInd[fam], ncol=nSnps[[chr]])
if(rd_dist=="NegBinom")
depth[which(!is.na(geno))] <- stats::rnbinom(sum(!is.na(geno)),mu=meanDepth,size=2)
else
depth[which(!is.na(geno))] <- stats::rpois(sum(!is.na(geno)),meanDepth)
# 2: simulate sequencing genotypes (with sequencing error rate of epsilon)
aCounts <- matrix(stats::rbinom(nInd[fam]*nSnps[[chr]],depth,geno/2),ncol=nSnps[[chr]])
bCounts <- depth - aCounts
aCountsFinal <- matrix(stats::rbinom(nInd[fam]*nSnps[[chr]],aCounts,prob=1-epsilon),ncol=nSnps[[chr]]) +
matrix(stats::rbinom(nInd[fam]*nSnps[[chr]],bCounts,prob=epsilon),ncol=nSnps[[chr]])
SEQgeno <- aCountsFinal/depth
SEQgeno[which(SEQgeno^2-SEQgeno<0)] <- 0.5
SEQgeno <- 2* SEQgeno ## GBS genotype call
genon[[fam]] <- cbind(genon[[fam]], SEQgeno)
ref[[fam]] <- cbind(ref[[fam]], aCountsFinal)
alt[[fam]] <- cbind(alt[[fam]], matrix(as.integer(depth-aCountsFinal), nrow=nInd, ncol=nSnps[[chr]]))
}
}
chrom <- unlist(sapply(1:noChr, function(x) rep(x,nSnps[[x]]), simplify = F))
pos <- unlist(sapply(1:noChr, function(x) 1:(nSnps[[x]]), simplify = F))
temp <- lapply(1:noFam, function(x) matrix(unlist(lapply(config[[x]], unlist)), nrow=noFam, byrow=T))
group <- summaryInfo <- list()
if(noFam == 1){
group$BI <- which(apply(temp[[1]],2,function(x) all(x==1)))
group$PI <- which(apply(temp[[1]],2,function(x) all(x %in% c(2,3))))
group$MI <- which(apply(temp[[1]],2,function(x) all(x %in% c(4,5))))
nSnps <- as.integer(sum(nSnps))
config <- list(as.vector(temp[[1]]))
## Create summary info
temp <- ref[[1]] + alt [[1]]
summaryInfo$data <- paste0(c(
"Single Family Linkage analysis:\n\n",
"Data Summary:\n",
"Data file:\t", "Simulated dataset","\n",
"Mean Depth:\t", round(mean(temp),4),"\n",
"Mean Call Rate:\t",round(sum(temp!=0)/length(temp),4),"\n",
"Number of ...\n",
" Progeny:\t",nInd,"\n",
" MI SNPs:\t",length(group$MI),"\n",
" PI SNPs:\t",length(group$PI),"\n",
" BI SNPs:\t",length(group$BI),"\n",
" Total SNPs:\t",length(unlist(group)),"\n\n"))
}
else{
group.temp <- list()
config <- lapply(config, unlist)
# count the seg types for each SNP
# MI
count_MI <- apply(sapply(1:noFam,function(y) config[[y]] %in% c(4,5)),1,sum)*apply(sapply(1:noFam,function(y) !(config[[y]] %in% c(1,2,3))),1,all)
group.temp$MI <- which(count_MI > (MNIF-1))
# PI
count_PI <- apply(sapply(1:noFam,function(y) config[[y]] %in% c(2,3)),1,sum)*apply(sapply(1:noFam,function(y) !(config[[y]] %in% c(1,4,5))),1,all)
group.temp$PI <- which(count_PI > (MNIF-1))
# BI
count_BI <- apply(sapply(1:noFam,function(y) config[[y]] %in% c(1:5)),1,sum)*(
apply(sapply(1:noFam,function(y) config[[y]] %in% c(1)),1,any) | apply(sapply(1:noFam,function(y) config[[y]] %in% c(2,3)),1,any) & apply(sapply(1:noFam,function(y) config[[y]] %in% c(4,5)),1,any))
group.temp$BI <- which(count_BI > (MNIF-1))
## create the groups
group$BI <- which(sort(unique(unlist(group.temp))) %in% group.temp$BI)
group$MI <- which(sort(unique(unlist(group.temp))) %in% group.temp$MI)
group$PI <- which(sort(unique(unlist(group.temp))) %in% group.temp$PI)
## work out which SNPs to keep
indx <- logical(sum(nSnps))
indx[unlist(group.temp, use.names = F)] <- TRUE
## Subset the data
for(fam in 1:noFam){
genon[[fam]] <- genon[[fam]][,indx]
ref[[fam]] <- ref[[fam]][,indx]
alt[[fam]] <- alt[[fam]][,indx]
config[[fam]] <- config[[fam]][indx]
}
chrom <- chrom[indx]
pos <- pos[indx]
nSnps <- sum(indx)
}
obj <- GUSbase::RA$new(
list(genon = genon, ref = ref, alt = alt, chrom = chrom, pos = pos, noFam = as.integer(noFam),
SNP_Names = NULL, indID = 1:sum(nInd), nSnps = nSnps, nInd = lapply(as.list(nInd), as.integer),
gform = "simFS", AFrq = NULL, infilename="Simulated dataset")
)
newObj <- FS$new(obj)
newObj$.__enclos_env__$private$updatePrivate(
list(group = group, config = config, noFam = noFam, summaryInfo=summaryInfo, config_orig=config,
simPara=list(OPGP=OPGP,nSnps=nSnps,config=config,meanDepth=meanDepth, nInd=nInd, seed1=seed1, seed2=seed2, ep=epsilon))
)
return(newObj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.