R/simulate.R

#' @title Simulate haploid parental genotypes along a chromosome
#' 
#' @description This function creates divergent genotypes between two parents. The first parent's genotypes
#' are denoted by a vector of zeros of length \code{L} while the second parent's genotypes are a vector of ones.
#' 
#' @param \code{snps} a vector gving the locations of snps along a chromosome
#' 
#' @return an object of class \code{parent.genomes} giving the parental ancestry at each snp. 
#' 
#' @references none.
#' 
#' @seealso \code{\link{recombine_index}}, \code{\link{recobmine}}
#' 
#' @author Tyler D. Hether
#' 
#'@importFrom Rcpp evalCpp
#'@useDynLib HMMancestry
#' @export make_parents
#' 
#' @examples
#' make_parents(snps=c(1:10)*10)

make_parents <- function(snps){
	if(length(snps)<2){
		stop("The number of loci to simulate needs to be >= 2")
	}

	# Code the parent genomes (0 or 1)
	p1 <- rep(0, length(snps))
	p2 <- rep(1, length(snps))

	out <- list(snps=snps, p1=p1, p2=p2)
	class(out) <- c("list", "parent.genomes")
	return(out)
}

#' @title Simulate recombination points along a chromosome.
#' 
#' @description Simulate recombination events given the vector of snp locations and recombination rate(s). 
#' Haldane's function is used to simulate the probabilty of getting an odd number of crossover events between
#' any two snps given their physical distance and the specified recombination rate.
#' 
#' @param \code{scale} either vector of length 1 specifying the genome-wide recombination rate (Morgans/bp)
#' or a vector of length \code{snps-1} specifying the recombination rate between all snps. In either
#' case rates must be positive.
#'
#' @param \code{snps} a vector gving the locations of snps along a chromosome
#' 
#' @return a vector of length \code{snps-1} specifying the location of recombiantion points.
#' 
#' @seealso \code{\link{make_parents}}
#' 
#' @author Tyler D. Hether
#' 
#' @export recombine_index
#' 
#' @examples
#' 
#' set.seed(1234567)
#' # Simulating recombination across 1000 loci randomly spaced 
#' # along a 200kbp chromosome
#' chromSize <- 2e5 		# Chromosome size
#' l <- 1e3 				# number of loci to simulate
#' c <- 1e-04 				# Morgans/bp
#' snps <- sort(sample(size=l, 1:chromSize, replace=FALSE))
#' # Find recombination points between all the snps
#' recomb_points <- recombine_index(scale=c, snps=snps)
#' recomb_points

recombine_index <- function(scale, snps){

	if(length(scale)==1){
		c <- rep(scale, length(snps)-1)
	} else {
		c <- scale
	}
	if(!all(scale >=0)){
		stop("all elements in scale need to be a positive rate (Morgans / bp)")
	}		
	if(length(snps)<2){
		stop("the vector of snps needs to be greater than 1 in length")
	}
	if(length(c)!=(length(snps)-1)){
		stop(paste("scale needs to either a single rate or vector of length ", length(snps)-1 ," giving the rates between each snp", sep=""))
	}
	if(sum(snps-sort(snps))!=0){
		warning("Input snps were not sorted")
		snps <- sort(snps)
	}
	haldane <- function(d){ (1/2)*(1- exp(-2*d))}
	displacement <- sapply(2:length(snps), function(x){
		return(snps[x]-snps[x-1])
		})
	if(!all(displacement>0)){
		stop("The physical distance between all snps must be >0")
	}

	# 1==recombination occurs, 0==no recombination
	out <- sapply(1:length(displacement), function(i) rbinom(n=1, size=1, prob=haldane(displacement[i]*c[i])))
	return(out)
}


#' @title Simulate recombination (crossovers or non-crossovers) along a chromosome
#' 
#' @description This function simulates recombination between two parent chromosomes generated from class
#' \code{parent.genomes}.
#' 
#' @param parents an object of class \code{parent.genomes} specifying the parental ancestry 
#' at each simulated snp.
#' 
#' @param r.index a vector of length \code{snps-1} specifying whether a recombination is to be 
#' simulated (1) or not (0) in between two adjacent snps. 
#' 
#' @param mu.rate a numeric between 0 and 1 (inclusive) specifying the per snp mutation rate. 
#' 
#' @param f.cross a numeric between 0 and 1 (inclusive) giving the frequency of recombination 
#' events that result in crossing over. This is same as 1 minus the frequency of non-crossovers.
#' 
#' @param f.convert a numeric between 0 and 1 (inclusive) that gives the frequency of gene conversion 
#' during recombination.
#' 
#' @param length.conversion an integer specifying the mean (and variance) of a given gene conversion 
#' tract (in bps).
#' 
#' @details This function simulates crossover events (with or without gene conversions)
#' and non-crossover events given the parental snps and location of recombination events.
#' First, parental chromosomes are duplicated into 2 pairs of sister chromatids. During this step
#' mutations occur at rate \code{mu.rate} that switch parental alleles. Second, recombination is
#' simulated by systematically going down the length of the chromosome. At recombination points, idenitfied in \code{r.index},
#' two non-sister chromatids are picked at random. Third, a CO or NCO is simulated; this is done
#' at a frequency of f.cross or 1-f.cross, respectively. If a CO occurs, the genotypes of the 3'
#' end of the non-sister chromatids are simply switched. If a NCO occurs, no such switch takes place. 
#' Note that is is impossible to detect NCO events without an associated gene conversion tract. 
#' Following every CO or NCO event gene conversion is simulated at a rate of f.convert. During a 
#' gene conversion, one of the two unpicked chromatids' genotypes are switched. The length of the 
#' gene conversion tract is sampled from a poission distribution with mean (variance) of length.conversion.
#' Note that the gene conversion tract will stop at the end of the chromosme if necessary. 
#' 
#' @return an object of class \code{recombine} that contains the following data:
#' \itemize{
#' 	\item{parents}{ input data}
#' 	\item{r.index}{ input data}
#' 	\item{m.rate}{ input data}
#' 	\item{f.cross}{ input data}
#' 	\item{f.convert}{ input data}
#' 	\item{length.conversion}{ input data}
#' 	\item{chromatids.mutated_snps}{ a list giving the genotypes of 4 chromatids (two pairs of sister chromatids) following mutation}
#' 	\item{chromatids.recombined}{ a list giving the genotypes of 4 chromatids following recombination}
#' }
#' 
#' 
#' @seealso \code{\link{recombine_index}}
#' 
#' @author Tyler D. Hether
#' 
#' @export recombine
#' 
#' @examples
#' set.seed(1234567)        # For reproducibility
#' l <- 50                  # number of snps to simulate
#' c <- 3e-05               # recombination rate between snps (Morgan/bp)
#' snps <- c(1:l)*1e4       # snps are evenly spaced 10kbp apart
#' p <- make_parents(snps)  # make the parents
#' #
#' # The recombination indeces are:
#' r_points <- recombine_index(scale=c, snps=snps) 
#' #
#' # Now recombine the parents:
#' recomb_sim <- recombine(parents=p, r.index=r_points, mu.rate=1e-05, 
#' 	f.cross=0.6, f.convert=0.2, length.conversion=15000) 
#' recomb_sim

recombine <- function(parents, r.index, mu.rate=0, f.cross=0.5, f.convert=0, length.conversion=20){
	if(!inherits(parents, "parent.genomes")){
		stop(paste("Object ", parents, " needs to be of class parent.genomes", sep=""))
	}
	l <- length(parents$p1)
	p <- parents 
	# S-phase and mutation
	chromatids.orig <- list(p1_1=parents$p1, p1_2=parents$p1, p2_1=parents$p2, p2_2=parents$p2)
	chromatids.mutated_snps <- lapply(chromatids.orig, function(i, ...){
		mutate_snps(i, mu=mu.rate)
		})
	
	# Recombine chromatids:
	chromatids.recombined <- chromatids.mutated_snps
	# type = 1
	for(i in which(r.index==1)){
		# Cases where there is a recombination event, pick
		# two of the four chromatids to recombine:

		# if(i %in% which(r.index==1)){
			# If recobomination with sister chromatids is allowed:
			# picked.chromatids <- sample(c(1:4), 2, replace=FALSE)

			# If recobomination only with non-sister chromatids is allowed:
			vals <- as.numeric(as.data.frame(chromatids.recombined)[i,])

			# In some rare cases of frequent gene conversion, vals will not have two 0s and two 1s. 
			# If that's the case, sample 1 or 2 and 3 or 4. 
			if(sum(vals)==2){
				picked.chromatids <- c(sample(which(vals==0), 1), sample(which(vals==1), 1))
			} else {
				picked.chromatids <- c(sample(c(1,2), 1), sample(c(3,4), 1))
			}

			# Recombine them:
			chromatids <- data.frame(one=chromatids.recombined[[picked.chromatids[1]]], 
					   two=chromatids.recombined[[picked.chromatids[2]]])
			one <- chromatids[(i+1):l,1]
			two <- chromatids[(i+1):l,2]
				# Does recombination result in a crossover (1) or non-crossover (0)?
				to_cross <- cross_vs_noncross(f.cross=f.cross)
				if(to_cross==1){
					chromatids[(i+1):l,] <- cbind(two, one)
				}			
				# Is there gene conversion at this recombination point?
				if(sample(c(1,0),1,prob=c(f.convert,1-f.convert))==1){
					# simulate gene conversion on one of the non-picked chromatids:
					to_convert <- sample(c(1:4)[-picked.chromatids],1) # non-picked chromotid
					length_of_conversion <- rpois(n=1,lambda=length.conversion)
					# if the length of the tract extends pass the chromosome then stop at the end of the chromosome.
					# Otherwise, create a gene conversion tract of length length_of_conversion starting at the 

					# find which snps are to the 3' of i, x
					x = which(p$snp>p$snp[i])
					# and the snps that are 5' of (length_of_conversion+i), y
					y = which(p$snps<(p$snps[i]+length_of_conversion))
					# If there are snps in both cats then they will be converted:
					if(length(intersect(x,y))!=0){
						chromatids.recombined[[to_convert]][c(intersect(x,y))] <- sapply(chromatids.recombined[[to_convert]][c(intersect(x,y))], function(j){switch_values(j)})
					}
				}
			# Save results:
			chromatids.recombined[[picked.chromatids[1]]] <- chromatids[,1]
			chromatids.recombined[[picked.chromatids[2]]] <- chromatids[,2]
		# }
	}

	out <- list(parents=parents, r.index=r.index, mu.rate=mu.rate, f.cross=f.cross, f.convert=f.convert, length.conversion=length.conversion,
			chromatids.mutated_snps=chromatids.mutated_snps, chromatids.recombined=chromatids.recombined)
	class(out) <- c("list", "recombine")
	return(out)	
}

#' @title Simulate sequencing across simulated data:
#' 
#' @description This function populates simulated data of class \code{recombine} with
#' read counts given a specified coverage and assignment probability.
#' 
#' @param simdata an object of class \code{recombine}
#' 
#' @param p.assign a numeric between 0 and 1 (inclusive) that gives the probability of 
#' correct sequencing assignment. A value of 1 means that no sequencing error or ancestral 
#' polymorphism is to be simulated. 
#' 
#' @param coverage the mean (and variance) of sequencing read coverage to be simlated.  \code{coverage} is 
#' sampled from a poisson distribution.
#'
#' @return an object of class \code{simulated.coverage} providing the a list of simulated sequencing reads for 
#' each parental type for each of the four haploid recombinants. There are four lists in the output -- one for
#' each recombinant. Within each list the read counts for parent 0 and parent 1 are given as well as the snp locations
#' and given (simulated) states. 
#' 
#' @seealso \code{\link{recombine}}
#' 
#' @author Tyler D. Hether 
#' 
#' @export simulate_coverage
#' 
#' @examples
#' set.seed(1234567)        # For reproducibility
#' l <- 50                  # number of snps to simulate
#' c <- 3e-05               # recombination rate between snps (Morgan/bp)
#' snps <- c(1:l)*1e4       # snps are evenly spaced 10kbp apart
#' p_a <- 0.97              # assignment probability
#' coverage <- 1            # mean coverage
#' p <- make_parents(snps)  # make the parents
#' #
#' # The recombination indeces are:
#' r_points <- recombine_index(scale=c, snps=snps) 
#' #
#' # Now recombine the parents:
#' recomb_sim <- recombine(parents=p, r.index=r_points, mu.rate=1e-05, 
#' 	f.cross=0.6, f.convert=0.2, length.conversion=15000) 
#' # And finally simulated read coverage for each haploid recombinant
#' sim_cov <- simulate_coverage(simdata=recomb_sim, 
#'  p.assign=p_a, coverage=coverage) 
#' sim_cov

simulate_coverage <- function(simdata, p.assign, coverage){
	if(!inherits(simdata, "recombine")){
		stop(paste("Object", simdata, " needs to be of class 'recombine'.", sep=""))
	}

	simulated.reads.total <- lapply(simdata$chromatids.recombined, function(i) return(rpois(i, coverage)))
	
	# This block returns the number of simulated reads for each parental type, 0 and 1, for each snp position.
	out <- lapply(1:4, function(i){
		correct_reads <- rbinom(size=simulated.reads.total[[i]], prob=p.assign, n=simulated.reads.total[[i]])
		incorrect_reads <- simulated.reads.total[[i]] - correct_reads 

		p0.assign <- sapply(1:length(simdata$chromatids.recombined[[i]]), function(x){
				if(simdata$chromatids.recombined[[i]][x]==0){
					return(correct_reads[x])
				}
				if(simdata$chromatids.recombined[[i]][x]==1){
					return(incorrect_reads[x])
				}
			})
		p1.assign <- sapply(1:length(simdata$chromatids.recombined[[i]]), function(x){
				if(simdata$chromatids.recombined[[i]][x]==1){
					return(correct_reads[x])
				}
				if(simdata$chromatids.recombined[[i]][x]==0){
					return(incorrect_reads[x])
				}
			})
		return(list(p0.assign=p0.assign, p1.assign=p1.assign, snps=simdata$parents$snps, states_given=simdata$chromatids.recombined[[i]]))

	})
	out[['snps']] <- simdata$parents$snps

	class(out) <- c("list", "simulated.coverage")
	return(out)
}

#' @title Simulate recombination across a given number of meiosis events
#' 
#' @description This is a wrapper function of many other functions that simulates a given number of
#' tetrads each with four haploid spores that are recombinants between two parents.  
#' 
#' @param n.tetrads An integer specifying the number of tetrads to simulate.
#' 
#' @param \code{scale} either vector of length 1 specifying the genome-wide recombination rate (Morgans/bp)
#' or a vector of length \code{snps-1} specifying the recombination rate between all snps. In either
#' case rates must be positive.
#'
#' @param \code{snps} a vector gving the locations of snps along a chromosome
#'
#' @param p.assign a numeric between 0 and 1 (inclusive) that gives the probability of 
#' correct sequencing assignment. A value of 1 means that no sequencing error or ancestral 
#' polymorphism is to be simulated. 
#' 
#' @param mu.rate a numeric between 0 and 1 (inclusive) specifying the per snp mutation rate. 
#' 
#' @param r.index a vector of length \code{snps-1} specifying whether a recombination is to be 
#' simulated (1) or not (0) in between two adjacent snps. 
#' 
#' @param f.cross a numeric between 0 and 1 (inclusive) giving the frequency of recombination 
#' events that result in crossing over. This is same as 1 minus the frequency of non-crossovers.
#' 
#' @param f.convert a numeric between 0 and 1 (inclusive) that gives the frequency of gene conversion 
#' during recombination.
#' 
#' @param length.conversion an integer specifying the mean (and variance) of a given gene conversion 
#' tract (in bps).
#'
#' @param chr.name a numeric or character value specifying the chromosome name (default to "I") 
#' 
#' @return A data.frame of class \code{tetrad}. Each row contains the following information:
#' \itemize{
#' 	\item{Tetrad}{ The tetrad ID}
#' 	\item{Spore}{ The spore ID (1:4)}
#' 	\item{Chr}{ The chromosome name}
#'  \item{Snp}{ The snp location (in bps)}
#'  \item{p0}{ The number of reads that mapped to parent 0}
#'  \item{p1}{ The number of reads that mapped to parent 1}
#'  \item{states_given}{ The simulated states (0 or 1)}
#' } 
#' 
#' @seealso \code{\link{recombine}}, \code{\link{make_parents}}, 
#' \code{\link{simulate_coverage}}, \code{\link{recombine_index}}, \code{\link{id_hotspots}}
#' 
#' @author Tyler D. Hether 
#' 
#' @export sim_tetrad
#' 
#' @examples
#' # Simulating 100 recombination events
#' set.seed(1234567)        # For reproducibility
#' n_tetrads <- 100         # number of tetrads (meiosis events)
#' l <- 50                  # number of snps to simulate
#' c <- 3e-05               # recombination rate between snps (Morgan/bp)
#' snps <- c(1:l)*2e4       # snps are evenly spaced 20kbp apart
#' p_a <- 0.97              # assignment probability
#' coverage <- 1            # mean coverage
#' # Now simulate
#' sim100 <- sim_tetrad(n.tetrads=n_tetrads, scale=c, snps=snps, 
#' 	p.assign=p_a, mu.rate=0, f.cross=0.8, f.convert=0.3, 
#' 	length.conversion=2e3, coverage=coverage)
#' sim100

sim_tetrad <- function(n.tetrads, scale, snps, p.assign, mu.rate, f.cross, f.convert, 
        length.conversion, coverage, chr.name="I"){
	parents <- make_parents(snps)
    
    res <- lapply(1:n.tetrads, function(Z, ...){

        r <- recombine_index(scale, snps)

        recomb_sim <- recombine(parents=parents, r.index=r, mu.rate=mu.rate, f.cross=f.cross, 
                f.convert=f.convert, length.conversion=length.conversion)
        sim_reads <- simulate_coverage(simdata=recomb_sim, p.assign=p.assign, coverage=coverage)
        class(sim_reads) <- list("individual.tetrad")
        return(sim_reads)
    })
    
    # Convert to a dataframe:
    dat0 <- lapply(1:length(res), function(x){

      one <- res[[x]][[1]]
      two <- res[[x]][[2]]
      three <- res[[x]][[3]]
      four <- res[[x]][[4]]

      return(rbind(
        data.frame(Tetrad=x, Spore=1, Chr=chr.name, Snp=one$snps, p0=one$p0.assign, p1=one$p1.assign, states_given=one$states_given),
        data.frame(Tetrad=x, Spore=2, Chr=chr.name, Snp=two$snps, p0=two$p0.assign, p1=two$p1.assign, states_given=two$states_given),
        data.frame(Tetrad=x, Spore=3, Chr=chr.name, Snp=three$snps, p0=three$p0.assign, p1=three$p1.assign, states_given=three$states_given),
        data.frame(Tetrad=x, Spore=4, Chr=chr.name, Snp=four$snps, p0=four$p0.assign, p1=four$p1.assign, states_given=four$states_given)
      ))

      })
    out <- do.call(rbind, dat0)

    class(out) <- c("data.frame", "tetrad")
    return(out)
}


#' @title Simulate random spores en masse
#' 
#' @description This is a wrapper function of many other functions that simulates a given number of
#' haploids each recombinant between two parents. As the name implies, information of the all four 
#' products are lost since spores are random. See \code{\link{sim_tetrad}} if all four haploid recombinants 
#' are desired.
#' 
#' @param n.spores An integer specifying the number of spores to simulate.
#'  
#' @param \code{scale} either vector of length 1 specifying the genome-wide recombination rate (Morgans/bp)
#' or a vector of length \code{snps-1} specifying the recombination rate between all snps. In either
#' case rates must be positive.
#'
#' @param \code{snps} a vector gving the locations of snps along a chromosome
#'
#' @param p.assign a numeric between 0 and 1 (inclusive) that gives the probability of 
#' correct sequencing assignment. A value of 1 means that no sequencing error or ancestral 
#' polymorphism is to be simulated. 
#' 
#' @param mu.rate a numeric between 0 and 1 (inclusive) specifying the per snp mutation rate. 
#' 
#' @param r.index a vector of length \code{snps-1} specifying whether a recombination is to be 
#' simulated (1) or not (0) in between two adjacent snps. 
#' 
#' @param f.cross a numeric between 0 and 1 (inclusive) giving the frequency of recombination 
#' events that result in crossing over. This is same as 1 minus the frequency of non-crossovers.
#' 
#' @param f.convert a numeric between 0 and 1 (inclusive) that gives the frequency of gene conversion 
#' during recombination.
#' 
#' @param length.conversion an integer specifying the mean (and variance) of a given gene conversion 
#' tract (in bps).
#'
#' @param chr.name a numeric or character value specifying the chromosome name (default to "I") 
#' 
#' @return A data.frame of class \code{en.masse}. Each row contains the following information:
#' which contains three elements: 
#' \itemize{
#' 	\item{Spore}{ The spore ID (1:length(n.spores))}
#' 	\item{Chr}{ The chromosome name}
#'  \item{Snp}{ The snp location (in bps)}
#'  \item{p0}{ The number of reads that mapped to parent 0}
#'  \item{p1}{ The number of reads that mapped to parent 1}
#'  \item{states_given}{ The simulated states (0 or 1)}
#' } 
#' 
#' @seealso \code{\link{recombine}}, \code{\link{make_parents}}, 
#' \code{\link{simulate_coverage}}, \code{\link{recombine_index}}, \code{\link{id_hotspots}}
#' 
#' @author Tyler D. Hether 
#' 
#' @export sim_en_masse
#' 
#' @examples
#' # Simulating 5 haploid recombinants
#' set.seed(1234567)        # For reproducibility
#' n_spores <- 5          # number of tetrads (meiosis events)
#' l <- 75                  # number of snps to simulate
#' c <- 3.5e-05             # recombination rate between snps (Morgan/bp)
#' snps <- c(1:l)*1.3e4     # snps are evenly spaced 20kbp apart
#' p_a <- 0.95              # assignment probability
#' coverage <- 2.1          # mean coverage
#' # Now simulate
#' sim5 <- sim_en_masse(n.spores=n_spores, scale=c, snps=snps, 
#' 	p.assign=p_a, mu.rate=0, f.cross=0.8, f.convert=0.3, 
#' 	length.conversion=2e3, coverage=coverage)
#' sim5

sim_en_masse <- function(n.spores, scale, snps, p.assign, mu.rate, f.cross, f.convert, 
		length.conversion, coverage, chr.name="I"){
	
	parents <- make_parents(snps)
    
    res <- lapply(1:n.spores, function(Z, ...){

        r <- recombine_index(scale, snps)

        recomb_sim <- recombine(parents=parents, r.index=r, mu.rate=mu.rate, f.cross=f.cross, 
                f.convert=f.convert, length.conversion=length.conversion)
        sim_reads <- simulate_coverage(simdata=recomb_sim, p.assign=p.assign, coverage=coverage)
        
        return(sim_reads[[sample(1:4, 1)]])
    })
  
    dat0 <- lapply(1:length(res), function(x){

	  return(data.frame(Spore=x, Chr=chr.name, Snp=res[[x]]$snps, p0=res[[x]]$p0, p1=res[[x]]$p1, states_given=res[[x]]$states_given))

      })
    out <- do.call(rbind, dat0)

    class(out) <- c("data.frame", "en.masse")
    return(out)
}

# Minor functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Function to make sure values in a vector, a, are between x and y, inclusive:
check_values <- function(a,x,y){
	if(max(a)>y){
		stop(paste("The maximum of 'a' needs to be <", y, sep=""))
	}
	if(min(a)<x){
		stop(paste("The minimum of 'a' needs to be >", x,sep=""))
	}
}

# Functions to simulate mutation for a vector of snps
switch_values <- function(i){
	if(i==0){
		return(1)
	}
	if(i==1){
		return(0)
	}
}

mutate_snps <- function(xx, mu=mu){
	# MLG <- multilocus genotype
	ll <- length(xx)
	to.mutate_snps <- sample(x=c(0,1), ll, prob=c(1-mu,mu), replace=TRUE)
	if(sum(to.mutate_snps>0)){
		xx[which(to.mutate_snps==1)] <- sapply(xx[which(to.mutate_snps==1)], switch_values)
	}
	return(xx)
}

# Given the frequency of crossover events, f.cross, sample whether one is to occur 
#     (1=yes, crossover; 0=no, non-crossover).
cross_vs_noncross <- function(f.cross){
	sample(c(1,0), 1, prob=c(f.cross, 1-f.cross))
}
tylerhether/fbgenotyper documentation built on May 3, 2019, 1:53 p.m.