mpMap: Multi-Parent RIL Genetic Analysis

#' Simulate data from multi-parent designs
#' 
#' Data is simulated according to a pedigree, map and QTL model
#' @export
#' @importFrom utils head
#' @useDynLib mpMap
#' @param map Linkage map with which to generate data. See \code{\link[qtl]{sim.map}}
#' @param pedigree Pedigree for a multi-parent cross. Can be generated using \code{\link[mpMap]{sim.mpped}}
#' @param qtl QTL model, defined by a matrix with one row per QTL and 6 columns: the chromosome of the QTL, the position in cM on that chromosome, and the four founder effects
#' @param vare Phenotypic error variance
#' @param error.prob Probability of genotyping errors - data will be changed with this probability to one of the other founder values
#' @param missing.prob Probability of missing data in final genotypes
#' @param full.prob Probability of fully informative markers. Markers will be assigned with this probability to retain IBD genotypes from founders rather than being recoded into binary values. See details below for more information
#' @param keep.qtlgeno Flag for whether to retain the QTL genotypes as a component in the output \code{mpcross} object
#' @param transpos Positions of potential translocation (vector)
#' @param transFounder Which founder carries the translocation
#' @param map.function Map function for conversion of linkage map into recombination fractions. Default is "haldane"
#' @param seed Random seed for generation of data
#' @param fg Input founder genotypes (optional) - otherwise generated randomly
#' @param founderld Flag for whether to generate founder genotypes in linkage equilibrium (FALSE=default) or according to recombination map (TRUE)
#' @return Object of class \code{mpcross}. See \code{\link[mpMap]{mpcross}} for further details. Additional components are:
#' \item{ibd}{ Fully informative founder genotypes for all markers}
#' \item{qtlgeno}{ If argument \code{keep.qtlgeno} is \code{TRUE} then QTL genotypes will be retained}
#' @details Data are initially generated by transmitting founder genotypes down through the pedigree to the finals. Errors, missing data, and binary alleles are then overlaid on this data (stored in $ibd). If founderld==FALSE, binary alleles are generated at each locus with probability 0.25 that one founder will have the allele; 0.50 that two founders will have the allele; and 0.25 that three founders will have the allele. The founders with the allele are randomly selected after the number of founders with the allele has been simulated. If founderld==TRUE then some markers may be monomorphic and will need to be removed from the resulting object using \code{\link[mpMap]{clean.mpcross}}.
#' 
#' Note that if founder genotypes are input they should be coded as follows:
#' DArT markers take values in {0,1}
#' SNP markers take values in {0,2}
#' All other markers take some other set of values. 
#' @note Translocations can only be generated when founderld==FALSE and no founder genotypes are input. Note that founder effects in the QTL model are per allele; thus, the phenotypic difference between a line carrying the founder and one that is not will be twice the input founder effect (because all lines are inbred). 
#' @seealso \code{\link[mpMap]{sim.mpped}}, \code{\link[qtl]{sim.map}}
#' @examples
#' map <- qtl::sim.map(len=100, n.mar=11, eq.spacing=TRUE, include.x=FALSE)
#' sim.ped <- sim.mpped(4, 1, 500, 6, 1)
#' sim.dat <- sim.mpcross(map=map, pedigree=sim.ped, 
#'		qtl=matrix(data=c(1, 50, .4, 0, 0, 0), 
#'		nrow=1, ncol=6, byrow=TRUE), seed=1)

sim.mpcross <- function(map, pedigree, qtl=NULL, vare=1, error.prob=0, missing.prob=0, full.prob=0, keep.qtlgeno=TRUE, transpos=integer(0), transFounder=0, map.function=c("haldane", "kosambi"), seed=1, fg=NULL, founderld=FALSE)
{
	set.seed(seed)
	if(length(transFounder) != 1) stop("Input transFounder must be a single number. Zero signals no translocation. ")
	transpos <- sort(transpos)
	#Translocation has to be a single contiguous set of markers, otherwise simulation becomes much more complicated - If you have
	#more than one translocation section per chromosome you need to explicitly model the transition probabilities, in such a way that you get twice as much of the translocation as the other founders
	if(length(transpos) > 1 && any(diff(transpos) != 1))
	{
		stop("Input transpos must be a contiguous set of markers, on a single chromosome")
	}
	#Check that the translocation is within a single chromosome
	if(length(transpos) > 0)
	{
		transMarkers <- unlist(lapply(1:length(map), function(x) rep(x, length(map[[x]]))))[transpos]
		if(length(unique(transMarkers)) > 1)
		{
			stop("Input transpos must be a contiguous set of markers, on a single chromosome")
		}
	}
	pedigree <- convertped(pedigree)
	n.ind <- sum(pedigree[,4])
	genotypedLines <- which(pedigree[,4] != 0)

	map.function <- match.arg(map.function)
	if(error.prob < 0) stop("error.prob shouldn't be < 0!")
	if (error.prob < 1e-50) error.prob <- 1e-50
	if (error.prob > 1) stop("error.prob shouldn't be > 1!")
	n.founders <- sum(pedigree[,2]==0 & pedigree[,3]==0)

	## check dimensions of input founders
	if (!is.null(fg) && (nrow(fg)!=n.founders || ncol(fg) != length(unlist(map))))
	{
		stop("Input founder genotype matrix has the wrong dimensions")
	}

  pednum <- convertped(pedigree)

  #### May need to check some bug about ordering of QTL
  if (!is.null(qtl) && is.matrix(qtl)) 
      qtl <- qtl[order(qtl[, 1], qtl[, 2]), ] 
 
	if (map.function == "kosambi") mf <- kosambiX2R else mf <- haldaneX2R

  QTLm <- NULL
  newmap <- map
  if (is.null(qtl))  
    n.qtl <- 0 else  { 
 	newmap <- check_qtl(QTL=qtl, map=map, n.founders=n.founders)
	if (!is.matrix(qtl))	QTLm <- rbind(qtl) else QTLm <- qtl 
	n.qtl <- nrow(QTLm) 
    }

	adjacentRecombination <- vector(mode="numeric")
	for (i in 1:length(newmap))
	{
		adjacentRecombination <- c(adjacentRecombination, sapply(diff(newmap[[i]]), mf), 0.5)
	}
	#drop last value of 0.5
	adjacentRecombination <- head(adjacentRecombination, -1)
	genotypes <- .Call("generateGenotypes", adjacentRecombination, pedigree, transpos, transFounder, PACKAGE="mpMap")
	colnames(genotypes) <- c(paste("ibd1_", unlist(lapply(newmap, names)), sep=""), paste("ibd2_", unlist(lapply(newmap, names)), sep=""))
	
	geno <- list()
	geno$founders <- genotypes[1:n.founders,,drop=FALSE]
	geno$finals <- genotypes[genotypedLines, ,drop=FALSE]
	geno$id <- pedigree[genotypedLines,1]
	geno$fid <- pedigree[pedigree[,2] == 0 & pedigree[,3] == 0, 1]
	geno$pedigree <- pedigree
	geno$map <- newmap

  qtlgeno <- list()
  # need to select qtlgeno out of the genotypes
  if (n.qtl>0) {
    qtlcol <- grep("QTL", colnames(geno$finals))
    qtlgeno$founders <- geno$founders[,qtlcol, drop=FALSE]
    qtlgeno$finals <- geno$finals[,qtlcol, drop=FALSE]
#    geno$founders <- geno$founders[,-qtlcol, drop=FALSE]
#    geno$finals <- geno$finals[,-qtlcol, drop=FALSE]
  }

  # generate phenotype
  pheno <- generate_pheno(n.founders, qtlgeno$finals, QTLm, vare, n.ind)
  pheno <- as.data.frame(pheno)
  obsgeno <- geno 
### BEH changed this 29/10/12 to generate errors after dominant markers

  # generate dominant markers
  if (full.prob < 1 | !is.null(fg))
    obsgeno <- generate_obs(obsgeno, newmap, full.prob, fg, transpos, transFounder, founderld)

  # generate errors in the data
  if (error.prob>0) {
  if (full.prob==1) 
    obsgeno <- generate_error(obsgeno, error.prob) else
    obsgeno <- generate_error2(obsgeno, error.prob)
  }
  ### note that if you have a mixture of biallelic and fully informative
### need a new function to generate errors 

	## do changes need to be made given that the generate.obs and
	## generate.obs2 may return slightly different values? 
	## need to coordinate so dimensions are the same. 

  if (n.qtl > 0)
  {
	qtlcol <- grep("QTL", colnames(obsgeno$finals))
	obsgeno$founders <- obsgeno$founders[,-qtlcol, drop=FALSE]
	obsgeno$finals <- obsgeno$finals[, -qtlcol, drop=FALSE]
	geno$founders <- geno$founders[,-qtlcol, drop=FALSE]
	geno$finals <- geno$finals[, -qtlcol, drop=FALSE]
#  	colnames(obsgeno$founders) <- unlist(lapply(newmap, names))[-grep("QTL", unlist(lapply(newmap, names)))]
#  	colnames(obsgeno$finals) <- unlist(lapply(newmap, names))[-grep("QTL", unlist(lapply(newmap, names)))]
#  } else {
#	  colnames(obsgeno$founders) <- unlist(lapply(newmap, names))
#	  colnames(obsgeno$finals) <- unlist(lapply(newmap, names))
  }

  obsgeno <- combine_ibd(obsgeno)

  map2 <- lapply(newmap, function(x) if (length(grep("QTL", names(x))>0)) return (x[-grep("QTL", names(x))]) else return(x))
  class(map2) <- "map"

  colnames(obsgeno$founders) <- colnames(obsgeno$finals) <- unlist(lapply(map2,names))

#  colnames(obsgeno$founders) <- substr(colnames(obsgeno$founders), 6, 
#		nchar(colnames(obsgeno$founders)))
#  colnames(obsgeno$finals) <- substr(colnames(obsgeno$finals), 6, 
#		nchar(colnames(obsgeno$finals)))
  
  if (missing.prob > 0) {
    obsgeno$finals[sample(c(TRUE, FALSE), length(obsgeno$finals), replace=TRUE, 
	prob=c(missing.prob, 1-missing.prob))] <- NA
  }

  rownames(obsgeno$finals) <- paste("L", 1:n.ind, sep="")
  rownames(pheno) <- rownames(obsgeno$finals)

  # ibd values are fully informative, while observed may have dominant markers
  mpcross <- list(founders=obsgeno$founders, finals=obsgeno$finals, 
	ibd=list(founders=geno$founders, finals=geno$finals), pheno=pheno, 
	map=map2, pedigree=pedigree, id=geno$id, fid=geno$fid) 

  if (keep.qtlgeno==TRUE)
	mpcross$qtlgeno <- qtlgeno

  attr(mpcross, "type") <- paste("ri", n.founders, "self", sep="")

  class(mpcross) <- "mpcross"  
  clean(mpcross)
  rownames(mpcross$founders) <- LETTERS[1:n.founders]

  mpcross 
}