R/MockData.R

##' MockData: Unknown. 
##'
##' Specifically designed to handle noisy data from amplified DNA on phenylketonuria (PKU) cards. The function is a pipeline using many subfunctions.
##' @title MockData
##' @param N: Unknown, default = 1.
##' @param Wave1: Unknown, default = FALSE.
##' @param Type: Unknown, default = Blood.
##' @param Cores: Number of cores used, default = 1.
##' @return Data frame with predicted CNVs.
##' @author Marcelo Bertalan, Louise K. Hoeffding. 
##' @source \url{http://biopsych.dk/iPsychCNV}
##' @export
##' @examples Unknown.
##'

MockData <- function(N=1, Wave1=FALSE, Type="Blood", Cores=1) # Type: Blood or PKU (Perfect or noise)
{
	# Use Wave1 PFB ? Wave1PFB comes with the package.
	if(!Wave1)
	{
		WaveTmp <- rep(0.5, length(Wave1PFB))
		names(WaveTmp) <- names(Wave1PFB)
		Wave1PFB <- WaveTmp
	}
		
	# CNV Info: Using always the same position. Rdata from the package.
	CNVsSize <- c(50, 100, 150, 300, 450, 600)
	CNVSizeFixed <- sample(CNVsSize, 1000, replace=TRUE)
	names(CNVSizeFixed) = 1:1000
	
	# Always the same CN
	CNFixed <- sample(c(0,1,1,2,2,2,3,3,4), 1000, replace=TRUE) # CNV Type
	names(CNFixed) = 1:1000

	List <- GetMockValues(Type=Type)
	
	Del <- List[["Del"]]
	Dup <- List[["Dup"]] 
	ChrMean <- List[["ChrMean"]]
	ChrMeanProb <- List[["ChrMeanProb"]]
	ChrSD <- List[["ChrSD"]]
	ChrSDProb <- List[["ChrSDProb"]]
	TelomereNoiseSize <- List[["TelomereNoiseSize"]]
	TelomereNoiseEffect <- List[["TelomereNoiseEffect"]]
	
	BadSNPs <- List[["BadSNPs"]]
	BadSNPIntensity <- List[["BadSNPIntensity"]]
	BadSNPIntensityProb <- List[["BadSNPIntensityProb"]]
	
	
	suppressPackageStartupMessages(library(parallel))

	All <- mclapply(1:N, mc.cores=Cores, mc.preschedule = FALSE, function(SampleNum)
	{
		FileName <- paste("MockSample_", SampleNum, ".tab", sep="", collapse="")
	
		tmp <- sapply(unique(CNV$Chr), function(CHR) # Chromosome loop
		{
			# CNV is a RData from the package with the Psycho chip information. 
			# It has SNP position and names. It is used to guide the mock data.
			subCNV <- subset(CNV, Chr %in% CHR) 
			subCNV <- subCNV[order(subCNV$Position),]
			Position <- subCNV$Position
			SNP.Name <- subCNV$SNP.Name
	
			ChrLength <- nrow(subCNV)
			SD=sample(ChrSD, 1, prob=ChrSDProb) # chr sd
			ChrMEAN <- sample(ChrMean[,as.numeric(CHR)], prob=ChrMeanProb[,as.numeric(CHR)], replace=TRUE, size=1)
			#X <- sample(ChrMean[,as.numeric(CHR)], prob=ChrMeanProb[,as.numeric(CHR)], replace=TRUE, size=ChrLength)
			X <- rnorm(n=ChrLength, mean=ChrMEAN, sd=SD)
			
			#
			CNPosition <- rep(2, ChrLength)
			
			
			#Heterozygosity <- sample(seq(from=6, to=30,by=2), size=1, prob=c(0.05,0.1,0.15,0.07,0.05,0.025))
			Heterozygosity <- sample(c(7,10,13,16,20,25,30), size=1)
			BAF_Info <- MakeBAF(Heterozygosity=Heterozygosity)
			BAFs <- BAF_Info[["BAFs"]]
			BAF_Normal <- BAF_Info[["BAF_Normal"]]
			BAF_Del <- BAF_Info[["BAF_Del"]]
			BAF_Dup <- BAF_Info[["BAF_Dup"]]
			BAF_CN4 <- BAF_Info[["BAF_CN4"]]
			BAF_CN0 <- BAF_Info[["BAF_CN0"]]
			
			# Telomere noise
			TelSize1 <- sample(TelomereNoiseSize, 1)
			TelEffect1 <- sample(TelomereNoiseEffect, 1)
			TelSize2 <- sample(TelomereNoiseSize, 1)
			TelEffect2 <- sample(TelomereNoiseEffect, 1)
			# Begining of chromosome
			X[1:TelSize1] <- X[1:TelSize1] + TelEffect1  	
			# End of chrosmoome
			X[(length(X) - TelSize2):length(X)] <- X[(length(X) - TelSize2):length(X)] + TelEffect2
			
			BAF <- sample(BAFs, prob=BAF_Normal, replace=TRUE, size=length(X))
			names(BAF) <- SNP.Name
		
			# Adding CNVs		
			NumCNVs <- ((round(length(X)/1000))-2)
			#cat(NumCNVs, "NumCNVs\n")
			DF <- sapply(1:NumCNVs, function(i) # Adding CNVs in the data.
			{
				PositionIndx <- as.numeric(i) * 1000

				# Using fix size for chr position.
				Size <- CNVSizeFixed[i]
				CN <- CNFixed[i]
				
				# CNV position
				IndxV <- PositionIndx:(PositionIndx+Size)
				
				if(CN == 1)
				{
					Impact <- sample(Del, 1)
					BAFCNV <- sample(BAFs, prob=BAF_Del, replace=TRUE, size=(Size+1))
				}
				if(CN == 3)
				{
					Impact <- sample(Dup, 1)
					BAFCNV <- sample(BAFs, prob=BAF_Dup, replace=TRUE, size=(Size+1))
				}
				if(CN == 0)
				{
					Impact <- sample(Del, 1)
					BAFCNV <- sample(BAFs, prob=BAF_CN0, replace=TRUE, size=(Size+1))
				}
				if(CN == 4)
				{
					Impact <- sample(Dup, 1)
					BAFCNV <- sample(BAFs, prob=BAF_CN4, replace=TRUE, size=(Size+1))
				}
				if(CN == 2) # Noise data. BAF can not match with LRR
				{
					NoiseLRR <- sample(c(1,1,1,1,3), 1) # More likely to have low DNA resulting in lower signal.
					if(NoiseLRR == 1){ Impact <- sample(Del, 1) }
					if(NoiseLRR == 3){ Impact <- sample(Dup, 1) }
					
					if(NoiseLRR == 1)
					{
						NoiseBAF <- sample(c(2,2,2,2,2,2,3,4), 1) # BAF 2 the most common one.
						if(NoiseBAF == 3){ BAFCNV <- sample(BAFs, prob=BAF_Dup, replace=TRUE, size=(Size+1)) }
						if(NoiseBAF == 4){ BAFCNV <- sample(BAFs, prob=BAF_CN4, replace=TRUE, size=(Size+1)) }
						if(NoiseBAF == 2){ BAFCNV <- sample(BAFs, prob=BAF_Normal, replace=TRUE, size=(Size+1)) }
					}
					else
					{	
						NoiseBAF <- sample(c(2,2,2,2,1,1,1,0), 1)
						if(NoiseBAF == 0){ BAFCNV <- sample(BAFs, prob=BAF_CN0, replace=TRUE, size=(Size+1)) }
						if(NoiseBAF == 1){ BAFCNV <- sample(BAFs, prob=BAF_Del, replace=TRUE, size=(Size+1)) }
						if(NoiseBAF == 2){ BAFCNV <- sample(BAFs, prob=BAF_Normal, replace=TRUE, size=(Size+1)) }
					}
				}
				
				## Changing GLOBAL VARIABLES ##
				# LRR = X
				X[IndxV] <<- X[IndxV] + Impact
				
				## Changing GLOBAL VARIABLES ##
				BAF[IndxV] <<- BAFCNV
				
				## Changing GLOBAL VARIABLES ##
				CNPosition[IndxV] <<- CN
				
				df <- data.frame(Start=Position[PositionIndx], Stop=Position[(PositionIndx+Size)], StartIndx=PositionIndx, StopIndx=(PositionIndx+Size), NumSNPs=Size, Chr=CHR, CNVmean=Impact, CN=CN, sd=SD, ID=FileName, NumCNVs=NumCNVs, ChrMean=ChrMEAN, Heterozygosity=Heterozygosity, stringsAsFactors=FALSE)
				return(df)
			})
			df <- MatrixOrList2df(DF)
			#save(df, file="df.RData")
			df2 <- data.frame(SNP.Name=SNP.Name, Chr=rep(CHR, length(X)), Position=Position, Log.R.Ratio=X, B.Allele.Freq=BAF, CN=CNPosition, stringsAsFactors=FALSE)
			return(list(LRR=df2, CNVs=df))
		})
		
		DF <- MatrixOrList2df(tmp["CNVs",])
		LRR <- MatrixOrList2df(tmp["LRR",])
	
		write.table(LRR, sep="\t", quote=FALSE, row.names=FALSE, file=FileName) 
		return(DF)
	})
	
	CNVs <- MatrixOrList2df(All)
	#save(CNVs, file="CNVs.RData")
	CNVs$Length <- CNVs$Stop -  CNVs$Start
	CNVs$CNVID <- 1:nrow(CNVs)
	CNVs$PositionID <- apply(CNVs, 1, function(X){ gsub(" ", "", paste(X["StartIndx"], X["StopIndx"], sep="_", collapse="")) })

	# Adding No-CNVs
	tmp3 <- sapply(unique(CNVs$ID), function(X)
	{
		tmp <- sapply(unique(CNVs$Chr), function(Y)
		{
			# selecting the CNVs
			subCNVs <- subset(CNVs, ID %in% X & Chr %in% Y)
			# Selecting data info from chip
			subCNV <- subset(CNV, Chr %in% Y)
			
			indx <- sort(c(1, subCNVs$StartIndx, subCNVs$StopIndx, nrow(subCNV)))
			Info <- sapply(1:(length(indx)-1), function(X){ rbind(indx[X],indx[(X+1)]) })
			Info <- t(Info)
			Df <- as.data.frame(Info)
			names(Df) <- c("StartIndx", "StopIndx")
			PositionID <- apply(Df, 1, function(X){ gsub(" ", "", paste(X[1], X[2], sep="_", collapse="")) })
			Df$PositionID <- PositionID
			Df2 <- Df[!Df$PositionID %in% subCNVs$PositionID,] # Only the non-CNVs

			subCNV <- subset(CNV, Chr %in% Y)
			subCNV <- subCNV[order(subCNV$Position),]
			Df2$Start <- subCNV$Position[Df2$StartIndx]
			Df2$Stop <- subCNV$Position[Df2$StopIndx]
			Df2$NumSNPs <- Df2$StopIndx - Df2$StartIndx
			Df2$Chr <- rep(Y, nrow(Df2))
			Df2$CNVmean <- rep(0, nrow(Df2))
			Df2$CN <- rep(2, nrow(Df2))
			Df2$sd <- rep(0.2, nrow(Df2))
			Df2$ID <- rep(X, nrow(Df2))
			Df2$NoiseSNP <- rep(unique(subCNVs$NoiseSNP), nrow(Df2))
			Df2$BadSNPs <- rep(unique(subCNVs$BadSNPs), nrow(Df2))
			Df2$NumCNVs <- rep(unique(subCNVs$NumCNVs), nrow(Df2))
			Df2$ChrMean <- rep(unique(subCNVs$ChrMean), nrow(Df2))
			Df2$Length <- Df2$Stop - Df2$Start
			Df2$CNVID <- 1:nrow(Df2)
			Df2$Heterozygosity <- rep(unique(subCNVs$Heterozygosity), nrow(Df2))
			Df2 <- Df2[, colnames(subCNVs)]
			return(Df2)
		})
		tmp2 <- MatrixOrList2df(tmp)
		tmp2 <- tmp2[, !colnames(tmp2) %in% ".id"]
		return(tmp2)
	})
	tmp3 <- MatrixOrList2df(tmp3)
	tmp3 <- tmp3[, !colnames(tmp3) %in% ".id"]

	tmp4 <- rbind(tmp3, CNVs)
	tmp4 <- tmp4[order(tmp4$ID, tmp4$Chr, tmp4$Start),]
	
	return(tmp4)
}	
mbertalan/iPsychCNV documentation built on May 22, 2019, 12:19 p.m.