R/HotspotsCNV.R

##' HotspotsCNV: Identify Copy Number Variation (CNV) hotspots.
##'
##' Specifically designed to handle noisy data from amplified DNA on phenylketonuria (PKU) cards. The function is a pipeline using many subfunctions.
##' @title HotspotsCNV
##' @param Df: Dataframe with CNV predction with chromosome (Chr.), start, and stop position.
##' @param Freq: Minimum number of CNVs to be considered a hotspot, default = 1.
##' @param OverlapCutoff: Minimum overlap among CNVs to be considered the same CNV region, default = 0.7.
##' @param Cores: Numeric, Number of cores used, default = 1.
##' @param OverlapMin: Minimum overlap with hotspot to be selected for counting, default = 0.9.
##' @param OverlapMax: Maximum overlap with hotspot to be selected for counting, default = 1.1.
##' @param hg: Human genome version, default = hg19. For full range of possibilities, run ucscGenomes()$db {rtracklayer}
##' @return CNV hotspots.
##' @author Marcelo Bertalan, Louise K. Hoeffding, Ida Elken Sønderby.
##' @source \url{http://biopsych.dk/iPsychCNV}
##' @export
##' @examples
##' MockDataCNVs <- MockData(N=100, Type="PKU", Cores=20)
##' iPsych.Pred <- iPsychCNV(PathRawData=".", MINNumSNPs=20, Cores=20, Pattern="^MockSample", MinLength=10, Skip=0)
##' iPsych.Pred.hotspots <- HotspotsCNV(iPsych.Pred, Freq=2, OverlapCutoff=0.9, Cores=1)

HotspotsCNV <- function(df, Freq=1, OverlapCutoff=0.7, Cores=1, OverlapMin=0.9,  OverlapMax=1.1, hg="hg19")
{
	library(plyr)
	library(parallel)

	df$Length <- df$Stop - df$Start
	if(sum(df$Length < 1) > 0){ stop("You have negative length. Please check your start and stop position\n") }
	
	if(length(unique(df$ID)) < 2){ stop("Need at least two samples or no ID found.") }
	if(length(df$Source[1]) > 0){ Source <- df$Source[1] }else{ stop("Data frame does not have Source\n") }
	OriginalDF <- df

	tmp <- apply(df, 1, function(X)
	{
		Start1 <- as.numeric(X["Start"])
		Stop1 <- as.numeric(X["Stop"])
		Chr1 <- gsub(" ", "", X["Chr"])
		Length1 <- as.numeric(X["Length"])
		CNV_Pos <- paste(Chr1, Start1, Stop1, Length1, sep="_", collapse="")
		return(CNV_Pos)
	})

	tmp2 <- table(tmp)

	# selecting by freq
	tmp3 <- tmp2[tmp2 > Freq]
	tmp4 <- as.vector(tmp3)
	names(tmp4) <- names(tmp3)

	# selecting by size
	tmp <- sapply(1:length(tmp4), function(X)
	{
		cat(X, "\r")
		df1 <- unlist(strsplit(names(tmp4)[X], "_"))
		df2 <- c(df1, tmp4[X], names(tmp4)[X])
		names(df2) <- c("Chr", "Start", "Stop", "Length", "Count", "NewName")
		return(df2)
	})
	tmp4 <- as.data.frame(t(tmp), stringsAsFactors=F)
	tmp4$Source <- rep(Source, nrow(tmp4))

	cat("Running CompressCNVs", nrow(tmp4), "\n")
	tmp5 <- CompressCNVs(tmp4, OverlapCutoff, Cores) # Problem
	cat("Re-running CompressCNVs", nrow(tmp5), "\n")
	NumOfCNVs <- 0
	while(nrow(tmp5) != NumOfCNVs)
	{
		NumOfCNVs <- nrow(tmp5)
		tmp5 <- CompressCNVs(tmp5, OverlapCutoff, Cores)
		if(nrow(tmp5) == NumOfCNVs)
		{
			cat("CompressCNVs done", nrow(tmp5), NumOfCNVs,  "\n")
		}
		else
		{
			cat("Re-running CompressCNVs", nrow(tmp5), NumOfCNVs,  "\n")
		}
	}
	#save(tmp5, file="tmp5.RData")
	# Count CNVs in compressed CNV regions
	cat("Counting CNVs\n")

	#save(tmp5, file="tmp5.RData")
	CNV_Count <- mclapply(1:nrow(tmp5), mc.cores=Cores, mc.preschedule = FALSE, function(X) #tmp
	{
		tmp <- SelectSamplesFromROI(DF=OriginalDF, roi=tmp5[X,], OverlapMin=OverlapMin,  OverlapMax=OverlapMax)
		Counts <- data.frame("CN0"=0, "CN1"=0, "CN2"=0, "CN3"=0, "CN4"=0, "CN5"=0)
		tmp2 <- as.data.frame(table(tmp$CN), stringsAsFactors=F)
		tmp3 <- tmp2$Freq
		names(tmp3) <- tmp2$Var1
		names(tmp3) <- sapply(names(tmp3), function(X){ paste("CN", X, sep="", collapse="") })
		Counts[names(tmp3)] <- tmp3
		return(Counts)
	})

	CNV_Count2 <- MatrixOrList2df(CNV_Count)
	#save(CNV_Count2, file="CNV_Count.RData")
	#save(tmp5, file="tmp5.RData")
	tmp5 <- GetLocus(tmp5, hg)
	tmp5$ID <- rep("ROI", nrow(tmp5))
	tmp5$Class <- rep("ROI", nrow(tmp5))
	tmp6 <- cbind(tmp5, CNV_Count2)
	tmp6$CNTotal <- tmp6$CN0 + tmp6$CN1 + tmp6$CN3 + tmp6$CN4
	return(tmp6)
}
mbertalan/iPsychCNV documentation built on May 22, 2019, 12:19 p.m.