R/deletionCategory.R

Defines functions countMissing filterSamplesPerSD getLibSD filterLowQualityExons readCoverageTable getExonsDF getExonsDetails normalizeCovs homDeletionFilter getHomExoDeletions getHetExoDeletions getAllDeletedExons getAllScaffoldsWithDeletions getScaffoldAveragesByIndex getAllScaffoldAveragesParallel getScaffoldAverages getDeletionScore getDeletionCategory getDeletionsPerCM getAllScaffoldAverages getWholeExonDeletions categorizeDeletions getHomSelectedDeletions

#' Coutns how many missing values are in the the list. 
#'
#' @param x: The list of values. It is transformed to a numneric vector. 
#'
#' @return The count of missing values (x == 0.0)

countMissing <-function(x){
	numT<-as.numeric(x)
	ret <- length(numT[numT==0.0])
	ret
}


#' Remove the libraries with bad standard deviation
#'
#' 
#' @param mat:  The matrix with the normalized coverage. The row names have the format "Scaffold:start:end". 
#'				Each column represents a library 
#' @param maxSD: The maximum standard deviation in the library to be considered as a good deletion. (default: 0.3)
#'
#'

filterSamplesPerSD<-function(covs, maxSD=0.3) {
	df<-getExonsDF(covs)
	mat<-as.matrix(normalizeCovs(covs, df))
	missing<-apply(mat,1,countMissing)
	libSD<-apply(mat[missing<1,],2,sd)
	fineSD<-libSD < maxSD
	covs[,fineSD]
}

getLibSD<-function(mat) {
	missing<-apply(mat,1,countMissing)
	libSD<-apply(mat[missing<1,],2,sd)
	libSD
}

#' Filters all the exons that have low quality. 
filterLowQualityExons <-function(mat, maxSD=0.3){
	exonSD<-apply(mat, 1, sd)
 	fineSD<-exonSD < maxSD
 	mat[fineSD,]
    mat<-mat[complete.cases(mat), ]
    mat
}


#' Reads the table with the coverages. The table has headers containing the libary name. 
#' the rows must be named with the format "Scaffold:start:end". 
#' 
#' @param filename: The filename with the table. 
#' @param is_gz: The file is compressed 
readCoverageTable<-function(filename, is_gz=FALSE){
	print(filename)
	fn <- filename
	if (is_gz == TRUE ) fn <- gzfile(filename)
	counts<-read.csv(fn, header=TRUE, sep="\t", row.names=1)
	counts$remove <-NULL
	counts<-counts[,colSums(counts,na.rm=T)>0]
	counts
}



#' Prepares the dataframe with the information per exon. This parses the rownames to get the scaffold
#' Start and end. It also calculates the exon length and the starad deviation of each exon. 
#' @param mat: The matrix with the data
#' @return A data frame with the following rows: Exon, Scaffold, Start, End, ExonL (exon length)
getExonsDF<-function(mat){
	names<-rownames(mat)
	exons<-strsplit(as.character(names), ":")
    df_names= as.data.frame(t(as.data.frame(exons,stringsAsFactors = F)),stringsAsFactors = F)
    colnames(df_names) <- c("Scaffold", "Start", "Ends")
    df_names$Exon<-names
    df_names$Start<-as.integer(df_names$Start)
    df_names$Ends<-as.integer(df_names$Ends)
	df_names$ExonL<-df_names$Ends-df_names$Start
    df_names$sdExon<-apply(mat,1,sd)
    df_names
}

getExonsDetails<-function(names){
	exons<-strsplit(as.character(names), ":")
	scaffolds<-sapply(exons, "[[", 1)
	starts<-as.integer(sapply(exons, "[[", 2))
    df<-data.frame(Exon=names, Scaffold=scaffolds, Start=starts)
    df
}



#' Function that normalizes the coverages, in an RPKM-like values, and normalized to have a mean of 1, at the exon level.
normalizeCovs<-function(counts, exonsDF) {
	exonLengths<-exonsDF$ExonL
	totalReadsPerSample<-apply(counts,2,sum)
	multiplier<-outer(exonLengths, totalReadsPerSample )
	multiplier<-1/multiplier
	multiplier<-1000000000*multiplier
	#"RPKM-like covs"
	multiplier<-counts*multiplier
	rowsToRemove<-rowSums(multiplier)>0
	
	#Filtering the rows with AVG=0 
	multiplier<-multiplier[rowsToRemove,]
	
	#Normalize by multiplying by the mean of each row. 	
	mean.mult <- apply(multiplier, 1, mean)
	covs<-sweep(multiplier,MARGIN=1,mean.mult,'/')
	covs
}


homDeletionFilter<-function(localMat, exonsDF, maxValueForDeletion=0.1, minSigmaExon=3, minSigmaLib=3 ) {
	sdExons<-exonsDF$sdExon
	missing<-apply(localMat,1,countMissing)
	sdLibs<-apply(localMat[missing<1,],2,sd)

	sdExons <- 1 - ( minSigmaExon * sdExons)#This is a vector
	sdLibs <- 1 - (minSigmaLib * sdLibs )

	sdLibs<- ifelse(sdLibs  < maxValueForDeletion, sdLibs , maxValueForDeletion)
	sdExons<-ifelse(sdExons < maxValueForDeletion, sdExons, maxValueForDeletion)
	
	sdLibsMat  <- matrix(sdLibs ,nrow=length(sdExons),ncol=length(sdLibs), byrow=FALSE)
	sdExonsMat <- matrix(sdExons,nrow=length(sdExons),ncol=length(sdLibs), byrow=TRUE)
	ret <- pmin(sdExonsMat,sdLibsMat)
	ret
}

getHomExoDeletions<-function(localMat, exonsDF, maxValueForDeletion=0.1, ...){
	library(reshape2)
	library(plyr)
	filterMat<-homDeletionFilter(localMat, exonsDF, maxValueForDeletion=maxValueForDeletion,...)
	dels <- localMat < filterMat
	meltedDels<-melt(dels)
	meltedDels<-rename(meltedDels, c("Var1"="Exon", "Var2"="Library", "value"="HomDel"))
	meltedDels
}

getHetExoDeletions<-function(localMat, exonsDF, minSigmaExonHet=1) {
	library(reshape2)
	library(plyr)
	sdExons<-exonsDF$sdExon
	sdExons <- 0.5 + (minSigmaExonHet * sdExons)#This is a vector
	sdExonsMat <- matrix(sdExons,nrow=nrow(localMat),ncol=ncol(localMat), byrow=TRUE)
	dels <- localMat < sdExonsMat
	meltedDels<-melt(dels)
	#meltedDels<- meltedDels[meltedDels$value,]
	meltedDels<-rename(meltedDels, c("Var1"="Exon", "Var2"="Library", "value"=paste("HetDel",minSigmaExonHet,sep="")))
	meltedDels
}

getAllDeletedExons<-function(localMat, exonsDF){
	library(reshape2)
	library(plyr)
	meltedMat<-melt(as.matrix(localMat),varnames =c("Exon", "Library"), value.name="NormCov")
	#meltedMat<-rename(meltedMat, c("Var1"="Exon", "Var2"="Library", "value"="NormCov"))
	
	dat<-getHomExoDeletions(localMat, exonsDF)
	meltedMat$HomDel <- dat$HomDel
	
	dat<-getHetExoDeletions(localMat, exonsDF, 1)
	meltedMat$HetDel1 <- dat$HetDel1
	meltedMat$Scaffold<-exonsDF$Scaffold
	meltedMat$Start<-exonsDF$Start
	meltedMat[meltedMat$HetDel1 | meltedMat$HomDel,]
}



getAllScaffoldsWithDeletions<-function(exonsDF, delsDF,  minValidExonLength=9){
	library(sqldf)
	allScaffcounts <- table(exonsDF$Scaffold)
	scaffoldsDF <- as.data.frame(rownames(allScaffcounts))
	names(scaffoldsDF)[1]<-"Scaffold"
	scaffoldsDF$validExons <- allScaffcounts
	tempMat<-delsDF
	tempMat<-delsDF[delsDF$HetDel1 | delsDF$HomDel, ]
	tempMat<-delsDF
	countsHom<-sqldf("SELECT Scaffold, Library, COUNT(*) as HomDels FROM tempMat WHERE HomDel GROUP BY Scaffold, Library ")
	countsHet<-sqldf("SELECT Scaffold, Library, COUNT(*) as HetDels FROM tempMat WHERE HetDel1 GROUP BY Scaffold, Library ")
	mergedScaffDF <- merge(scaffoldsDF, countsHet, all.x=T)
	mergedScaffDF <- merge(mergedScaffDF, countsHom, all.x=T)
	mergedScaffDF$HomDels[is.na(mergedScaffDF$HomDels)] <- 0
	mergedScaffDF$HetDels[is.na(mergedScaffDF$HetDels)] <- 0
	mergedScaffDF<-na.omit(mergedScaffDF)
	mergedScaffDF<-mergedScaffDF[mergedScaffDF$validExons > minValidExonLength,]
	mergedScaffDF
}

getScaffoldAveragesByIndex<-function(index, delsMat, localMat, exonsDF){
	row <- delsMat[index,]
	ret <- getScaffoldAverages(row$Scaffold, row$Library, localMat, exonsDF)
	ret
}


getAllScaffoldAveragesParallel<-function(delsMat, localMat, exonsDF, cores=2){
    library(parallel)

	total<-nrow(delsMat)
    cl <- makeForkCluster(nnodes=cores)
    
    result <-parSapply(cl,1:total, getScaffoldAveragesByIndex,delsMat, localMat, exonsDF)
    df<- data.frame(t(result), stringsAsFactors=FALSE)   
	df[, 3:21] <- sapply(df[, 3:21], as.numeric)
    stopCluster(cl)
	df
}

getScaffoldAverages<-function(scaffold, library, localMat, exonsDF, minSigmaExonHet=1, maxValueForHomDeletion=0.1, minSigmaExon=3) {
	exonsToProcessDF<-exonsDF[exonsDF$Scaffold==scaffold,]
	exonsToProcess<-rownames(exonsToProcessDF)
	scaffMat<-localMat[exonsToProcess,library]
	
	maxForHetDel<-	0.5 + exonsToProcessDF$sdExon * minSigmaExonHet
	maxForDel   <- 1 - 3*exonsToProcessDF$sdExon

#	dels3SigmaExon <- length(scaffMat[scaffMat <= maxForDel])

	#allAvg    <- scaffMat
	allNoHom  <- scaffMat[scaffMat > maxValueForHomDeletion]
	allNoDels <- scaffMat[scaffMat > maxForDel] #Doesn't include anything below 3sigma exon
	allNoHet  <- scaffMat[scaffMat > maxForHetDel]
	hetAvg    <- scaffMat[scaffMat <= maxForHetDel]
	homAvg    <- scaffMat[scaffMat <= maxValueForHomDeletion]

	LENall    <-length(scaffMat )
	LENallNoHom  <-length(allNoHom )
	LENallNoDels <-length(allNoDels) #Doesn't include anything below 3sigma exon
	LENallNoHet  <-length(allNoHet )
	LENhet       <-length(hetAvg   )
	LENhom       <-length(homAvg   )

	AVGall       <-mean(scaffMat )
	AVGallNoHom  <-mean(allNoHom )
	AVGallNoDels <-mean(allNoDels) #Doesn't include anything below 3sigma exon
	AVGallNoHet  <-mean(allNoHet )
	AVGhet       <-mean(hetAvg   )
	AVGhom       <-mean(homAvg   )

    SDall       <-sd(scaffMat )
	SDallNoHom  <-sd(allNoHom )
	SDallNoDels <-sd(allNoDels) #Doesn't include anything below 3sigma exon
	SDallNoHet  <-sd(allNoHet )
	SDhet       <-sd(hetAvg   )
	SDhom       <-sd(homAvg   )

	score 		<- LENhet/LENallNoDels

	arg0<-c(Scaffold=toString(scaffold), Library=toString(library), 
		AllCount=LENall, AllAvg=AVGall, AllSD=SDall,
		NoHomCount=LENallNoHom, NoHomAvg=AVGallNoHom, NoHomSD=SDallNoHom,
		No3SigmaDelCount=LENallNoDels, No3SigmaDelAvg=AVGallNoDels, No3SigmaDelSD=SDallNoDels,
		NoHetCount=LENallNoHet, NoHetAvg=AVGallNoHet, NoHetSD=SDallNoHet,
		HetCount=LENhet, HetAvg=AVGhet, HetSD=SDhet,
		HomCount=LENhom, HomAvg=AVGhom, HomSD=SDhom,
		Score=score
		)

	#arg0<-list(Dels3SigmaExon=dels3SigmaExon, AllAvg=allAvg, AllNoDelsAvg3SigmaExon=allNoDels,AllNoHomAvg=allNoHom, AllNoHetAvg=allNoHet, HetAvg=hetAvg, HomAvg=homAvg)
	#arg0<-c(arg0, AllSD=SDall, AllNoDels3SigmaExonSD=SDallNoDels,AllNoHomSD=SDallNoHom, AllNoHetSD=SDallNoHet, HetSD=SDhet, HomSD=SDhom)
	#df<-data.frame(t(unlist(arg0)))
	#df
	arg0
}

getDeletionScore<-function(df) {
	ret <- as.numeric(df$HetCount) / as.numeric(df$No3SigmaDelCount)
	ret
}

getDeletionCategory<-function(df,threshold=0.8) {
	ratio<- as.numeric(df$HomCount) / as.numeric(df$HetCount)  > threshold
	type<-ifelse(ratio, 'Hom', 'Het')
	type
}
getDeletionsPerCM<-function(geneticMap,
                            selectedDels, 
                            libraries, 
                            df,
                            minExonCount=5, 
                            groupByCM=FALSE,
                            groupByCMPrecision=0,
                            chromosome=NULL){
    library(sqldf)
    selectedDels$category<-getDeletionCategory(selectedDels, threshold=0.75)
        
    cmGroup<-ifelse(groupByCM,paste0("round(cm,",groupByCMPrecision,")"),"cm")
    cmWhere<-ifelse(is.null(chromosome)," 1 ", paste0( " chr='", chromosome, "' "))
    exonCounts<-sqldf('SELECT Scaffold, Count(*) as exonCount FROM df group by Scaffold')
    
    exonQuery<-paste0('SELECT chr, ',cmGroup ,' as cM, count(*) as ScaffoldCount
FROM geneticMap 
LEFT JOIN exonCounts on Scaffold=contig  
WHERE exonCount > ', minExonCount, ' AND ', cmWhere,
' GROUP BY chr, ',cmGroup ,'
ORDER BY  chr, ',cmGroup)
    usedScaffoldsPerCM<-sqldf(exonQuery)
    
    homDelsQuery<-paste0("SELECT chr, ",cmGroup ," as cM, Library,  count(*) as HomDels 
FROM geneticMap
LEFT JOIN selectedDels on Scaffold = contig
WHERE category = 'Hom' AND ", cmWhere,
" GROUP BY chr, ",cmGroup ,", Library")
    homDels<-sqldf(homDelsQuery)
    
    hetDelsQuery<-paste0("SELECT chr, ",cmGroup ," as cM, Library,  count(*) as HetDels 
FROM geneticMap
LEFT JOIN selectedDels on Scaffold = contig
WHERE category = 'Het' AND ", cmWhere,
" GROUP BY chr, ",cmGroup ,", Library")
    hetDels<-sqldf(hetDelsQuery)
    
    mapsForLibraries<-sqldf("SELECT * FROM usedScaffoldsPerCM, libraries ")
    
    tableWithAllDeletions<-sqldf(
"SELECT mapsForLibraries.*,
    ifnull(hetDels.HetDels, 0) as HetDels, 
    ifnull(homDels.HomDels, 0) as HomDels
FROM mapsForLibraries
LEFT JOIN hetDels 
    ON  mapsForLibraries.chr     = hetDels.chr
    AND mapsForLibraries.cM      = hetDels.cM 
    AND mapsForLibraries.Library = hetDels.Library 
LEFT JOIN homDels 
    ON  mapsForLibraries.chr     = homDels.chr
    AND mapsForLibraries.cM      = homDels.cM 
    AND mapsForLibraries.Library = homDels.Library ")

    tableWithAllDeletions$hetDelsPer<-tableWithAllDeletions$HetDels/tableWithAllDeletions$ScaffoldCount
    tableWithAllDeletions$homDelsPer<-tableWithAllDeletions$HomDels/tableWithAllDeletions$ScaffoldCount
    tableWithAllDeletions
}




getAllScaffoldAverages<-function(delsMat, localMat, exonsDF, showProgressBar = T){

	if(showProgressBar) library(tcltk)
	

	total<-nrow(delsMat)

    df <- data.frame(Scaffold=rep("", total), Library=rep("", total), 
		AllCount=rep(NA, total), AllAvg=rep(NA, total), AllSD=rep(NA, total),
		NoHomCount=rep(NA, total), NoHomAvg=rep(NA, total), NoHomSD=rep(NA, total),
		No3SigmaDelCount=rep(NA, total), No3SigmaDelAvg=rep(NA, total), No3SigmaDelSD=rep(NA, total),
		NoHetCount=rep(NA, total), NoHetAvg=rep(NA, total), NoHetSD=rep(NA, total),
		HetCount=rep(NA, total), HetAvg=rep(NA, total), HetSD=rep(NA, total),
		HomCount=rep(NA, total), HomAvg=rep(NA, total), HomSD=rep(NA, total),
		Score=rep(NA, total),
		stringsAsFactors=FALSE)


	if(showProgressBar) pb <- tkProgressBar(title = "progress bar", min = 0, max = total, width = 300)
	for(i in 1:total) {
	    row <- delsMat[i,]
	    df[i, ] <- getScaffoldAverages(row$Scaffold, row$Library, localMat, exonsDF)
	   
		
		if(showProgressBar) setTkProgressBar(pb, i, label=paste( round(i/total*100, 0),"% done"))
	}
	if(showProgressBar) close(pb)
	df
}


getWholeExonDeletions<-function(scaffoldsDF, exonsDF, localMat, sdLibs, deletions, maxValueForDeletion=0.1, minSigmaExon=3, minSigmaLib=3){
	library(reshape2)
	library(plyr)

	libraries<-colnames(localMat)
	deletions<-sapply(libraries,getWholeExonDeletionsLibrary, scaffoldsDF, exonsDF, localMat, sdLibs, maxValueForDeletion=maxValueForDeletion, minSigmaExon=minSigmaExon, minSigmaLib=minSigmaLib)
	rownames(deletions)<-scaffoldsDF$Scaffold
	deletionsMelted<-melt(deletions)
	
	deletionsMelted<-deletionsMelted[deletionsMelted$value >0,]
	deletionsMelted$Category <- "Whole Exon"
	deletionsMelted<-rename(deletionsMelted, c("Var1"="Scaffold", "Var2"="Library"))
	delsJoined <- merge(x=deletionsMelted, y=scaffoldsDF, by = "Scaffold", all.x = TRUE )
	delsJoined
}




categorizeDeletions<-function(deletionsMelted, exonsDF){
	all_scaffold_counts <- sort(table(exonsDF$Scaffold))
	scaffoldsDF <- as.data.frame(rownames(all_scaffold_counts))
	names(scaffoldsDF)[1]<-"Scaffold"
	missing<-apply(localMat,1,countMissing)
	scaffoldsDF$validExons <- all_scaffold_counts
	scaffoldsDF<-subset(scaffoldsDF, validExons > 0)
	sdLibs<-apply(localMat[missing<1,],2,sd)
	deletions<-getWholeExonDeletions(scaffoldsDF, exonsDF, localMat, sdLibs, maxValueForDeletion=0.1, minSigmaExon=3, minSigmaLib=3)
	deletions
}


getHomSelectedDeletions<-function(selectedDels, geneticMap, threshold=0.75){
    selectedDels$category<-getDeletionCategory(selectedDels,threshold=threshold)
    
    selected <- sqldf("SELECT chr, cM, Scaffold, Library, AllAvg, AllSD FROM selectedDels
LEFT JOIN geneticMap ON contig=Scaffold  WHERE category = 'Hom'")
    scaffs <- sqldf("SELECT Scaffold, count(Scaffold) as delsPerScaffold FROM selected group by Scaffold")
    libs <- sqldf("SELECT Library, count(Library) as delsPerLibrary FROM selected group by Library")
    
    sel2 <- sqldf("SELECT selected.*, delsPerScaffold, delsPerLibrary from 
    	selected JOIN scaffs on selected.Scaffold = scaffs.Scaffold
    	JOIN libs on selected.Library = libs.Library" )

    sel2

}
homonecloco/bio.tilling documentation built on May 17, 2019, 4:51 p.m.