R/validateMaps.R

Defines functions checkIntraGeneOverlaps validateMapSet

Documented in validateMapSet

# validateMapSet.R

# verify that the mapSet (extracted from a .gff file or any other sources) is internally consistent.


validateMapSet <- function( mapset, checkOverlaps=FALSE) {

# local functions...


validateSeqMap <- function( mydf, speciesID) {

	neededColumns <- c( "SEQ_ID", "LENGTH")
	anyBad <- FALSE
	if ( ! all( neededColumns %in% colnames( mydf))) stop( paste( "seqMap is missing required columns: ", neededColumns))
	if ( length( whoBad <- which( duplicated( mydf$SEQ_ID)))) {
		cat( "\nseqMap has ", length(whoBad), " non-unique SEQ_ID entries:\n")
		print( head( mydf[whoBad, ]))
		anyBad <- TRUE
	}
	if ( any( is.na( as.integer( mydf$LENGTH)))) stop( "seqMap has non-integer LENGTH entries")
	if ( length( whoBad <- which( mydf$LENGTH < 2))) {
		cat( "\nseqMap has ", length(whoBad), " invalid LENGTH entries:\n")
		print( head( mydf[whoBad,]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( base::substr( mydf$SEQ_ID, 1, base::nchar(speciesID)) != speciesID))) {
		cat( "\nseqMap has ", length(whoBad), " SEQ_IDs that do not begin with 'speciesID':\n")
		print( head( mydf[whoBad,]))
		anyBad <- TRUE
	}
	if (anyBad) stop( "Fix SeqMap issues before maps can be validated..")
	return( )
}


validateGeneMap <- function( mydf, seqMap, checkOverlaps=FALSE) {

	neededColumns <- c( "GENE_ID", "POSITION", "END", "SEQ_ID", "STRAND")
	anyBad <- FALSE
	if ( ! all( neededColumns %in% colnames( mydf))) 
		stop( paste( "geneMap is missing required columns: ", neededColumns))
	if ( length( whoBad <- which( duplicated( mydf$GENE_ID)))) {
		cat( "geneMap has ", length(whoBad), " non-unique GENE_ID entries:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( ! all( unique.default( mydf$SEQ_ID) %in% seqMap$SEQ_ID)) {
		bad <- setdiff( mydf$SEQ_ID, seqMap$SEQ_ID)
		cat( "\nError: Unexpected gene SEQ_ID terms: ", bad)
		stop( "some geneMap SEQ_ID entries not in seqMap")
	}
	if ( ! all( unique.default( mydf$STRAND) %in% c("+","-","",NA))) stop( "some geneMap STRAND entries not valid")
	if ( any( is.na( as.integer( mydf$POSITION)))) stop( "geneMap has non-integer POSITION entries")
	if ( any( is.na( as.integer( mydf$END)))) stop( "geneMap has non-integer END entries")
	if ( length( whoBad <- which( mydf$POSTION < 1))) {
		cat( "geneMap has ", length(whoBad), " POSITION entries less than 1:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END < 2))) {
		cat( "geneMap has ", length(whoBad), " END entries less than 2:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	lastBase <- getSeqLength( mydf$SEQ_ID, seqMap)
	if ( length( whoBad <- which( mydf$POSITION > lastBase))) {
		cat( "geneMap has ", length(whoBad), " POSITION entries past SEQ_ID's length:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END > lastBase))) {
		cat( "geneMap has ", length(whoBad), " END entries past SEQ_ID's length:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if (anyBad) stop( "Fix GeneMap issues before maps can be validated..")

	# force non-decreasing order within each chromosome
	checkNonDecreasingOrder( mydf)

	if ( checkOverlaps) checkGeneOverlaps( mydf)
	return( )
}


checkNonDecreasingOrder <- function( gmap) {

	seqFac <- factor( gmap$SEQ_ID)
	Good <- TRUE
	tapply( 1:nrow(gmap), seqFac, function(x) {
		pos <- gmap$POSITION[x]
		diffs <- diff( pos)
		if ( any( diffs < 0)) {
			cat( "\nGenes out of order: ", gmap$SEQ_ID[x[1]])
			cat( "\nRows: ", x[ which( diffs < 0)])
			Good <<- FALSE
		}
	})
	if (! Good) {
		cat("\n")
		stop( "Gene POSITIONs must be in non-decreasing order")
	} else {
		return(TRUE)
	}
}


checkGeneOverlaps <- function( gmap) {

	prevG <- 0
	prevSeq <- ""
	hasNonGenes <- ( ("REAL_G" %in% colnames( gmap)) && any( gmap$REAL_G == FALSE))

	# run along the genes and see if any overlap
	nOver <- 0
	for( i in 1:nrow( gmap)) {
		if ( hasNonGenes && gmap$REAL_G[i] != TRUE) next

		if ( gmap$SEQ_ID[i] != prevSeq) {
			prevSeq <- gmap$SEQ_ID[i]
			prevG <- i
			next
		}

		if ( gmap$POSITION[i] <= gmap$END[ prevG]) {
			if ( gmap$STRAND[i] == gmap$STRAND[prevG]) {
				cat( "\nOverlap:  ", gmap$GENE_ID[prevG], gmap$GENE_ID[i])
				cat( "\nExtents:  ", gmap$POSITION[prevG], "-",gmap$END[prevG], "  <->  ", gmap$POSITION[i], "-", gmap$END[i])
				cat( "\n")
				nOver <- nOver + 1
			}
		}

		prevG <- i
	}
	cat( "\n\nGene Overlap summary: \nN_Overlapping_Genes = ", nOver, "\n\n")
}


validateExonMap <- function( mydf, seqMap, geneMap, checkOverlaps=FALSE) {

	neededColumns <- c( "GENE_ID", "POSITION", "END", "SEQ_ID", "STRAND")
	anyBad <- FALSE
	if ( ! all( neededColumns %in% colnames( mydf))) stop( paste( "exonMap is missing required columns: ", neededColumns))
	if ( ! all( unique.default( mydf$SEQ_ID) %in% seqMap$SEQ_ID)) stop( "some exonMap SEQ_ID entries not in seqMap")
	if ( ! all( unique.default( mydf$GENE_ID) %in% geneMap$GENE_ID)) stop( "some exonMap GENE_ID entries not in geneMap")
	if ( any( is.na( as.integer( mydf$POSITION)))) stop( "exonMap has non-integer POSITION entries")
	if ( any( is.na( as.integer( mydf$END)))) stop( "exonMap has non-integer END entries")
	if ( length( whoBad <- which( mydf$POSTION < 1))) {
		cat( "exonMap has ", length(whoBad), " POSITION entries less than 1:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END < 2))) {
		cat( "exonMap has ", length(whoBad), " END entries less than 2:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	geneBases <- getGeneLimits( mydf$GENE_ID, geneMap)
	if ( length( whoBad <- which( mydf$POSITION < geneBases$POSITION))) {
		cat( "exonMap has ", length(whoBad), " POSITION entries before GENE_ID's start:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$POSITION > geneBases$END))) {
		cat( "exonMap has ", length(whoBad), " POSITION entries past GENE_ID's end:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END < geneBases$POSITION))) {
		cat( "exonMap has ", length(whoBad), " END entries before GENE_ID's start:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END > geneBases$END))) {
		cat( "exonMap has ", length(whoBad), " END entries past GENE_ID's end:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if (anyBad) stop( "Fix ExonMap issues before maps can be validated..")

	if ( checkOverlaps) checkExonOverlaps( mydf, geneMap)
	return( )
}


validateCdsMap <- function( mydf, seqMap, geneMap, checkOverlaps=FALSE) {

	neededColumns <- c( "GENE_ID", "POSITION", "END", "SEQ_ID", "STRAND")
	anyBad <- FALSE
	if ( ! all( neededColumns %in% colnames( mydf))) stop( paste( "cdsMap is missing required columns: ", neededColumns))
	if ( ! all( unique.default( mydf$SEQ_ID) %in% seqMap$SEQ_ID)) stop( "some cdsMap SEQ_ID entries not in seqMap")
	if ( ! all( unique.default( mydf$GENE_ID) %in% geneMap$GENE_ID)) stop( "some cdsMap GENE_ID entries not in geneMap")
	if ( any( is.na( as.integer( mydf$POSITION)))) stop( "cdsMap has non-integer POSITION entries")
	if ( any( is.na( as.integer( mydf$END)))) stop( "cdsMap has non-integer END entries")
	if ( length( whoBad <- which( mydf$POSTION < 1))) {
		cat( "cdsMap has ", length(whoBad), " POSITION entries less than 1:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END < 2))) {
		cat( "cdsMap has ", length(whoBad), " END entries less than 2:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	geneBases <- getGeneLimits( mydf$GENE_ID, geneMap)
	if ( length( whoBad <- which( mydf$POSITION < geneBases$POSITION))) {
		cat( "cdsMap has ", length(whoBad), " POSITION entries before GENE_ID's start:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$POSITION > geneBases$END))) {
		cat( "cdsMap has ", length(whoBad), " POSITION entries past GENE_ID's end:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END < geneBases$POSITION))) {
		cat( "cdsMap has ", length(whoBad), " END entries before GENE_ID's start:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if ( length( whoBad <- which( mydf$END > geneBases$END))) {
		cat( "cdsMap has ", length(whoBad), " END entries past GENE_ID's end:\n")
		print( head( mydf[ whoBad, ]))
		anyBad <- TRUE
	}
	if (anyBad) stop( "Fix CdsMap issues before maps can be validated..")

	if ( checkOverlaps) checkExonOverlaps( mydf, geneMap)
	return( )
}


checkExonOverlaps <- function( emap, geneMap) {

	prevG <- 0
	prevSeq <- ""

	# run along the genes and see if any overlap
	nOver <- 0
	for( i in 1:nrow( emap)) {

		if ( emap$SEQ_ID[i] != prevSeq) {
			prevSeq <- emap$SEQ_ID[i]
			prevG <- i
			next
		}

		if ( emap$POSITION[i] <= emap$END[ prevG]) {
			if ( emap$STRAND[i] == emap$STRAND[prevG]) {
				cat( "\nOverlap:  ", emap$GENE_ID[prevG], emap$GENE_ID[i])
				cat( "\nExtents:  ", emap$POSITION[prevG], "-",emap$END[prevG], "  <->  ", emap$POSITION[i], "-", emap$END[i])
				cat( "\n")
				nOver <- nOver + 1
			}
		}

		# also verify that it lands "in" its gene...
		gptr <- base::match( emap$GENE_ID[i], geneMap$GENE_ID, nomatch=0)
		if ( gptr == 0) {
			cat( "\n\nMissing gene: ", emap$GENE_ID[i], "\n")
		} else {
			if ( emap$POSITION[i] < geneMap$POSITION[gptr]) {
				cat( "\nBoundary error:  (exon too early)", emap$GENE_ID[i], emap$POSITION[i], geneMap$POSITION[gptr])
			}
			if ( emap$END[i] > geneMap$END[gptr]) {
				cat( "\nBoundary error:  (exon too late)", emap$GENE_ID[i], emap$END[i], geneMap$END[gptr])
			}
		}

		prevG <- i
	}
	cat( "\n\nExon Overlap summary: \nN_Overlapping_Exons = ", nOver, "\n\n")
}


validateRRNA_Map <- function( mydf) {

	if ( is.null( mydf)) {
		cat( "\nRibosomal RNA map not found...  skipping.\n")
		return( NULL)
	}
	neededColumns <- c( "GENE_ID", "POSITION", "END", "SEQ_ID", "STRAND")
	if ( ! all( neededColumns %in% colnames( mydf))) 
		stop( paste( "rrnaMap is missing required columns: ", neededColumns))
	return( )
}


getSeqLength <- function( seqids, seqMap) {

	out <- rep( 0, times=length(seqids))
	where <- base::match( seqids, seqMap$SEQ_ID, nomatch=0)
	out[ where > 0] <- seqMap$LENGTH[ where]
	return( out)
}

getGeneLimits <- function( geneids, geneMap) {

	outB <- outE <- rep( 0, times=length(geneids))
	where <- base::match( geneids, geneMap$GENE_ID, nomatch=0)
	outB[ where > 0] <- geneMap$POSITION[ where]
	outE[ where > 0] <- geneMap$END[ where]
	return( data.frame( "POSITION"=outB, "END"=outE))
}

	# end of local functions...


	# check each map
	speciesID <- mapset$speciesID
	seqMap <- mapset$seqMap
	validateSeqMap( seqMap, speciesID)
	geneMap <- mapset$geneMap
	validateGeneMap( geneMap, seqMap, checkOverlaps)
	exonMap <- mapset$exonMap
	validateExonMap( exonMap, seqMap, geneMap, checkOverlaps)
	cdsMap <- mapset$cdsMap
	validateCdsMap( cdsMap, seqMap, geneMap, checkOverlaps)
	rrnaMap <- mapset$rrnaMap
	validateRRNA_Map( rrnaMap)
	return( "OK")
}


# stand alone to see if within each gene, are there multiple gene models that cause exons or cds to overlap
checkIntraGeneOverlaps <- function( emap) {

	# run along the genes and see if any of its sub-sections overlap
	geneFac <- factor( emap$GENE_ID)
	N <- nrow( emap)

	nbad <- 0
	tapply( 1:N, geneFac, function(x) {
			if ( (nEx <- length(x)) < 2) return(NULL)
			for ( i in 1:(nEx-1)) {
				beg1 <- emap$POSITION[ x[ i]]
				end1 <- emap$END[ x[ i]]
				set1 <- beg1 : end1
				for ( j in (i+1):nEx) {
					beg2 <- emap$POSITION[ x[ j]]
					end2 <- emap$END[ x[ j]]
					set2 <- beg2 : end2

					nover <- length( intersect( set1, set2))
					if (nover) {
						nbad <<- nbad + 1
						cat( "\n", emap$GENE_ID[x[i]], i, j, beg1, end1, beg2, end2)
					}
				}
			}
			return()
		})
	
	if ( nbad) {
		cat( "\n\nFound gene sub-section overlaps: ", nbad)
	} else {
		cat( "\n\nNo overlaps detected.")
	}
}
robertdouglasmorrison/DuffyTools documentation built on April 16, 2024, 6:31 a.m.