R/pipe.ConsensusProteinPileups.R

Defines functions `showCPPspliceBoundaries` CPP.AuditSummary `writeAuditRecord` `auditFileName` `createAuditFile` `searchForMisalignedReads` mergePeptideFiles `acceptRealignedConsensus` `realignConsensus` `modifyConsensus` `inspectConsensus` `pipe.ConsensusProteinSetup` `pipe.ConsensusProteinPileups`

# pipe.ConsensusProteinPileups -- use raw reads as peptides to determine true protein sequence in a field isolate

`pipe.ConsensusProteinPileups` <- function( sampleID, geneID, geneName=geneID, optionsFile="Options.txt",
						results.path=NULL, exon=NULL, maxNoHits.pileup=1000000,
						max.depth=60, txt.cex=0.21, forceSetup=FALSE, maxNoHits.setup=1000000,
						mode=c("normal","realigned"), plotOnly=FALSE, max.drawnPerSite=3,
						trim5.aligns=0, trim3.aligns=0, trim5.nohits=0, trim3.nohits=0,
						draw.box=FALSE, chunkSize.pileup=50000, useCutadapt=FALSE,
						startFromReference=FALSE, clipAtStop=TRUE, showFrameShiftPeptides=TRUE) {

	require(Biostrings)

	# make sure we were given valid parameters..
	if ( length(sampleID) > 1) {
		sampleID <- sampleID[1]
		cat( "\nWarning:  only one sampleID allowed")
	}
	gmap <- getCurrentGeneMap()
	if ( ! (geneID %in% gmap$GENE_ID)) {
		cat( "\nNot a known geneID: ", geneID)
		return(NULL)
	}

	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	if ( ! file.exists( peptide.path)) dir.create( peptide.path, recursive=T)
	
	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	# step 1: make sure all needed peptide and pileup files are in place
	nAA <- pipe.ConsensusProteinSetup( sampleID, geneID=geneID, geneName=geneName, results.path=results.path, 
				exon=exon, maxNoHits=maxNoHits.setup, forceSetup=forceSetup,
				trim5.aligns=trim5.aligns, trim3.aligns=trim3.aligns, 
				trim5.nohits=trim5.nohits, trim3.nohits=trim3.nohits,
				useCutadapt=useCutadapt, startFromReference=startFromReference, clipAtStop=clipAtStop)
	if ( nAA < 1) {
		cat( "\nSetting up for CPP gave an empty protein sequence..")
		return(NULL)
	}

	consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusAA.fasta", sep="."))
	
	mode <- match.arg( mode)
	if ( mode == "realigned") {
		consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusAA.fasta", sep="."))
	}

	# step 2:  run the tool that aligns peptides to a construct

	# better luck drawing if we close the current graphics window first
	if ( dev.cur() > 1) dev.off()
	x11Width <- max( round( nAA/550 * 25), 10)
	x11Height <- max( round( max.depth/10, digits=1), 6)
	pdfWidth <- max( round( nAA/550 * 20), 10)
	pdfHeight <- max( round( max.depth/12, digits=1), 5)
	checkX11( bg='white', width=x11Width, height=x11Height)

	consensusAns <<- proteinConstructPeptidePileups( sampleID, geneName=geneName, constructFile=consensusAAfile, 
				peptide.path=peptide.path, txt.cex=txt.cex, maxNoHits=maxNoHits.pileup, 
				max.depth=max.depth, max.drawnPerSite=max.drawnPerSite, mode=mode, draw.box=draw.box,
				chunkSize=chunkSize.pileup, showFrameShiftPeptides=showFrameShiftPeptides)

	# record metrics to the audit file
	writeAuditRecord( peptide.path, sampleID, geneName, mode="Pileup", info=consensusAns)

	# perhaps ttry to supplement the image with exon splice boundaries
	if (is.null(exon)) showCPPspliceBoundaries( geneID, construct=consensusAns$Construct, optionsFile=optionsFile)

	pdfFile1 <- file.path( peptide.path, paste( sampleID, geneName, "PeptidePileups.pdf", sep="."))
	pdfFile2 <- file.path( peptide.path, paste( sampleID, geneName, "FinalConsensus.pdf", sep="."))
	if ( mode == "realigned") {
		pdfFile1 <- file.path( peptide.path, paste( sampleID, geneName, "RealignedPeptidePileups.pdf", sep="."))
		pdfFile2 <- file.path( peptide.path, paste( sampleID, geneName, "RealignedFinalConsensus.pdf", sep="."))
	}
	dev.print( pdf, pdfFile1, width=pdfWidth, height=pdfHeight)
	if ( plotOnly) return()

	consensusSaveFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusAnswer.rda", sep="."))
	save( consensusAns, file=consensusSaveFile)

	# step 3:  summarize how the consensus differs from what it used as its construct
	differences <- proteinConstructPileupSummary( consensusSaveFile, sampleID=sampleID, geneName=geneName, 
						txt.cex=txt.cex)
	dev.print( pdf, pdfFile2, width=pdfWidth, height=pdfHeight)
	return( invisible( differences))
}


`pipe.ConsensusProteinSetup` <- function( sampleID, geneID, geneName=geneID, maxNoHits=1000000, 
					optionsFile="Options.txt", results.path=NULL, exon=NULL, 
					trim5.aligns=0, trim3.aligns=0, trim5.nohits=0, trim3.nohits=0,
					forceSetup=FALSE, useCutadapt=FALSE, startFromReference=FALSE, 
					clipAtStop=TRUE, ...) {

	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	if ( ! file.exists( peptide.path)) dir.create( peptide.path, recursive=T)

	madeAnyFiles <- FALSE
	
	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	# Step 1:  make sure that all the needed raw read and peptide files are ready
	#		allow use of trimmed reads for the noHits by default
	nohitReadsFile <- file.path( results.path, "fastq", paste( sampleID, "noHits.trimmed.fastq.gz", sep="."))
	nohitReadsFile2 <- file.path( results.path, "fastq", paste( sampleID, "noHits.fastq.gz", sep="."))
	nohitPeptidesFile <- file.path( peptide.path, paste( sampleID, "NoHits", "RawReadPeptides.txt", sep="."))
	geneReadsFile <- file.path( results.path, "fastq", paste( sampleID, geneName, "fastq.gz", sep="."))
	geneReadsTrimmedFile <- file.path( results.path, "fastq", paste( sampleID, geneName, "trimmed.fastq.gz", sep="."))
	genePeptidesFile <- file.path( peptide.path, paste( sampleID, geneName, "RawReadPeptides.txt", sep="."))
	auditFile <- file.path( peptide.path, paste( sampleID, geneName, "AuditTrail.txt", sep="."))

	if ( useCutadapt) {
		if ( ! file.exists( nohitReadsFile)) {
			cat( "\nCutting Adapters off 'NoHit' reads..")
			cutadapt( file1=basename(nohitReadsFile2), path=dirname(nohitReadsFile))
		}
	}
	if ( ! file.exists( nohitReadsFile)) nohitReadsFile <- nohitReadsFile2
	if ( ! file.exists( nohitReadsFile)) {
		stop( "Alignment pipeline 'NoHits' fastq file not found.  Run main pipeline first.")
	}

	if ( forceSetup || ! file.exists( geneReadsFile)) {
		cat( "\nExtracting Gene alignments..")
		nAligns <- pipe.GatherGeneAlignments( sampleID, geneID, tail=500, asFASTQ=T, fastq.keyword=geneName)
		if (nAligns < 1) {
			cat( "\nZero reads aligned to gene..")
			return( 0)
		}
		# new file, so make sure we remake its decendants
		file.delete( genePeptidesFile)
		madeAnyFiles <- TRUE
	}
	if ( forceSetup || ! file.exists( genePeptidesFile)) {
		cat( "\nConverting Gene alignments to peptides..")
		if (useCutadapt) {
			cat( "\nCutting Adapters off 'Gene' reads..")
			cutadapt( file1=basename(geneReadsFile), path=dirname(geneReadsFile))
			geneReadsFile <- geneReadsTrimmedFile
		}
		nPeptides <- fastqToPeptides( geneReadsFile, genePeptidesFile, chunk=100000, lowComplexityFilter=FALSE,
				trim5=trim5.aligns, trim3=trim3.aligns, clipAtStop=FALSE)
		if (nPeptides < 1) {
			cat( "\nGene alignments gave zero peptides..")
			return( 0)
		}
		madeAnyFiles <- TRUE
	}
	if ( maxNoHits && (forceSetup || ! file.exists( nohitPeptidesFile))) {
		cat( "\nConverting 'NoHit' reads to peptides..  Takes quite a while..")
		cat( "\nBase Trimming:     5' =", trim5.nohits, "    3' =", trim3.nohits)
		multicore.setup(10)
		fastqToPeptides( nohitReadsFile, nohitPeptidesFile, chunk=100000, maxPeptides=maxNoHits,
				lowComplexityFilter=TRUE, trim5=trim5.nohits, trim3=trim3.nohits, 
				clipAtStop=clipAtStop, ...)
		madeAnyFiles <- TRUE
	}
	if ( forceSetup || ! file.exists( auditFile)) {
		createAuditFile( peptide.path, sampleID, geneName)
	}

	# Step 2:  extract the consensus sequence from the Bowtie alignment results
	exonSuffix <- "Full"
	gmap <- subset( getCurrentGeneMap(), GENE_ID == geneID)
	# using CDS instead of Exons
	emap <- subset( getCurrentCdsMap(), GENE_ID == geneID)
	exonStart <- min( emap$POSITION)
	exonStop <- max( emap$END)
	if (! is.null(exon)) {
		Nexons <- nrow( emap)
		thisExon <- as.integer(exon)
		if( ! (thisExon %in% 1:Nexons)) {
			cat( "\nError:  'exon' must be an integer in 1..",Nexons)
			stop()
		}
		exonSuffix <- paste( "Exon", thisExon, sep="")
		if ( gmap$STRAND == "-") {
			ord <- order( emap$POSITION, decreasing=T)
			myExon <- ord[ thisExon]
			exonStart <- emap$POSITION[myExon]
			exonStop <- emap$END[myExon]
		} else {
			ord <- order( emap$POSITION, decreasing=F)
			myExon <- ord[ thisExon]
			exonStart <- emap$POSITION[myExon]
			exonStop <- emap$END[myExon]
		}
	}
	consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusAA.fasta", sep="."))
	consensusDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusDNA.fasta", sep="."))
	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusBaseCalls.txt", sep="."))
	if ( forceSetup || ! all( file.exists( c( consensusBASEfile, consensusAAfile, consensusDNAfile)))) {
		cat( "\nTurning alignment pileups into consensus base calls..")
		ans <- pipe.ConsensusBaseCalls( sampleID, geneID, start=exonStart, stop=exonStop, as.cDNA=TRUE, 
				noReadCalls="blank")
		callsTable <- ans$callsTable
		myAAvec <- callsTable$AA
		myAAvec[ is.na(myAAvec)] <- ""
		myDNAvec <- callsTable$DNA
		myDNAvec[ is.na(myDNAvec)] <- ""
		myAA <- paste( myAAvec, collapse="")
		myDNA <- paste( myDNAvec, collapse="")
		myDesc <- paste( sampleID, geneName, exonSuffix, sep="_")
		writeFasta( as.Fasta( myDesc, myAA), consensusAAfile, line.width=100)
		writeFasta( as.Fasta( myDesc, myDNA), consensusDNAfile, line.width=100)
		write.table( callsTable, consensusBASEfile, sep="\t", quote=F, row.names=FALSE)
		madeAnyFiles <- TRUE
		nAA <- nchar( myAA)
	} else {
		fa <- loadFasta( consensusAAfile, verbose=F)
		nAA <- nchar( fa$seq)
	}

	# allow starting from the expected reference sequence instead of the pileups we actually see
	if ( startFromReference && (forceSetup || madeAnyFiles)) {
		cat( "\nForcing use of Reference Protein as Starting Target..")
		refAns <- gene2Fasta( geneID, getOptionValue( optionsFile, "genomicFastaFile", verbose=F), mode="cdna")
		ref.cdna <- refAns$seq[1]
		ref.aa <- DNAtoAA( ref.cdna, clip=F, read=1)
		myDesc <- paste( sampleID, geneName, exonSuffix, sep="_")
		writeFasta( as.Fasta( myDesc, ref.aa), consensusAAfile, line.width=100)
		writeFasta( as.Fasta( myDesc, ref.cdna), consensusDNAfile, line.width=100)
		# also make a 'fake' base calls table
		Ncdna <- nchar(ref.cdna)
		fakeCallsM <- matrix( 0, nrow=Ncdna, ncol=7)
		colnames(fakeCallsM) <- c( "Ref", "A", "C", "G", "T", "N", "Indel")
		fakeCallsM[ ,1] <- 1
		fakeAaM <- matrix( "", nrow=Ncdna, ncol=4)
		colnames(fakeAaM) <- c( "AA", "Frame1", "Frame2", "Frame3")
		aaStrs <- DNAtoAA( ref.cdna, clip=F, read=1:3)
		aaVecs <- strsplit( aaStrs, split="")
		nChar <- length(aaVecs[[1]])
		fakeAaM[ seq(2,Ncdna,by=3)[1:nChar], 1] <- aaVecs[[1]]
		fakeAaM[ seq(2,Ncdna,by=3)[1:nChar], 2] <- aaVecs[[1]]
		nChar <- length(aaVecs[[2]])
		fakeAaM[ seq(3,Ncdna,by=3)[1:nChar], 3] <- aaVecs[[2]][1:nChar]
		nChar <- length(aaVecs[[3]])
		fakeAaM[ seq(4,Ncdna,by=3)[1:nChar], 4] <- aaVecs[[3]][1:nChar]
		callsTable <- data.frame( "POSITION"=1:Ncdna, fakeCallsM, "DNA"=strsplit(ref.cdna, split="")[[1]],
				fakeAaM, "IndelDetails"="", stringsAsFactors=F)
		write.table( callsTable, consensusBASEfile, sep="\t", quote=F, row.names=FALSE)
		nAA <- nchar( ref.aa)
	}

	if (madeAnyFiles) cat( "\nConsensus Prep done for sample: ", sampleID, "\tGene: ", geneID, "\n")
	return( nAA)
}


`inspectConsensus` <- function( sampleID, geneName="Var2csa", context=NULL, extra.rows=3,
				readingFrame=c("BestFrame","Frame1","Frame2","Frame3"), 
				optionsFile="Options.txt", results.path=NULL, peptide.path=NULL) {

	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	# get the current consensus base calls file
	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	if ( is.null(peptide.path)) peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusBaseCalls.txt", sep="."))
	calls <- read.delim( consensusBASEfile, as.is=T)
	aaAns <- consensusTranslation( calls$DNA)
	readingFrame <- match.arg( readingFrame)
	aaSeq <- paste( aaAns[[readingFrame]], collapse="")
	dnaSeq <- paste( calls$DNA, collapse="")
	calls$AA <- aaAns[[readingFrame]]
	rownames(calls) <- 1:nrow(calls)

	# there is a tiny chance of no indels at all, which makes that column of "" look like NA
	# when we read in the call table
	if ( all( is.na( calls$IndelDetails))) calls$IndelDetails <- ""

	# find that AA context
	context <- toupper( context)
	contextStart <- gregexpr( context, aaSeq, fixed=T)[[1]]
	if ( contextStart[1] < 1) {
		cat( "\nFailed to find 'context' in AA sequence: ", context)
		return(NULL)
	}
	if ( length(contextStart) > 1) {
		cat( "\nNon-unique 'context' in AA sequence: ", context, "\tN_times: ", length(contextStart))
		cat( "\nSites at: ", contextStart)
		return(NULL)
	}
	contextStop <- contextStart + nchar( context) - 1
	contextStr <- substr( aaSeq, contextStart, contextStop)
	cat( "\nContext location: ", contextStart, contextStop, contextStr)

	dnaContextStart <- (contextStart-1) * 3 + 1 - extra.rows
	dnaContextStop <- dnaContextStart + nchar(context) * 3 + extra.rows*2
	cat( "\nBase Call Profile:\n")
	print( calls[ rownames(calls) %in% (dnaContextStart:dnaContextStop), ])
}


`modifyConsensus` <- function( sampleID, geneName="Var2csa", 
				command=c("delete", "insert", "replace", "frameshift","append"), 
				location=NULL, seq, readingFrame=c("BestFrame","Frame1","Frame2","Frame3"), 
				optionsFile="Options.txt", results.path=NULL, peptide.path=NULL) {

	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	# get the current consensus base calls file
	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	if ( is.null(peptide.path)) peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusBaseCalls.txt", sep="."))
	calls <- read.delim( consensusBASEfile, as.is=T)
	aaAns <- consensusTranslation( calls$DNA)
	readingFrame <- match.arg( readingFrame)
	aaSeq <- paste( aaAns[[readingFrame]], collapse="")
	dnaSeq <- paste( calls$DNA, collapse="")
	calls$AA <- aaAns[[readingFrame]]
	rownames(calls) <- 1:nrow(calls)	

	# there is a tiny chance of no indels at all, which makes that column of "" look like NA
	# when we read in the call table
	if ( all( is.na( calls$IndelDetails))) calls$IndelDetails <- ""

	# what command do we want
	command <- match.arg( command)
	
	# what rows do we want
	if ( is.null(location)) stop( "Explicit row number location (range) required")
	if( is.character(location)) {
		rowNumbers <- as.integer( strsplit( gsub(" ","",location), split=":", fixed=T)[[1]])
		if ( length( rowNumbers) == 2) rowNumbers <- rowNumbers[1] : rowNumbers[2]
	} else {
		rowNumbers <- location
	}
	rowNumbers <- intersect( rowNumbers, 1:nrow(calls))

	# force complete codon edits!
	if ( (!(command %in% c("frameshift","append"))) && rowNumbers[1] %% 3 != 1) {
		cat( "\nLocation 1 for edits must be exact codon start boundary!!")
		stop()
	}
	if ( command == "append" && rowNumbers[1] %% 3 != 0) {
		cat( "\nLocation for append must be exact codon end boundary!!")
		stop()
	}
	if ( command %in% c( "delete","replace") && rowNumbers[length(rowNumbers)] %% 3 != 0) {
		cat( "\nLocation 2 for edits must be exact codon end boundary!!")
		stop()
	}
	cat( "\nCommand: ", command, "\tRows: ", range(rowNumbers))

	newCalls <- NULL
	oldSeq <- ""
	newSeq <- toupper(seq)
	protLen <- sum( calls$AA != "", na.rm=T)

	if ( command %in% c( "delete", "frameshift")) {
		newCalls <- calls[ -rowNumbers, ]
		cat( "\nDeleted ", length(rowNumbers), " rows.")
		oldSeq <- calls$AA[ rowNumbers]
	} else {
		# we need the sequence
		newAAseq <- toupper( seq)
		newDNAseq <- AAtoCodonOptimizedDNA( newAAseq, dnaSeq)
		newDNAvector <- strsplit( newDNAseq, split="")[[1]]
		nNewBases <- length(newDNAvector)
		newAAvector <- rep.int( "", nNewBases)
		newAAvector[ seq( 2, nNewBases, by=3)] <- strsplit( newAAseq, split="")[[1]]
	
		if ( command == "replace") {
			if ( nNewBases != length( rowNumbers)) {
				cat( "\nReplacement sequence is wrong size for location range!")
				stop()
			}
			# OK to replace
			newCalls <- calls
			newCalls$Ref[ rowNumbers] <- 1
			newCalls[ rowNumbers, c("A","C","G","T","N","Indel")] <- 0
			newCalls$DNA[ rowNumbers] <- newDNAvector
			newCalls$AA[ rowNumbers] <- newAAvector
			newCalls$IndelDetails[ rowNumbers] <- ""
			cat( "\nReplaced ", nNewBases, " rows with new sequence: ", newAAseq)
			oldSeq <- calls$AA[ rowNumbers]
		}
		if ( command == "insert") {	# insert 'before' X
			part1 <- part2 <- data.frame()
			if (rowNumbers[1] > 1) part1 <- calls[ 1:(rowNumbers[1]-1), ]
			if (rowNumbers[1] <= nrow(calls)) part2 <- calls[ rowNumbers[1]:nrow(calls), ]
			copyFrom <- min( rowNumbers[1], nrow(calls))
			newPart <- calls[ rep.int( copyFrom, nNewBases), ]
			newPart$Ref <- 1
			newPart[ , c("A","C","G","T","N","Indel")] <- 0
			newPart$DNA <- newDNAvector
			newPart$AA <- newAAvector
			newCalls$IndelDetails <- ""
			newCalls <- rbind( part1, newPart, part2)
		}
		if ( command == "append") {	# insert 'after' X
			part1 <- part2 <- data.frame()
			part1 <- calls[ 1:rowNumbers[1], ]
			if (rowNumbers[1] < nrow(calls)) part2 <- calls[ (rowNumbers[1]+1):nrow(calls), ]
			copyFrom <- min( rowNumbers[1], nrow(calls))
			newPart <- calls[ rep.int( copyFrom, nNewBases), ]
			newPart$Ref <- 1
			newPart[ , c("A","C","G","T","N","Indel")] <- 0
			newPart$DNA <- newDNAvector
			newPart$AA <- newAAvector
			newCalls$IndelDetails <- ""
			newCalls <- rbind( part1, newPart, part2)
		}
	}
	if ( ! is.null( newCalls)) {
		cat( "\nUpdating sequence after edits...")
		aaAns <- consensusTranslation( newCalls$DNA)
		dnaSeq <- paste( newCalls$DNA, collapse="")

		# if we did an explicit frame shift, force keeping RF 1, don't let the consensus pick the best!
		if ( command == "frameshift") {
			newCalls$AA <- aaAns$Frame1
			aaSeq <- paste( aaAns$Frame1, collapse="")
		} else {
			newCalls$AA <- aaAns[[readingFrame]]
			aaSeq <- paste( aaAns[[readingFrame]], collapse="")
		}
		newCalls[ ,c('Frame1','Frame2','Frame3')] <- aaAns[,c("Frame1","Frame2","Frame3")]
		rownames(newCalls) <- 1:nrow(newCalls)

		# save the current prior to overwriting..
		cat( "\nSaving previous .FASTA and BaseCalls..")
		consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusAA.fasta", sep="."))
		consensusDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusDNA.fasta", sep="."))
		saveAAfile <- file.path( peptide.path, paste( sampleID, geneName, "SaveConsensusAA.fasta", sep="."))
		saveDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "SaveConsensusDNA.fasta", sep="."))
		saveBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "SaveConsensusBaseCalls.txt", sep="."))
		file.copy( consensusAAfile, saveAAfile, overwrite=T)
		file.copy( consensusDNAfile, saveDNAfile, overwrite=T)
		file.copy( consensusBASEfile, saveBASEfile, overwrite=T)
		oldFA <- loadFasta( consensusAAfile, verbose=F)
		myDesc <- oldFA$desc
		writeFasta( as.Fasta( myDesc, aaSeq), consensusAAfile, line.width=100)
		writeFasta( as.Fasta( myDesc, dnaSeq), consensusDNAfile, line.width=100)
		write.table( newCalls, consensusBASEfile, sep="\t", quote=F, row.names=TRUE)
		cat( "\nWrote new edited .FASTA and BaseCalls for sampleID: ", sampleID, "\n")
		protLen <- sum( newCalls$AA != "", na.rm=T)
	} else {
		cat( "\nNo modifications made...\n")
	}

	# record what we did
	writeAuditRecord( peptide.path, sampleID, geneName, mode="Modify", 
			info=list( "Command"=command, "Location"=rowNumbers, "OldSequence"=oldSeq,
			"NewSequence"=newSeq, "ReadingFrame"=readingFrame, 
			"Length_AA"=protLen))
}


`realignConsensus` <- function( sampleID, geneID="PF3D7_1200600", geneName=geneID, 
				readingFrame=c("BestFrame","Frame1","Frame2","Frame3"), 
				optionsFile="Options.txt", annotationFile="Annotation.txt", results.path=NULL,
				peptide.path=NULL, exon=NULL, extra.fastq.keyword=NULL, useCutadapt=FALSE,
				mode=c("normal", "TargetSearch")) {

	require(Biostrings)

	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	# given a manually improved consensus sequence, redo the Bowtie alignment step to this custome protein target
	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	if ( is.null(peptide.path)) peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusAA.fasta", sep="."))
	consensusDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusDNA.fasta", sep="."))
	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusBaseCalls.txt", sep="."))

	# step 1:  make sure the consensus data match the FASTA files of the same data
	cat( "\nValidating current consensus constructs..")
	calls <- read.delim( consensusBASEfile, as.is=T)
	aaAns <- consensusTranslation( calls$DNA)
	readingFrame <- match.arg( readingFrame)
	aaSeq <- paste( aaAns[[readingFrame]], collapse="")
	dnaSeq <- paste( calls$DNA, collapse="")
	aaFA <- loadFasta( consensusAAfile, verbose=F)
	dnaFA <- loadFasta( consensusDNAfile, verbose=F)
	myDesc <- dnaFA$desc
	aaMatch <- (aaSeq == aaFA$seq)
	# for the DNA, ignore last bases after last AA
	ncDNAcheck <- nchar( aaSeq) * 3
	dnaMatch <- ( substr( dnaSeq, 1, ncDNAcheck) ==  substr( dnaFA$seq, 1, ncDNAcheck))
	if ( ! aaMatch) cat( "\nAmino acid FASTA sequence does not match consensus base calls AA column!")
	if ( ! dnaMatch) cat( "\nNucleotide FASTA sequence does not match consensus base calls DNA column!")
	if ( ! all( c( aaMatch, dnaMatch))) stop( "Fix sequence disparities before realignment can run")

	# step 2:  make a Bowtie index of this tiny sequence
	# but we want to add a bit of flanking data to make the alignments at the edges be more robust...

	# step 2.1: find the flanks of reference DNA around this construct.  When we have a real gene in the current map set
	if ( ! is.null( geneID)) {
		gmap <- subset( getCurrentGeneMap(), GENE_ID == geneID)
		if ( nrow(gmap) < 1) stop( "'geneID' problem:  no gene detected..")
		myStart <- gmap$POSITION[1]
		myStop <- gmap$END[1]
		cat( "\nExtracting genomic flanking regions..")
		flankLen <- 50
		refAns <- pipe.ConsensusBaseCalls( sampleID, geneID=geneID, start=myStart, stop=myStop, 
				aaToo=FALSE, as.cDNA=FALSE, utr.tail.length=flankLen, noReadCalls="genomic")
		myRefV <- refAns$ref
		myRefStr <- paste( myRefV, collapse="")
		if ( gmap$STRAND[1] == "-") myRefStr <- myReverseComplement(myRefStr)
		# we need the 'exon' info to grab/find the correct flanking data
		emap <- subset( getCurrentCdsMap(), GENE_ID == geneID)
		Nexons <- nrow( emap)
		if (! is.null(exon)) {
			exon <- as.integer(exon)
			if( ! (exon %in% 1:Nexons)) stop( paste( "\nError:  'exon' must be an integer in 1..",Nexons))
		}
		# if we are doing a single exon, then our construct will be found directly
		if ( is.integer( exon) || Nexons == 1) {
			cat( "\nGetting single exon flanking regions..")
			paAns <- pairwiseAlignment( dnaSeq, myRefStr, type="global-local")
			if ( score( paAns) < nchar(dnaSeq)/2) cat( "  low 'flank search alignment score' warning: ", score(paAns))
			myRefStart <- start( subject( paAns))
			myRefStop <- width( subject( paAns)) + myRefStart - 1
			leftFlank <- substr( myRefStr, max( 1, myRefStart-flankLen), myRefStart-1)
			rightFlank <- substr( myRefStr, myRefStop+1, min( myRefStop+flankLen, nchar(myRefStr)))
		} else {
			# but if doing a multi-exon construct, we have to go find the correct edges separately
			cat( "\nGetting multi exon flanking regions..")
			if (gmap$STRAND[1] == "-") emap <- emap[ rev( 1:Nexons), ]
			firstExonStart <- 1
			firstExonStop <- emap$END[1] - emap$POSITION[1] + 1
			firstExonSeq <- substr( dnaSeq, firstExonStart, firstExonStop)
			paAns <- pairwiseAlignment( firstExonSeq, myRefStr, type="global-local")
			if ( score( paAns) < nchar(firstExonSeq)/2) cat( "  low 'flank search alignment score' warning: ", score(paAns))
			myRefStart <- start( subject( paAns))
			myRefStop <- width( subject( paAns)) + myRefStart - 1
			leftFlank <- substr( myRefStr, max( 1, myRefStart-flankLen), myRefStart-1)
			lastExonStop <- nchar( dnaSeq)
			lastExonStart <- lastExonStop - (emap$END[Nexons] - emap$POSITION[Nexons])
			lastExonSeq <- substr( dnaSeq, lastExonStart, lastExonStop)
			paAns <- pairwiseAlignment( lastExonSeq, myRefStr, type="global-local")
			if ( score( paAns) < nchar(lastExonSeq)/2) cat( "  low 'flank search alignment score' warning: ", score(paAns))
			myRefStart <- start( subject( paAns))
			myRefStop <- width( subject( paAns)) + myRefStart - 1
			rightFlank <- substr( myRefStr, myRefStop+1, min( myRefStop+flankLen, nchar(myRefStr)))
		}
		leftFlankLen <- nchar( leftFlank)
		rightFlankLen <- nchar( rightFlank)
	} else {	# no real gene to get flanks from...
		leftFlank <- rightFlank <- ""
		leftFlankLen <- rightFlankLen <- 0
	}

	# step 2.2:  make a temp sequence that has these flanks
	tmpdna <- paste( leftFlank, dnaSeq, rightFlank, sep="", collapse="")
	#tmpdnaV <- strsplit( tmpdna, split="")[[1]]
	consensusDNAflankFile <- sub( "ConsensusDNA", "ConsensusDNAwithFlanks", consensusDNAfile)
	writeFasta( as.Fasta( myDesc, tmpdna), consensusDNAflankFile, line.width=100)
	indexFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProteinIndex", sep="."))
	cat( "\nMaking Bowtie index from consensus sequence with flanks..")
	callBowtie2Build( buildBowtie2BuildCommandLine( inputFastaFile=consensusDNAflankFile, outputIndexFile=indexFile, 
			optionsFile=optionsFile, verbose=F))

	# step 3:  realign the raw reads against this new target.  This step needs to be aware of which workflow called it
	mode <- match.arg( mode)
	cat( "\nCalling Bowtie against consensus sequence..")
	if ( mode == "TargetSearch") {
		geneReadsFile <- file.path( peptide.path, paste( sampleID, geneName, "fastq.gz", sep="."))
		allReadsFiles <- geneReadsFile
	} else {
		nohitReadsFile <- file.path( results.path, "fastq", paste( sampleID, "noHits.fastq.gz", sep="."))
		geneReadsFile <- file.path( results.path, "fastq", paste( sampleID, geneName, "fastq.gz", sep="."))
		if (useCutadapt) {
			nohitReadsFile <- file.path( results.path, "fastq", paste( sampleID, "noHits.trimmed.fastq.gz", sep="."))
			geneReadsTrimFile <- file.path( results.path, "fastq", paste( sampleID, geneName, "trimmed.fastq.gz", sep="."))
		}
		allReadsFiles <- c(geneReadsFile, nohitReadsFile)
		if ( ! is.null( extra.fastq.keyword)) {
			cat( "  extra FASTQ:  ")
			for ( keyword in extra.fastq.keyword) {
				extraFile <- file.path( results.path, "fastq", paste( sampleID, keyword, "fastq.gz", sep="."))
				allReadsFiles <- c( allReadsFiles, extraFile)
				cat( "  ", keyword)
			}
		}
	}
	# put the sample and gene name in these temp files, to prevent collisions
	bamFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.bam", sep="."))
	nhFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.noHits.fastq.gz", sep="."))
	bowtieAns <- fastqToBAM( inputFastqFile=allReadsFiles, outputFile=bamFile, sampleID=sampleID, 
			optionsFile=optionsFile, annotationFile=annotationFile, 
			alignIndex=indexFile, index.path=".", noHitsFile=nhFile, verbose=F)
	nReadsAligned <- bowtieAns$UniqueReads + bowtieAns$MultiReads
	
	# step 4:  get the new consensus from this BAM file
	cat( "\nExtract new consensus from re-aligning reads to previous consensus..")
	ans <- consensusBaseCalls( bamfile=bamFile, genomicFastaFile=consensusDNAflankFile, seqID=myDesc, 
				geneID=NULL, start=leftFlankLen+1, stop=leftFlankLen+nchar(dnaSeq), aaToo=TRUE, 
				noReadCalls="genomic")
	#ans <- consensusBaseCallsToCDNA( ans, geneID=NULL, start=leftFlankLen+1, stop=leftFlankLen+nchar(dnaSeq))
	ans <- consensusBaseCallsToCDNA( ans, geneID=NULL)

	# step 5:  write these out
	calls <- ans$callsTable
	dnaSeq <- paste( calls$DNA, collapse="")
	aaSeq <- paste( calls$AA, collapse="")
	consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusAA.fasta", sep="."))
	consensusDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusDNA.fasta", sep="."))
	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusBaseCalls.txt", sep="."))
	writeFasta( as.Fasta( myDesc, aaSeq), consensusAAfile, line.width=100)
	writeFasta( as.Fasta( myDesc, dnaSeq), consensusDNAfile, line.width=100)
	write.table( calls, consensusBASEfile, sep="\t", quote=F, row.names=TRUE)
	cat( "\nWrote new realigned .FASTA and BaseCalls for sampleID: ", sampleID, "\n")

	# step 6:  turn the alignments back to fastq and then to peptides
	bamFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.sorted.bam", sep="."))
	fastqFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.AlignedReads.fq.gz", sep="."))
	tempPeptidesFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.Peptides.txt", sep="."))
	genePeptidesFile <- file.path( peptide.path, paste( sampleID, geneName, "RawReadPeptides.txt", sep="."))
	bam2fastq( bamfile=bamFile, outfile=fastqFile, verbose=F)
	# since the raw DNA reads wt got aligned are already cutAdapt'ed, no need to do it again...
	fastqToPeptides( filein=fastqFile, fileout=tempPeptidesFile, clipAtStop=FALSE)
	# merge these new peptides into the existing set, instead of overwriting...
	mergePeptideFiles( tempPeptidesFile, genePeptidesFile, outfile=genePeptidesFile, 
			mergeCountsMode="File2")

	# record this step to audit trail
	writeAuditRecord( peptide.path, sampleID, geneName, mode="Realign", 
			info=list( "Length_AA"=nchar(aaSeq), "N_Peptides"=nReadsAligned))

	# lastly, clean up
	system( paste( "rm ", file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein*bam*", sep="."))))
	system( paste( "rm ", file.path( peptide.path, paste( sampleID, geneName, "ConsensusProteinIndex*", sep="."))))
	system( paste( "rm ", file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.noHits.fastq.gz", sep="."))))
	system( paste( "rm ", file.path( peptide.path, paste( sampleID, geneName, "ConsensusProtein.AlignedReads.fq.gz", sep="."))))
	system( paste( "rm ", consensusDNAflankFile))
	system( paste( "rm ", file.path( peptide.path, paste( sampleID, geneName, "ConsensusDNAwithFlanks.fasta.fai", sep="."))))
	system( paste( "rm ", tempPeptidesFile))
}


`acceptRealignedConsensus` <- function( sampleID, geneID="PF3D7_1200600", geneName=geneID, optionsFile="Options.txt", 
				results.path=NULL, peptide.path=NULL) {

	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	if ( is.null(peptide.path)) peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)

	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	consensusAAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusAA.fasta", sep="."))
	consensusDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusDNA.fasta", sep="."))
	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusBaseCalls.txt", sep="."))

	realignedAAfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusAA.fasta", sep="."))
	realignedDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusDNA.fasta", sep="."))
	realignedBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedConsensusBaseCalls.txt", sep="."))
	if ( ! all( file.exists( c( realignedAAfile, realignedDNAfile, realignedBASEfile)))) {
		cat( "\nRe-aligned files not found!  Tried: ", basename( realignedAAfile))
		stop()
	}

	saveAAfile <- file.path( peptide.path, paste( sampleID, geneName, "SaveConsensusAA.fasta", sep="."))
	saveDNAfile <- file.path( peptide.path, paste( sampleID, geneName, "SaveConsensusDNA.fasta", sep="."))
	saveBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "SaveConsensusBaseCalls.txt", sep="."))
	file.copy( consensusAAfile, saveAAfile, overwrite=T)
	file.copy( consensusDNAfile, saveDNAfile, overwrite=T)
	file.copy( consensusBASEfile, saveBASEfile, overwrite=T)

	file.copy( realignedAAfile, consensusAAfile, overwrite=T)
	file.copy( realignedDNAfile, consensusDNAfile, overwrite=T)
	file.copy( realignedBASEfile, consensusBASEfile, overwrite=T)
	file.delete( c( realignedAAfile, realignedDNAfile, realignedBASEfile))

	consensusPILEUPfile <- file.path( peptide.path, paste( sampleID, geneName, "PeptidePileups.pdf", sep="."))
	consensusFINALfile <- file.path( peptide.path, paste( sampleID, geneName, "FinalConsensus.pdf", sep="."))
	realignedPILEUPfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedPeptidePileups.pdf", sep="."))
	realignedFINALfile <- file.path( peptide.path, paste( sampleID, geneName, "RealignedFinalConsensus.pdf", sep="."))
	if ( file.exists( realignedPILEUPfile)) {
		file.copy( realignedPILEUPfile, consensusPILEUPfile, overwrite=T)
		file.delete( realignedPILEUPfile)
	}
	if ( file.exists( realignedFINALfile)) {
		file.copy( realignedFINALfile, consensusFINALfile, overwrite=T)
		file.delete( realignedFINALfile)
	}
}


mergePeptideFiles <- function( infile1, infile2, outfile, mergeCountsMode=c("File1","File2","Add")) {

	df1 <- read.delim( infile1, as.is=T)
	df2 <- read.delim( infile2, as.is=T)

	outPep <- union( df1$Peptide, df2$Peptide)
	outCounts1 <- outCounts2 <- rep.int( 0, length(outPep))

	wh1 <- match( outPep, df1$Peptide, nomatch=0)
	wh2 <- match( outPep, df2$Peptide, nomatch=0)
	outCounts1[ wh1 > 0] <- df1$Count[wh1]
	outCounts2[ wh2 > 0] <- df2$Count[wh2]

	mergeCountsMode <- match.arg( mergeCountsMode)
	if ( mergeCountsMode == "File1") {
		missing <- which( outCounts1 == 0)
		outCounts1[ missing] <- outCounts2[ missing]
		outCounts <- outCounts1
	} else if ( mergeCountsMode == "File2") {
		missing <- which( outCounts2 == 0)
		outCounts2[ missing] <- outCounts1[ missing]
		outCounts <- outCounts2
	} else {
		outCounts <- outCounts1 + outCounts2
	}

	out <- data.frame( "Peptide"=outPep, "Count"=outCounts, stringsAsFactors=FALSE)
	write.table( out, outfile, sep="\t", quote=F, row.names=F)
}


`searchForMisalignedReads` <- function( sampleID, geneName="Var2csa", context=NULL, flank.size=50,
				optionsFile="Options.txt", results.path=NULL, peptide.path=NULL) {

	# get the current consensus base calls file
	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	if ( is.null(peptide.path)) peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)

	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)

	consensusBASEfile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusBaseCalls.txt", sep="."))
	calls <- read.delim( consensusBASEfile, as.is=T)
	aaAns <- consensusTranslation( calls$DNA)
	readingFrame <- "Frame1"
	aaSeq <- paste( aaAns[[readingFrame]], collapse="")
	dnaSeq <- paste( calls$DNA, collapse="")
	calls$AA <- aaAns[[readingFrame]]
	rownames(calls) <- 1:nrow(calls)

	# find that AA context
	context <- toupper( context)
	contextStart <- gregexpr( context, aaSeq, fixed=T)[[1]]
	if ( contextStart[1] < 1) {
		cat( "\nFailed to find 'context' in AA sequence: ", context)
		return(NULL)
	}
	if ( length(contextStart) > 1) {
		cat( "\nNon-unique 'context' in AA sequence: ", context, "\tN_times: ", length(contextStart))
		cat( "\nSites at: ", contextStart)
		return(NULL)
	}
	contextStop <- contextStart + nchar( context) - 1
	contextStr <- substr( aaSeq, contextStart, contextStop)
	cat( "\nContext location: ", contextStart, contextStop, contextStr)

	# this gives us the DNA in the area around that AA context, plus flanks
	dnaContextStart <- (contextStart-1) * 3 + 1 - flank.size
	dnaContextStop <- dnaContextStart + nchar(context) * 3 + flank.size * 2
	dnaContext <- substr( dnaSeq, dnaContextStart, dnaContextStop)
	dnaContextRC <- myReverseComplement( dnaContext)
	cat( "\nDNA Context to look for: ", dnaContext)

	# get the DNA for every gene
	geneDNAfile <- file.path( peptide.path, "AllGenes.GenomeDNA.fasta")
	if ( ! file.exists( geneDNAfile)) {
		cat( "\nBuilding FASTA of DNA for all genes..")
		genomicFastaFile <- getOptionValue( optionsFile, "genomicFastaFile", verbose=F)
		gmap <- subset( getCurrentGeneMap(), REAL_G == TRUE)
		allGenes <- gmap$GENE_ID
		fa <- gene2Fasta( allGenes, genomicFastaFile, mode="gdna", verbose=T)
		writeFasta( fa, geneDNAfile, line=100)
	} else {
		fa <- loadFasta( geneDNAfile, verbose=F)
	}

	# find the best hits of this DNA context to any gene in the genome
	cat( "\nSearching DNA of all genes..")
	require( Biostrings)
	submat <- nucleotideSubstitutionMatrix()
	scores1 <- pairwiseAlignment( fa$seq, dnaContext, type="local", substitutionMatrix=submat, scoreOnly=T)
	names(scores1) <- fa$desc
	scores2 <- pairwiseAlignment( fa$seq, dnaContextRC, type="local", substitutionMatrix=submat, scoreOnly=T)
	names(scores2) <- fa$desc

	allScores <- sort( c( scores1, scores2), decreasing=T)
	ans <- allScores[ 1:10]

	out <- data.frame( "GENE_ID"=names(ans), "SCORE"= as.numeric(ans), "PRODUCT"=gene2Product( names(ans)), 
				stringsAsFactors=F)
	return(out)
}


`createAuditFile` <- function( peptide.path, sampleID, geneName) {

	auditFile <- auditFileName( peptide.path, sampleID, geneName)
	con <- file( auditFile, open="wt")
	headerTerms <- c( "Command", "DateTime", "Length_AA", "N_Peptides", "SubCommand", "Location_DNA", 
			"OldSeqFrag", "NewSeqFrag")
	headerText <- paste( headerTerms, collapse="\t")
	writeLines( headerText, con=con)
	close( con)
}


`auditFileName` <- function( peptide.path, sampleID, geneName) {

	# the gene name is used in all filenames, force it to be clean of special characters
	geneName <- file.cleanSpecialCharactersFromFileName( geneName)
	f <- paste( sampleID, geneName, "AuditTrail.txt", sep=".")
	return( file.path( peptide.path, f))
}


`writeAuditRecord` <- function( peptide.path, sampleID, geneName, mode, info) {

	auditFile <- auditFileName( peptide.path, sampleID, geneName)
	con <- file( auditFile, open="at")

	dat <- date()
	cmnd <- mode
	cmd2 <- protLen <- nPep <- locStr <- oldSeqFrag <- newSeqFrag <- ""
	if ( mode == "Pileup") {
		# info is a list of details from the protein pileup step
		nPep <- info$N_Peptides
		# the protein is a vector of single characters
		protLen <- length( info$Construct)
	}
	if ( mode == "Modify") {
		# info is a list of details from the protein modification step
		cmd2 <- info$Command
		loc <- as.numeric( info$Location)
		locStr <- paste( min(loc,na.rm=T), max(loc,na.rm=T), sep=":")
		oldSeqFrag <- gsub( " ", "", paste( info$OldSequence, collapse=""))
		newSeqFrag <- gsub( " ", "", paste( info$NewSequence, collapse=""))
		protLen <- info$Length_AA
	}
	if ( mode == "Realign") {
		# info is a list of details from the bowtie re-alignment step
		nPep <- info$N_Peptides
		cmd2 <- "Bowtie2"
		protLen <- info$Length_AA
	}
	auditText <- paste( c( cmnd, dat, protLen, nPep, cmd2, locStr, oldSeqFrag, 
				newSeqFrag), collapse="\t")
	writeLines( auditText, con=con)
	close( con)
}


CPP.AuditSummary <- function( sampleID, geneName="Varcsa", results.path=getOptionValue( "Options.txt", "results.path", verbose=F)) {

	path <- file.path( results.path, "ConsensusProteins", sampleID)
	f <- auditFileName( path, sampleID, geneName=geneName)
	if ( ! file.exists( f)) {
		cat( "\nCPP audit file not found: ", f)
		return( NULL)
	}

	tbl <- read.delim( f, as.is=T)
	cat( "\n\nAudit Summary: ", sampleID, "\n")
	cat( "\nCounts by CPP Command Type:")
	print( ans1 <- table( tbl$Command))
	cat( "\nCounts by CPP 'Modify' Sub-Command Type:")
	print( ans2 <- table( tbl$SubCommand[ tbl$Command == "Modify"]))

	# if the gene is VAR2CSA, do more detailed analysis
	if ( toupper(geneName) == "VAR2CSA") {
		dmap <- getVar2csaDomainMap( strain="3D7")
		# make a "findInterval" construct, using the midpoint between domains
		domStarts <- dmap$REF_START
		prevDomStops <- c( 1, dmap$REF_STOP[1:(nrow(dmap)-1)])
		domStarts <- round( (prevDomStops + domStarts) / 2)
		names( domStarts) <- dmap$DOMAIN_ID
		# let's look at all the modifcation lines
		isMOD <- which( tbl$Command == "Modify")
		# turn the "Location_DNA" info into  a AA location
		dnaLocTerms <- strsplit( tbl$Location_DNA[ isMOD], split=":")
		aaCenter <- sapply( dnaLocTerms, function( x) return( round( mean( as.numeric(x), na.rm=T) / 3)))
		# bacause the CPP construct tends to be short early, and grow to full size, just using AA locs
		# will have a bias.  Convert to percentages to do the find, show we are being fair
		domStarts <- domStarts / max( domStarts)
		# more precisely, the length to use is the one from "before" this edit, as it was the location this edit occured on
		aaLens <- as.numeric( tbl$Length_AA[ isMOD-1])
		# we except a typical Var2csa to be ~2650 long.  And we know that the worst, most likely missing region is DBL6
		# thus, lets assume the length at a minimum to be more fair with our fractional guesses.
		aaLens <- pmax( aaLens, rep.int(2600,length(aaLens)))
		aaCenter <- aaCenter / aaLens
		hits <- findInterval( aaCenter, domStarts, all.inside=T)
		domHits <- names( domStarts)[hits]
		domHitTbl <- table( factor( domHits, levels=dmap$DOMAIN_ID))
		domHitPct <- round( domHitTbl * 100 / sum(domHitTbl), digits=1)
		cat( "\nLocations of Modify Operations By Var2csa Domain: \n")
		ans3 <- data.frame( "DomainID"=dmap$DOMAIN_ID, "N_Modify_Ops"=as.numeric(domHitTbl), 
				"Pct_Modify_Ops"=as.numeric(domHitPct), stringsAsFactors=F)
		rownames(ans3) <- 1:nrow(ans3)
		print( ans3)

		return( list( "CPP Commands"=ans1, "Modify_Subcommands"=ans2, "Var2csa Domains"=ans3))
	} else {
		return( list( "CPP Commands"=ans1, "Modify_Subcommands"=ans2))
	}
}


`showCPPspliceBoundaries` <- function( geneID, construct, optionsFile="Options.txt") {

	# get the exon details
	cdsMap <- subset( getCurrentCdsMap(), GENE_ID == geneID)
	if ( nrow(cdsMap) < 2) return()
	geneMap <- subset( getCurrentGeneMap(), GENE_ID == geneID)

	# the protein may be a vector of single bases
	if ( length(construct) > 1) construct <- paste( construct, collapse="")

	# get the genomic locations of the splices
	gPos <- sort( c( cdsMap$POSITION, cdsMap$END))
	# the very first and last are the gene ends
	gPos <- gPos[ 2:(length(gPos)-1)]
	# what's left should be end, beg, end, beg, etc.  Let's use jus the begin spots
	gPos <- gPos[ seq( 2, length(gPos), by=2)]
	if ( !length(gPos)) return()

	# map these to protein AA lcoations
	posAns <- convertGenomicDNApositionToAAposition( cdsMap$SEQ_ID[1], DNAposition=gPos, geneID=geneID,
					genemap=geneMap, cdsmap=cdsMap)
	aaPos <- posAns$AA_POSITION
	codonPos <- posAns$CODON_POSITION
	# try to tweak where the exon splice line lands based on codon location
	#toShift <- which( codonPos == 3)
	#aaPos[ toShift] <- aaPos[toShift] - 1
	
	# we need the reference protein to know what these locations should look like
	fa <- gene2Fasta( geneID, getOptionValue( optionsFile, "genomicFastaFile", verbose=F), mode="aa")
	refProtein <- fa$seq[1]
	if ( is.na(refProtein)) return()

	require(Biostrings)
	data( BLOSUM62)
	myExonFrags <- substr( rep.int( refProtein,length(aaPos)), aaPos, aaPos+8)
	myPA <- pairwiseAlignment( myExonFrags, refProtein, type="global-local", scoreOnly=F, 
				substitutionMatrix=BLOSUM62)
	myLocs <- start( subject( myPA))
	for ( i in 1:length(myLocs)) {
		lines( rep.int(myLocs[i]-0.5,2), c(-5,100), lwd=0.5, lty=2, col="steelblue")
	}
	return()
}
robertdouglasmorrison/DuffyNGS documentation built on March 24, 2024, 4:16 p.m.