R/pipe.ConsensusProteinExtraction.R

Defines functions `pipe.RealignExtractedProteins` `cpp.ExtractTopProteins` `pipe.ConsensusProteinExtraction`

# pipe.ConsensusProteinExtraction.R -- tools to extract the final protein calls from CPP
#				allowing for 2+ proteins of different proportions to be resolved by deconvolution


`pipe.ConsensusProteinExtraction` <- function( sampleID, geneID="PF3D7_1200600", geneName="Var2csa", optionsFile="Options.txt",
						results.path=NULL, max.proteins=NULL, min.minor.pct=6, min.mutation.pct=1.0, min.minor.read.count=2,
						verbose=TRUE ) {
						
	# min.minor.pct - at any one amino acid, how frequent must a minor AA call be to not be treated as just noise
	# min.mutation.pct -  for the full length protein, what fraction of AA must be mutated to call it a separate protein call


	# make sure we were given valid parameters..
	if ( length(sampleID) > 1) {
		sampleID <- sampleID[1]
		cat( "\nWarning:  only one sampleID allowed")
	}
	gmap <- getCurrentGeneMap()
	if ( ! (geneID %in% gmap$GENE_ID)) {
		cat( "\nNot a known geneID: ", geneID)
		return(NULL)
	}
	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	if ( ! file.exists( peptide.path)) {
		cat( "\nNo 'Consensus Protein' results found for sample: ", sampleID)
		return(NULL)
	}

	# build the filename of the expected results, and get the consensus protein call details 
	summaryFile <- file.path( peptide.path, paste( sampleID, geneName, "ConsensusProteinSummary.txt", sep="."))						
	if ( ! file.exists( summaryFile)) {
		cat( "\nProtein summary file not found: ", summaryFile)
		return(NULL)
	}
	protDF <- read.delim( summaryFile, as.is=T)
	NAA <- nrow( protDF)

	# there is a small chance that this file is old, from befoer the 'Counts' field was added.
	# check and perhaps remake it
	if ( ! "Counts" %in% colnames(protDF)) {
		sumFile <- sub( "ProteinSummary.txt$", "Answer.rda", summaryFile)
		tmpAns <- proteinConstructPileupSummary( sumFile, sampleID=sampleID, geneName=geneName, doPlot=F)
		protDF <- read.delim( summaryFile, as.is=T)
		NAA <- nrow( protDF)
	}
	
	# get the consensus protein(s) themselves, and extract the initial guess about proportions from their names
	topProteins <- cpp.ExtractTopProteins( summaryFile, min.minor.pct=min.minor.pct, min.mutation.pct=min.mutation.pct,
						drop.gaps=FALSE)
	NPROT <- length( topProteins)
	protNames <- names( topProteins)

	# allow the user to put a upper limit on how many proteins we consider
	if ( ! is.null( max.proteins)) {
		topProteins <- topProteins[ 1:min(NPROT,max.proteins)]
		NPROT <- length( topProteins)
		protNames <- names( topProteins)
	}
	protProportions <- rep.int( NA, NPROT)
	protProportions[1] <- 100
	if (NPROT > 1) {
		for ( i in 2:NPROT) protProportions[i] <- as.numeric( sub( "(^MinorVariant.+Pct=)([0-9]+)(%$)", "\\2", protNames[i]))
		protProportions[1] <- 100 - sum( protProportions[2:NPROT], na.rm=T)
	}
	if (verbose) {
		cat( "\nInitial Proportions: \n")
		print( paste( protNames, protProportions, sep=" = "))
	}
	# as defined, all these constructs should be the same length.  Make sure
	if ( any( nchar(topProteins) != NAA)) {
		stop( "\nSize error:  consensus protein lengths not all equal..")
	}
	# turn all the protein calls into one AA matrix
	protV <- strsplit( topProteins, split="")
	aaM <- matrix( unlist(protV), nrow=NAA, ncol=NPROT)
	colnames(aaM) <- protNames
	
	# prep all the protein calls and all the summary percentages & counts
	protV <- strsplit( topProteins, split="")
	pcts <- protDF$Percentages
	cnts <- protDF$Counts
	pctTerms <- strsplit( pcts, split="; ")
	cntTerms <- strsplit( cnts, split="; ")
	pctAAs <- lapply( pctTerms, function(x) sub( ":[0-9]+", "", x))
	pctVals <- lapply( pctTerms, function(x) as.numeric( sub( "[-*A-Z]:", "", x)))
	cntAAs <- lapply( cntTerms, function(x) sub( ":[0-9]+", "", x))
	cntVals <- lapply( cntTerms, function(x) as.numeric( sub( "[-*A-Z]:", "", x)))
	
	# we can tell who is conserved or not by how many AA were called
	nAAterms <- sapply( pctAAs, length)
	useForScoring <- which( nAAterms > 1)
	
	# we will ignore very small read counts as being not seen enough to be considered a true minor variant
	for ( k in useForScoring) {
		myCnts <- cntVals[[k]]
		tooLow <- which( myCnts < min.minor.read.count)
		if ( length(tooLow)) {
			# remove these from both percents and counts
			cntVals[[k]] <- cntVals[[k]][ -tooLow]
			cntAAs[[k]] <- cntAAs[[k]][ -tooLow]
			pctVals[[k]] <- pctVals[[k]][ -tooLow]
			pctAAs[[k]] <- pctAAs[[k]][ -tooLow]
			# re-evaluate the percentages after this trim of low count minor variants
			pctVals[[k]] <- round( cntVals[[k]] * 100 / sum( cntVals[[k]], na.rm=T))
		}
	}
	
	# now we need to reassess who is conserved or not by how many AA were called
	nAAterms <- sapply( pctAAs, length)
	useForScoring <- which( nAAterms > 1)
	if (verbose) cat( "\nN AA with minor variants: ", length(useForScoring))
	
	# use one universal set of all possible AA for doing our compares and distance scoring
	ALL.AA <- sort( unique( c( unlist( pctAAs), unlist(cntAAs))))
	
	# prebuild the static distributions of the called AAs
	aaPctTables <- vector( mode="list", length=NAA)
	for ( k in useForScoring) {
		myAA <- rep( pctAAs[[k]], times=pctVals[[k]])
		myTbl <- table( factor(myAA, levels=ALL.AA))
		aaPctTables[[k]] <- myTbl
	}
	
	
	# local function that scores how well the consensus proteins match/explain the call details
	# puts some results into global storage for speed / later use...
	aaDist <- rep.int( 0, NAA)
	aaM.out <- aaM
	
	consensus.AA.Distance <- function( aaM, protPcts) {
	
		# given one matrix of all the protein AA calls for every location, and the percentage 
		# that each protein contributes to the total, see how this current consensus and percentage
		# fits to the called data from the CPP tool
		protPcts <- as.integer( protPcts)

		# visit each AA that has any minor forms, and see how distant is current from what was called
		for ( k in useForScoring) {
			myAA <- rep( aaM[ k, ], times=protPcts)
			myTbl <- table( factor(myAA, levels=ALL.AA))
			calledTbl <- aaPctTables[[k]]
			myDistTbl <- myTbl - calledTbl
			myDist <- sum( abs( myDistTbl), na.rm=T) 
			# store these where the caller can use them later
			aaDist[k] <<- myDist 
		}
		return( sum(aaDist) / nrow(aaM))
	}
	
	
	optimize.Proportions <- function( aaM, protPcts, verbose=T) {
	
		# given one matrix of all the protein AA calls for every location, and the percentage 
		# that each protein contributes to the total, optimize those percentages to give the best
		# fit to the called data
		
		# ready to do the optimization.   Evaluate at the initial estimate
		startingDist <- consensus.AA.Distance( aaM, protProportions)
		if (verbose) cat( "\nInitial Distance: ", startingDist)

		bestDist <- startingDist
		bestProportion <- protProportions
		iterCount <- 0
		while ( TRUE ) {
			# step over all alternative pairs of proteins
			thisProportion <- lastProportion <- bestProportion
			iterCount <- iterCount + 1
			bestI <- bestJ <- 0
			for (i in 1:NPROT) {
				for ( j in 1:NPROT) {
					if ( i == j) next
					# tweak the proportions between all 2 choices of protein
					thisProportion <- lastProportion
					thisProportion[i] <- thisProportion[i] + 1
					thisProportion[j] <- thisProportion[j] - 1
					# never let a proportion go to zero
					if ( any( thisProportion < 1)) next
					# rescore the distance
					thisDist <- consensus.AA.Distance( aaM, thisProportion)
					if ( thisDist < bestDist) {
						bestDist <- thisDist
						bestProportion <- thisProportion
						bestI <- i
						bestJ <- j
					}
				}
			}
			# we did all possible steps, was any better?
			if ( bestI == 0 || bestJ == 0) break
			if (verbose) cat( "\nIter: ", iterCount, "  Dist: ", bestDist, "  Proportions: ", bestProportion)
		}
		return( list( "Distance"=bestDist, "Proportions"=bestProportion))
	}


	optimize.ResidueCalls <- function( aaM, protPcts, verbose=T) {
	
		# given one matrix of all the protein AA calls for every location, and the percentage 
		# that each protein contributes to the total, optimize the AA residues between the proteins
		# to give the best fit to the called data
		
		# the current distance for each minor site already sits in global storage...

		aaM.out <- aaM
		dist.out <- aaDist
		
		# visit each AA that has any minor forms, and see how distant is affected by permuting the residues
		for ( k in useForScoring) {
		
			# start with the set of AA the proteins were on function entry
			myAAorig <- aaM[ k, ]
			aaSet <- sort( unique( myAAorig))
			nChoice <- length( aaSet)
			nNeed <- ncol(aaM)
			
			# step over all possible combinations of AA
			bestDist <- origDist <- aaDist[k]
			setPtr <- rep.int( 1, nNeed)
			bestAAtry <- NULL
			while (TRUE) {
				# use the set pointers to make this possible choice of AA calls
				myAAtry <- aaSet[ setPtr]
				myAA <- rep( myAAtry, times=protPcts)
				myTbl <- table( factor(myAA, levels=ALL.AA))
				calledTbl <- aaPctTables[[k]]
				myDistTbl <- myTbl - calledTbl
				thisDist <- sum( abs( myDistTbl), na.rm=T) 
				if ( thisDist < bestDist) {
					bestDist <- thisDist
					bestAAtry <- myAAtry
				}
				# now iterate by incrementing the pointers to eventually try all possible combinations
				setPtr[1] <- setPtr[1] + 1
				for (ich in 1:(nNeed-1)) {
					if (setPtr[ich] > nChoice) {
						setPtr[ich] <- 1
						setPtr[ich+1] <- setPtr[ich+1] + 1
					}
				}
				# we are done when the very last pointer is too big
				if ( setPtr[nNeed] > nChoice) break
			}
			# we did all possible steps, was any better?
			if ( is.null(bestAAtry)) next
			if (verbose && any( myAAorig != bestAAtry)) cat( "\nAA Swap: ", k, "  OldDist:", origDist, "  OldAA:", myAAorig, 
									"|  NewDist:", bestDist, "  NewAA:", bestAAtry)
			
			# update the proteins
			aaM.out[ k, ] <- bestAAtry
			dist.out[k] <- bestDist
		}
		
		newDist <- sum( dist.out) / nrow(aaM)
		return( list( "Distance"=newDist, "AA_Matrix"=aaM.out))
	}
	
	
	topDistSites <- function( aaM, aaDist, cutoffDist, nExtraAA=2) {
	
		# gather the top residual distance sites as extra info
		keep <- which( aaDist > cutoffDist)
		if ( ! length(keep)) return(NULL)
		badDist <- aaDist[ keep]
		badMotif <- aaM[ keep, 1]
		for ( i in 1:length(keep)) {
			k <- keep[i]
			badMotif[i] <- paste( aaM[(max(1,k-nExtraAA)):(min(nrow(aaM),k+nExtraAA)), 1], collapse="")
		}
		sml <- data.frame( "AA.Position"=keep, "AA.Motif"=badMotif, "Residual.Distance"=badDist, stringsAsFactors=F)
		ord <- order( sml$Residual.Distance, decreasing=T)
		sml <- sml[ ord, ]
		rownames(sml) <- 1:nrow(sml)
		return(sml)
	}

	
	# ready to do the work...
	# Step 1:  optimize the starting proportions, given what the extractor guessed
	myProportions <- protProportions
	ans1 <- optimize.Proportions( aaM, myProportions, verbose=verbose)
	myProportions <- ans1$Proportions
	myDist <- ans1$Distance
	# note that the optimization may have decreased the number of proteins!...
	NPROT <- length( myProportions)
	
	if ( NPROT > 1) {
		# Step 2:  visit each minor mutation site, and see if swapping residues improves the distance scoring
		ans2 <- optimize.ResidueCalls( aaM, myProportions, verbose=verbose)
		aaM.out <- ans2$AA_Matrix
	
		# Step 3:  re-optimize the  proportions, given the improved residue calls
		ans3 <- optimize.Proportions( aaM.out, myProportions, verbose=verbose)
		myProportions <- ans3$Proportions
		myDist <- ans3$Distance
	}
	
	# make the final answer
	finalProportions <- myProportions
	finalDist <- myDist
	finalAAseqs <- apply( aaM.out, MARGIN=2, FUN=function(x) paste( x, collapse=""))
	finalNames <- rep.int( paste( sampleID, geneName, sep="_"), NPROT)
	finalNames[1] <- paste( finalNames[1], "_Consensus", sep="")
	if (NPROT > 1) {
		finalNames[2:NPROT] <- paste( finalNames[2:NPROT], "_Variant", 2:NPROT, sep="")
	}
	finalSuffix <- paste( round( finalProportions), "%", sep="")
	finalNames <- paste( finalNames, finalSuffix, sep="_")
	
	if (verbose) cat( "\nFinal Extraction: \n", finalNames, "\nFinal Distance: ", finalDist)

	outSeqs <- as.Fasta( finalNames, finalAAseqs)
	finalFile <- file.path( peptide.path, paste( sampleID, geneName, "FinalExtractedAA.fasta", sep="."))
	writeFasta( outSeqs, finalFile, line=100)

	alnFile <- file.path( peptide.path, paste( sampleID, geneName, "FinalExtractedAA.aln", sep="."))
	if ( NPROT > 1) {
		aln <- mafft( finalFile, alnFile)
		writeALN( aln, alnFile, line=100)	
	} else {
		# make sure any earlier result gets removed
		file.delete( alnFile)
	}
	
	ans4 <- topDistSites( aaM.out, aaDist, finalDist*2)
	if ( ! is.null(ans4)) {
		residSiteFile <- file.path( peptide.path, paste( sampleID, geneName, "FinalExtracted.ResidualSites.csv", sep="."))
		write.table( ans4, residSiteFile, sep=",", quote=T, row.names=F)	
	}
	
	out <- list( "AA.Fasta"=outSeqs, "Residual"=finalDist, "Problem.Sites"=ans4)
	return( invisible(out))
}
	


`cpp.ExtractTopProteins` <- function( proteinSummaryFile, min.minor.pct=5.0, min.mutation.pct=1.0, 
					drop.gaps=TRUE) {

	# min.minor.pct - at any one amino acid, how frequent must a minor AA call be to not be treated as just noise
	# min.mutation.pct -  for the full length protein, what fraction of AA must be mutated to call it a separate protein call

	tbl <- read.delim( proteinSummaryFile, as.is=T)

	# the most likely one protein is already called
	bestProtein <- tbl$ConsensusAA
	NAA <- length( bestProtein)
	
	# always return the best
	out <- paste( bestProtein, collapse="")
	names(out) <- "BestConsensus"

	# to know the minor proteins, look to the table of details
	pctDetails <- tbl$Percentages
	pctTerms <- strsplit( pctDetails, split="; ", fixed=T)
	nTerms <- sapply( pctTerms, length)
	has2plus <- which( nTerms > 1)

	# start with the best, and then substitute where the minor call is deep enough
	minorPctsFound <- rep.int( NA, 5)
	for ( minor in 2:5) {
		nMutat <- 0
		myPcts <- vector()
		minorProtein <- bestProtein
		for ( j in has2plus) {
			if ( nTerms[j] < minor) next
			thisTerm <- pctTerms[[j]][minor]
			thisAA <- sub( ":.+", "", thisTerm)
			thisPct <- as.numeric(sub( ".+:", "", thisTerm))
			if ( ! is.na( thisPct) && thisPct >= min.minor.pct) {
				minorProtein[j] <- thisAA
				nMutat <- nMutat + 1
				myPcts[ nMutat] <- thisPct
			}
		}
		
		# did we get enought mutations to call this a new protein variant?
		if ( nMutat < (NAA * (min.mutation.pct/100))) next
		
		minorProtein <- paste( minorProtein, collapse="")
		out <- c( out, minorProtein)
		avgPct <- round( mean( myPcts))
		minorPctsFound[ minor] <- avgPct
	}

	# there is a chance of gaps being in the final set, drop them
	if ( drop.gaps) {
		out <- gsub( "-", "", out, fixed=T)
	}
	
	if ( length(out) > 1) {
		names(out) <- c( "BestConsensus", paste( "MinorVariant", 2:length(out), "_Pct=", minorPctsFound[2:length(out)], "%", sep=""))
	}
	return(out)
}



`pipe.RealignExtractedProteins` <- function( sampleID, geneID="PF3D7_1200600", geneName="Var2csa", optionsFile="Options.txt",
						results.path=NULL) {
						
	# after hand curation of the extracted .ALN file against the pileup image, remake the final FASTA and ALN files

	if ( is.null(results.path)) results.path <- getOptionValue( optionsFile, "results.path", notfound=".", verbose=F)
	peptide.path <- file.path( results.path, "ConsensusProteins", sampleID)
	if ( ! file.exists( peptide.path)) {
		cat( "\nNo 'Consensus Protein' results found for sample: ", sampleID)
		return(NULL)
	}

	finalFile <- file.path( peptide.path, paste( sampleID, geneName, "FinalExtractedAA.fasta", sep="."))
	alnFile <- file.path( peptide.path, paste( sampleID, geneName, "FinalExtractedAA.aln", sep="."))
	if ( ! file.exists( alnFile)) {
		cat( "\nError:  Can not find required Final Extracted ALN file: ", alnFile)
		return(NULL)
	}
	
	# convert the ALN that was hand modified
	ALNtoFasta( alnFile, outfile=finalFile, gap.character="", line.width=100, reorder=NULL)
	
	# now redo the MSA alignment again
	aln <- mafft( finalFile, alnFile)
	writeALN( aln, alnFile, line=100)	
}
robertdouglasmorrison/DuffyNGS documentation built on May 16, 2024, 10:14 a.m.