R/lifeCycleTools.R

Defines functions `recombineLifeCycleStageIntensity` `adjustLifeCycleStageFileSet` `adjustLifeCycleStageMatrix` `drawStageIntensityLine` `drawStageDensityLine` `plotLifeCycleStageIntensity` `plotLifeCycleStageUnitVectors` `bestLifeCycleStageTimeCall` `plotLifeCycleStages` `plotLifeCycleStageFromMatrix` `plotLifeCycleStageFromFileSet` `calcLifeCycleStageFromFile` `calcLifeCycleStage` `buildLifeCycleVectorSpace` `getLifeCycleMatrix` `LifeCycleSetup` `verifyLifeCycleSetup`

# lifeCycleTools.R

`verifyLifeCycleSetup` <- function() {

	isReady <- exists( "VectorSpace", envir=LifeCycleEnv)
	isRightSpecies <- FALSE
	if ( isReady) {
		curSpecies <- get( "Species", envir=LifeCycleEnv)
		if( ! is.null( curSpecies)) isRightSpecies <- ( curSpecies == getCurrentSpecies())
	}
	if ( !isReady || !isRightSpecies) LifeCycleSetup( dataset="RNAseq", unitVectorMode="absolute")
	return()
}


`LifeCycleSetup` <- function( dataset=c( "RNAseq", "YoungWinzeler", "DeRisi", "Custom"), 
				unitVectorMode=c("absolute","relative","none"), 
				stageMode=c("all", "blood.only"),
				custom.file=NULL, preNormalize=TRUE, postNormalize=TRUE, 
				doRMA=FALSE, min.spread=3.0, verbose=FALSE) {

	dataset <- match.arg( dataset)
	unitVectorMode <- match.arg( unitVectorMode)
	stageMode <- match.arg( stageMode)
	ans <- NULL

	cat( "\nSetting up LifeCycle dataset:  ", dataset)

	# get the data from the package, its called 'lifeCycleData'
	if ( dataset == "RNAseq") {
		data( LifeCycleData_RNAseq, package="DuffyTools", envir=environment())

		# use explicit colnames lookup
		sgspzPtrs <- match( c("sgSPZ.3D7","sgSPZ.NF54",
					"Spz1","Spz2","Spz3","Spz4", "SGspz1","SGspz2","SGspz3"), colnames(lifeCycleData))
		#lsd3Ptrs <- match( c("SanariaLS_Day3_1","SanariaLS_Day3_2"), colnames(lifeCycleData))
		lsd3Ptrs <- match( c("SanariaLS_Day3_Clean"), colnames(lifeCycleData))
		merozPtrs <- match( c("OttoHr48","StunHr5","HoeijHr05"), colnames(lifeCycleData))
		eringPtrs <- match( c("OttoHr0","OttoHr8","StunHr10","HoeijHr10"), colnames(lifeCycleData))
		lringPtrs <- match( c("OttoHr16","StunHr15","HoeijHr15","StunHr20","HoeijHr20",
					"Ring1","Ring2"), colnames(lifeCycleData))
		etrophPtrs <- match( c("OttoHr24","StunHr25","HoeijHr25",
					"HoeijHr30","StunHr30"), colnames(lifeCycleData))
		ltrophPtrs <- match( c("OttoHr32","StunHr35","HoeijHr35"), colnames(lifeCycleData))
		schizPtrs <- match( c("OttoHr40","StunHr40","HoeijHr40","SanariaBLOOD_1",
					"SanariaBLOOD_2"), colnames(lifeCycleData))
		#gcyte2Ptrs <- match( c("Su.GcyteII"), colnames(lifeCycleData))
		gcyte5Ptrs <- match( c("Su.GcyteV","AO_Gcyte","SauerGcyte_Female","SauerGcyte_Male1",
					"SauerGcyte_Male2","Gcyte1","Gcyte2"), colnames(lifeCycleData))
		ookinePtrs <- match( c("Su.Ookinete3D7","Su.Ookinete7G8"), colnames(lifeCycleData))
		#oocystPtrs <- match( c("Oocyst1","Oocyst2"), colnames(lifeCycleData))
		oospzPtrs <- match( c("ooSPZ.NF54","OOspz1","OOspz2","OOspz3"), colnames(lifeCycleData))
		#allPtrs <- c( sgspzPtrs, lsd3Ptrs, merozPtrs, eringPtrs, lringPtrs, etrophPtrs, ltrophPtrs,
		#		schizPtrs, gcyte2Ptrs, gcyte5Ptrs, ookinePtrs, oospzPtrs)
		allPtrs <- c( sgspzPtrs, lsd3Ptrs, merozPtrs, eringPtrs, lringPtrs, etrophPtrs, ltrophPtrs,
				#schizPtrs, gcyte5Ptrs, ookinePtrs, oocystPtrs, oospzPtrs)
				schizPtrs, gcyte5Ptrs, ookinePtrs, oospzPtrs)
		if ( any ( is.na( allPtrs))) {
			bad <- which( is.na( allPtrs))
			cat( "\nBad/missing Life Cycle Stage column name: ", bad)
			stop()
		}

		# these are column numbers in the original file
		ans <- buildLifeCycleVectorSpace( file=NULL, tbl=lifeCycleData, geneColumn="Gene",
			#dimensions=list( "SalivarySPZ"=sgspzPtrs, "LiverDay3"=lsd3Ptrs, "Merozoite"=merozPtrs,
			#"EarlyRing"=eringPtrs, "LateRing"=lringPtrs, "EarlyTroph"=etrophPtrs, "LateTroph"=ltrophPtrs, 
			#"Schizont"=schizPtrs, "EarlyGameto"=gcyte2Ptrs, "LateGameto"=gcyte5Ptrs, 
			#"Ookinete"=ookinePtrs, "OocystSPZ"=oospzPtrs), 
			dimensions=list( "SalivarySPZ"=sgspzPtrs, "LiverDay3"=lsd3Ptrs, "Merozoite"=merozPtrs,
			"EarlyRing"=eringPtrs, "LateRing"=lringPtrs, "EarlyTroph"=etrophPtrs, "LateTroph"=ltrophPtrs, 
			"Schizont"=schizPtrs, "Gametocyte"=gcyte5Ptrs, "Ookinete"=ookinePtrs, 
			#"Oocyst"=oocystPtrs, "OocystSPZ"=oospzPtrs), 
			"OocystSPZ"=oospzPtrs), 
			unitVectorMode=unitVectorMode, min.spread=min.spread, preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=FALSE, stageMode=stageMode, verbose=verbose)
		ans2 <- buildLifeCycleVectorSpace( file=NULL, tbl=lifeCycleData, geneColumn="Gene",
			#dimensions=list( "SalivarySPZ"=sgspzPtrs, "LiverDay3"=lsd3Ptrs, "Merozoite"=merozPtrs,
			#"EarlyRing"=eringPtrs, "LateRing"=lringPtrs, "EarlyTroph"=etrophPtrs, "LateTroph"=ltrophPtrs, 
			#"Schizont"=schizPtrs, "EarlyGameto"=gcyte2Ptrs, "LateGameto"=gcyte5Ptrs, 
			#"Ookinete"=ookinePtrs, "OocystSPZ"=oospzPtrs), 
			dimensions=list( "SalivarySPZ"=sgspzPtrs, "LiverDay3"=lsd3Ptrs, "Merozoite"=merozPtrs,
			"EarlyRing"=eringPtrs, "LateRing"=lringPtrs, "EarlyTroph"=etrophPtrs, "LateTroph"=ltrophPtrs, 
			"Schizont"=schizPtrs, "Gametocyte"=gcyte5Ptrs, "Ookinete"=ookinePtrs, 
			#"Oocyst"=oocystPtrs, "OocystSPZ"=oospzPtrs), 
			"OocystSPZ"=oospzPtrs), 
			unitVectorMode="none", min.spread=NULL, preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=FALSE, stageMode=stageMode, verbose=FALSE)
	}

	# get the data from the package, its called 'tbl'
	if ( dataset == "YoungWinzeler") {
		data( LifeCycleData_YoungWinzeler, package="DuffyTools", envir=environment())

		# these are column numbers in the original file
		ans <- buildLifeCycleVectorSpace( file=NULL, tbl=tbl, geneColumn="Gene",
			dimensions=list( "Sporozoite"=5, "Merozoite"=c(12,19), "EarlyRing"=c(6,13), 
			"LateRing"=c(7,14), "EarlyTroph"=c(8,15), "LateTroph"=c(9,16), 
			"EarlySchiz"=c(10,17), "LateSchiz"=c(11,18), "EarlyGameto"=c( 24:26), 
			"LateGameto"=c( 27:29)), unitVectorMode=unitVectorMode,
			min.spread=min.spread, preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=doRMA, verbose=verbose)
		ans2 <- buildLifeCycleVectorSpace( file=NULL, tbl=tbl, geneColumn="Gene",
			dimensions=list( "Sporozoite"=5, "Merozoite"=c(12,19), "EarlyRing"=c(6,13), 
			"LateRing"=c(7,14), "EarlyTroph"=c(8,15), "LateTroph"=c(9,16), 
			"EarlySchiz"=c(10,17), "LateSchiz"=c(11,18), "EarlyGameto"=c( 24:26), 
			"LateGameto"=c( 27:29)), unitVectorMode="none",
			min.spread=NULL, preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=doRMA, verbose=FALSE)
	}

	if ( dataset == "DeRisi") {
		data( LifeCycleData_DeRisi, package="DuffyTools", envir=environment())
		ans <- buildLifeCycleVectorSpace( file=NULL, tbl=tbl, geneColumn="Gene",
			dimensions=list( "EarlyRing"=1:6, "MidRing"=7:12, "LateRing"=13:18,
			"EarlyTroph"=19:25, "LateTroph"=26:31, "EarlySchiz"=32:39, "LateSchiz"=40:48), 
			unitVectorMode=unitVectorMode, min.spread=min.spread, 
			preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=doRMA, verbose=verbose)
		ans2 <- buildLifeCycleVectorSpace( file=NULL, tbl=tbl, geneColumn="Gene",
			dimensions=list( "EarlyRing"=1:6, "MidRing"=7:12, "LateRing"=13:18,
			"EarlyTroph"=19:25, "LateTroph"=26:31, "EarlySchiz"=32:39, "LateSchiz"=40:48), 
			unitVectorMode="none", min.spread=NULL, 
			preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=doRMA, verbose=FALSE)
	}

	if ( dataset == "Custom") {

		# pre read the file to make the dimensions construct
		tmp <- read.delim(custom.file, as.is=T)
		dimNames <- setdiff( colnames(tmp), "Gene")
		dimensions <- vector( mode="list")
		for (k in 1:length(dimNames)) dimensions[[k]] <- match( dimNames[k], colnames(tmp))
		names(dimensions) <- dimNames

		cat( "\nBuilding custom Life Cycle dataset:   Dimensions:\n")
		print( dimensions)
		cat( "\nFirst genes: \n")
		print( head( tmp))


		ans <- buildLifeCycleVectorSpace( file=custom.file, tbl=NULL, geneColumn="Gene",
			dimensions=dimensions,
			unitVectorMode=unitVectorMode, min.spread=min.spread, 
			preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=doRMA, verbose=verbose)
		ans2 <- buildLifeCycleVectorSpace( file=custom.file, tbl=NULL, geneColumn="Gene",
			dimensions=dimensions,
			unitVectorMode="none", min.spread=min.spread, 
			preNormalize=preNormalize, postNormalize=postNormalize,
			doRMA=doRMA, verbose=verbose)
	}


	LifeCycleEnv[[ "VectorSpace" ]] <- ans
	LifeCycleEnv[[ "IntensitySpace" ]] <- ans2
	LifeCycleEnv[[ "Species" ]] <- getCurrentSpecies()
	if ( ! is.null( ans)) {
		LifeCycleEnv[[ "N_STAGES" ]] <- ncol(ans) - 2   # geneID, Product come first
		LifeCycleEnv[[ "STAGE_NAMES" ]] <- colnames(ans)[3:ncol(ans)]
	}
	cat( "\nDone.\n")
}


`getLifeCycleMatrix` <- function( mode=c("IntensitySpace", "VectorSpace")) {

	verifyLifeCycleSetup()
	mode <- match.arg( mode)

	tbl <- LifeCycleEnv[[ mode]]
	genes <- tbl$GENE_ID

	# get just the values, not the IDs and Products
	m <- as.matrix( tbl[ ,3:ncol(tbl)])
	rownames(m) <- genes
	return(m)
}


`buildLifeCycleVectorSpace` <- function( file="", tbl=NULL, dimensions, geneColumn="Gene", 
					unitVectorMode=c( "absolute", "relative", "none"), 
					stageMode=c("all", "blood.only"),
					min.value=0.01, min.spread=2, preNormalize=FALSE, 
					postNormalize=TRUE, doRMA=TRUE, verbose=FALSE) {

	unitVectorMode <- match.arg( unitVectorMode)
	stageMode <- match.arg( stageMode)

	# get all the original reference set of gene data
	if ( is.null( tbl)) {
		tbl <- read.delim( file, as.is=TRUE)
	}

	# verify needed columns are all present
	if ( !( geneColumn %in% colnames(tbl))) {
		stop( paste( "gene name column not found:   tried: ", 
			geneColumn, "\tfound: ", colnames(tbl)))
	}

	allDs <- base::unlist( dimensions)
	isNumbers <-  ! any( is.na( colNumbers <- as.integer( allDs)))
	if (isNumbers) {
		if ( max( colNumbers) > ncol(tbl)) {
			stop( paste( "Invalid dimension column:   given: :", base::sort(colNumbers), 
				"\tColumns in table: ", ncol(tbl)))
		}
	} else {
		if ( !( all( allDs %in% colnames(tbl)))) {
			stop( paste( "some dimension name columns not found:   tried: ", 
				allD, "\tfound: ", colnames(tbl)))
		}
		allDs <- base::match( allDs, colnames(tbl))
		for( k in 1:length( dimensions)) dimensions[[k]] <- base::match( dimensions[[k]], colnames(tbl))
	}
	if ( verbose) {
		for( k in 1:length(dimensions)) cat( "\n\nStage: ", names(dimensions)[k], 
			"\n\tColumns: ", colnames(tbl)[ dimensions[[k]] ])
		cat( "\n")
	}

	# get the gene names and resolve to current annotation
	oldGenes <- tbl[[ geneColumn]]

	# allow a mapping by ortholog to the current species...  the Life cycle data is PF3D7 only
	thisSpecies <- getCurrentSpecies()
	if ( toupper(thisSpecies) %in% setdiff( toupper(PARASITE_SPECIES), "PF3D7")) {
		setCurrentSpecies( "Pf3D7")
		newGenes <- alias2Gene( oldGenes)
		orthos <- ortholog( newGenes, from="Pf3D7", to=thisSpecies)
		# drop rows with no ortholog
		keep <- which( orthos != "")
		if ( length(keep) < 1000) {
			cat( "\nProblem:  Not enough Pf3D7 GeneIDs detected.  Verify gene names...")
			stop()
		}
		tbl <- tbl[ keep, ]
		newGenes <- orthos[ keep]
		setCurrentSpecies( thisSpecies)
	} else {
		newGenes <- alias2Gene( oldGenes)
	}

	# extract just the wanted columns, impose a minimun intensity, and perhaps normalize the intensity columns
	dataColumns <- allDs
	mIn <- as.matrix( tbl[ , dataColumns])
	colnames( mIn) <- base::paste( "V", dataColumns, sep="")
	mIn[ is.na(mIn)] <- min.value
	mIn[ mIn < min.value] <- min.value

	# with microarrays, RMA is more appropriate, otherwise just plain quantile normalize
	if ( preNormalize) {
		if ( doRMA) {
			mIn <- duffyRMA( mIn, verbose=verbose)
		} else {
			mIn <- duffyMagnitudeNormalize( mIn, verbose=verbose)
		}
	}

	# there may be duplicate rows because of gene re-annotation or orthollogging...
	# combine those by mean average
	uGenes <- unique.default( newGenes)
	nUnique <- length( uGenes)
	mNew <- matrix( 0, nrow=nUnique, ncol=ncol(mIn))
	colnames(mNew) <- colnames(mIn)
	for ( i in 1:nUnique) {
		who <- which( newGenes == uGenes[i])
		if ( length(who) == 1) {
			mNew[ i, ] <- mIn[ who, ]
		} else {
			mNew[ i, ] <- apply( mIn[ who, ], MARGIN=2, FUN=sqrtmean)
		}
	}

	# use the dimensionality list to combine columns
	N_STAGES <- length( dimensions)
	STAGE_NAMES <- names( dimensions)
	mNewer <- matrix( nrow=nrow(mNew), ncol=N_STAGES)
	for( i in 1:N_STAGES) {
		who <- dimensions[[i]]
		where <- base::match( base::paste( "V", who, sep=""), colnames(mNew))
		if ( length(who) == 1) {
			oneV <- mNew[ , where]
		} else {
			oneV <- apply( mNew[ , where], MARGIN=1, FUN=sqrtmean)
		}
		mNewer[ , i] <- oneV
	}
	colnames( mNewer) <- STAGE_NAMES

	# once the stages have been set, now we can eliminate some if wanted...
	if ( stageMode == "blood.only") {
		keepers <- grep( "merozoite|ring|troph|schizont", tolower(colnames(mNewer)))
		# put a second copy of merozoite at the end, to complete the cycle
		mNewer <- mNewer[ , c(keepers,keepers[1])]
		colnames(mNewer)[ncol(mNewer)] <- "Merozoite2"
		N_STAGES <- ncol( mNewer)
		STAGE_NAMES <- colnames( mNewer)
	}

	# with microarrays, RMA is more appropriate, otherwise just plain quantile normalize
	if ( postNormalize) {
		if ( doRMA) {
			mNewer <- duffyRMA( mNewer, verbose=verbose)
		} else {
			mNewer <- duffyMagnitudeNormalize( mNewer, verbose=verbose)
		}
	}
	# now normalize each row to sum to unit vector (length=1)
	if ( unitVectorMode != "none") {
	    for( i in 1:nrow( mNewer)) {
		oneV <- mNewer[ i, ]
		# subtract the min so the proportions have at least one 0%
		if ( unitVectorMode == "relative") oneV <- oneV - min(oneV)
		oneV <- oneV / sum(oneV)
		mNewer[i, ] <- oneV
	    }
	}

	if ( ! is.null( min.spread)) {
		min.spread <- as.numeric( min.spread)
		mins <- apply( mNewer, MARGIN=1, min, na.rm=T)
		maxs <- apply( mNewer, MARGIN=1, max, na.rm=T)
		spreads <- maxs / mins
		drops <- which( spreads < min.spread)
		if ( length(drops) > 0) {
			mNewer <- mNewer[ -drops, ]
			uGenes <- uGenes[ -drops]
			cat( "\nDropping Genes with too little intensity change between dimensions: ", length(drops))
		}
	}

	ans <- data.frame( "GENE_ID"=uGenes, "PRODUCT"=gene2Product( uGenes), mNewer,
			stringsAsFactors=FALSE)
	rownames(ans) <- 1:nrow(ans)
	return( ans)
}
 

`calcLifeCycleStage` <- function( g, inten, dropGenes=vector()) {

	verifyLifeCycleSetup()

	# get the life cycle data...  there is 'geneID, product', and then the intensities...
	vectorSpace <- LifeCycleEnv[[ "VectorSpace"]]
	unitVectors <- vectorSpace[ , 3:ncol(vectorSpace)]
	N_STAGES <- LifeCycleEnv[[ "N_STAGES"]]
	STAGE_NAMES <- LifeCycleEnv[[ "STAGE_NAMES"]]

	# build up a tally of how much intensity goes to each stage
	# do it for all genes, and the make stage histogram from the marker genes
	genes <- alias2Gene(g)

	# allow omitting some GeneIDs...
	if ( length( dropGenes)) {
		droppers <- alias2Gene( dropGenes)
		toDrop <- which( genes %in% droppers)
		if ( length( toDrop)) {
			g <- g[ -toDrop]
			genes <- genes[ -toDrop]
			inten <- inten[ -toDrop]
		}
	}

	# the Vector Space table is now in the current species, no need to ortholog here!
	where <- base::match( genes, vectorSpace$GENE_ID, nomatch=0)

	# build the matrix of stage fractions for every gene we were given.
	# switching to only use genes that are defined... others will not contribute
	myFractions <- matrix( 0, nrow=length(genes), ncol=N_STAGES)
	for( i in 1:N_STAGES) {
		myFractions[ where > 0, i] <- unitVectors[ where, i]
	}

	# account for background intensity to put microarray and RNA, etc., on equal footing .. now done elsewhere.!
	useInten <- inten

	# spread these intensitys over those stages
	myVectors <- myIntens <- matrix( 0, nrow=length(genes), ncol=N_STAGES)
	rownames(myVectors) <- rownames(myIntens) <- g
	colnames(myVectors) <- colnames(myIntens) <- STAGE_NAMES
	for( i in 1:N_STAGES) {
		myVectors[ , i] <- useInten * myFractions[ , i]
		myIntens[ , i] <- inten * myFractions[ , i]
	}

	# now do a summation by stage, and express as percentages...
	allSums <- apply( myVectors, MARGIN=2, FUN=sum, na.rm=T)
	bigSum <- sum( allSums)
	ans <- allSums * 100 / bigSum

	out <- list( "Stage"=ans, "IntensityVectors"=myIntens)
	return( out)
}


`calcLifeCycleStageFromFile` <- function( f, geneColumn="GENE_ID", intensityColumn="INTENSITY", 
					dropGenes=vector(), sep="\t") {

	verifyLifeCycleSetup()

	# open that file and find the needed columns
	tmp <- read.delim( f, as.is=T, sep=sep)
	if ( nrow( tmp) < 1) return(NULL)

	gset <- tmp[[ geneColumn]]
	if ( is.null(gset)) {
		cat( "calcLifeCycleStage:  gene name column not found.\nFile: ",f,
			"\nTried: ", geneColumn)
		return( NULL)
	}

	inten <- tmp[[ intensityColumn]]
	if ( is.null(inten)) {
		cat( "calcLifeCycleStage:  gene intensity column not found.\nFile: ",f,
			"\nTried: ", intensityColumn)
		return( NULL)
	}

	return( calcLifeCycleStage( gset, inten, dropGenes=dropGenes))
}


`plotLifeCycleStageFromFileSet` <- function( fnames, fids, fcolors=NULL, geneColumn="GENE_ID", 
		intensityColumn="RPKM_M", yMax=NULL, barSplines=FALSE, bar.lwd=5, legend.cex=1, max.labels=20,
		dropGenes=vector(), label="your label goes here...", sep="\t") {

	verifyLifeCycleSetup()

	# make sure we can read those files
	filesOK <- file.exists( fnames)
	if ( !all( filesOK)) {
		cat( "\nLifeStage:  Some transcript files not found:\n")
		print( fnames[ !filesOK])
		return(NULL)
	}

	N_STAGES <- LifeCycleEnv[[ "N_STAGES"]]
	STAGE_NAMES <- LifeCycleEnv[[ "STAGE_NAMES"]]

	# build the storage
	nFiles <- length( fnames)
	m <- matrix( nrow=nFiles, ncol=N_STAGES)
	colnames(m) <- STAGE_NAMES
	rownames(m) <- fids

	# load each file in turn
	cat( "\nLoading:  ")
	for( i in 1:nFiles) {
		cat( " ", basename(fnames[i]))
		ans <- calcLifeCycleStageFromFile( fnames[i], geneColumn=geneColumn,
				intensityColumn=intensityColumn, dropGenes=dropGenes, sep=sep)
		m[ i, ] <- ans$Stage
	}
	cat( "\n")

	# plot it
	splineAns <- plotLifeCycleStages(m, col=fcolors, label=label, yMax=yMax, barSplines=barSplines,
			bar.lwd=bar.lwd, legend.cex=legend.cex, max.labels=max.labels)

	if ( is.null( splineAns)) {
		return( m)
	} else {
		return( list( "stage.vectors"=m, "spline.maximums"=splineAns))
	}
}


`plotLifeCycleStageFromMatrix` <- function( geneSet, intenMatrix, fids=colnames(intenMatrix), 
			fcolors=NULL, yMax=NULL, barSplines=FALSE, bar.lwd=5, legend.cex=1, max.labels=20,
			dropGenes=vector(), label="your label goes here...") {

	verifyLifeCycleSetup()

	N_STAGES <- LifeCycleEnv[[ "N_STAGES"]]
	STAGE_NAMES <- LifeCycleEnv[[ "STAGE_NAMES"]]

	# build the storage
	nColumns <- ncol( intenMatrix)
	m <- matrix( nrow=nColumns, ncol=N_STAGES)
	colnames(m) <- STAGE_NAMES
	rownames(m) <- fids

	# load each dataset in turn
	for( i in 1:nColumns) {
		ans <- calcLifeCycleStage( geneSet, intenMatrix[ ,i], dropGenes=dropGenes)
		m[ i, ] <- ans$Stage
	}

	# plot it
	splineAns <- plotLifeCycleStages(m, col=fcolors, label=label, yMax=yMax, barSplines=barSplines,
			bar.lwd=bar.lwd, legend.cex=legend.cex, max.labels=max.labels)

	if ( is.null( splineAns)) {
		return( m)
	} else {
		return( list( "stage.vectors"=m, "spline.maximums"=splineAns))
	}
}


`plotLifeCycleStages` <- function( m, col=NULL, yMax=NULL, label="", barSplines=FALSE, 
				legend.cex=1.0, max.labels=20, bar.lwd=5) {

	N <- nrow(m)
	NC <- ncol(m)

	if ( all( is.na(m))) {
		cat( "\nWarning:  no non-zero gene intensities !!")
		cat( "\nPerhaps expression data does not match current species...")
		return(NULL)
	}

	par( "mai"=c( 1.25, 0.95, 0.85, 0.2))
	las <- 3
	border <- par( "fg")
	if ( is.null( col)) {
		col=gray( seq( 0.2, 1.0, length.out=N))
	} else {
		if ( N >= 20) border <- NA
	}

	barSpace <- c( 0, N/4)

	if ( is.null( yMax)) yMax <- max(m) * 1.1
	mainText <- paste( "Life Cycle Stage Plot:\n", label)

	mp <- barplot(m, beside=T, col=col, border=border, main=mainText, 
		ylab="Percent of Total Gene Intensity", 
		space=barSpace, las=las, font.lab=2, font.axis=2, cex.lab=1.05, cex.axis=1.05, 
		ylim=c(0,yMax), xlim=c( N*0.1,N*(ncol(m)+4.2)))

	splineAns <- NULL
	if ( is.logical( barSplines)) {
		if ( barSplines) {
			barSplines <- 1:N
		} else {
			barSplines <- vector()
		}
	}
	if ( length(barSplines)) {

		# use the "middle" X for all stages, so large Ncolumns doesn't distort the pic.
		x <- apply( mp, MARGIN=2, FUN=mean)
		splineAns <- vector()
		for( j in round( barSplines)) {
			y <- m[ j, ]
			# x <- mp[ j, ]
			ans <- spline( x, y, n=8*length(x), method="natural")
			lines( ans$x, ans$y, col=par("fg"), lwd=bar.lwd*1.4, lty=1)
			lines( ans$x, ans$y, col=col[j], lwd=bar.lwd, lty=1)
			ans2 <- spline( 1:NC, y, n=20*NC, method="natural")
			splineAns[j] <- ans2$x[ which.max(ans2$y)]
		}
		names(splineAns) <- rownames(m)
		splineAns <- splineAns[ ! is.na(splineAns)]
	}
	
	# limit the legend to a reasonable number
	who <- 1:N
	isSubset <- FALSE
	if ( N > 12) legend.cex <- legend.cex * 0.95
	if ( N > 18) legend.cex <- legend.cex * 0.95
	if ( N > max.labels) {
		who <- seq.int( 1, N, length.out=max.labels)
		isSubset <- TRUE
		legend.cex <- legend.cex * 0.95
	}
	ans <- legend( "topright", rownames(m)[who], fill=col[who], cex=legend.cex, bg="white")
	if ( isSubset) mtext( "not all labeled", side=3, adj=1, cex=legend.cex*0.95)
	dev.flush()

	return( splineAns)
}


`bestLifeCycleStageTimeCall` <- function( splineAns, timePoints=c(0,8,16,24,32,40,48)) {

	# time is currently hard coded for "Blood.Only" mode
	IE_MAX <- 48

	lowLimits <- floor( splineAns)
	lowTime <- timePoints[lowLimits]
	hiLimits <- ceiling( splineAns)
	hiTime <- timePoints[hiLimits]
	if ( any( is.na( c( lowTime, hiTime)))) {
		cat( "\nExpected non-NA time hour values for IE cycle stages")
		cat( "\nRequires a 'spline.maximums' object from a LifeCycle plot result.")
		return( NA)
	}

	fraction <- splineAns - lowLimits
	bestTime <- lowTime + fraction * (hiTime - lowTime)
	isNEG <- which( bestTime < 0)
	isBIG <- which( bestTime > IE_MAX)
	if ( length(isNEG)) bestTime[ isNEG] <- bestTime[isNEG] + IE_MAX
	if ( length(isBIG)) bestTime[ isBIG] <- bestTime[isBIG] - IE_MAX

	return( round( bestTime, digits=2))
}


`plotLifeCycleStageUnitVectors` <- function( gSet, col=1, lwd=1, legend=NA, plot=TRUE, yMax=1,
				legend.cex=1, label="", annotate=FALSE) {

	verifyLifeCycleSetup()

	# get the life cycle data...  there is 'geneID, product', and then the intensities...
	vectorSpace <- LifeCycleEnv[[ "VectorSpace"]]
	unitVectors <- vectorSpace[ , 3:ncol(vectorSpace)]
	N_STAGES <- LifeCycleEnv[[ "N_STAGES"]]
	STAGE_NAMES <- LifeCycleEnv[[ "STAGE_NAMES"]]

	if (plot) {

		par( "mai"=c( 1.25, 0.95, 0.85, 0.4))

		plot( 1,1, type="n", main=paste( "Life Cycle Stage:   Gene Unit Vectors\n",label),
			xlim=c(0.5,N_STAGES+0.5), ylim=c(0,yMax), xaxt="n", xlab=NA,
			ylab="Gene 'Projection' per Stage", las=3, font.axis=2, font.lab=2)
		axis( 1, at=1:N_STAGES, label=STAGE_NAMES, las=3, font=2)
	}

	# draw the lines a bit nicer...as step steps...
	colUse <- rep( col, length.out=length(gSet))
	lwdUse <- rep( lwd, length.out=length(gSet))

	where <- base::match( gSet, vectorSpace$GENE_ID, nomatch=NA)
	who <- which( !is.na(where))

	for ( k in who) {
		y <- as.numeric( vectorSpace[ where[k], 3:ncol(vectorSpace)])
		ans <- drawStageDensityLine( y, col=colUse[k], lwd=lwdUse[k])
		xLocation <- ans$x
		if ( annotate) {
			useY <- which.max( y)
			text( xLocation[useY], y[useY], gSet[k], col=colUse[k], pos=3, font=2, cex=1)
		}
	}
	if ( ! is.na( legend)) {
		legend( x=legend, legend=gSet[who], col=colUse[who], lwd=3, cex=legend.cex)
	}
	
	# send back useful info
	units <- vectorSpace[ where, ]
	out <- list( "unitVectors"=units, "x"=xLocation)
	return( out)
}


`plotLifeCycleStageIntensity` <- function( gSet, col=1, lwd=1, pt.cex=1.25, legend=NA, plot=TRUE, 
				legend.cex=1, label="", minYmin=1, minYmax=NULL, threshold=NULL, annotate=FALSE) {

	verifyLifeCycleSetup()

	# get the life cycle data...  there is 'geneID, product', and then the intensities...
	intenSpace <- LifeCycleEnv[[ "IntensitySpace"]]
	intenVectors <- as.matrix( intenSpace[ , 3:ncol(intenSpace)])
	N_STAGES <- LifeCycleEnv[[ "N_STAGES"]]
	STAGE_NAMES <- LifeCycleEnv[[ "STAGE_NAMES"]]

	where <- base::match( gSet, intenSpace$GENE_ID, nomatch=0)
	if (sum(where > 0) < 1) {
		cat( "\nNo Matching Genes found..")
		return()
	}

	intenSpace <- intenSpace[ where, , drop=F]
	intenVectors <- intenVectors[ where, , drop=F]
	gSet <- gSet[ where > 0]
	NG <- length(gSet)

	bigValue <- max( intenVectors, na.rm=T)
	if ( ! is.null( minYmax)) bigValue <-max( bigValue, minYmax, na.rm=T)
	if ( ! is.null( threshold)) bigValue <- max( bigValue, threshold, na.rm=T)
	yMax <- bigValue * 2
	smallValue <- min( intenVectors, na.rm=T)
	if ( ! is.null( threshold)) minYmin <- min( minYmin, threshold)
	yMin <- min( smallValue, minYmin)
	if ( yMin < 0.01) yMin <- 0.01

	if (plot) {

		par( "mai"=c( 1.25, 0.95, 0.85, 0.4))
		mainText <- paste( "Life Cycle Stage:    Gene Intensity Vectors\n",label)
		if ( NG == 1) {
			mainText <- paste( "Life Cycle Stage:    Gene Intensity:    ", gSet)
			if ( (orig <- gene2OrigID(gSet)) != gSet) mainText <- paste( mainText, "    (",orig,")", sep="")
			mainText <- paste( mainText, "\n", gene2Product( gSet))
		}

		plot( 1,1, type="n", main=mainText, xlim=c(0.5,N_STAGES+0.5), ylim=c(yMin,yMax), 
			log="y", xaxt="n", xlab=NA, ylab="Gene Intensity per Stage  (RPKM)", las=3, font.axis=2, font.lab=2)
		axis( 1, at=1:N_STAGES, label=STAGE_NAMES, las=3, font=2)
	}

	# draw the lines a bit nicer...as step steps...
	colUse <- rep( col, length.out=length(gSet))
	lwdUse <- rep( lwd, length.out=length(gSet))

	if ( ! is.null( threshold)) {
		lines( c(-2,20), rep.int(threshold,2), lwd=1, lty=3, col="brown")
		text( c(0.3,0.3), c( threshold,threshold), c( "Expression", "Threshold"), col="brown", pos=c(3,1))
	}

	# when only one gene, color by stage instead
	if ( NG == 1) {
		colUse <- rainbow( ncol(intenVectors), end=0.78)
		y <- intenVectors[ 1, ]
		y <- pmax( y, yMin)
		ans <- drawStageIntensityLine( y, col=colUse, lwd=lwdUse[1], pt.cex=pt.cex, col2=1, useLog=T)
		xLocation <- ans$x
		if ( annotate) {
			useY <- which.max( y)
			text( xLocation[useY], y[useY], gSet[1], col=colUse[useY], pos=3, font=2, cex=1)
		}
	} else {
	    for ( k in 1:length(gSet)) {
		y <- intenVectors[ k, ]
		y <- pmax( y, yMin)
		ans <- drawStageIntensityLine( y, col=colUse[k], lwd=lwdUse[k], pt.cex=pt.cex, useLog=T)
		xLocation <- ans$x
		if ( annotate) {
			useY <- which.max( y)
			text( xLocation[useY], y[useY], gSet[k], col=colUse[k], pos=3, font=2, cex=1)
		}
	    }
	}
	if ( ! is.na( legend)) {
		if ( NG == 1) colUse <- 1
		legend( x=legend, legend=gSet, col=colUse, lwd=3, cex=legend.cex)
	}
	dev.flush()
	
	# send back useful info
	out <- list( "intensityVectors"=intenVectors, "x"=xLocation)
	return( out)
}


`drawStageDensityLine` <- function(y, col, lwd) {

	yUse <- c( 0, rep( y, each=2), 0)
	xmids <- seq( 0, length(y), by=1) + 0.5
	xUse <- base::sort( c( xmids-0.05, xmids+0.05))
	lines( xUse, yUse, type="l", col=col, lwd=lwd) 
	return( invisible( list( "x"=(xmids+0.5))))
}


`drawStageIntensityLine` <- function(y, at=1:length(y), col, lwd, pt.cex, col2=col, useLog=FALSE) {

	N <- length(y)
	if (useLog) {
		splineAns <- spline( at, log2(y+1), n=N*5)
		xSpline <- splineAns$x
		ySpline <- 2 ^ (splineAns$y) - 1
		# spline in log space can look extreme...
		minY <- min(y, na.rm=T) *.1
		ySpline[ ySpline < minY] <- minY
	} else {
		splineAns <- spline( at, y, n=N*5)
		xSpline <- splineAns$x
		ySpline <- splineAns$y
	}
	lines( xSpline, ySpline, col=col2, lty=2, lwd=lwd)
	points( at, y, pch=21, bg=col, cex=pt.cex)
	return( invisible( list( "x"=at)))
}


`adjustLifeCycleStageMatrix` <- function( geneSet, intenMatrix, fids=colnames(intenMatrix), 
		fcolors=NULL, tolerance=1.0, rate=1.0, max.iterations=100, 
		label="your label here...", legend.cex=1, max.labels=20, pause=FALSE) {

	verifyLifeCycleSetup()

	# turn the matrix into some temp transcript files, then call the file set tool
	nFiles <- ncol( intenMatrix)
	fNames <- vector()

	for ( i in 1:nFiles) {
		f <- paste( "matrix", fids[i], "Pf.Transcript.txt", sep=".")
		fNames[i] <- f
		tmp <- data.frame( "GENE_ID"=geneSet, "INTENSITY"=intenMatrix[ ,i], stringsAsFactors=F)
		write.table( tmp, file=f, sep="\t", quote=F, row.names=F)
	}

	ans <- adjustLifeCycleStageFileSet( fNames, fids, fcolors=fcolors, tolerance=tolerance, 
			rate=rate, max.iterations=max.iterations, label=label, legend.cex=legend.cex,
			max.labels=max.labels, pause=pause)

	# with this final adjustment scaling, modify the original matrix
	intenOut <- intenMatrix
	gPtrs <- match( rownames(intenMatrix), rownames(ans$scaleFactors), nomatch=0)
	for ( j in 1:nrow(intenMatrix)) {
		where <- gPtrs[j]
		if ( where == 0) next
		scal <- ans$scaleFactors[ where, ]
		if (any( scal == 0)) next
		intenOut[ j, ] <- intenMatrix[ j, ] * scal
	}

	# remove those temp files
	for ( i in 1:nFiles) {
		f <- paste( "matrix", fids[i], "Pf.Transcript.txt", sep=".")
		file.delete(f)
	}

	return( intenOut)
}


`adjustLifeCycleStageFileSet` <- function( fnames, fids, fcolors=NULL, geneColumn="GENE_ID", 
		intensityColumn="INTENSITY", tolerance=1.0, rate=1.0, max.iterations=100, 
		label="your label here...", legend.cex=1, max.labels=20, pause=FALSE, yMax=NULL) {

	verifyLifeCycleSetup()

	# make sure we can read those files
	filesOK <- file.exists( fnames)
	if ( !all( filesOK)) {
		cat( "\nLifeCycleStage:  Some transcript files not found:\n")
		print( fnames[ !filesOK])
		return(NULL)
	}

	# build local re-writable files and storage for each
	myFiles <- paste( "tmp.stageAdjusted", basename( fnames), "rda", sep=".")
	nFiles <- length( fnames)
	fileList <- geneIntenList <- vector( mode="list", length=nFiles)

	neededColumns <- c( geneColumn, intensityColumn)
	for( i in 1:nFiles) {
		tbl <- read.delim( fnames[i], as.is=T)
		tbl <- tbl[ , neededColumns]
		fileList[[i]] <- tbl
		save( tbl, file=myFiles[i])
	}

	# make one copy of all gene intensities before and after to create final multipliers
	startingInten <- endingInten <- matrix( 1.0, nrow=nrow(tbl), ncol=nFiles)
	rownames(startingInten) <- rownames(endingInten) <- geneNames <- base::sort( tbl[ , geneColumn])
	colnames(startingInten) <- colnames(endingInten) <- fids
	for( i in 1:nFiles) {
		tbl <- fileList[[i]]
		who <- base::match( geneNames, tbl[ , geneColumn])
		startingInten[ ,i] <- endingInten[ , i] <- tbl[ who, intensityColumn]
	}

	# get the life cycle data...  there is 'geneID, product', and then the intensities...
	vectorSpace <- LifeCycleEnv[[ "VectorSpace"]]
	N_STAGES <- LifeCycleEnv[[ "N_STAGES"]]
	unitVectors <- vectorSpace[ , 3:ncol(vectorSpace)]
	STAGE_NAMES <- LifeCycleEnv[[ "STAGE_NAMES"]]

	# storage for the resulting stage histograms
	m <- matrix( nrow=nFiles, ncol=N_STAGES)
	colnames(m) <- STAGE_NAMES
	rownames(m) <- fids

	# repeatedly measure and adjust til convergence
	nIters <- 0
	curDev <- N_STAGES
	repeat {

		# load each file in turn
		for( i in 1:nFiles) {
			tbl <- fileList[[ i]]
			ans <- calcLifeCycleStage( g=tbl[[ geneColumn]],
				inten=tbl[[ intensityColumn]])
			m[ i, ] <- ans$Stage
			geneIntenList[[ i]] <- ans$IntensityVectors
		}

		# plot it
		plotText <- paste( label, "\nIter: ", nIters, "     Max Intra-Stage Dif:  ", 
				formatC(curDev, format="f", digits=3))
		plotLifeCycleStages(m, col=fcolors, label=plotText, yMax=yMax, legend.cex=legend.cex,
				max.labels=max.labels)

		# calculate the average per stage and the factor set to apply to each file
		stageAvg <- apply( m, MARGIN=2, mean)
		deviation <- m
		for (i in 1:nFiles) deviation[ i, ] <- stageAvg - m[ i, ]
		netDifference <- range( as.vector( deviation))
		curDev <- netDifference[2] - netDifference[1]

		nIters <- nIters + 1
		cat( "\rIter: ", nIters, "\tMax Intra-Stage Dif: ", curDev)
		if ( curDev <= tolerance) break
		if ( nIters > max.iterations) break
		if (pause) locator(1)

		# now use these file to file stage deviations to tweak each transcript
		multiplier <- (100 + (deviation*rate)) / 100
		for( i in 1:nFiles) {
			tbl <- fileList[[ i]]
			newInten <- recombineLifeCycleStageIntensity( geneIntenList[[i]], multiplier[ i, ])

			# put this new intensities back for next pass
			tbl[ , intensityColumn] <- newInten
			fileList[[ i]] <- tbl
			save( tbl, file=myFiles[i])

			who <- base::match( geneNames, names( newInten))
			endingInten[ , i] <- newInten[ who]
		}
	}
	cat( "\nDone.")

	# plot it again to show latest numbers
	plotText <- paste( label, "\nDone.   Iter: ", nIters, "     Max Intra-Stage Dif:  ", 
			formatC(curDev, format="f", digits=3))
	plotLifeCycleStages(m, col=fcolors, label=plotText, yMax=yMax, legend.cex=legend.cex,
			max.labels=max.labels)

	finalScaling <- endingInten / startingInten
	finalScaling[ is.nan(finalScaling)] <- 1.0

	out <- list( "deviation"=curDev, "iterations"=nIters, "Stages"=m, 
			"scaleFactors"=finalScaling)

	# clean up the temp files...
	file.delete( myFiles)

	return( out)
}


`recombineLifeCycleStageIntensity` <- function( geneStageInten, multiplier=rep( 1.0, times=ncol(geneStageInten))) {

	# recombine the stage intensity portions into one gene intensity, allowing for
	# a multiplier to be applied to each stage
	myInten <- geneStageInten
	sumBefore <- sum( myInten, na.rm=T)

	for ( i in 1:ncol( geneStageInten)) myInten[ ,i] <- myInten[ ,i] * multiplier[i]

	sumAfter <- sum( myInten, na.rm=T)
	adjFactor <- sumBefore / sumAfter
	myInten <- myInten * adjFactor

	geneInten <- apply( myInten, MARGIN=1, sum, na.rm=TRUE)

	names(geneInten) <- rownames( geneStageInten)
	return( geneInten)
}
robertdouglasmorrison/DuffyTools documentation built on April 16, 2024, 6:31 a.m.