# pipe.MarkerGenes.R -- evaluate transcriptomes based on a set of marker genes
`pipe.ScoreMarkerGenes` <- function( sampleIDset, markerDF, optionsFile="Options.txt", results.path=NULL,
folder=NULL, mode=c("absolute", "relative"),
groupOrder=NULL, col=NULL, main="", legend.cex=1, label.cex=0.75,
nFDRsimulations=100, forceYmax=NULL) {
if ( is.null( results.path)) {
results.path <- getOptionValue( optionsFile, "results.path", notfound="./results", verbose=F)
}
NS <- length( sampleIDset)
grpFac <- factor( markerDF$Group)
NG <- nlevels( grpFac)
m <- pvm <- matrix( NA, nrow=NG, ncol=NS)
rownames(m) <- rownames(pvm) <- levels(grpFac)
colnames(m) <- colnames(pvm) <- sampleIDset
colnames(m) <- sampleIDset
colnames(pvm) <- paste( "FDR", sampleIDset, sep="_")
prefix <- getCurrentSpeciesFilePrefix()
# use either transcriptome or DE results
if ( is.null(folder)) {
myFolder <- "transcript"
mySuffix <- paste( prefix, "Transcript.txt", sep=".")
intensityColumn <- "RPKM_M"
} else {
myFolder <- file.path( "MetaResults", paste( prefix, folder, sep="."))
mySuffix <- paste( prefix, "Meta.UP.txt", sep=".")
intensityColumn <- "LOG2FOLD"
}
for ( i in 1:NS) {
s <- sampleIDset[i]
f <- file.path( results.path, myFolder, paste( s, mySuffix, sep="."))
tbl <- read.delim( f, as.is=T)
ans <- scoreMarkerGenes( tbl, markerDF, intensityColumn=intensityColumn, nFDRsimulations=nFDRsimulations)
if ( is.null( ans)) next
ord <- order( ans$Group)
m[ , i] <- ans$Score[ ord]
pvm[ , i] <- ans$FDR[ ord]
cat( "\n", i, s, "\tBest: ", ans$Group[1], ans$Score[1])
}
cat( "\n")
if ( ! is.null( groupOrder)) {
if ( length(groupOrder) != NG) stop( "'groupOrder' length does not match group count")
m <- m[ groupOrder, , drop=F]
pvm <- pvm[ groupOrder, , drop=F]
} else {
groupOrder <- 1:NG
}
if ( is.null(col)) {
groupColor <- rainbow( NG, end=0.76)
} else {
groupColor <- rep( col, length.out=NG)
}
# spacing for the legend depends on counts and label length...
gap <- 2.5
NSsizing <- max( NS, 3)
bigX <- NSsizing * (NG+gap) * (1 + ((max( nchar(levels(grpFac)))+6)/120))
ylabel <- "Marker Gene Score (Absolute Scale)"
mode <- match.arg( mode)
if ( mode == "relative") {
for ( i in 1:nrow(m)) {
mymed <- mean( m[ i, ])
m[ i, ] <- m[ i, ] - mymed
}
ylabel <- "Marker Gene Score (Relative to Group Means)"
}
yLimits <- range(m,0) * 1.15
if ( ! is.null(forceYmax)) {
yLimits[2] <- as.numeric( forceYmax)
if ( yLimits[1] < -1.0) yLimits[1] <- -(as.numeric(forceYmax))
}
barAns <- barplot( m, beside=T, col=groupColor, main=paste( "Marker Gene Profile: ", main),
las=if(NS<5) 1 else 3, xlim=c(1,bigX), ylim=yLimits, space=c(0,gap),
ylab=ylabel, cex.lab=1.1, cex.axis=1.1, font.lab=2, font.axis=2)
lines( c(-10,bigX*2), c(0,0), lwd=1, lty=1, col=1)
if ( nFDRsimulations > 0) {
text( as.vector(barAns), as.vector(m), paste( "P=",as.vector(pvm),sep=""), pos=ifelse( as.vector(m) > 0, 3, 1), cex=label.cex)
}
legend( "topright", levels(grpFac)[groupOrder], fill=groupColor, bg='white', cex=legend.cex)
return( data.frame( "Group"=rownames(m), m, pvm, row.names=1:NG, stringsAsFactors=F))
}
`pipe.ShowMarkerGenes` <- function( sampleID, markerDF, optionsFile="Options.txt", results.path=NULL,
folder=NULL, groupOrder=NULL, col=NULL, main="", legend.cex=1,
names.cex=1.2, nFDRsimulations=100) {
if ( is.null( results.path)) {
results.path <- getOptionValue( optionsFile, "results.path", notfound="./results", verbose=F)
}
grpFac <- factor( markerDF$Group)
NG <- nlevels( grpFac)
prefix <- getCurrentSpeciesFilePrefix()
# use either transcriptome or DE results
if ( is.null(folder)) {
myFolder <- "transcript"
mySuffix <- paste( prefix, "Transcript.txt", sep=".")
intensityColumn <- "RPKM_M"
ylabel <- "Gene Rank Percentile in Transcriptome"
} else {
myFolder <- file.path( "MetaResults", paste( prefix, folder, sep="."))
mySuffix <- paste( prefix, "Meta.UP.txt", sep=".")
intensityColumn <- "LOG2FOLD"
ylabel <- "Gene Rank Percentile in Diff Expression"
}
f <- file.path( results.path, myFolder, paste( sampleID, mySuffix, sep="."))
tbl <- read.delim( f, as.is=T)
genes <- tbl$GENE_ID
isNG <- grep( "(ng)", genes, fixed=T)
if ( length( isNG)) {
tbl <- tbl[ -isNG, ]
genes <- tbl$GENE_ID
}
genes <- shortGeneName( genes, keep=1)
# offset for log scale...
v <- tbl[[ intensityColumn]]
vRank <- as.rankPercentile(v)
Ngenes <- length( genes)
if ( ! is.null( groupOrder)) {
if ( length(groupOrder) != NG) stop( "'groupOrder' length does not match group count")
} else {
groupOrder <- 1:NG
}
if ( is.null(col)) {
groupColor <- rainbow( NG, end=0.76)
} else {
groupColor <- rep( col, length.out=NG)
}
xlabel <- if ( NG > 4) NA else "Marker Gene Group Name"
plot( 1, 1, main=paste( "Marker Gene Profile: SampleID =", sampleID, main), type="n",
xlim=c(0.4,NG*1.2), ylim=c( -6,105), ylab=ylabel, xlab=xlabel, xaxt="n",
cex.lab=1.1, cex.axis=1.1, font.lab=2, font.axis=2)
# calc and show where the marker genes land
outX <- outY <- outPCH <- outCol <- vector()
for ( j in 1:NG) {
myGrp <- levels(grpFac)[ groupOrder[ j]]
myColor <- groupColor[j]
who <- which( markerDF$Group == myGrp)
myGenes <- markerDF$GENE_ID[ who]
myPCH <- ifelse( markerDF$Direction[ who] == "UP", 24, 6)
where <- match( myGenes, genes, nomatch=0)
use <- which( where > 0)
outX <- c( outX, rep.int( j, length(use)))
outY <- c( outY, vRank[where[use]])
outPCH <- c( outPCH, myPCH[use])
outCol <- c( outCol, rep.int(myColor,length(use)))
}
# extra annotations at the left edge
xLeft <- min( outX) - diff( range(outX))/(NG*1.25)
lines( c(-10, NG*2), c(50,50), lty=2, lwd=2, col='gray50')
text( c(xLeft,xLeft), c(80,28), c( "Positive Weighting", "Negative Weighting"), srt=90,
col='gray50', cex=1.03, font=2)
points( jitter(outX, factor=1.5), outY, pch=outPCH, col=outCol, bg=outCol)
axis( 1, at=1:NG, levels(grpFac)[groupOrder], font=2, cex.axis=names.cex, las=if (NG>4) 3 else 1)
legend( "topright", levels(grpFac)[groupOrder], fill=groupColor, bg='white', cex=legend.cex)
legend( "bottomright", c( "Positve Marker", "Negative Marker"), pch=c(24,6),
col="brown", pt.bg="brown", pt.cex=2, bg='white', cex=legend.cex)
ans <- scoreMarkerGenes( tbl, markerDF, intensityColumn=intensityColumn, nFDRsimulations=nFDRsimulations)
ord <- match( ans$Group, levels(grpFac)[groupOrder])
text( c( xLeft, ord), -3.5, c( "SCORE:", as.character(round( ans$Score, digits=1))), font=2, cex=0.95,
srt=if(NG>5) 90 else 0)
if ( nFDRsimulations > 0 && NG <= 5) {
text( c( xLeft, ord), -6, c( "P-value:", as.character(round( ans$FDR, digits=3))), font=2, cex=0.75)
}
return( ans)
}
`scoreMarkerGenes` <- function( transcript, markerDF, geneColumn="GENE_ID", intensityColumn="RPKM_M",
nFDRsimulations=100) {
genes <- transcript[[ geneColumn]]
if ( is.null( genes)) {
cat( "\nGeneID column not found: ", geneColumn)
return( NULL)
}
isNG <- grep( "(ng)", genes, fixed=T)
if ( length( isNG)) {
transcript <- transcript[ -isNG, ]
genes <- transcript[[ geneColumn]]
}
# we will only want the gene symbol...
genes <- shortGeneName( genes, keep=1)
inten <- transcript[[ intensityColumn]]
if ( is.null( inten)) {
cat( "\nIntensity column not found: ", intensityColumn)
return( NULL)
}
# make sure the transcript is in order
ord <- order( inten, decreasing=T)
if ( any( ord != 1:nrow(transcript))) {
cat( "\nNot in Intensity order.. Resorting..")
genes <- genes[ ord]
inten <- inten[ ord]
}
NG <- length(genes)
generank <- as.rankPercentile( NG:1)
neededColumns <- c( "GENE_ID", "Group", "Direction")
if ( ! all( neededColumns %in% colnames(markerDF))) {
cat( "Expected 'MarkGene' data frame columns not found. Need: ", neededColumns)
return(NULL)
}
# make sure most of the markers are seen
markgenes <- markerDF$GENE_ID
NMG <- nrow(markerDF)
where <- match(markgenes, genes, nomatch=0)
if ( sum( where != 0) < NMG*0.5) {
cat( "\nDebug: N_Missing: ", sum( where == 0), " \tN_Found: ", sum( where > 0),"\n")
cat( "Too many 'Marker Genes' not found in transcriptome.. Check current species..")
return( NULL)
}
markerDF$SCORE <- ifelse( toupper( markerDF$Direction) == "UP", 1, -1)
markerDF$SCORE[ where == 0] <- 0
markerFac <- factor( markerDF$Group)
NGRPS <- nlevels( markerFac)
RANK_MIDPT <- 50
scores <- tapply( 1:NMG, markerFac, function(x) {
mywhere <- where[x]
myscores <- markerDF$SCORE[x]
myrankPcts <- rep.int( RANK_MIDPT, length(x))
myrankPcts[ mywhere > 0] <- generank[ mywhere]
# turn these 1..N into centered about zero and scaled to -1,1
myPtiles <- (myrankPcts - RANK_MIDPT) / RANK_MIDPT
# multiply by their sign and scale to number of markers
score <- sum( myPtiles * myscores) * 100 / sum( mywhere > 0)
return( score)
})
# if we do FDR, permute and redo the scoring
fdr <- rep.int( NA, NGRPS)
if ( nFDRsimulations > 0) {
randScores <- matrix( 0, nrow=NGRPS, ncol=nFDRsimulations)
for ( j in 1:nFDRsimulations) {
randGenes <- sample( genes)
where <- match(markgenes, randGenes, nomatch=0)
randScores[,j] <- tapply( 1:NMG, markerFac, function(x) {
mywhere <- where[x]
myscores <- markerDF$SCORE[x]
myrankPcts <- rep.int( RANK_MIDPT, length(x))
myrankPcts[ mywhere > 0] <- generank[ mywhere]
myPtiles <- (myrankPcts - RANK_MIDPT) / RANK_MIDPT
score <- sum( myPtiles * myscores) * 100 / sum( mywhere > 0)
return( score)
})
}
# now count up how often did we do better than real score
for ( i in 1:NGRPS) {
if ( scores[i] >= 0) {
fdr[i] <- sum( randScores[ i, ] >= scores[i]) / nFDRsimulations
} else {
fdr[i] <- sum( randScores[ i, ] <= scores[i]) / nFDRsimulations
}
}
}
out <- data.frame( "Group"=levels(markerFac), "Score"=round(scores,digits=2),
"FDR"=round( fdr, digit=4), stringsAsFactors=F)
ord <- order( scores, decreasing=T)
out <- out[ ord, ]
rownames(out) <- 1:NGRPS
return( out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.