# metaRank.R -- combine several ranked files of genes
metaRanks <- function( fnames, fids, weightset=rep(1, length(fnames)),
geneColumn="GENE_ID", valueColumn="LOG2FOLD",
pvalueColumn="PVALUE", productColumn="PRODUCT", sep="\t",
rank.average.FUN=sqrtmean, value.average.FUN=mean,
keepIntergenics=FALSE, missingGenes=c("drop", "fill", "na"),
missingValue=0, naDropPercent=0.5, nFDRsimulations=0,
diffExpressPvalueCorrection=FALSE, mode=c("rank", "percentile"),
direction=c("UP","DOWN"), cleanHyperlinks=FALSE, verbose=TRUE) {
missingGenes <- match.arg( missingGenes)
direction <- match.arg( direction)
if (verbose) cat( "\nReading in files...")
allDF <- vector( "mode"="list")
allGenes <- vector()
nfiles <- 0
missing <- vector()
for (i in 1:length(fnames)) {
filename <- fnames[i]
if ( ! file.exists( filename)) {
cat( "\nNot found: ", filename)
missing <- c( missing, i)
next
}
tmp <- read.delim( filename, as.is=T, sep=sep)
if ( ! geneColumn %in% colnames(tmp)) {
cat( "\nGeneID column not found. File=", filename)
next
}
nfiles <- nfiles + 1
if ( ! keepIntergenics) {
isNG <- grep( "(ng)", tmp[[ geneColumn]], fixed=T)
if ( length(isNG) > 0) tmp <- tmp[ -isNG, ]
nonGenes <- subset( getCurrentGeneMap(), REAL_G == FALSE)$GENE_ID
drops <- which( tmp[[ geneColumn]] %in% nonGenes)
if ( length(drops) > 0) tmp <- tmp[ -isNG, ]
}
# optionally strip out hyperlinks, like in GeneSet PathName usages...
if (cleanHyperlinks) {
tmp[[ geneColumn]] <- cleanGeneSetModuleNames( tmp[[ geneColumn]], wrapParentheses=F)
}
# by default, all the files are assumed to be already sorted in "Up-regulated" order
allDF[[ nfiles]] <- tmp
if ( direction == "DOWN") {
# the general rule is to make it be "Down-regulated", flip it over
nr <- nrow( tmp)
tmp <- tmp[ rev( 1:nr), ]
# special cases need special treatment
# 1) Rank Product is not symmetrical. Instead it has explicit columns for down
if ( "RP_VALUE_DOWN" %in% colnames(tmp)) {
tmp <- tmp[ order( tmp$RP_VALUE_DOWN), ]
}
allDF[[ nfiles]] <- tmp
}
allGenes <- base::union( allGenes, tmp[[geneColumn]])
if (verbose) cat( "\n",filename, "\tN_Genes: ", nrow(tmp))
}
allG <- sort( unique( allGenes))
NG <- length( allG)
if ( length( missing) > 0) {
fnames <- fnames[ -missing]
fids <- fids[ -missing]
}
if (verbose) cat( "\nN_Files: ", nfiles, " N_Genes: ", NG)
# get the rank order in all datasets
rankM <- foldM <- pvalM <- matrix( NA, nrow=NG, ncol=nfiles)
rownames(rankM) <- rownames(foldM) <- rownames(pvalM) <- allG
colnames(rankM) <- colnames(foldM) <- colnames(pvalM) <- fids
allProds <- rep( "", NG)
# allow for using a 'rank percentile' mode to better deal with very unevenly
# sized datasets
mode <- match.arg( mode)
for( i in 1:nfiles) {
thisDF <- allDF[[i]]
theseGenes <- thisDF[[geneColumn]]
nGenes.thisDF <- length(theseGenes)
where <- match( allG, theseGenes, nomatch=0)
# so DE tools throw away genes with no difference and/or expression
# they will be 'missing', so give them values saying 'not differentially expressed'
isMissing <- which( where == 0)
missingFold <- 0
missingPval <- 1
missingRank <- round( (NG+nrow(thisDF))/4) # thus the 'midpoint' of the average # of genes
rankM[ where > 0, i] <- where[ where > 0]
rankM[ isMissing, i] <- missingRank
if (mode == "percentile") {
ptile <- ((nGenes.thisDF - rankM[ , i] + 1) / nGenes.thisDF) * 100
ptile[ isMissing] <- 0
rankM[ , i] <- ptile
}
logFoldColumn <- if ( is.null(valueColumn)) integer(0) else grep( valueColumn, colnames(thisDF))
if ( length( logFoldColumn) > 0) {
foldM[ where > 0, i] <- thisDF[[ logFoldColumn[1]]][where]
foldM[ isMissing, i] <- missingFold
}
pvalColumn <- if ( is.null(pvalueColumn)) integer(0) else grep( pvalueColumn, colnames(thisDF))
if ( length( pvalColumn) > 0) {
pvalM[ where > 0, i] <- thisDF[[ pvalColumn[1]]][where]
pvalM[ isMissing, i] <- missingPval
} else {
pvalM[ , i] <- missingPval
}
hasPROD <- ( !is.na(productColumn) && (productColumn %in% colnames(thisDF)))
if ( hasPROD) {
allProds[ where > 0] <- ifelse( allProds[where > 0] == "",
thisDF[[ productColumn]][where], allProds[ where > 0])
}
}
# check for missing genes
whoNA <- vector()
for ( i in 1:nfiles) {
who <- which( is.na( rankM[ ,i]))
if ( length(who) > 0) {
whoNA <- sort( unique( c( whoNA, who)))
if ( missingGenes == "fill") {
rankM[ who, i] <- if ( mode == "rank") nrow(rankM) else 0
foldM[ who, i] <- missingValue
pvalM[ who, i] <- 1.0
}
}
}
if ( missingGenes == "drop" && length(whoNA) > 0) {
rankM <- rankM[ -whoNA, ]
foldM <- foldM[ -whoNA, ]
pvalM <- pvalM[ -whoNA, ]
allG <- allG[ -whoNA]
allProds <- allProds[ -whoNA]
NG <- nrow(rankM)
}
# if 'na', and a gene row has too many na's, drop the whole thing
if ( missingGenes == "na") {
nna <- apply( rankM, 1, function(x) sum(is.na(x)))
whoNA <- which( nna > (nfiles * naDropPercent))
if ( length( whoNA) > 0) {
rankM <- rankM[ -whoNA, ]
foldM <- foldM[ -whoNA, ]
pvalM <- pvalM[ -whoNA, ]
allG <- allG[ -whoNA]
allProds <- allProds[ -whoNA]
NG <- nrow(rankM)
}
}
if (verbose) cat( " after 'drop' and/or 'na' filtering: ", NG)
if (verbose) cat( "\nAveraging Ranks...")
if ( all( weightset == 1)) {
avgRank <- apply( rankM, MARGIN=1, FUN=rank.average.FUN, na.rm=T)
} else {
if ( identical( rank.average.FUN, logmean)) {
avgRank <- apply( rankM, MARGIN=1, FUN=function(x) {
numerator <- sum( log2(x) * weightset, na.rm=T)
denom <- sum( weightset, na.rm=T)
return( 2 ^ (numerator / denom))
})
} else if ( identical( rank.average.FUN, sqrtmean)) {
avgRank <- apply( rankM, MARGIN=1, FUN=function(x) {
numerator <- sum( sqrt(x) * weightset, na.rm=T)
denom <- sum( weightset, na.rm=T)
return( (numerator / denom) ^ 2)
})
} else {
avgRank <- apply( rankM, MARGIN=1, FUN=function(x) {
numerator <- sum( x * weightset, na.rm=T)
denom <- sum( weightset, na.rm=T)
return( numerator / denom)
})
}
}
if (diffExpressPvalueCorrection) {
# the p-values of strongly down-regulated genes are (usually) very small numbers
# if wanted, turn these into bad p-values of being up-regulated
for ( i in 1:nfiles) {
myfolds <- foldM[ ,i]
mypvals <- pvalM[ ,i]
if ( direction == "UP") {
needsCorrect <- which( myfolds < 0 & mypvals < 0.5)
} else {
needsCorrect <- which( myfolds > 0 & mypvals < 0.5)
}
if ( length(needsCorrect)) {
mypvals[ needsCorrect] <- 1.0 - mypvals[ needsCorrect]
pvalM[ ,i] <- mypvals
}
}
}
avgFold <- apply( foldM, MARGIN=1, FUN=value.average.FUN, na.rm=T)
if ( is.null( valueColumn)) valueColumn <- "VALUE"
if ( !is.null(pvalueColumn) && pvalueColumn != "") {
#avgPval <- apply( pvalM, MARGIN=1, FUN=logmean, na.rm=T)
avgPval <- apply( pvalM, MARGIN=1, FUN=p.combine)
} else {
avgPval <- rep.int( NA, nrow(pvalM))
}
if ( ! is.na( productColumn)) {
out <- data.frame( allG, allProds, avgFold, avgPval, avgRank, rankM, stringsAsFactors=FALSE)
colnames(out) <- c( geneColumn, productColumn, valueColumn, "AVG_PVALUE", "AVG_RANK", colnames( rankM))
} else {
out <- data.frame( allG, avgFold, avgPval, avgRank, rankM, stringsAsFactors=FALSE)
colnames(out) <- c( geneColumn, valueColumn, "AVG_PVALUE", "AVG_RANK", colnames( rankM))
}
# do the final ordering by Average Rank
if ( pvalueColumn != "") {
ord <- order( out$AVG_RANK, out$AVG_PVALUE, -out[[ valueColumn]])
} else {
ord <- order( out$AVG_RANK, -out[[ valueColumn]])
}
if ( mode == "percentile") {
if ( pvalueColumn != "") {
ord <- order( -out$AVG_RANK, out$AVG_PVALUE, -out[[ valueColumn]])
} else {
ord <- order( -out$AVG_RANK, -out[[ valueColumn]])
}
}
out <- out[ ord, ]
rownames( out) <- 1:nrow(out)
# do a simulation of random permutations of these ranks
# do the FDR after sorting final row order
if ( nFDRsimulations > 0) {
simM <- rankM
NR <- nrow(simM)
randomAvgRank <- vector( length=NR*nFDRsimulations)
nnow <- 0
if (verbose) cat( " estimating FDR..")
if ( mode == "rank") {
for ( k in 1:nFDRsimulations) {
for ( i in 1:nfiles) simM[ , i] <- sample( NR)
randomNow <- apply( simM, MARGIN=1, FUN=rank.average.FUN, na.rm=T)
randomAvgRank[ (nnow+1):(nnow+NR)] <- randomNow
nnow <- nnow + NR
if (verbose && k %% 100 == 0) cat( ".")
}
# with this pool of 'by-chance average ranks, we can estimate the likelihood of ours
randomAvgRank <- sort( randomAvgRank)
avgRank <- out$AVG_RANK
myLocs <- findInterval( avgRank * 1.00000001, randomAvgRank)
myLocs <- ifelse( myLocs > 0, myLocs - 1, 0)
} else { # different values in the FDR whe we do rank percentiles
for ( k in 1:nFDRsimulations) {
for ( i in 1:nfiles) simM[ , i] <- runif( NR, 0, 100)
randomNow <- apply( simM, MARGIN=1, FUN=rank.average.FUN, na.rm=T)
randomAvgRank[ (nnow+1):(nnow+NR)] <- randomNow
nnow <- nnow + NR
if (verbose) cat( ".")
}
# with this pool of 'by-chance average ranks, we can estimate the likelihood of ours
randomAvgRank <- sort( randomAvgRank)
avgRank <- out$AVG_RANK
myLocs <- findInterval( avgRank * 0.9999999, randomAvgRank)
myLocs <- ifelse( myLocs > 0, myLocs, 0)
# bigger values are more rare, so invert the pointers
myLocs <- length(randomAvgRank) - myLocs
}
myEvalue <- myLocs / nFDRsimulations
myFPrate <- myEvalue / (1:NR)
myFPrate <- ifelse( myFPrate > 1, 1, myFPrate)
out$FDR <- round( myFPrate, digits=4)
}
# correlation test...
ccM <- matrix( NA, nrow=nfiles, ncol=nfiles)
for( i in 1:(nfiles-1)) {
for( j in (i+1):nfiles) {
thisCC <- cor( rankM[ ,i], rankM[ ,j], use="complete")
ccM[i,j] <- ccM[j,i] <- thisCC
}}
colnames( ccM) <- colnames(rankM)
cc <- apply( ccM, MARGIN=2, mean, na.rm=T)
metaRankCC <<- cc
if (verbose) {
cat( "\nRank Correlations:\n")
print( sort( cc, decreasing=T))
}
return( out)
}
metaRank2html <- function( tbl, fileout="metaRanks.html", title="", maxRows=100,
valueColumn="LOG2FOLD", ...) {
# clean up any formatting...
if ( nrow(tbl)) {
tbl[[ valueColumn]] <- formatC( tbl[[ valueColumn]], format="f", digits=3)
if ("AVG_PVALUE" %in% colnames(tbl)) tbl$AVG_PVALUE <- formatC( tbl$AVG_PVALUE, format="e", digits=3)
if ("AVG_RANK" %in% colnames(tbl)) tbl$AVG_RANK <- formatC( tbl$AVG_RANK, format="f", digits=2)
NC <- ncol(tbl)
if ( NC > 3) colnames(tbl)[4:NC] <- gsub( "_", " ", colnames(tbl)[4:NC], fixed=T)
}
title <- paste( "Meta Ranks: ", title)
table2html( tbl, fileout=fileout, title=title, maxRows=maxRows, ...)
return()
}
metaRank.data.frames <- function( df.list, weightset=rep(1, length(df.list)),
geneColumn="GENE_ID", valueColumn="LOG2FOLD",
pvalueColumn="PVALUE", productColumn="PRODUCT",
rank.average.FUN=sqrtmean, value.average.FUN=mean,
missingGenes=c("drop", "fill", "na"), missingValue=0,
naDropPercent=0.5, diffExpressPvalueCorrection=FALSE,
direction="UP", nFDRsimulations=0, cleanHyperlinks=FALSE,
mode=c("rank","percentile"), verbose=TRUE) {
missingGenes <- match.arg( missingGenes)
mode <- match.arg( mode)
allDF <- vector( "mode"="list")
nDF <- 0
allGenes <- vector()
nGenesPerDF <- vector()
for (i in 1:length(df.list)) {
tmp <- df.list[[ i]]
if ( is.null(tmp)) {
cat( "\nData frame is NULL DataFrame: ", i, "\t", names(df.list)[i])
next
}
if ( nrow(tmp) < 1) {
cat( "\nData frame is empty... DataFrame: ", i, "\t", names(df.list)[i])
next
}
if ( ! geneColumn %in% colnames(tmp)) {
cat( "\nGeneID column not found. DataFrame: ", i, "\t", names(df.list)[i])
next
}
# optionally strip out hyperlinks, like in GeneSet PathName usages...
if (cleanHyperlinks) {
tmp[[ geneColumn]] <- cleanGeneSetModuleNames( tmp[[ geneColumn]], wrapParentheses=F)
}
nDF <- nDF + 1
allDF[[ nDF]] <- tmp
names(allDF)[nDF] <- names(df.list)[i]
nGenesPerDF[nDF] <- nrow(tmp)
allGenes <- base::union( allGenes, tmp[[geneColumn]])
if (verbose) cat( "\n",i, names(df.list)[i], "\tN_Genes: ", nrow(tmp))
}
fids <- names( allDF)
allG <- sort( unique( allGenes))
NG <- length( allG)
if (verbose) cat( "\nN_DataFrames: ", nDF, " N_Genes: ", NG)
# get the rank order in all datasets
rankM <- foldM <- pvalM <- matrix( NA, nrow=NG, ncol=nDF)
rownames(rankM) <- rownames(foldM) <- rownames(pvalM) <- allG
colnames(rankM) <- colnames(foldM) <- colnames(pvalM) <- fids
allProds <- rep( "", NG)
missingPval <- 1
for( i in 1:nDF) {
thisDF <- allDF[[i]]
hasPROD <- ( !is.na(productColumn) && (productColumn %in% colnames(thisDF)))
hasPVAL <- ( !is.null(pvalueColumn) && !is.na(pvalueColumn) && (pvalueColumn %in% colnames(thisDF)))
theseGenes <- thisDF[[geneColumn]]
nGenes.thisDF <- length(theseGenes)
where <- match( allG, theseGenes, nomatch=0)
rankM[ where > 0, i] <- where[ where > 0]
if (mode == "percentile") {
ptile <- ((nGenes.thisDF - rankM[ , i] + 1) / nGenes.thisDF) * 100
rankM[ , i] <- ptile
}
logFoldColumn <- if ( is.null(valueColumn)) integer(0) else grep( valueColumn, colnames(thisDF))
if ( length( logFoldColumn) > 0) {
foldM[ where > 0, i] <- thisDF[[ logFoldColumn[1]]][where]
}
if (hasPVAL) {
pvalColumn <- grep( pvalueColumn, colnames(thisDF))
if ( length( pvalColumn) > 0) {
pvalM[ where > 0, i] <- thisDF[[ pvalColumn[1]]][where]
}
} else {
pvalM[ , i] <- missingPval
}
if (hasPROD) {
allProds[ where > 0] <- ifelse( allProds[where > 0] == "",
thisDF[[ productColumn]][where], allProds[ where > 0])
}
}
# check for missing genes
whoNA <- vector()
for ( i in 1:nDF) {
who <- which( is.na( rankM[ ,i]))
if ( length(who) > 0) {
whoNA <- sort( unique( c( whoNA, who)))
if ( missingGenes == "fill") {
rankM[ who, i] <- if ( mode == "rank") nrow(rankM) else 0
foldM[ who, i] <- missingValue
pvalM[ who, i] <- 1.0
}
}
}
if ( missingGenes == "drop" && length(whoNA) > 0) {
rankM <- rankM[ -whoNA, , drop=F]
foldM <- foldM[ -whoNA, , drop=F]
pvalM <- pvalM[ -whoNA, , drop=F]
allG <- allG[ -whoNA]
allProds <- allProds[ -whoNA]
NG <- length(allG)
}
# if 'na', and a gene row has too many na's, drop the whole thing
if ( missingGenes == "na") {
nna <- apply( rankM, 1, function(x) sum(is.na(x)))
whoNA <- which( nna > (nDF * naDropPercent))
if ( length( whoNA) > 0) {
rankM <- rankM[ -whoNA, , drop=F]
foldM <- foldM[ -whoNA, , drop=F]
pvalM <- pvalM[ -whoNA, , drop=F]
allG <- allG[ -whoNA]
allProds <- allProds[ -whoNA]
NG <- length(allG)
}
}
if (verbose) cat( " after 'drop' and/or 'na' filtering: ", NG)
if ( NG < 1) return( data.frame())
if (verbose) cat( "\nAveraging Ranks...")
if ( all( weightset == 1)) {
avgRank <- apply( rankM, MARGIN=1, FUN=rank.average.FUN, na.rm=T)
} else {
if ( identical( average.FUN, logmean)) {
avgRank <- apply( rankM, MARGIN=1, FUN=function(x) {
numerator <- sum( log2(x) * weightset)
denom <- sum( weightset)
return( 2 ^ (numerator / denom))
})
} else if ( identical( average.FUN, sqrtmean)) {
avgRank <- apply( rankM, MARGIN=1, FUN=function(x) {
numerator <- sum( sqrt(x) * weightset)
denom <- sum( weightset)
return( (numerator / denom) ^ 2)
})
} else {
avgRank <- apply( rankM, MARGIN=1, FUN=function(x) {
numerator <- sum( x * weightset)
denom <- sum( weightset)
return( numerator / denom)
})
}
}
if (diffExpressPvalueCorrection && direction == "UP") {
# the p-values of strongly down-regulated genes are (usually) very small numbers
# if wanted, turn these into bad p-values of being up-regulated
for ( i in 1:nDF) {
myfolds <- foldM[ ,i]
mypvals <- pvalM[ ,i]
needsCorrect <- which( myfolds < 0 & mypvals < 0.5)
if ( length(needsCorrect)) {
mypvals[ needsCorrect] <- 1.0 - mypvals[ needsCorrect]
pvalM[ ,i] <- mypvals
}
cat( "\nDebug: ", i, " N_Corrected_Pvalues: ", length( needsCorrect))
}
}
avgFold <- apply( foldM, MARGIN=1, FUN=value.average.FUN, na.rm=T)
#avgPval <- apply( pvalM, MARGIN=1, FUN=logmean, na.rm=T)
avgPval <- apply( pvalM, MARGIN=1, FUN=p.combine)
if ( is.null( valueColumn)) valueColumn <- "VALUE"
if ( ! is.na(productColumn)) {
out <- data.frame( allG, allProds, avgFold, avgRank, avgPval, rankM, stringsAsFactors=FALSE)
colnames(out) <- c( geneColumn, productColumn, valueColumn, "AVG_RANK", "AVG_PVALUE", colnames( rankM))
} else {
out <- data.frame( allG, avgFold, avgRank, avgPval, rankM, stringsAsFactors=FALSE)
colnames(out) <- c( geneColumn, valueColumn, "AVG_RANK", "AVG_PVALUE", colnames( rankM))
}
# do the final ordering by Average Rank
ord <- order( out$AVG_RANK, out$AVG_PVALUE, -out[[ valueColumn]])
if ( mode == "percentile") {
ord <- order( -out$AVG_RANK, out$AVG_PVALUE, -out[[ valueColumn]])
}
out <- out[ ord, ]
rownames( out) <- 1:nrow(out)
# do a simulation of random permutations of these ranks
# do the FDR after sorting final row order
if ( nFDRsimulations > 0) {
simM <- rankM
NR <- nrow(simM)
randomAvgRank <- vector( length=NR*nFDRsimulations)
nnow <- 0
if (verbose) cat( " estimating FDR..")
if ( mode == "rank") {
for ( k in 1:nFDRsimulations) {
for ( i in 1:nDF) simM[ , i] <- sample( NR)
randomNow <- apply( simM, MARGIN=1, FUN=rank.average.FUN, na.rm=T)
randomAvgRank[ (nnow+1):(nnow+NR)] <- randomNow
nnow <- nnow + NR
if (verbose) cat( ".")
}
# with this pool of 'by-chance average ranks, we can estimate the likelihood of ours
randomAvgRank <- sort( randomAvgRank)
avgRank <- out$AVG_RANK
myLocs <- findInterval( avgRank * 1.00000001, randomAvgRank)
myLocs <- ifelse( myLocs > 0, myLocs - 1, 0)
} else {
for ( k in 1:nFDRsimulations) {
for ( i in 1:nDF) simM[ , i] <- runif( NR, 0, 100)
randomNow <- apply( simM, MARGIN=1, FUN=rank.average.FUN, na.rm=T)
randomAvgRank[ (nnow+1):(nnow+NR)] <- randomNow
nnow <- nnow + NR
if (verbose) cat( ".")
}
# with this pool of 'by-chance average ranks, we can estimate the likelihood of ours
randomAvgRank <- sort( randomAvgRank)
avgRank <- out$AVG_RANK
myLocs <- findInterval( avgRank * 0.9999999, randomAvgRank)
myLocs <- ifelse( myLocs > 0, myLocs, 0)
# bigger values are more rare, so invert the pointers
myLocs <- length(randomAvgRank) - myLocs
}
myEvalue <- myLocs / nFDRsimulations
myFPrate <- myEvalue / (1:NR)
myFPrate <- ifelse( myFPrate > 1, 1, myFPrate)
out$FDR <- round( myFPrate, digits=4)
}
# correlation test...
ccM <- matrix( NA, nrow=nDF, ncol=nDF)
if ( nDF > 1) {
for( i in 1:(nDF-1)) {
for( j in (i+1):nDF) {
thisCC <- cor( rankM[ ,i], rankM[ ,j], use="complete")
ccM[i,j] <- ccM[j,i] <- thisCC
}}
}
colnames( ccM) <- colnames(rankM)
cc <- apply( ccM, MARGIN=2, mean, na.rm=T)
metaRankCC <<- cc
if (verbose) {
cat( "\nRank Correlations:\n")
print( sort( cc, decreasing=T))
}
return( out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.