# transcriptDeconvolution.R -- higher level code to do transcript deconvolution
# uses lower level 'transcriptBlend' tool
`getTranscriptDeconvolutionTargetMatrix` <- function( speciesID=getCurrentSpecies(), target=NULL) {
if ( speciesID != getCurrentSpecies()) {
oldSpecies <- getCurrentSpecies()
setCurrentSpecies( speciesID)
on.exit( setCurrentSpecies( oldSpecies))
}
# let's update this system to use the exact same data as from the cell type tools
CellTypeSetup()
targetM <- getCellTypeMatrix()
return( targetM)
# old method...
myPrefix <- getCurrentSpeciesFilePrefix()
CellTypeSetup()
reference <- getCellTypeReference()
datafile <- paste( myPrefix, reference, "TargetMatrix", sep=".")
targetM <- NULL
data( list=datafile, package="DuffyTools", envir=environment())
if ( is.null( targetM)) cat( "\nFailed to load deconvolution target matrix. Tried: ", datafile)
return( targetM)
}
`buildTranscriptDeconvolutionTargetMatrix` <- function( dimensionIDset, transcript.files, de.files, nGenesPerDimension=NULL,
speciesID=getCurrentSpecies(), dropGenePattern=NULL, dropProductPattern=NULL) {
if ( speciesID != getCurrentSpecies()) setCurrentSpecies( speciesID)
prefix <- getCurrentSpeciesFilePrefix()
geneMap <- subset( getCurrentGeneMap(), REAL_G=TRUE)
# step 1: make sure all files and sampleIDs are in agreement
NS <- length(dimensionIDset)
NTF <- length(transcript.files)
NDEF <- length(de.files)
if ( ! all( c( NTF, NDEF) == NS)) stop( "All arguments must be same length: dimensionIDset, transcript.files, de.files")
for ( i in 1:NS) {
sid <- dimensionIDset[i]
tfid <- basename( transcript.files[i])
defid <- basename( de.files[i])
if ( ! grepl( sid, tfid, fixed=T)) stop( paste( "SampleID not in Transcript filename: ", sid, tfid))
if ( ! grepl( sid, defid, fixed=T)) stop( paste( "SampleID not in DE ratios filename: ", sid, defid))
}
if ( ! all( grep( prefix, basename(transcript.files)) %in% 1:NS)) stop( "Expected species prefix not in all transcript files")
if ( ! all( grep( prefix, basename(de.files)) %in% 1:NS)) stop( "Expected species prefix not in all DE ratio files")
if ( ! all( file.exists( transcript.files))) stop( "Some transcript files not found")
if ( ! all( file.exists( de.files))) stop( "Some DE ratio files not found")
# OK, the files and IDs are self consistent
# step 2: get the reference transcriptomes for these samples
cat( "\nLoading reference transcriptomes..\n")
targetM <- expressionFileSetToMatrix( transcript.files, dimensionIDset, verbose=T)
# force check that the genes we see are in this annotation
drops <- which( ! ( rownames(targetM) %in% geneMap$GENE_ID))
if ( length(drops)) {
cat( "\nSome GeneIDs not in current species genome annotation. Dropping: ", length( drops))
targetM <- targetM[ -drops, ]
}
# step 3: gather the genes that will be the marker gene set for each dimension
if ( is.null( nGenesPerDimension)) {
gpd <- round( nrow(geneMap) / NS / 2)
nGenesPerDimension <- gpd
}
cat( "\nLoading marker genes..\n")
markerGenes <- markerRanks <- markerTypes <- vector()
for ( i in 1:NS) {
f <- de.files[i]
tmp <- read.delim( f, as.is=T)
drops1 <- drops2 <- vector()
if ( ! is.null( dropGenePattern)) {
drops1 <- grep( dropGenePattern, tmp$GENE_ID)
}
if ( ! is.null( dropProductPattern)) {
drops2 <- grep( dropProductPattern, tmp$PRODUCT)
}
drops <- sort( unique( c( drops1, drops2)))
if ( length(drops)) {
cat( "\nDropping unwanted genes: ", length(drops), "\n")
tmp <- tmp[ -drops, ]
}
# grap up to 2N here, then trim out excess later
Ng2 <- nGenesPerDimension * 2
if ( Ng2 > nrow(tmp)) Ng2 <- nrow(tmp)
markerGenes <- c( markerGenes, tmp$GENE_ID[1:Ng2])
markerRanks <- c( markerRanks, 1:Ng2)
markerTypes <- c( markerTypes, rep( dimensionIDset[i], Ng2))
cat( "\r", i, dimensionIDset[i], Ng2, length(markerGenes))
}
# step 4: keep just one copy of each gene, and track its type and rank
ord <- order( markerGenes, markerRanks, markerTypes)
best <- match( sort( unique( markerGenes)), markerGenes[ord])
bestMarkerGenes <- markerGenes[ ord[ best]]
bestMarkerRanks <- markerRanks[ ord[ best]]
bestMarkerTypes <- markerTypes[ ord[ best]]
# now enforce the N limit
dropExcess <- vector()
for (ct in sort( unique( bestMarkerTypes))) {
now <- which( bestMarkerTypes == ct)
if ( length(now) > nGenesPerDimension) {
dropNow <- setdiff( now, now[1:nGenesPerDimension])
dropExcess <- c( dropExcess, dropNow)
}
}
if ( length( dropExcess)) {
bestMarkerGenes <- bestMarkerGenes[ -dropExcess]
bestMarkerRanks <- bestMarkerRanks[ -dropExcess]
bestMarkerTypes <- bestMarkerTypes[ -dropExcess]
}
# now finalize with just those best
# given this set as the proposed marker genes, make the target matrix we will use
who <- match( bestMarkerGenes, rownames(targetM))
targetM <- targetM[ who, ]
out <- data.frame( "GENE_ID"=bestMarkerGenes, "Dimension"=bestMarkerTypes,
"Rank"=bestMarkerRanks, "PRODUCT"=gene2Product( bestMarkerGenes),
targetM, stringsAsFactors=F)
outfile <- paste( prefix, "Deconvolution.TargetMatrix.txt", sep=".")
write.table( out, outfile, sep="\t", quote=F, row.names=F)
outfile2 <- paste( prefix, "Deconvolution.TargetMatrix.rda", sep=".")
save( targetM, file=outfile2)
cat( "\nWrote new Deconvolution Target Matrix: ", nGenesPerDimension, " \tN_Row: ", nrow(targetM))
return()
}
`fileSet.TranscriptDeconvolution` <- function( files, fids, targetM=getTranscriptDeconvolutionTargetMatrix(),
geneColumn="GENE_ID", intensityColumn="RPKM_M",
sep="\t", useLog=FALSE, minIntensity=0,
arrayFloorIntensity=NULL, dropLowVarianceGenes=NULL, geneUniverse=NULL,
algorithm=c("port","default","plinear","LM","GenSA","steepDescent"),
startFractions=NULL, plot=TRUE, plot.path=".", plot.col=NULL,
label="", verbose=TRUE) {
if ( length(files) != length(fids)) {
cat( "\nLength mismatch: 'files' and 'fids' must be same length")
return(NULL)
}
# evaluate all files, and then decide how to visualize
found <- file.exists(files)
if ( sum(found) < length(files)) {
cat( "\nSome transcript files not found: ", fids[ !found])
files <- files[ found]
fids <- fids[ found]
}
NS <- length( files)
ND <- ncol(targetM)
ans <- matrix( 0, nrow=ND, ncol=NS)
colnames(ans) <- fids
rownames(ans) <- colnames(targetM)
algorithm <- match.arg( algorithm)
if (algorithm == "GenSA") require( GenSA)
if (algorithm == "LM") require( minpack.lm)
# try to do in multi core
myStarts <- NULL
mcAns <- multicore.lapply( 1:NS, FUN=function(i) {
f <- files[ i]
tbl <- read.delim( f, sep=sep, as.is=T)
if (i > 1) verbose <<- FALSE
# allow usuing specific starting percentages
# note that previous runs output 0 to 100, but the low level tool wants 0 to 1.0
if ( ! is.null( startFractions)) {
wh <- which( colnames(startFractions) == fids[i])[1]
if ( ! is.na(wh)) {
#cat( " Given start %s..")
myStarts <- as.numeric(startFractions[ ,wh]) / 100
}
}
fitAns <- fit.transcriptBlend( tbl, targetM, geneColumn=geneColumn,
intensityColumn=intensityColumn, useLog=useLog,
minIntensity=minIntensity,
arrayFloorIntensity=arrayFloorIntensity,
dropLowVarianceGenes=dropLowVarianceGenes,
geneUniverse=geneUniverse,
algorithm=algorithm, startFractions=myStarts, verbose=verbose)
if ( ! is.null(fitAns)) cat( " ", i, fids[i], "RMS.Dev=", round(fitAns$RMS.Deviation,digits=3))
return( fitAns)
})
rmsDev <- r2cod <- r2p <- pval <- vector( length=NS)
for ( i in 1:NS) {
# catch case of a single element affect lapply
fitAns <- if ( NS == 1) mcAns else mcAns[[i]]
if ( is.null(fitAns)) next
ans[ , i] <- fitAns$BestFit
rmsDev[i] <- fitAns$RMS.Deviation
r2cod[i] <- fitAns$R2.CoD
r2p[i] <- fitAns$R2.Pearson
pval[i] <- fitAns$Pvalue
}
out2 <- data.frame( "SampleID"=fids, "RMS.Deviation"=rmsDev, "R2.CoD"=r2cod, "R2.Pearson"=r2p,
"P.Value"=pval, stringsAsFactors=F)
cat( " Done.\n")
# standardize the values to 100 %
# NO, let's not do this here anymore. Allow original values to be passed back
pcts <- ans
# for ( i in 1:NS) pcts[ , i] <- ans[ , i] * 100 / sum( ans[ , i])
if (plot) {
if ( label == "") label <- paste( "Sample =", fids[1], " Algorithm =", algorithm, " Log =", useLog)
plotTranscriptProportions(pcts, col=plot.col, label=label)
# new plot printing method does not need explicit extension
plotFile <- paste( getCurrentSpeciesFilePrefix(), getCellTypeReference(), "DeconvProportions", algorithm, sep=".")
if ( length(fids) == 1) plotFile <- paste( fids[1], plotFile, sep=".")
plotFile <- file.path( plot.path, plotFile)
printPlot( plotFile)
}
return( list( "BestFit"=pcts, "Statistics"=out2))
}
`matrix.TranscriptDeconvolution` <- function( m, targetM=getTranscriptDeconvolutionTargetMatrix(),
useLog=FALSE, minIntensity=0,
arrayFloorIntensity=NULL, dropLowVarianceGenes=NULL, geneUniverse=NULL,
algorithm=c("port","default","plinear","LM","GenSA","steepDescent"),
plot=TRUE, plot.path=".", plot.col=NULL, label="", verbose=TRUE) {
NS <- ncol(m)
ND <- ncol(targetM)
geneIDs <- rownames(m)
if ( is.null( geneIDs)) stop( "Matrix has no GeneID rownames..")
ans <- matrix( 0, nrow=ND, ncol=NS)
colnames(ans) <- fids <- colnames(m)
rownames(ans) <- colnames(targetM)
algorithm <- match.arg( algorithm)
if (algorithm == "GenSA") require( GenSA)
if (algorithm == "LM") require( minpack.lm)
# try to do in multi core
mcAns <- multicore.lapply( 1:NS, FUN=function(i) {
v <- m[ ,i]
tbl <- data.frame( "GENE_ID"=geneIDs, "INTENSITY"=v, stringsAsFactors=F)
if (i > 1) verbose <<- FALSE
fitAns <- fit.transcriptBlend( tbl, targetM, geneColumn="GENE_ID",
intensityColumn="INTENSITY", useLog=useLog,
minIntensity=minIntensity,
arrayFloorIntensity=arrayFloorIntensity,
dropLowVarianceGenes=dropLowVarianceGenes,
geneUniverse=geneUniverse,
algorithm=algorithm, verbose=verbose)
if ( ! is.null(fitAns)) cat( " ", i, fids[i], "RMS.Dev=", round(fitAns$RMS.Deviation,digits=3))
return( fitAns)
})
rmsDev <- r2cod <- r2p <- pval <- vector( length=NS)
for ( i in 1:NS) {
# catch case of a single element affect lapply
fitAns <- if ( NS == 1) mcAns else mcAns[[i]]
if ( is.null(fitAns)) next
ans[ , i] <- fitAns$BestFit
rmsDev[i] <- fitAns$RMS.Deviation
r2cod[i] <- fitAns$R2.CoD
r2p[i] <- fitAns$R2.Pearson
pval[i] <- fitAns$Pvalue
}
out2 <- data.frame( "SampleID"=fids, "RMS.Deviation"=rmsDev, "R2.CoD"=r2cod, "R2.Pearson"=r2p,
"P.Value"=pval, stringsAsFactors=F)
cat( " Done.\n")
# standardize the values to 100 %
pcts <- ans
for ( i in 1:NS) pcts[ , i] <- ans[ , i] * 100 / sum( ans[ , i])
if (plot) {
if ( label == "") label <- paste( "Sample =", colnames(m)[1], " Algorithm =", algorithm, " Log =", useLog)
plotTranscriptProportions(pcts, col=plot.col, label=label)
plotFile <- paste( getCurrentSpeciesFilePrefix(), getCellTypeReference(), "DeconvProportions", algorithm, sep=".")
if ( length(fids) == 1) plotFile <- paste( fids[1], plotFile, sep=".")
plotFile <- file.path( plot.path, plotFile)
printPlot( plotFile)
}
return( list( "BestFit"=pcts, "Statistics"=out2))
}
`plotTranscriptProportions` <- function( pcts, mode=c("bars","auto","pie","lines"),
col=rainbow( nrow(pcts), end=0.82), label="", useLog=FALSE, min.value.show=0.2,
pch=22, pt.cex=2.5, lwd=4, legend.cex=0.9, label.cex=0.9, cex.axis=1,
text.rotation=if( ncol(pcts) < 9) 0 else 90,
gaps=NULL, significance.values=NULL, significance.scaling=FALSE, ...) {
# evaluate all files, and then decide how to visualize
NS <- ncol(pcts)
ND <- nrow(pcts)
mode <- match.arg( mode)
fids <- colnames(pcts)
# previously, the proportions were alway true percentages, that sum to 100%.
# No longer a given. Force it now
for ( i in 1:NS) pcts[ ,i] <- pcts[ ,i] * 100 / sum( pcts[ ,i], na.rm=T)
doPie <- (( mode == "pie") || (mode == "auto" && NS < 3))
doBars <- (( mode == "bars") || (mode == "auto" && NS > 2))
doLines <- (( mode == "lines") || (mode == "auto" && NS > 16))
# visuals...
if ( doPie) {
par( mfcol=c(1,1))
# set up with useful sizes for this number of pies
maintext <- "Transcript Proportions: "
saveMAI <- par( 'mai')
if (NS > 1) par( mfrow=c(1,2))
if (NS > 2) par( mfrow=c(2,2))
if (NS > 4) par( mfrow=c(2,3))
if (NS > 6) par( mfrow=c(3,3))
if (NS > 9) par( mfrow=c(4,4))
if (NS > 3) maintext <- ""
if (NS > 1 && all( saveMAI == c(1.02,0.82,0.82,0.42))) {
par( mai=c( 0.15, 0.4, 0.6, 0.4))
on.exit( par( 'mai'=saveMAI))
}
for ( i in 1:NS) {
v <- pcts[ ,i]
nams <- paste( rownames(pcts), as.percent( v, big.value=100, digits=1), sep=" ")
toShow <- which( v > min.value.show)
pie( v[toShow], labels=nams[toShow], col=col[toShow], main=paste( maintext, fids[i]), ...)
}
par( mfcol=c(1,1))
dev.flush()
return()
}
LAS <- if ( NS < 5) 1 else 3
X_LIM_SCALE <- 1.3
nextra <- length(gaps)
if ( doLines) {
# more than 2, so try colored points
par( mfcol=c(1,1))
xLimits <- c( 0, NS + 1.15 + ( 0.3 * (NS-1)))
yLimits <- range( pcts)
yLabel <- "Proportion per Component"
Log=""
# perhaps we were given some P values, to weight how we draw things
myCEX <- rep.int( pt.cex, ND)
myLWD <- rep.int( lwd, ND)
myTXTCEX <- rep.int( label.cex, ND)
myTXTCOL <- rep.int( 'black', ND)
if ( ! is.null( significance.values)) {
whSig <- match( rownames(pcts), names(significance.values), nomatch=0)
mySignif <- rep.int( 0.001, ND)
mySignif[ whSig > 0] <- significance.values[whSig]
# best significance keep the given weight/color scheme, worse get less
if (significance.scaling) {
whNow <- which( mySignif > 0.01)
myCEX[whNow] <- myCEX[whNow] * 0.7
myLWD[whNow] <- myLWD[whNow] * 0.6
myTXTCEX[whNow] <- myTXTCEX[whNow] * 0.8
myTXTCOL[whNow] <- adjustColor( myTXTCOL[whNow], 0.2)
whNow <- which( mySignif > 0.05)
myCEX[whNow] <- myCEX[whNow] * 0.7
myLWD[whNow] <- myLWD[whNow] * 0.6
myTXTCEX[whNow] <- myTXTCEX[whNow] * 0.8
myTXTCOL[whNow] <- adjustColor( myTXTCOL[whNow], 0.2)
whNow <- which( mySignif > 0.1)
myCEX[whNow] <- myCEX[whNow] * 0.7
myLWD[whNow] <- myLWD[whNow] * 0.6
myTXTCEX[whNow] <- myTXTCEX[whNow] * 0.8
myTXTCOL[whNow] <- adjustColor( myTXTCOL[whNow], 0.2)
whNow <- which( mySignif > 0.2)
myCEX[whNow] <- myCEX[whNow] * 0.7
myLWD[whNow] <- myLWD[whNow] * 0.6
myTXTCEX[whNow] <- myTXTCEX[whNow] * 0.8
myTXTCOL[whNow] <- adjustColor( myTXTCOL[whNow], 0.2)
}
}
if (useLog) {
Log <- 'y'
yLimits[1] <- min.value.show
pcts[ pcts < min.value.show] <- min.value.show
yLabel <- "Proportion per Component (log scale)"
}
plot( 1, 1, type="n", main=paste( "Transcriptome Proportions: ", label), ylab=yLabel,
xlim=xLimits, ylim=yLimits, font.axis=2, font.lab=2, cex.axis=1.1, cex.lab=1.1,
xaxt="n", xlab=NA, log=Log, ...)
axis( 1, at=1:NS, colnames(pcts), las=LAS, font=2, cex.axis=cex.axis)
for ( i in 1:NS) {
v <- pcts[ ,i]
toShow <- which( v >= min.value.show)
ord <- order( v[toShow])
points( rep.int(i,length(toShow)), v[toShow[ord]], bg=col[toShow[ord]], pch=pch, cex=myCEX[toShow[ord]])
}
# if gaps are wanted, that inserts breaks (NAs) in the lines
nextra <- 0
linePts <- 1:NS
if ( ! is.null(gaps)) {
gaps <- sort( unique( as.integer( gaps)))
for (g in gaps) {
if ( is.na(g)) next
if ( g < 1) next
if ( g+nextra >= ncol(pcts)) next
pcts <- cbind( pcts[,1:(g+nextra)], rep.int(NA,nrow(pcts)), pcts[,(g+nextra+1):ncol(pcts)])
linePts <- c( linePts[1:(g+nextra)], NA, linePts[(g+nextra+1):length(linePts)])
nextra <- nextra + 1
}
NSpts <- ncol(pcts)
}
# try to draw the line in order of pct
pctMeans <- apply( pcts, 1, mean, na.rm=T)
lineOrd <- order( pctMeans)
for( j in lineOrd) {
if ( all(is.na(pcts[j,])) || max(pcts[j,],na.rm=T) < min.value.show) next
lines( linePts, pcts[ j, ], col=col[j], lwd=myLWD[j])
text( 0.9, pcts[j,1], rownames(pcts)[j], pos=2, cex=myTXTCEX[j], col=myTXTCOL[j])
text( NS+0.1, pcts[j,NS+nextra], rownames(pcts)[j], pos=4, cex=myTXTCEX[j], col=myTXTCOL[j])
}
# the line plot has no fixed order, so don't reverese the colors
legend( "topright", rownames(pcts), fill=col, bg='white', cex=legend.cex)
dev.flush()
return()
}
if ( doBars) {
# more than 2, so try a barchart
par( mfcol=c(1,1))
nextra <- 0
barAts <- 1:NS
if ( ! is.null(gaps)) {
gaps <- sort( unique( as.integer( gaps)))
for (g in gaps) {
if ( is.na(g)) next
if ( g < 1) next
if ( g+nextra >= ncol(pcts)) next
pcts <- cbind( pcts[,1:(g+nextra)], rep.int(NA,nrow(pcts)), pcts[,(g+nextra+1):ncol(pcts)])
barAts <- c( barAts[1:g], (barAts[(g+1):length(barAts)] + 1))
nextra <- nextra + 1
}
}
xLimits <- c( 0, max(barAts) + 1.15 + ( 0.3 * (NS+length(gaps)-1)))
ans <- barplot( pcts, col=col, main=paste( "Transcriptome Proportions: ", label),
ylab="Proportion per Component", las=LAS,
xlim=xLimits, font.axis=2, font.lab=2, cex.axis=cex.axis, cex.lab=1.1,
legend=TRUE, args.legend=list( "cex"=legend.cex, "bg"='white'), ...)
# calc the center of each band, to send back to caller
bandCtr <- matrix( 0, nrow(pcts), ncol(pcts))
rownames(bandCtr) <- rownames(pcts)
colnames(bandCtr) <- colnames(pcts)
for ( k in 1:ncol(pcts)) {
myTops <- cumsum( pcts[,k])
myDiffs <- diff( c( 0, myTops))
bandCtr[,k] <- myTops - (myDiffs/2)
}
# draw some labels where we can
samplesToLabel <- barAts
if ( text.rotation == 0) {
if ( NS > 16) samplesToLabel <- samplesToLabel[ seq( 1, NS, by=2)]
if ( NS > 32) samplesToLabel <- samplesToLabel[ seq( 1, NS, by=3)]
if ( NS > 48) samplesToLabel <- samplesToLabel[ seq( 1, NS, by=4)]
}
rotate0.sizeCut <- 3
rotate90.sizeCut <- 12
for ( i in samplesToLabel) {
v <- pcts[ ,i]
if ( all( is.na( v))) next
y0 <- 0
xExtra <- 0.01
for ( j in 1:ND) {
yNow <- v[j]
if ( is.na( yNow)) next
if ( text.rotation == 90 && yNow >= rotate90.sizeCut) {
text( ans[i]+xExtra, (y0+yNow/2), rownames(pcts)[j], col=1, cex=label.cex,
srt=90)
xExtra <- 0.2 - xExtra
}
if ( text.rotation == 0 && yNow >= rotate0.sizeCut) {
text( ans[i], (y0+yNow/2), rownames(pcts)[j], col=1, cex=label.cex, srt=0)
}
y0 <- y0 + yNow
}
}
dev.flush()
# package up a bit of context to send back
barAts <- ans
names(barAts) <- colnames(pcts)
return( invisible( list( "bar.centers"=barAts, "band.centers"=bandCtr)))
}
}
`compareTranscriptProportions` <- function( pcts, groups, levels=sort(unique(as.character(groups))),
col=rainbow(nrow(pcts),end=0.8),
minPerGroup=3, test=t.test, plot=TRUE, plot.path=".",
plot.mode=c("bars","auto","pie","lines"), label="", useLog=FALSE,
wt.fold=1, wt.pvalue=2, min.percent=0.1, gaps=NULL,
stats.color=NULL, significance.scaling=FALSE, ...) {
NC <- ncol(pcts)
NR <- nrow(pcts)
if ( length(groups) != NC) stop( "Length of 'groups' must match columns of 'pcts'")
grpFac <- factor( groups, levels=levels)
grpPtrs <- tapply( 1:NC, grpFac, FUN=NULL)
NG <- nlevels( grpFac)
grpLabels <- base::levels(grpFac)
if ( ! all( 1:NG %in% grpPtrs)) stop( "Some group levels have zero members")
# previously, we were gauranteed that all proportions summed to 100%. No longer always true.
# But we need them to be for comparitive plotting. So do it now with a warning
pctSums <- apply( pcts, 2, sum, na.rm=T)
if ( any( round(pctSums) != 100)) {
cat( "\n Note: normalizing Proportions to sum to 100% for plotting.")
for ( j in 1:NC) pcts[ ,j] <- pcts[ ,j] * 100 / pctSums[j]
}
bigOut <- vector( mode="list")
nOut <- 0
# do all 2-way compares we can
for ( j1 in 1:(NG-1)) {
isGrp1 <- which( grpPtrs == j1)
#if ( length( isGrp1) < minPerGroup) next
label1 <- grpLabels[ j1]
for ( j2 in (j1+1):NG) {
isGrp2 <- which( grpPtrs == j2)
#if ( length( isGrp2) < minPerGroup) next
label2 <- grpLabels[ j2]
# OK to do these 2 groups
v1 <- v2 <- pval <- vector( length=NR)
for ( i in 1:NR) {
x <- pcts[ i, isGrp1]
y <- pcts[ i, isGrp2]
v1[i] <- mean( x, na.rm=T)
v2[i] <- mean( y, na.rm=T)
# values too close to zero can give great P-values. Override that
if ( all( c(x,y) < min.percent)) {
pval[i] <- 1.0
} else {
# make sure we have enough values
if ( (nNow <- length(x)) < minPerGroup) x <- c( x, jitter(sample(x,size=minPerGroup-nNow, replace=T)))
if ( (nNow <- length(y)) < minPerGroup) y <- c( y, jitter(sample(y,size=minPerGroup-nNow, replace=T)))
isXzero <- which( x < min.percent)
if (Nzero <- length(isXzero)) x[ isXzero] <- runif( Nzero, min.percent/10, min.percent)
isYzero <- which( y < min.percent)
if (Nzero <- length(isYzero)) y[ isYzero] <- runif( Nzero, min.percent/10, min.percent)
if ( is.function(test)) {
ans <- test( x, y)
pval[i] <- ans$p.value
} else {
pval[i] <- 1
}
}
}
# report the fold as Group2 over Group1
fold <- log2( (v2+min.percent) / (v1+min.percent))
pval[ is.nan(pval)] <- 1.0
pval[ is.na(pval)] <- 1.0
signif <- rep.int( "", length(pval))
signif[ pval < 0.1] <- "."
signif[ pval < 0.05] <- "*"
signif[ pval < 0.01] <- "**"
signif[ pval < 0.001] <- "***"
out <- data.frame( "Component"=rownames(pcts), "N1"=length(isGrp1), "N2"=length(isGrp2),
"Avg1"=round(v1,digits=2), "Avg2"=round(v2,digits=2),
"Log2Fold"=round(fold,digits=3), "PI.value"=round( piValue(fold,pval), digits=3),
"P.value"=round(pval,digits=5), "Signif"=signif, stringsAsFactors=F)
colnames(out)[2:3] <- paste( "N", c( label1, label2), sep="_")
colnames(out)[4:5] <- paste( "Avg", c( label1, label2), sep="_")
ord <- diffExpressRankOrder( out$Log2Fold, out$P.value, wt.fold=wt.fold, wt.pvalue=wt.pvalue)
out <- out[ ord, ]
rownames(out) <- 1:nrow(out)
nOut <- nOut + 1
bigOut[[nOut]] <- out
names( bigOut)[nOut] <- paste( label1, label2, sep=".vs.")
}
}
# we have the compares done, draw it too
# try to pass info about how significant each was down to the plot tool, to weight/color accordingly
comps <- bigOut[[1]]$Component
nComps <- length(comps)
bestPvals <- rep.int( 1, nComps)
iList <- 0
for ( i1 in 1:(NG-1)) for (i2 in (i1+1):NG) {
iList <- iList + 1
if ( i2 - i1 != 1) next # not an adjacent pair
# don't use stats where the caller put gaps in the plot
if ( ! is.null(gaps) && (i1 %in% gaps)) next
smlDF <- bigOut[[iList]]
wh <- match( comps, smlDF$Component)
thisP <- smlDF$P.value[wh]
bestPvals <- pmin( bestPvals, thisP, na.rm=T)
}
names(bestPvals) <- comps
if (plot) {
plot.mode <- match.arg( plot.mode)
pctsGrp <- t( apply( pcts, 1, function(x) tapply( x, grpFac, mean)))
# show the group counts, if not too many
if (NG <=12) colnames(pctsGrp) <- paste( colnames(pctsGrp), "\n(N=", as.numeric(table(as.numeric(grpPtrs))), ")", sep="")
plotAns <- plotTranscriptProportions( pctsGrp, label=label, col=col, mode=plot.mode, gaps=gaps, useLog=useLog,
significance.values=bestPvals, significance.scaling=significance.scaling, ...)
# perhaps try to show the significance?
# various code for some of the modes
if ( plot.mode == "bars" && NG == 2) {
myStats <- bigOut[[1]]
myBarCenters <- plotAns$bar.centers
myBandCenters <- plotAns$band.centers
toShow <- which( myStats$Signif != "")
if ( length(toShow)) {
where <- match( myStats$Component[toShow], rownames(myBandCenters))
textToShow <- myStats$Signif[toShow]
upDownCall <- myStats$Log2Fold[toShow]
myX <- ifelse( upDownCall > 0, myBarCenters[2] + 0.6, myBarCenters[1] - 0.6)
myY <- ifelse( upDownCall > 0, myBandCenters[ where, 2], myBandCenters[ where, 1])
statsColor <- col[where]
if ( ! is.null(stats.color)) statsColor <- stats.color[1]
text( myX, myY, textToShow, col=statsColor, cex=1.95)
}
}
if ( plot.mode == "lines") {
# we can show the stats between adjacent groups, step along the list of stats
smallX <- if ( NG > 2) 0.15 else -0.1
if ( NG > 8) smallX <- 0.28
smallY <- diff( range( as.vector( pctsGrp), na.rm=T)) * 0.01
if ( useLog) smallY <- 0
iList <- 0
for ( i1 in 1:(NG-1)) for (i2 in (i1+1):NG) {
iList <- iList + 1
if ( i2 - i1 != 1) next # not an adjacent pair
# don'thsow stats where the caller put gaps in the plot
if ( ! is.null(gaps) && (i1 %in% gaps)) next
myStats <- bigOut[[iList]]
toShow <- which( myStats$Signif != "")
if ( length(toShow)) {
where <- match( myStats$Component[toShow], rownames(pctsGrp))
textToShow <- myStats$Signif[toShow]
upDownCall <- myStats$Log2Fold[toShow]
myX <-rep.int( i2 - smallX, length(toShow))
myY <-ifelse( upDownCall > 0, pctsGrp[where,i2]+smallY, pctsGrp[where,i2]-smallY)
statsColor <- col[where]
if ( ! is.null(stats.color)) statsColor <- stats.color[1]
text( myX, myY, textToShow, col=statsColor, cex=2, pos=2, offset=0.15)
}
}
}
}
return( if (nOut == 1) bigOut[[1]] else bigOut)
}
`mergeTranscriptProportions` <- function( pcts, groups, drops=NULL) {
# generic way to reduce the number of cel types by merging similar types
if ( length( groups) != nrow(pcts)) stop( "'groups' must be same length as 'nrow(pcts)'")
out <- data.frame()
tapply( 1:nrow(pcts), grpFac <- factor(groups), function(x) {
sml <- pcts[ x, , drop=F]
smlDF <- as.data.frame(sml[1, , drop=F])
vals <- apply( sml, 2, sum, na.rm=T)
smlDF[ 1, ] <- vals
out <<- rbind( out, smlDF)
})
out <- as.matrix( out)
rownames(out) <- levels(grpFac)
if ( ! is.null(drops)) {
whoDrop <- which( rownames(out) %in% drops)
if ( length(whoDrop)) out <- out[ -whoDrop, ]
}
return( out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.