# combine single-TF FootprintFinder models to predict TF occupancy for other TFs
# Seth Ament
# 2016-5-17
require( caret )
require( pROC )
require( gbm )
# for which TFs do we have feature matrices?
feature_matrices = Sys.glob("FeatureMatrix/*RData")
tflist = gsub("FeatureMatrix\\/" , "" , feature_matrices )
tflist = gsub("\\_(.*)" , "" , tflist )
# load the single TF models
modelfiles = Sys.glob("SingleTFModels/*.RData")
models = list()
for( m in modelfiles ) {
tf.m = gsub( "SingleTFModels\\/" , "" , m )
tf.m = gsub( "\\_(.*)" , "" , tf.m )
load( m )
models[[tf.m]] = fit
}
# compare the variable importance scores across models
nmodels = length(models)
nvars = nrow( as.data.frame( summary( models[[1]] ) ) )
varImp = matrix( NA , ncol = nmodels , nrow = nvars )
for( m in 1:length(models) ) {
rel.inf = as.data.frame( summary( models[[m]] ) )
rel.inf = rel.inf[ order( rel.inf[,1] ) , ]
rel.inf = rel.inf[,2] / max(rel.inf[,2])
varImp[ , m ] = rel.inf
}
colnames(varImp) = names(models)
rownames(varImp) = sort( as.data.frame( summary( models[[m]] ) )[,1] )
# heatmap to visualize
my_palette = colorRampPalette(c("white","blue"))(299)
pdf("varImp.heatmap.43singleTFModels.pdf")
heatmap(varImp ,
col = my_palette ,
scale = "none" ,
margins = c(10,15) )
dev.off()
write.csv( varImp , file="variable.importance.43tfs.csv" )
# predict TFBSs in test set
# majority vote of all other models
pdf("roc.median_probs_43tfs.pdf")
performance.metrics = list()
for( tf in tflist ) {
cat( "working on" , tf , "\n" )
featurefile = paste( "FeatureMatrix/" , tf , "_feature_matrix.RData" , sep="" )
load( featurefile )
y = features$y
x = features[,-c(1:4)]
predictions = matrix( NA , nrow = nrow(x) , ncol = nmodels )
colnames(predictions) = names(models)
roc.single = rep( NA , nmodels )
for( m in 1:nmodels ) {
probs = predict( models[[m]] , newdata = x , type = "prob" )
predictions[,m] = probs[,2]
roc.single[m] = as.numeric( roc( response = y , predictor = probs[,2] , direction = "<" )$auc )
}
median.probs = apply( predictions[ , -which( names(models) == tf ) ] , 1 , quantile , probs = 0.5 )
t2 = table( y == "yesChIP" , predictions[,names(models) == tf] > 0.5 )
sensitivity.train = t2[2,2] / sum(t2[2,])
specificity.train = t2[2,2] / sum(t2[,2])
t = table( y == "yesChIP" , median.probs > 0.5 )
sensitivity.median = t[2,2] / sum(t[2,])
specificity.median = t[2,2] / sum(t[,2])
roc.median = roc( response = y , predictor = median.probs , direction = "<" )
roc.training = roc( response = y , predictor = predictions[ , names(models) == tf ] )
roc.fimo = roc( response = y , predictor = x$min.p.fimo )
roc.wellington = roc( response = y , predictor = x$max.score.wellington )
plot( roc.training , lty = 2 )
par( new = T )
plot( roc.median , lty = 1 , col = "red" , lwd = 3 )
par( new = T )
plot( roc.fimo , lty = 3 )
par( new = T )
plot( roc.wellington , lty = 4 )
legend( x = 0.4 , y = 0.2 ,
lty = c(1,2,3,4) ,
col = c("red",rep("black",3)) ,
lwd = c(3,1,1,1) ,
legend = c( "test (ensemble)" , "training" , "FIMO only" , "Wellington only" )
)
mtext( side = 3 , line = 2 , adj = 0 , font = 4 , cex = 2 , tf )
performance.metrics[[tf]] = list(
probs.singletf = predictions ,
auc.singletf = roc.single ,
probs.median = median.probs ,
confusion.matrix.median = t ,
confusion.matrix.train = t2 ,
auc.train = as.numeric( roc.training$auc ) ,
auc.median = as.numeric( roc.median$auc ) ,
auc.fimo = as.numeric( roc.fimo$auc ) ,
auc.wellington = as.numeric( roc.wellington$auc ) ,
sensitivity.train = sensitivity.train ,
specificity.train = specificity.train ,
sensitivtiy.median = sensitivity.median ,
specificity.median = specificity.median )
}
dev.off()
save( performance.metrics , file= "ensemble.classification.performance.metrics.43tfs.RData" )
auc.median = rep(NA,length(performance.metrics))
auc.train = rep(NA,length(performance.metrics))
auc.fimo = rep(NA,length(performance.metrics))
auc.wellington = rep(NA,length(performance.metrics))
for( i in 1:length( performance.metrics ) ) {
auc.median[i] = performance.metrics[[i]]$auc.median
auc.train[i] = performance.metrics[[i]]$auc.train
auc.fimo[i] = performance.metrics[[i]]$auc.fimo
auc.wellington[i] = performance.metrics[[i]]$auc.wellington
}
auc.res = data.frame( tflist , auc.train , auc.median , auc.fimo , auc.wellington )
quintiles.batf = t( sapply( 1:nrow(predictions) ,
function(x) quantile( predictions[ x ,-which( names(models)==tf)] , probs = seq(0,1,0.2) )))
require(doMC)
registerDoMC( cores = 5 )
ctrl = trainControl( method = "repeatedcv" ,
repeats = 2 )
fit.combined = train( y = y , x = quintiles ,
trControl = ctrl , method = "gbm" , tuneLength = 2 , metric = "Kappa" )
prob.combined = predict( fit.combined , newdata = quintiles , type = "prob" )
plot( roc( response = y , predictor = prob.combined[,2] ) , lty = 2 , col = "green" )
table( y , prob.combined[,2] > 0.5 )
pred = prediction( prob.combined[,2] , y )
perf = performance( pred , measure="prec", x.measure="rec" )
pdf("ikzf1.precrecall.pdf")
plot( performance( prediction(
predictions[,names(models) == tf] , y ) ,
measure="prec", x.measure="rec" ) , lty = 1 , ylim = c(0,1) )
par( new = T )
plot( performance( prediction( prob.combined[,2] , y ) , measure="prec", x.measure="rec" ) ,
lty =2 , ylim = c(0,1) )
par( new = T )
plot( performance( prediction( median.probs , y ) , measure="prec", x.measure="rec" ) ,
lty =2 , ylim = c(0,1) )
dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.