assembleFeatureMatrix = function( tf ) {
require( GenomicRanges )
require( TReNA )
require( doBy )
cat( "initializing databases\n" )
genome.db.uri <- "postgres://whovian/hg38"
project.db.uri <- "postgres://whovian/lymphoblast"
fp <- FootprintFinder(genome.db.uri, project.db.uri, quiet=TRUE)
###### Wellington / FIMO ##########
cat( "loading footprints\n" )
# get Wellington/FIMO footprints for tf
query <-
paste( "select fp.chr, fp.mfpstart, fp.mfpend, fp.motifname, fp.pval, fp.score, mg.motif, mg.tf",
"from footprints fp",
"inner join motifsgenes mg",
"on fp.motifName=mg.motif",
paste("where mg.tf='",tf,"'" , sep="" ) , collapse = " " )
footprints.tf = dbGetQuery( fp@project.db , query )
if( nrow(footprints.tf) == 0 ) {
closeDatabaseConnections(fp)
cat("TF not in footprint database\n")
return(NULL)
}
colnames( footprints.tf ) = c("chrom","start","end","motifname","p.fimo","score.wellington","motif","tf" )
fp.gr = makeGRangesFromDataFrame( footprints.tf , keep.extra.columns = T )
footprints.uniq = unique(footprints.tf[,1:3])
fp.gr.uniq = makeGRangesFromDataFrame( footprints.uniq )
cat( "storing FIMO and Wellington p-values" )
# min FIMO p-value
loc = paste( footprints.tf$chr , footprints.tf$start ,
footprints.tf$end , footprints.tf$tf , sep="." )
x = data.frame( loc , footprints.tf )
x = x[ order( x$p.fimo ) , ]
x = x[ duplicated(x$loc) == F , ]
m = match( unique(loc) , x$loc )
min.p.fimo = x$p.fimo[ m ]
# max Wellington score
x = data.frame( loc , footprints.tf )
x = x[ order( x$score.wellington , decreasing = T ) , ]
x = x[ duplicated(x$loc) == F , ]
m = match( unique(loc) , x$loc )
max.score.wellington = x$score.wellington[ m ]
####### phastCons ########
cat( "loading phastCons\n" )
# get phastCons conserved element annotations
load("/proj/price1/sament/resources/phastConsElements100way.RData")
phastCons.gr = makeGRangesFromDataFrame( phastCons )
# matches to conserved elements
conserved_element = countOverlaps( fp.gr.uniq , phastCons.gr )
###### ChromHMM ########
# imputed 25 state model
# get chromHMM annotations for GM12878
cat( "loading ChromHMM\n" )
hmm.states = read.csv("/proj/price1/sament/resources/ChromHMM/states.25statemodel.csv" , header = F)
chromHMM = read.table( "/proj/price1/sament/lymphoblast_trn/E116_25_imputed12marks_stateno.hg38.bed")
colnames(chromHMM) = c("chr","start","end","state")
chromHMM = chromHMM[ order( chromHMM[,4] ) , ]
chromStates = splitBy( ~ state , data = chromHMM )
# matches to each chromHMM state
fp.chromStates = matrix( NA ,
ncol = length(chromStates) ,
nrow = nrow(footprints.uniq) )
colnames(fp.chromStates) = paste( "chromHMM" , hmm.states[,2] , sep="." )
for( i in 1:length(chromStates) ) {
df = chromStates[[i]]
chromState.gr = makeGRangesFromDataFrame( df )
counts.state = countOverlaps( fp.gr.uniq , chromState.gr )
fp.chromStates[,i] = counts.state
}
####### FANTOM5 #########
cat( "loading FANTOM5\n" )
# FANTOM5 nehancers B-lymphocytes
fantom = read.table( "/proj/price1/sament/resources/fantom5/hg38/facet_expressed_enhancers/CL0000945_lymphocyte_of_B_lineage_expressed_enhancers.bed.hg38.bed")
colnames(fantom)[1:3] = c("chrom","start","end")
fantom = makeGRangesFromDataFrame( fantom )
fantom = countOverlaps( fp.gr.uniq , fantom )
# FANTOM5 enhancers, all tissues
fantom.files = Sys.glob("/proj/price1/sament/resources/fantom5/hg38/facet_expressed_enhancers/*.bed")
fantom.all = read.table(fantom.files[1])[,1:3]
for( f in fantom.files[-1] ) {
tmp = read.table(f)[,1:3]
fantom.all = rbind( fantom.all , tmp )
# cat( f , "\n" )
}
colnames(fantom.all) = c("chrom","start","end")
fantom.all = makeGRangesFromDataFrame( fantom.all )
fantom.all = countOverlaps( fp.gr.uniq , fantom.all )
closeDatabaseConnections(fp)
# assemble feature matrix
cat( "assembling feature matrix\n" )
features = data.frame(
footprints.uniq ,
min.p.fimo ,
max.score.wellington ,
fp.chromStates ,
fantom.lymphocytes = fantom ,
fantom.all ,
conserved_element )
return( features )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.