# findTFsInFootprints.R
#------------------------------------------------------------------------------------------------------------------------
library(GenomicRanges)
library(RUnit)
#------------------------------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#------------------------------------------------------------------------------------------------------------------------
findTFsInFootprints <- function(tbl.fp, tbl.motifToGenes, chromosome, startLoc, endLoc, transcriptionFactors=NA)
{
if(all(is.na(transcriptionFactors)))
transcriptionFactors <- sort(unique(tbl.motifToGenes$tfs))
stopifnot(all(c("chr", "mfpStart", "mfpEnd", "motifName") %in% colnames(tbl.fp)))
tbl.sub <- subset(tbl.fp, chr==chromosome & mfpStart >= startLoc & mfpEnd <= endLoc)
motifs.per.fp <- tbl.sub$motifName
tbl.hits <- subset(tbl.motifToGenes, motif %in% motifs.per.fp)
indicesMatched <- match(transcriptionFactors, tbl.hits$tfs)
tbl.hitAndMatched <- subset(tbl.hits, tfs %in% transcriptionFactors)
motifs <- unique(tbl.hitAndMatched$motif)
tbl.sub <- subset(tbl.sub, motifName %in% motifs)
tfs.per.motif <- lapply(motifs, function(m) subset(tbl.hitAndMatched, motif==m)$tfs)
names(tfs.per.motif) <- motifs
tfsPerRow <- unlist(lapply(tbl.sub$motifName, function(motifName)
paste(intersect(transcriptionFactors, subset(tbl.motifToGenes, motif==motifName)$tfs), collapse=";")))
tbl.augmented <- cbind(tbl.sub, tfsMatched=tfsPerRow, stringsAsFactors=FALSE)
invisible(tbl.augmented)
} # findTFsInFootprints
#------------------------------------------------------------------------------------------------------------------------
# identify some proximal upstream snps, within
# with mef2c at hg39 chr5:88,716,241-88,906,105, coded on the minus strand, here are some proximal upstream snps
# reported by mariette
#
# true overlaps with mfpStart-mfpEnd: rs3814428, rs770463, rs446500
# near misses: rs10038371 rs214136 rs16876775
#
# fivenum(tbl$GRCh38.liftover) [1] 88640561 88740812 88828020 88921378 89001975
# dim(subset(tbl, GRCh38.liftover > 88906105)) [1] 43 18
# tbl.upstream <- tbl[which(tbl$GRCh38.liftover - 88906105 > 0),] # 43 18
# upstream.loc <- tbl.upstream$GRCh38.liftover - 88906105
# head(tbl.upstream[, c(1,3,4,5,6,17:19)], n=20)
# Symbol CHR SNP BP A1 GRCh38.liftover GRCh37 upstream
# 115 MEF2C 5 rs10085009 88241752 A 88910179 88205996 4074
# 150 MEF2C 5 rs13357047 88246411 A 88914838 88210655 8733
# 119 MEF2C 5 rs770466 88251210 C 88919637 88215454 13532
# 81 MEF2C 5 rs304132 88251350 A 88919777 88215594 13672
# 153 MEF2C 5 rs10044186 88254553 C 88922980 88218797 16875
# 26 MEF2C 5 rs10037047 88255720 G 88924147 88219964 18042
# 44 MEF2C 5 rs190982* 88259176 G 88927603 88223420 21498
# 42 MEF2C 5 rs301717 88267164 A 88935591 88231408 29486
# 30 MEF2C 5 rs301718 88267595 G 88936022 88231839 29917
# 34 MEF2C 5 rs301719 88268671 C 88937098 88232915 30993
# 40 MEF2C 5 rs301720 88269410 A 88937837 88233654 31732
# 35 MEF2C 5 rs301723 88271430 A 88939857 88235674 33752
# 36 MEF2C 5 rs301725 88272216 T 88940643 88236460 34538
# 29 MEF2C 5 rs7700950 88272698 T 88941125 88236942 35020
# 33 MEF2C 5 rs301726 88272826 T 88941253 88237070 35148
# 31 MEF2C 5 rs301727 88273286 G 88941713 88237530 35608
# 32 MEF2C 5 rs301728 88273557 T 88941984 88237801 35879
# 39 MEF2C 5 rs301729 88273797 G 88942224 88238041 36119
# 41 MEF2C 5 rs1650626 88274962 G 88943389 88239206 37284
# 37 MEF2C 5 rs1650627 88275084 G 88943511 88239328 37406
findSNPsInFootprints <- function(tbl.fp, tbl.motifToGenes,
chromosome, region.startLoc, region.endLoc,
snp.chromosome, snp.loc,
transcriptionFactors=NA,
padding=0)
{
tbl.fpSub <- findTFsInFootprints(tbl.fp, tbl.motifToGenes,
chromosome, region.startLoc, region.endLoc,
transcriptionFactors)
if(nrow(tbl.fpSub) == 0)
return(data.frame())
gr.fp <- with(tbl.fpSub, GRanges(seqnames=chr, IRanges(start=mfpStart, end=mfpEnd)))
gr.snp <- GRanges(seqnames=snp.chromosome, IRanges(start=snp.loc-padding, end=snp.loc+padding))
#browser()
tbl.overlaps <- as.data.frame(findOverlaps(gr.fp, gr.snp))
snp <- rep(-1, length(gr.fp))
snp[tbl.overlaps$queryHits] <- snp.loc[tbl.overlaps$subjectHits]
tbl.fpSub$snp <- snp
result <- subset(tbl.fpSub, snp != -1)
#browser()
result[, c("chr", "mfpStart", "mfpEnd", "motifStart", "motifEnd", "sequence", "motifName", "snp", "tfsMatched")]
} # findTFsInFootprints
#------------------------------------------------------------------------------------------------------------------------
# me2fc upstream mariette-reported snps in footprints
# seqnames start end width strand motifName score seqnames.1 start.1 end.1 width.1 strand.1
# 1 chr5 88669200 88669211 12 - MA0486.2 50.7 chr5 88669204 88669204 1 *
# 2 chr5 88669200 88669211 12 - MA0771.1 50.1 chr5 88669204 88669204 1 *
# 3 chr5 88884580 88884599 20 - MA0528.1 56.3 chr5 88884588 88884588 1 *
# 4 chr5 88884581 88884590 10 + KLF16_DBD 51.1 chr5 88884588 88884588 1 *
# 5 chr5 88884581 88884590 10 + MA0079.3 60.5 chr5 88884588 88884588 1 *
# 6 chr5 88884581 88884594 14 + MA0516.1 51.6 chr5 88884588 88884588 1 *
# 7 chr5 88884581 88884590 10 + MA0741.1 51.1 chr5 88884588 88884588 1 *
# 8 chr5 88884581 88884590 10 + MA0746.1 50.0 chr5 88884588 88884588 1 *
# 9 chr5 88884583 88884589 7 - MAZ.p2 50.9 chr5 88884588 88884588 1 *
# 10 chr5 88884584 88884591 8 - GTF2I.p2 50.2 chr5 88884588 88884588 1 *
# 11 chr5 88884585 88884593 9 + MA0753.1 50.1 chr5 88884588 88884588 1 *
# 12 chr5 88900174 88900187 14 - MA0505.1 53.7 chr5 88900187 88900187 1 *
#------------------------------------------------------------------------------------------------------------------------
prepareData <- function()
{
f <- 'tfdb_1mb_GRCh38.83.bed' # too big
f <- 'tiny.bed' # for testing
f <- 'uniq_fimo_fp.bed' # using this one
tbl.fpAnnotated <- read.table(f, header=FALSE, sep="\t", as.is=TRUE)
new.colnames <- c("chr", "start", "end", "score", "strand", "name", "info", "sequence", "width", "chrom", "bigStart",
"bigEnd", "x", "y", "z", "zz")
colnames(tbl.fpAnnotated) <- new.colnames
tbl.fpAnnotated$chr <- paste("chr", tbl.fpAnnotated$chr, sep="")
save(tbl.fpAnnotated, file = "tbl.fpAnnotated.RData")
tbl.motifToMultipleGenes <- read.table("motif_to_tf_mappings_with_tfclass_include_multiple.csv", sep=",", header=TRUE, as.is=TRUE)
save(tbl.motifToMultipleGenes, file="tbl.motifToMultipleGenes.RData")
} # prepareData
#------------------------------------------------------------------------------------------------------------------------
intersectingLocs <- function(tbl.bed, chrom, locs, padding=0)
{
# some simple, non-exhaustive checks on tbl.bed. expect chrX start.loc end.loc for cols 1:3
stopifnot(length(grep("chr", tbl.bed[,1])) == nrow(tbl.bed))
stopifnot(is.numeric(tbl.bed[,2]))
stopifnot(is.numeric(tbl.bed[,3]))
gr.tbl <- GRanges(seqnames=tbl.bed[,1], IRanges(start=tbl.bed[,2], end=tbl.bed[,3]))
gr.locs <- GRanges(seqnames=chrom, IRanges(start=locs-padding, end=locs+padding))
overlaps <- sort(unique(subjectHits(findOverlaps(gr.tbl, gr.locs))))
locs[overlaps]
} # intersectingLocs
#------------------------------------------------------------------------------------------------------------------------
displayADgwas <- function(igv, name)
{
if(!exists("tbl.gwas"))
load(tbl.gwas.level_1)
displayGWASTable(igv, tbl.gwas, name)
} # displayADgwas
#------------------------------------------------------------------------------------------------------------------------
# each of the filenames names an RData file, a serialized data frame with our standard colnames,
# chr, mfpStart, mfpEnd, motifName, name
combineBedTables <- function(filenames)
{
tbl.all <- data.frame()
for(filename in filenames){
printf("about to load serialized data.frame from %s", filename)
load(filename)
tbl.bed <- x$bed4igv
stopifnot(all(c("chr", "mfpStart", "mfpEnd", "motifName", "name") %in% colnames (tbl.bed)))
tbl.all <- rbind(tbl.all, tbl.bed)
printf("now %d rows in tbl.all", nrow(tbl.all))
}
gr <- GRanges(Rle(tbl.all$chr), IRanges(tbl.all$mfpStart, tbl.all$mfpEnd))
mcols(gr) <- tbl.all[, c("motifName", "name")]
seqinfo(gr) <- seqinfo(TxDb.Hsapiens.UCSC.hg38.knownGene)[names(seqinfo(gr))]
seqlevels(gr) <- seqlevels(sortSeqlevels(seqinfo(gr)))
gr <- sort(gr)
tbl.out <- data.frame(chr=seqnames(gr), start=start(gr), end=end(gr), mcols(gr)[, "name"])
invisible(tbl.out)
} # combineBedTables
#------------------------------------------------------------------------------------------------------------------------
run.tfGrabber <- function(genes, trn, trnName, sqlite.fileName, sql.tableName)
{
#browser()
x <- lapply(genes, function(gene) tfGrabber.new(gene, trn, trnName=trnName, promoterDist=1e6))
lapply(x, length)
tbl.fp <- do.call("rbind", x)
dim(tbl.fp) # [1] 8470 23
table(tbl.fp$gene)
db <- dbConnect(dbDriver("SQLite"), sqlite.fileName)
dbWriteTable(db, sql.tableName, tbl.fp, append=TRUE)
dbDisconnect(db)
} # run.tfGrabber
#------------------------------------------------------------------------------------------------------------------------
tfGrabber.new <- function(gene, inputTRN, trnName, promoterDist)
{
if(!exists("tbls.fp")) {
printf("splitting tbl.fpAnnotated into chromosome-specific tables...")
tbls.fp <<- split(tbl.fpAnnotated, tbl.fpAnnotated$chr)
printf("done.")
}
if(!gene %in% rownames(inputTRN)){
printf("gene '%s' not in TRN model '%s'", gene, trnName)
return(data.frame())
}
loc.info <- subset(humangene, genename==gene)[, c("chrom", "start", "end")]
if(nrow(loc.info) == 0){ # some genes are missing, eg ACBD6
printf("no loc.info for %s, skipping", gene)
return(data.frame())
}
# if there are multiple entries, take just the first. see AATF
loc.info <- loc.info[1,]
#printf("--- loc.info for %s", gene)
#print(loc.info)
start <- loc.info$start - promoterDist; if(start < 1) start <- 1
end <- start + promoterDist
chrom <- as.character(loc.info$chrom)
if(!grepl("^chr", chrom))
chrom <- sprintf("chr", chrom)
# printf("chromosome for %s: %s (%d)", gene, chrom, nchar(chrom))
#browser()
if(nchar(chrom) > 5){
printf("unexpected chromosome name for %s: %s, skipping", gene, chrom)
return(data.frame())
}
tbl.sub <- subset(tbls.fp[[chrom]], mfpStart >= start & mfpEnd <= end)
if(nrow(tbl.sub) == 0){
printf("no footprints found in region %s:%d-%d", chrom, start, end)
return(data.frame())
}
tbl.sub <- cbind(tbl.sub, data.frame(gene=gene, trnName=trnName, promoterDist=promoterDist, stringsAsFactors=FALSE))
tbl.subTF <- merge(tbl.sub, tbl.motifToMultipleGenes, by.x="motifName", by.y="motif")
colnames(tbl.subTF)[match("tfs", colnames(tbl.subTF))] <- "TF"
tfs <- tbl.subTF$TF
tfs.in.model <- intersect(tfs, colnames(inputTRN))
tfs.with.nonzero.betas <- which(inputTRN[gene, tfs.in.model] != 0)
if(length(tfs.with.nonzero.betas) == 0)
return(data.frame())
tfs.in.model.non.zero <- names(tfs.with.nonzero.betas)
tbl.subTF <- subset(tbl.subTF, TF %in% tfs.in.model.non.zero)
beta <- as.numeric(inputTRN[gene, tbl.subTF$TF])
tbl.subTF$beta <- beta
name <- sprintf("%s|%s|%s|%s|%10.7f", tbl.subTF$gene, tbl.subTF$TF, tbl.subTF$trnName,
tbl.subTF$motifName, tbl.subTF$beta)
tbl.subTF$name <- name
new.order <- c("chr", "mfpStart", "mfpEnd", "name", "beta",
"motifName", "motifStart", "motifEnd",
"motifScore", "motifStrand", "id", "pvalue",
"qvalue", "sequence", "experimentCount", "fpStart",
"fpEnd", "fpScore", "motifFpOverlap",
"gene", "trnName", "promoterDist",
"TF")
stopifnot(sort(new.order) == sort(colnames(tbl.subTF)))
tbl.subTF <- tbl.subTF[, new.order]
colnames(tbl.subTF)[2:3] <- c("start", "end")
invisible(tbl.subTF)
} # tfGrabber.new
#------------------------------------------------------------------------------------------------------------------------
tfGrabber <- function(genelist, inputTRN, trnName=NULL, promoterDist=1000000, threshold=0.01)
{
if(!exists("tbls.fp")) {
printf("splitting tbl.fpAnnotated into chromosome-specific tables...")
tbls.fp <<- split(tbl.fpAnnotated, tbl.fpAnnotated$chr)
printf("done.")
}
if (is.null(trnName)){trnName="trn"}
alltfs=colnames(inputTRN)
allgenes=rownames(inputTRN)
nalltfs=length(alltfs)
genelist=intersect(genelist,allgenes)
if (length(genelist) == 0){warning("Input genes not in TRN")}
allTRN=list()
wholebed=data.frame()
count <- 0
for (gene in genelist){ # get single and summed TRN
count <- count + 1
if(!gene %in% rownames(inputTRN))
next;
singleTRN=inputTRN[gene,]
potential.regulator.count <- length(singleTRN)
keepCols=which(abs(singleTRN)>threshold) # keep only TFs with absolute coefficients above threshold
singleTRN=singleTRN[keepCols]
printf("--- %s %d) %8s regulators above threshold %d/%d", trnName, count, gene, length(singleTRN), potential.regulator.count)
# sumTRN == singleTRN in thse case of input is only one TRN
sumTRN=singleTRN
# get motif of each TF in the sumTRN
imotifs=subset(tbl.motifToMultipleGenes,tfs %in% names(sumTRN))
geneline=humangene[which(humangene$genename==gene),][1,]
ichr=as.character(geneline[,"chrom"])
startPosition=geneline[,"start"]-promoterDist
if(startPosition < 1) startPosition <- 1
endPosition=geneline[,"start"]+promoterDist
printf("sequence length: %8.2f", (endPosition-startPosition)/1000)
tbl.thisgene <- subset(tbls.fp[[ichr]], motifName %in% imotifs$motif &
mfpStart >= startPosition & mfpEnd <= endPosition)
#tbl.thisgene <- subset(tbl.fpAnnotated, chr==ichr & motifName %in% imotifs$motif &
# mfpStart >= startPosition & mfpEnd <= endPosition)
#browser()
tbl.thisgene <- tbl.thisgene[, c("chr", "mfpStart", "mfpEnd", "motifName")]
tbl.thisGeneWithTFs <- merge(tbl.thisgene, tbl.motifToMultipleGenes, by.x="motifName", by.y="motif")
tbl.thisGeneWithTFS.collapsed <- aggregate(tfs~chr+mfpStart+mfpEnd+motifName, c, data=tbl.thisGeneWithTFs)
#x <- merge(tbl.thisgene, tbl.motifToMultipleGenes, by.x="motifName", by.y="motif")
# xxx <- aggregate(tfs~chr+mfpStart+mfpEnd, c, data=xx)
printf("about to loop through %d rows in tbl.thisgene", nrow(tbl.thisgene))
if(nrow(tbl.thisgene) > 0 && nrow(imotifs) > 0){ # add regression coefficients
for (i in 1:nrow(tbl.thisgene)){
thismotif=tbl.thisgene[i,"motifName"]
indices <- which(imotifs$motif==thismotif)
itfs <- imotifs[indices,]
#browser()
info.string <- "<html>";
gene.motif <- sprintf("%s(%s): %s", gene, trnName, thismotif)
info.string <- paste(info.string, gene.motif, sep="")
#printf("about to loop over %d motifs", nrow(itfs))
for (j in 1:nrow(itfs)){
tf.info <- sprintf("<br>%s: %04.2f", itfs[j, "tfs"], sumTRN[itfs[j,"tfs"]])
info.string <- paste(info.string, tf.info, sep="")
} # for j
info.string <- paste(info.string, "</html>", sep="")
tbl.thisgene[i,"name"]=info.string
tbl.thisgene[i, "motifName"] <- thismotif
} # for i
thistrn=list()
thistrn$singleTRN=singleTRN
thistrn$sumTRN=sumTRN
allTRN[[gene]]=thistrn
wholebed=rbind(wholebed,tbl.thisgene)
} # if nrow
} # for gene
result <- list(TRN=allTRN,bed4igv=wholebed)
invisible(result)
} # tfGrabber
#------------------------------------------------------------------------------------------------------------------------
displaySnps <- function(igv, chrom, locs, name)
{
tbl.tmp <- data.frame(chr=rep(chrom, length(locs)), start=locs, end=locs, name=as.character(locs),
stringsAsFactors=FALSE)
displayBedTable(igv, tbl.tmp, name)
} # displaySnps
#------------------------------------------------------------------------------------------------------------------------
selectBestAmongDuplicates <- function(tbl)
{
columns.of.interest <- c("chr", "snp", "tf_name")
stopifnot(all(columns.of.interest %in% colnames(tbl)))
tbl.core <- unique(tbl[, columns.of.interest])
tbl.min <- data.frame()
for(r in 1:nrow(tbl.core)){
chrom <- tbl.core[r, "chr"]
snp.loc <- tbl.core[r, "snp"]
tf <- tbl.core[r, "tf_name"]
tbl.sub <- subset(tbl, chr==chrom & snp==snp.loc & tf_name==tf)
if(nrow(tbl.sub) > 1){
deleters <- which(tbl.sub$pval != min(tbl.sub$pval))
tbl.sub <- tbl.sub[-deleters,]
if(nrow(tbl.sub) > 1)
tbl.sub <- tbl.sub[1,]
} # if duplicates found (and then eliminated)
tbl.min <- rbind(tbl.min, tbl.sub)
} # for r
tbl.min
} # selectBestAmongDuplicates
#------------------------------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.