library(igvR)
library(TrenaProjectErythropoiesis)
library(GenomicScores)
library(phastCons7way.UCSC.hg38); phast.7 <- phastCons7way.UCSC.hg38
library(trenaSGM)
if(!exists("tbl.benchmark")){
tbl.benchmark <- get(load(system.file(package="TrenaValidator", "extdata", "tbl.A.RData")))
tbl.benchmark$pubmed.count <- unlist(lapply(strsplit(tbl.benchmark$pubmedID_from_curated_resources, ","), length))
as.data.frame(t(subset(tbl.benchmark, TF=="GATA1" & target=="NFE2")))
}
# TF GATA1
# target NFE2
# effect 0
# score A
# is_evidence_curateddatabase TRUE
# is_evidence_chipSeq FALSE
# is_evidence_TFbindingMotif TRUE
# is_evidence_coexpression TRUE
# which_curateddatabase HTRIdb,trrust
# which_chipSeq none
# which_TFbindingMotif hocomoco_v11
# which_coexpression ARACNe-GTEx
# pubmedID_from_curated_resources 16648487,20564185
# kegg_pathway -
# pubmed.count 2
#------------------------------------------------------------------------------------------------------------------------
# TF GATA1
# target NFE2
# effect 0
# score A
# is_evidence_curateddatabase TRUE
# is_evidence_chipSeq FALSE
# is_evidence_TFbindingMotif TRUE
# is_evidence_coexpression TRUE
# which_curateddatabase HTRIdb,trrust
# which_chipSeq none
# which_TFbindingMotif hocomoco_v11
# which_coexpression ARACNe-GTEx
# pubmedID_from_curated_resources 16648487,20564185
# kegg_pathway -
# pubmed.count 2
#------------------------------------------------------------------------------------------------------------------------
# https://www.ncbi.nlm.nih.gov/pubmed/16648487
#
# Functional interaction of CP2 with GATA-1 in the regulation of erythroid promoters. 2006
# cp2 -> TFCP2
# Hsapiens-HOCOMOCOv10-TFCP2_HUMAN.H10MO.D
# Hsapiens-jaspar2018-TFCP2-MA0145.3
# We observed that binding sites for the ubiquitously expressed transcription factor CP2 [TFCP2] were
# present in regulatory regions of multiple erythroid genes. In these regions, the CP2 binding site
# was adjacent to a site for the erythroid factor GATA-1. Using three such regulatory regions (from
# genes encoding the transcription factors GATA-1, EKLF, and p45 NF-E2), we demonstrated the
# functional importance of the adjacent CP2/GATA-1 sites. In particular, CP2 binds to the GATA-1 HS2
# enhancer, generating a ternary complex with GATA-1 and DNA. Mutations in the CP2 consensus greatly
# impaired HS2 activity in transient transfection assays with K562 cells. Similar results were
# obtained by transfection of EKLF and p45 NF-E2 mutant constructs. Chromatin immunoprecipitation
# with K562 cells showed that CP2 binds in vivo to all three regulatory elements and that both
# GATA-1 and CP2 were present on the same GATA-1 and EKLF regulatory elements. Adjacent CP2/GATA-1
# sites may represent a novel module for erythroid expression of a number of genes. Additionally,
# coimmunoprecipitation and glutathione S-transferase pull-down experiments demonstrated a physical
# interaction between GATA-1 and CP2. This may contribute to the functional cooperation between
# these factors and provide an explanation for the important role of ubiquitous CP2 in the
# regulation of erythroid genes.
#
# 10778 chr12 54295889 54295904 16 + TFCP2 17.2360 1.56e-07 CCCTGCCTGGGCCAGA Hsapiens-HOCOMOCOv10-TFCP2_HUMAN.H10MO.D 0.95625
#
# lapply(query(MotifDb, c("sapiens", "TFCP2"), c("jaspar2018", "hocomoco")), consensusString)
# $`Hsapiens-jaspar2018-TFCP2-MA0145.3`
# [1] "AAACCGGTT?"
#
# $`Hsapiens-HOCOMOCOv10-TFCP2_HUMAN.H10MO.D`
# [1] "?CCTG??C??GCC?GA" where the trailing GA is followed by TAAGA: perhaps the adjacency mentioned above
#
#------------------------------------------------------------------------------------------------------------------------
# https://www.ncbi.nlm.nih.gov/pubmed/20564185
#
# Over-expression of EDAG in the myeloid cell line 32D: induction of GATA-1 expression and erythroid/megakaryocytic
# phenotype. (2010)
#
# Erythroid differentiation-associated gene (EDAG), a hematopoietic tissue-specific transcription
# regulator, plays a key role in maintaining the homeostasis of hematopoietic lineage
# commitment. However, the mechanism and genes regulated by EDAG remain unknown. In this study, we
# showed that overexpression of EDAG in a myeloid cell line 32D led to an erythroid phenotype with
# increased number of benzidine-positive cells and up-regulation of erythroid specific surface
# marker TER119. The megakaryocytic specific marker CD61 was also induced significantly. Using a
# genome-wide microarray analysis and a twofold change cutoff, we identified 332 genes with reduced
# expression and 288 genes with increased expression. Among up-regulation genes, transcription
# factor GATA-1 and its target genes including EKLF, NF-E2, Gfi-1b, hemogen, SCL, hemoglobin alpha,
# beta and megakaryocytic gene GPIX were increased. Silencing of EDAG by RNA interference in K562
# cells resulted in down-regulation of these genes. Taken together, EDAG functions as a positive
# regulator of erythroid/megakaryocytic differentiation in 32D cells associated with the induction
# of GATA-1 and its target genes.
#
#------------------------------------------------------------------------------------------------------------------------
# from google search: gata1 nf32
#
# Genetic Analysis of Hierarchical Regulation for Gata1 and NF-E2 p45 Gene Expression in Megakaryopoiesis
# mouse, 2010. "a unique in vivo validation of the hierarchical relationship between GATA1 and p45 in megakaryocytes"
#
# Insight into GATA1 transcriptional activity through interrogation of cis elements disrupted in human erythroid disorders
# pnas (2016) very strong paper but seems only to discuss downstream effects of GATA1 and NFE2
#
# https://mcb.asm.org/content/25/4/1215
# minireview: GATA1 Function, a Paradigm for Transcription Factors in Hematopoiesis
# includes this suggestion that gata1 mutant causes significant decrease in nfe2 expression
# (p45 is an alias for nfe2. nfe1 is an alias for gata1)
# (ii) MaFK and p45 NF-E2.In addition to the results seen with other megakaryocyte-specific genes,
# the expression of the transcription factors MafK and p45 NF-E2 is significantly decreased in
# megakaryocytes expressing an N-finger mutant of GATA1 (V205G) and in GATA1-deficient
# megakaryocytes (96, 116). p45 NF-E2 p45 and small Maf factors are critical for terminal
# differentiation of megakaryocytes (71, 100). This suggests that the attenuated expression of the
# essential transcription factors NF-E2 p45 and MafK is a major cause of the megakaryocytic
# phenotype of GATA1 mutations.
# Regulation of Mouse p45 NF-E2 Transcription by an Erythroid-specific GATA-dependent Intronic Alternative Promoter*
# mouse, 2000. two alternate promoters.
#
# The erythroid-enriched transcription factor NF-E2 is composed of two subunits, p45 and p18, the
# former of which is mainly expressed in the hematopoietic system. We have isolated and
# characterized the mouse p45 NF-E2 gene; we show here that, similar to the human gene, the mouse
# gene has two alternative promoters, which are differentially active during development and in
# different hematopoietic cells. Transcripts from the distal promoter are present in both erythroid
# and myeloid cells; however, transcripts from an alternative proximal 1b promoter, lying in the
# first intron, are abundant in erythroid cells, but barely detectable in myeloid cells. During
# development, both transcripts are detectable in yolk sac, fetal liver, and bone
# marrow. Transfection experiments show that proximal promoter 1b has a strong activity in erythroid
# cells, which is completely dependent on the integrity of a palindromic GATA-1 binding site. In
# contrast, the distal promoter 1a is not active in this assay. When the promoter 1b is placed 3′ to
# the promoter 1a and reporter gene, in an arrangement that resembles the natural one, it acts as an
# enhancer to stimulate the activity of the upstream promoter la.
# previous two articles mentioned in plos 2011
# Hierarchical Differentiation of Myeloid Progenitors Is Encoded in the Transcription Factor Network
#
# we examplarily discuss five such cases. (i) NF-E2 is regulated by GATA-1 and SCL (TAL1), but specifically
# important for megakaryocytic development [23]–[25].
#
if(!exists("igv")){
igv <- igvR()
setGenome(igv, "hg38")
showGenomicRegion(igv, "NFE2")
tp <- TrenaProjectErythropoiesis()
setTargetGene(tp, "NFE2")
tbl.gh <- getEnhancers(tp)
tbl.gh$width <- with(tbl.gh, 1 + end - start)
tbl.gh <- subset(tbl.gh, width < 5000)
track <- DataFrameQuantitativeTrack("gh", tbl.gh[, c(1,2,3,11)], color="blue", autoscale=FALSE, min=0, max=50)
displayTrack(igv, track)
showGenomicRegion(igv, "chr12:54,289,590-54,311,542")
showGenomicRegion(igv, "chr12:54,282,156-54,326,061")
showGenomicRegion(igv, "chr12:54,290,497-54,315,164")
}
#------------------------------------------------------------------------------------------------------------------------
round.numeric.columns.in.dataframe <- function(tbl, digits=2, pvalColumnNames="lassoPValue")
{
tbl.pvals <- data.frame()
tbl.main <- tbl
if(!(all(is.na(pvalColumnNames)))){
pval.cols <- grep(pvalColumnNames, colnames(tbl))
stopifnot(length(pval.cols) == length(pvalColumnNames))
tbl.pvals <- tbl[, pval.cols, drop=FALSE]
tbl.main <- tbl[, -pval.cols, drop=FALSE]
}
numeric_columns <- sapply(tbl.main, mode) == 'numeric'
tbl.main[numeric_columns] <- round(tbl.main[numeric_columns], digits)
if(ncol(tbl.pvals) > 0){
tbl.pvals <- apply(tbl.pvals, 2, function(col) as.numeric(formatC(col, format = "e", digits = 2)))
}
tbl.out <- cbind(tbl.main, tbl.pvals)[, colnames(tbl)]
tbl.out
} # round.numeric.columns.in.dataframe
#------------------------------------------------------------------------------------------------------------------------
conservationTrack <- function()
{
loc <- getGenomicRegion(igv)
starts <- with(loc, seq(start, end, by=5))
ends <- starts + 5
count <- length(starts)
tbl.blocks <- data.frame(chrom=rep(loc$chrom, count), start=starts, end=ends, stringsAsFactors=FALSE)
tbl.cons7 <- as.data.frame(gscores(phast.7, GRanges(tbl.blocks)), stringsAsFactors=FALSE)
tbl.cons7$chrom <- as.character(tbl.cons7$seqnames)
tbl.cons7 <- tbl.cons7[, c("chrom", "start", "end", "default")]
track <- DataFrameQuantitativeTrack("phast7", tbl.cons7, autoscale=TRUE, color="red")
displayTrack(igv, track)
} # conservationTrack
#------------------------------------------------------------------------------------------------------------------------
fimoConservationTable <- function()
{
source("~/github/fimoService/batchMode/fimoBatchTools.R") # works on hagfish & khaleesi
meme.file <- "jaspar2018-hocomoco.meme"
motifs <- query(MotifDb, "hsapiens", c("jaspar2018", "hocomoco"))
length(motifs) # 1177
export(motifs, con=meme.file, format="meme")
roi <- getGenomicRegion(igv)
tbl.regions <- with(roi, data.frame(chrom=chrom, start=start, end=end, stringsAsFactors=FALSE))
fimo.threshold <- 1e-5
tbl.match <- fimoBatch(tbl.regions, matchThreshold=fimo.threshold, genomeName="hg38", pwmFile=meme.file)
dim(tbl.match)
tbl.matchCons <- as.data.frame(gscores(phast.7, GRanges(tbl.match)), stringsAsFactors=FALSE)
dim(tbl.matchCons)
tbl.matchCons <- subset(tbl.matchCons, default > 0.95)
dim(tbl.matchCons)
return(tbl.matchCons)
} # fimoConservationTable
#------------------------------------------------------------------------------------------------------------------------
demo_NFE2_models <- function()
{
library(TrenaProjectLymphocyte)
library(org.Hs.eg.db)
tp <- TrenaProjectLymphocyte();
genome <- "hg38"
targetGene <- "NFE2"
setTargetGene(tp, targetGene)
tbl.info <- getTranscriptsTable(tp)
chromosome <- tbl.info$chrom
tss <- tbl.info$tss
# strand-aware start and end: atf1 is on the + strand
start <- tss - 50000
end <- tss + 50000
tbl.regions <- data.frame(chrom=chromosome, start=start, end=end, stringsAsFactors=FALSE)
file <- system.file(package="TrenaProjectLymphocyte", "extdata", "expression","GTEX.wholeBlood.rna-seq-geneSymbols.22330x407.RData")
mtx.blood <- get(load(file))
file <- system.file(package="TrenaProjectLymphocyte", "extdata", "expression",
"GTEX.lymphocyte.rna-seq-geneSymbols.21415x130.RData")
mtx.lymphocyte <- get(load(file))
file <- system.file(package="TrenaProjectErythropoiesis", "extdata", "expression", "brandLabDifferentiationTimeCourse-27171x28.RData")
mtx.marjorie <- get(load(file))
file <- "~/github/TrenaProjectErythropoiesis/prep/import/buenrostro/GSE74246_RNAseq_All_Counts.txt"
file.exists(file)
tbl <- read.table(file, sep="\t", as.is=TRUE, header=TRUE,nrow=-1)
rownames(tbl) <- tbl[, 1]
tbl <- tbl[, -1]
mtx.buenrosto <- asinh(as.matrix(tbl))
fivenum(mtx.buenrosto)
tbl.matchCons <- fimoConservationTable()
candidate.tfs <- unique(tbl.matchCons$tf)
length(candidate.tfs) # 57
noDNA.recipe <- list(title="noDNA.matchCons",
type="noDNA.tfsSupplied",
matrix=mtx.marjorie,
#matrix=mtx.blood,
#matrix=mtx.lymphocyte,
#matrix=mtx.buenrosto,
candidateTFs=candidate.tfs,
tfPool=allKnownTFs(),
tfPrefilterCorrelation=0.2,
annotationDbFile=dbfile(org.Hs.eg.db),
orderModelByColumn="rfScore",
solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman", "xgboost"),
quiet=TRUE)
builder <- NoDnaModelBuilder(genome, targetGene, noDNA.recipe, quiet=TRUE)
x <- build(builder)
tbl.model <- x$model[order(abs(x$model$pearsonCoeff), decreasing=TRUE),]
tbl.tfbs.counts <- as.data.frame(sort(table(tbl.matchCons$tf)))
bindingSiteCount <- merge(tbl.model, tbl.tfbs.counts, by.x="gene", by.y="Var1")$Freq
tbl.model$bindingSites <- bindingSiteCount
tbl.model.strong <- subset(tbl.model, abs(pearsonCoeff) > 0.5)
displayBindingSites(tbl.model.strong, tbl.matchCons)
mtx.model <- as.matrix(tbl.model.strong[, -1])
rownames(mtx.model) <- tbl.model.strong$gene
tbl.model.trimmed <- as.data.frame(round.numeric.columns.in.dataframe(mtx.model))
save(tbl.model.trimmed, file="brand.tbl.model.trimmed")
recipe.all <- noDNA.recipe
recipe.all$candidateTFs <- allKnownTFs()
builder <- NoDnaModelBuilder(genome, targetGene, recipe.all, quiet=TRUE)
x1 <- build(builder)
#----------------------------------------------------------------------------------------------------
# first, build a model with "placenta2", an early version of the placenta footprint database
#----------------------------------------------------------------------------------------------------
recipe <- list(title="NFE2",
type="footprint.database",
regions=tbl.regions,
geneSymbol=targetGene,
tss=tss,
matrix=mtx.blood,
db.host="khaleesi.systemsbiology.net",
db.port=5432,
databases=list("lymphoblast_hint_16", "lymphoblast_hint_20"),
annotationDbFile=dbfile(org.Hs.eg.db),
motifDiscovery="builtinFimo",
tfPool=allKnownTFs(),
tfMapping="MotifDB",
tfPrefilterCorrelation=0.1,
orderModelByColumn="rfScore",
solverNames=c("lasso", "lassopv", "pearson", "randomForest", "ridge", "spearman", "xgboost"))
fpBuilder <- FootprintDatabaseModelBuilder(genome, targetGene, recipe, quiet=FALSE)
x <- build(fpBuilder)
recipe$matrix <- mtx.marjorie
fpBuilder <- FootprintDatabaseModelBuilder(genome, targetGene, recipe, quiet=FALSE)
x2 <- build(fpBuilder)
file <- "~/github/TrenaProjectErythropoiesis/prep/import/buenrostro/GSE74246_RNAseq_All_Counts.txt"
file.exists(file)
tbl <- read.table(file, sep="\t", as.is=TRUE, header=TRUE,nrow=-1)
rownames(tbl) <- tbl[, 1]
tbl <- tbl[, -1]
mtx <- asinh(as.matrix(tbl))
fivenum(mtx)
suppressWarnings(
logTimingInfo("placenta2 db, +/- 5bk generic promoter on ATF1", system.time(x2 <- build(fpBuilder)))
)
} # demo_NFE2_models
#------------------------------------------------------------------------------------------------------------------------
displayBindingSites <- function(tbl.model, tbl.matchCons)
{
tfs <- subset(tbl.model, abs(pearsonCoeff) > 0.5)$gene
for(one.tf in tfs){
tbl.bs <- subset(tbl.matchCons, tf==one.tf)[, c("seqnames", "start", "end", "matched_sequence")]
colnames(tbl.bs) <- c("chrom", "start", "end", "seq")
tbl.bs$chrom <- as.character(tbl.bs$chrom)
track <- DataFrameAnnotationTrack(one.tf, tbl.bs, color="random", trackHeight=25, displayMode="EXPANDED")
displayTrack(igv, track)
}
} # displayBindingSites
#------------------------------------------------------------------------------------------------------------------------
getATACseq <- function()
{
roi <- getGenomicRegion(igv)
chromosome <- roi$chrom
start.loc <- roi$start
end.loc <- roi$end
directory <- "~/github/TrenaProjectErythropoiesis/prep/import/atacPeaks"
files <- grep("narrowPeak$", list.files(directory), value=TRUE)
result <- list()
for(file in files){
full.path <- file.path(directory, file)
track.name <- sub("_hg38_macs2_.*$", "", sub("ATAC_Cord_", "", file))
tbl.atac <- read.table(full.path, sep="\t", as.is=TRUE)
colnames(tbl.atac) <- c("chrom", "start", "end", "name", "c5", "strand", "c7", "c8", "c9", "c10")
tbl.atac.region <- subset(tbl.atac, chrom==chromosome & start >= start.loc & end <= end.loc)
if(nrow(tbl.atac.region) > 0){
tbl.atac.region$sample <- track.name
result[[track.name]] <- tbl.atac.region
}
} # files
tbl.out <- do.call(rbind, result)
rownames(tbl.out) <- NULL
tbl.out
} # getATACseq
#------------------------------------------------------------------------------------------------------------------------
displayATACseq <- function()
{
library (RColorBrewer)
totalColorCount <- 12
colors <- brewer.pal(8, "Dark2")
currentColorNumber <- 0
tbl.all <- getATACseq()
samples <- unique(tbl.all$sample)
current.day.string <- ""
color <- colors[1]
for(current.sample in samples){
this.day.string <- strsplit(current.sample, "_")[[1]][1]
if(this.day.string != current.day.string){
currentColorNumber <- (currentColorNumber %% totalColorCount) + 1
color <- colors[currentColorNumber]
current.day.string <- this.day.string
}
tbl.atac.sub <- subset(tbl.all, sample == current.sample)
track.name <- current.sample
track <- DataFrameQuantitativeTrack(track.name, tbl.atac.sub[, c("chrom", "start", "end", "c10")],
color, autoscale=FALSE, min=0, max=430, trackHeight=30)
displayTrack(igv, track)
} # for samples
tbl.regions.condensed <- as.data.frame(union(GRanges(tbl.all[, c("chrom", "start", "end")]),
GRanges(tbl.all[, c("chrom", "start", "end")])))[, c("seqnames", "start", "end")]
colnames(tbl.regions.condensed) <- c("chrom", "start", "end")
tbl.regions.condensed$chrom <- as.character(tbl.regions.condensed$chrom)
lapply(tbl.regions.condensed, class)
#state$tbl.regions.condensed <- tbl.regions.condensed
track <- DataFrameAnnotationTrack("atac combined", tbl.regions.condensed, color="black")
displayTrack(igv, track)
} # displayATACseq
#------------------------------------------------------------------------------------------------------------------------
# oddly, no methylation data in the immediate vicinity of nfe2.
addMethylationTracks <- function()
{
library(AnnotationHub)
ah <- AnnotationHub()
ah.human <- subset(ah, species == "Homo sapiens")
histone.tracks <- query(ah.human, c("H3K4me3", "Gm12878", "Peak", "narrow")) # 3 tracks
descriptions <- histone.tracks$description
titles <- histone.tracks$title
colors <- rep(terrain.colors(6), 4)
color.index <- 0
tbl.roi <- as.data.frame(getGenomicRegion(igv), stringsAsFactors=FALSE)
for(i in seq_len(length(histone.tracks))){
name <- names(histone.tracks)[i]
color.index <- color.index + 1
gr <- histone.tracks[[name]]
ov <- findOverlaps(gr, GRanges(tbl.roi))
roi.histones <- gr[queryHits(ov)]
track.histones <- GRangesQuantitativeTrack(titles[i], roi.histones[, "pValue"],
color=colors[color.index], trackHeight=50,
autoscale=TRUE)
displayTrack(igv, track.histones)
} # for track
} # addMethylationTracks
#------------------------------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.