About GEO study GSE74363

GSE74363: The Common Stress Responsive Transcription Factor ATF3 Binds Active Enhancers and Bookmarks Genomic Sites for Transcriptional Regulation. Paper. The authors collect wildtype and ATF3 knockout expression using an illumina beadchip array (GEO dataset GSE74362). This should serve as a spike-in dataset.

The raw data turned out to be two probe annotaiton files - no sign of expression. Sigh. I hate bioinformatics.

There is very little signal in this dataset. It will be a hard dataset to extract any signal from.

Import data

Download the unnormalised zipped expression data directly from GEO, then annotate the columns and background normalise using limma::neqc. I tried downloaded the processed files and the P-value distribution was very odd - I suspect and artifact. Also, the raw files

library(data.table)
library(GEOquery)
library(purrr)
library(stringr)
library(magrittr)

parsedGSE74362SOFT <- getGEO(GEO = "GSE74362", GSEMatrix = FALSE, AnnotGPL = FALSE, getGPL = TRUE)


GSE74362_unnormalised_RAW_DT <- fread(
  cmd = "curl ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74362/suppl/GSE74362_non-normalized.txt.gz | zcat",
  check.names = TRUE)

# Copied from the GSE74362 page
GSE74362_sampleAnnot <- fread(
"GSM1918933,HCT116 WT DMSO rep1
GSM1918934,HCT116 WT CPT rep1
GSM1918935,HCT116 E11 DMSO rep1
GSM1918936,HCT116 E11 CPT rep1
GSM1918937,HCT116 WT DMSO rep2
GSM1918938,HCT116 WT CPT rep2
GSM1918939,HCT116 E11 DMSO rep2
GSM1918940,HCT116 E11 CPT rep2
GSM1918941,HCT116 WT DMSO rep3
GSM1918942,HCT116 WT CPT rep3
GSM1918943,HCT116 E11 DMSO rep3
GSM1918944,HCT116 E11 CPT rep3", 
header = FALSE, sep= ",", col.names = c("GSM", "sample"))

# I have to make the uncomfortable assumption that these two sets are sorted in the same way ...
GSE74362_sampleAnnot %<>% .[,.(colName = c(make.names(sample), str_c(make.names(sample),"_pVal") )), by = GSM]


setnames(GSE74362_unnormalised_RAW_DT, GSE74362_sampleAnnot[, c("HumanHT12v4probeID",colName)])


# Matched that of illumina beadchips
GPL10558_annot_DT <- fread(cmd = "curl ftp://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL10nnn/GPL10558/suppl/GPL10558_HumanHT-12_V4_0_R2_15002873_B.txt.gz | zcat", skip = 8, header = TRUE, fill = TRUE)

setkey(GPL10558_annot_DT, "Probe_Id")

Build an annotation frame for the experimental system

samples_GSE74362_metadataDT <- data.table(
  sampleNames = names(GSE74362_unnormalised_RAW_DT) )

samples_GSE74362_metadataDT[, condition := shift(sampleNames)] # label all of the P-value cols

samples_GSE74362_metadataDT[, type := "measure"]
samples_GSE74362_metadataDT[sampleNames %like% "pVal", type := "detect"]
samples_GSE74362_metadataDT[sampleNames == "HumanHT12v4probeID", type := "probeID"]

samples_GSE74362_metadataDT[type == "measure", condition := sampleNames]

samples_GSE74362_metadataDT[, treatment := str_extract(condition,"HCT116\\.\\w+\\.\\w+") %>% str_remove("HCT116\\.") %>% str_replace("\\.", "_")]

samples_GSE74362_metadataDT[GSE74362_sampleAnnot, GSM := i.GSM, on = .(sampleNames == colName)]

Background correct and quantile normalise

We shall make use of the neqc function in the limma package, which is designed for use with Illumina BeadChips; it performs background correction followed by quantile normalisation across all samples, using detection P-values provided by the BeadArray detector.

library(limma)

GSE74362measureColumns <- colnames(GSE74362_unnormalised_RAW_DT)[colnames(GSE74362_unnormalised_RAW_DT) %like% "HCT116"]

exprsMat <- GSE74362_unnormalised_RAW_DT[, 
                                    as.matrix(.SD, rownames = HumanHT12v4probeID),
                    .SDcols = GSE74362measureColumns[!GSE74362measureColumns %like% "pVal"] ]

detectionPvalueMat <- GSE74362_unnormalised_RAW_DT[,
                                    as.matrix(.SD, rownames = HumanHT12v4probeID),
                    .SDcols = GSE74362measureColumns[GSE74362measureColumns %like% "pVal"] ]

GSE74362_expObj <- neqc(x = exprsMat,
                        detection.p = detectionPvalueMat)
GSE74362_designMat <- model.matrix(sampleNames ~ 0 + treatment,
                                   samples_GSE74362_metadataDT[type == "measure",.(sampleNames,treatment)])

colnames(GSE74362_designMat) %<>% str_remove("treatment")
row.names(GSE74362_designMat) <- samples_GSE74362_metadataDT[type == "measure", sampleNames]


GSE74362_ATF3contrastMat <- makeContrasts(
              contrasts = "E11_DMSO-WT_DMSO",
              levels = GSE74362_designMat)

GSE74362_lmFit <- lmFit(GSE74362_expObj,
                        design = GSE74362_designMat) %>%
                  contrasts.fit(contrasts = GSE74362_ATF3contrastMat) %>%
                  eBayes


GSE74362_diffexDT <- data.table(topTable(GSE74362_lmFit, number = Inf), keep.rownames = TRUE )

setnames(GSE74362_diffexDT, "rn", "HumanHT12v4probeID")

setkey(GSE74362_diffexDT, "HumanHT12v4probeID")


GSE74362_diffexDT[GPL10558_annot_DT, geneID := i.Entrez_Gene_ID]
GSE74362_diffexDT[GPL10558_annot_DT, geneSymbol := i.Symbol]

GSE74362_diffexDT[, qplot(P.Value, bins = 100)]

So there is almost no signal in this dataset!! Any method should be robust to this kind of sample. 70+% of calls made at FDR of 0.05 are false positives!! It might serve as good decoy dataset actually.

koATF3_HCT116_diffexDT <- GSE74362_diffexDT[,.(HumanHT12v4probeID, geneID, geneSymbol, koATF3_logFC = logFC, koATF3_pValue = P.Value)]

setorder(koATF3_HCT116_diffexDT, koATF3_pValue)

save(koATF3_HCT116_diffexDT, file = "./data/koATF3_HCT116_diffexDT.RData", compress = "xz")

Play about with some TF-target database sources ...

FANTOM5_hs_regcirc <- fread("zcat /mnt/c/regulatory_circutis/Network_compendium/Tissue-specific_regulatory_networks_FANTOM5-v1/32_high-level_networks/20_gastrointestinal_system.txt.gz") # Arbitrary cut off

FANTOM5_hs_regcirc[V3 > 0.2, qplot(V3)] + scale_x_continuous(breaks = seq(0,1,0.1))
koATF3_geneIDExpress <- koATF3_diffexDT[!is.na(geneSymbol), .SD[koATF3_pValue == min(koATF3_pValue, na.rm = T)][1] , by = geneID]

koATF3_geneIDExpress[,qplot(koATF3_pValue, bins = 100)]

ATF3knockout_betaUniformModel <- fitBetaUniformMixtureDistribution(koATF3_geneIDExpress$koATF3_pValue, nStarts = 20)
# LLH fluctates a bit as we are on the bounds of parameter space!

ATF3knockout_betaUniformModel

noiseFractionUpperBound(ATF3knockout_betaUniformModel)

# If we were to use a P-value cutoff of 0.01, what would the FP rate be?
TP <- truePositiveFraction(ATF3knockout_betaUniformModel, pValueThreshold = 0.01)
FP <- falsePositiveFraction(ATF3knockout_betaUniformModel, pValueThreshold = 0.01)
round(100*FP/(TP+FP), digits = 1) # Over 30% would be false positives!
# Perform meta-analysis using the TTRUST TF->target sets

koATF3_geneIDExpress[, betaUnifScore_FDR0.05 := betaUniformScore(koATF3_pValue, ATF3knockout_betaUniformModel, FDR = 0.05)]



TTRUST_TF_pvalues <- koATF3_geneIDExpress[ unique(TTRUST_TF2targets_DT[!is.na(geneID), .(TF,geneID)]), , on = "geneID"][!is.na(koATF3_pValue),.(pValueSet = list(koATF3_pValue), geneSet = list(geneSymbol), scoreSum = sum(betaUnifScore_FDR0.05)), by = TF]

TTRUST_TF_pvalues[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], ATF3knockout_betaUniformModel), by = TF]
TTRUST_TF_pvalues[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = TF]

TTRUST_TF_pvalues[p.adjust(betaUniformMixtureP, "fdr") < 0.01][order(betaUniformMixtureP)] # No significant gene sets
TTRUST_TF_pvalues[p.adjust(fishersP) < 0.05] # Quite a few ... too many for so little signal?

TTRUST_TF_pvalues[scoreSum > 0]

# I'm not convince that there is any signal in this dataset ...

TTRUST_TF_pvalues %>%
ggplot(aes(x = -log(betaUniformMixtureP), y = -log(fishersP))) +
geom_point() +
labs(title = "Plot to show how much more conservative beta-uniform P-Values are than Fisher's combined P-values",
subtitle = "ATF3 knockout study (which has very little signal).\nBonferroni (conservative) multiple hypothesuis correction cut-off shown.") +
geom_vline(xintercept = -log(0.05/800), linetype = "dashed", colour = "red") +
geom_hline(yintercept = -log(0.05/800), linetype = "dashed", colour = "red")


TTRUST_TF_pvalues <- koATF3_diffexDT[TTRUST_TF2targets_DT[!is.na(geneID), .(TF, geneID)], , on = "geneID"][,.(pValueSet = list(P.Value), geneSet = list(geneSymbol)), by = TF]
# Perform meta-analysis using the TTRUST TF->target sets
TTRUST_TF_pvalues <- koATF3_diffexDT[TTRUST_TF2targets_DT[!is.na(geneID), .(TF, geneID)], , on = "geneID"][,.(pValueSet = list(P.Value), geneSet = list(geneSymbol)), by = TF]
TTRUST_TF_pvalues[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], ATF3knockout_betaUniformModel), by = TF]
TTRUST_TF_pvalues[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = TF]
TTRUST_TF_pvalues[p.adjust(betaUniformMixtureP) < 0.05] # No significant gene sets
TTRUST_TF_pvalues[p.adjust(fishersP) < 0.05] # Quite a few ... too many for so little signal?
TTRUST_TF_pvalues %>%
ggplot(aes(x = -log(betaUniformMixtureP), y = -log(fishersP))) +
geom_point() +
labs(title = "Plot to show how much more conservative beta-uniform P-Values are than Fisher's combined P-values",
subtitle = "ATF3 knockout study (which has very little signal).\nBonferroni (conservative) multiple hypothesuis correction cut-off shown.") +
geom_vline(xintercept = -log(0.05/800), linetype = "dashed", colour = "red") +
geom_hline(yintercept = -log(0.05/800), linetype = "dashed", colour = "red")
ACT3ko_geneIDexpress <- koATF3_HCT116_diffexDT[!is.na(geneID), .SD[koATF3_pValue == min(koATF3_pValue, na.rm = T)][1] , by = geneID]


ATF3knockoutBUM <- fitBetaUniformMixtureDistribution(ACT3ko_geneIDexpress$koATF3_pValue)

plot(ATF3knockoutBUM)

# 55+% of calls made at FDR of 0.05 are false positives!!
falsePositiveFraction(ATF3knockoutBUM, pValueThreshold = 0.05)/(falsePositiveFraction( ATF3knockoutBUM, pValueThreshold = 0.05) + truePositiveFraction(ATF3knockoutBUM, pValueThreshold = 0.05))


TF_pvals <- ACT3ko_geneIDexpress[TTRUST_TF2targets_DT %>% 
                    .[!is.na(geneID), .(TF, geneID)], , on = "geneID"] %>% 
                    .[,.(geneSets = list(koATF3_pValue)), by = TF]

TF_pvals_list <- TF_pvals[, geneSets]
names(TF_pvals_list) <- TF_pvals[, TF]

betaUnifPs <- sapply(TF_pvals_list, betaUniformPvalueSumTest, betaUniformFit = ATF3knockoutBUM)
fisherPs <- sapply(TF_pvals_list, fishersPvalueSumTest)

summaryDT <- data.table(TF = names(TF_pvals_list),
                        metaDEGthP = betaUnifPs,
                        fiserP = fisherPs)

summaryDT[, metaDEGthQ := p.adjust(metaDEGthP, "fdr")]
summaryDT[, fisherQ := p.adjust(fiserP, "fdr")]

summaryDT[order(metaDEGthQ)]


summaryDT[metaDEGthQ <= 1E-3]



summaryDT[ACT3ko_geneIDexpress, geneID := geneID, on = .(TF == geneSymbol)]

ACT3ko_geneIDexpress

TTRUST_TF2targets_DT[TF == "ATF3"] %>%
  merge(summaryDT, by = "geneID") %>% 
  .[!is.na(geneID)] %>%
  mutate(pQ = -log10(metaDEGthQ)) %>%
  ggplot() + geom_histogram(aes(x = pQ))

It's good that it recovers signal in p53. But this is far from a comprehensive test ...

What about with BioPlanet

ACT3ko_geneIDexpress <- koATF3_HCT116_diffexDT[!is.na(geneID), .SD[koATF3_pValue == min(koATF3_pValue, na.rm = T)][1] , by = geneID]


BioPlanet_pvals <- ACT3ko_geneIDexpress[bioplanetPathwaysDT %>% 
                      .[!is.na(geneID), .(BioPlanetName, geneID)], , on = "geneID"] %>% 
                      .[,.(geneSets = list(koATF3_pValue)), by = BioPlanetName]

BP_pvals_list <- BioPlanet_pvals[, geneSets]
names(BP_pvals_list) <- BioPlanet_pvals[, BioPlanetName]

betaUnifPs <- sapply(BP_pvals_list, betaUniformPvalueSumTest, betaUniformFit = ATF3knockoutBUM)
fisherPs <- sapply(BP_pvals_list, fishersPvalueSumTest)


summaryBP_DT <- data.table(TF = names(BP_pvals_list),
                        metaDEGthP = betaUnifPs,
                        fiserP = fisherPs)

summaryBP_DT[, metaDEGthQ := p.adjust(metaDEGthP, "fdr")]
summaryBP_DT[, fisherQ := p.adjust(fiserP, "fdr")]

summaryBP_DT[order(metaDEGthQ)]

 summaryBP_DT[order(metaDEGthQ)][metaDEGthQ <= 0.01] %>% head

Hmmm. Still tricky to see the precise signal.



adamsardar/metaDEGth documentation built on Sept. 28, 2022, 10:12 p.m.