GSE74363: The Common Stress Responsive Transcription Factor ATF3 Binds Active Enhancers and Bookmarks Genomic Sites for Transcriptional Regulation. Paper. The authors collect wildtype and ATF3 knockout expression using an illumina beadchip array (GEO dataset GSE74362). This should serve as a spike-in dataset.
The raw data turned out to be two probe annotaiton files - no sign of expression. Sigh. I hate bioinformatics.
There is very little signal in this dataset. It will be a hard dataset to extract any signal from.
Download the unnormalised zipped expression data directly from GEO, then annotate the columns and background normalise using limma::neqc. I tried downloaded the processed files and the P-value distribution was very odd - I suspect and artifact. Also, the raw files
library(data.table) library(GEOquery) library(purrr) library(stringr) library(magrittr) parsedGSE74362SOFT <- getGEO(GEO = "GSE74362", GSEMatrix = FALSE, AnnotGPL = FALSE, getGPL = TRUE) GSE74362_unnormalised_RAW_DT <- fread( cmd = "curl ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74362/suppl/GSE74362_non-normalized.txt.gz | zcat", check.names = TRUE) # Copied from the GSE74362 page GSE74362_sampleAnnot <- fread( "GSM1918933,HCT116 WT DMSO rep1 GSM1918934,HCT116 WT CPT rep1 GSM1918935,HCT116 E11 DMSO rep1 GSM1918936,HCT116 E11 CPT rep1 GSM1918937,HCT116 WT DMSO rep2 GSM1918938,HCT116 WT CPT rep2 GSM1918939,HCT116 E11 DMSO rep2 GSM1918940,HCT116 E11 CPT rep2 GSM1918941,HCT116 WT DMSO rep3 GSM1918942,HCT116 WT CPT rep3 GSM1918943,HCT116 E11 DMSO rep3 GSM1918944,HCT116 E11 CPT rep3", header = FALSE, sep= ",", col.names = c("GSM", "sample")) # I have to make the uncomfortable assumption that these two sets are sorted in the same way ... GSE74362_sampleAnnot %<>% .[,.(colName = c(make.names(sample), str_c(make.names(sample),"_pVal") )), by = GSM] setnames(GSE74362_unnormalised_RAW_DT, GSE74362_sampleAnnot[, c("HumanHT12v4probeID",colName)]) # Matched that of illumina beadchips GPL10558_annot_DT <- fread(cmd = "curl ftp://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL10nnn/GPL10558/suppl/GPL10558_HumanHT-12_V4_0_R2_15002873_B.txt.gz | zcat", skip = 8, header = TRUE, fill = TRUE) setkey(GPL10558_annot_DT, "Probe_Id")
Build an annotation frame for the experimental system
samples_GSE74362_metadataDT <- data.table( sampleNames = names(GSE74362_unnormalised_RAW_DT) ) samples_GSE74362_metadataDT[, condition := shift(sampleNames)] # label all of the P-value cols samples_GSE74362_metadataDT[, type := "measure"] samples_GSE74362_metadataDT[sampleNames %like% "pVal", type := "detect"] samples_GSE74362_metadataDT[sampleNames == "HumanHT12v4probeID", type := "probeID"] samples_GSE74362_metadataDT[type == "measure", condition := sampleNames] samples_GSE74362_metadataDT[, treatment := str_extract(condition,"HCT116\\.\\w+\\.\\w+") %>% str_remove("HCT116\\.") %>% str_replace("\\.", "_")] samples_GSE74362_metadataDT[GSE74362_sampleAnnot, GSM := i.GSM, on = .(sampleNames == colName)]
We shall make use of the neqc
function in the limma
package, which is designed for use with Illumina BeadChips; it performs background correction followed by quantile normalisation across all samples, using detection P-values provided by the BeadArray detector.
library(limma) GSE74362measureColumns <- colnames(GSE74362_unnormalised_RAW_DT)[colnames(GSE74362_unnormalised_RAW_DT) %like% "HCT116"] exprsMat <- GSE74362_unnormalised_RAW_DT[, as.matrix(.SD, rownames = HumanHT12v4probeID), .SDcols = GSE74362measureColumns[!GSE74362measureColumns %like% "pVal"] ] detectionPvalueMat <- GSE74362_unnormalised_RAW_DT[, as.matrix(.SD, rownames = HumanHT12v4probeID), .SDcols = GSE74362measureColumns[GSE74362measureColumns %like% "pVal"] ] GSE74362_expObj <- neqc(x = exprsMat, detection.p = detectionPvalueMat)
GSE74362_designMat <- model.matrix(sampleNames ~ 0 + treatment, samples_GSE74362_metadataDT[type == "measure",.(sampleNames,treatment)]) colnames(GSE74362_designMat) %<>% str_remove("treatment") row.names(GSE74362_designMat) <- samples_GSE74362_metadataDT[type == "measure", sampleNames] GSE74362_ATF3contrastMat <- makeContrasts( contrasts = "E11_DMSO-WT_DMSO", levels = GSE74362_designMat) GSE74362_lmFit <- lmFit(GSE74362_expObj, design = GSE74362_designMat) %>% contrasts.fit(contrasts = GSE74362_ATF3contrastMat) %>% eBayes GSE74362_diffexDT <- data.table(topTable(GSE74362_lmFit, number = Inf), keep.rownames = TRUE ) setnames(GSE74362_diffexDT, "rn", "HumanHT12v4probeID") setkey(GSE74362_diffexDT, "HumanHT12v4probeID") GSE74362_diffexDT[GPL10558_annot_DT, geneID := i.Entrez_Gene_ID] GSE74362_diffexDT[GPL10558_annot_DT, geneSymbol := i.Symbol] GSE74362_diffexDT[, qplot(P.Value, bins = 100)]
So there is almost no signal in this dataset!! Any method should be robust to this kind of sample. 70+% of calls made at FDR of 0.05 are false positives!! It might serve as good decoy dataset actually.
koATF3_HCT116_diffexDT <- GSE74362_diffexDT[,.(HumanHT12v4probeID, geneID, geneSymbol, koATF3_logFC = logFC, koATF3_pValue = P.Value)] setorder(koATF3_HCT116_diffexDT, koATF3_pValue) save(koATF3_HCT116_diffexDT, file = "./data/koATF3_HCT116_diffexDT.RData", compress = "xz")
FANTOM5_hs_regcirc <- fread("zcat /mnt/c/regulatory_circutis/Network_compendium/Tissue-specific_regulatory_networks_FANTOM5-v1/32_high-level_networks/20_gastrointestinal_system.txt.gz") # Arbitrary cut off FANTOM5_hs_regcirc[V3 > 0.2, qplot(V3)] + scale_x_continuous(breaks = seq(0,1,0.1))
koATF3_geneIDExpress <- koATF3_diffexDT[!is.na(geneSymbol), .SD[koATF3_pValue == min(koATF3_pValue, na.rm = T)][1] , by = geneID] koATF3_geneIDExpress[,qplot(koATF3_pValue, bins = 100)] ATF3knockout_betaUniformModel <- fitBetaUniformMixtureDistribution(koATF3_geneIDExpress$koATF3_pValue, nStarts = 20) # LLH fluctates a bit as we are on the bounds of parameter space! ATF3knockout_betaUniformModel noiseFractionUpperBound(ATF3knockout_betaUniformModel) # If we were to use a P-value cutoff of 0.01, what would the FP rate be? TP <- truePositiveFraction(ATF3knockout_betaUniformModel, pValueThreshold = 0.01) FP <- falsePositiveFraction(ATF3knockout_betaUniformModel, pValueThreshold = 0.01) round(100*FP/(TP+FP), digits = 1) # Over 30% would be false positives! # Perform meta-analysis using the TTRUST TF->target sets koATF3_geneIDExpress[, betaUnifScore_FDR0.05 := betaUniformScore(koATF3_pValue, ATF3knockout_betaUniformModel, FDR = 0.05)] TTRUST_TF_pvalues <- koATF3_geneIDExpress[ unique(TTRUST_TF2targets_DT[!is.na(geneID), .(TF,geneID)]), , on = "geneID"][!is.na(koATF3_pValue),.(pValueSet = list(koATF3_pValue), geneSet = list(geneSymbol), scoreSum = sum(betaUnifScore_FDR0.05)), by = TF] TTRUST_TF_pvalues[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], ATF3knockout_betaUniformModel), by = TF] TTRUST_TF_pvalues[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = TF] TTRUST_TF_pvalues[p.adjust(betaUniformMixtureP, "fdr") < 0.01][order(betaUniformMixtureP)] # No significant gene sets TTRUST_TF_pvalues[p.adjust(fishersP) < 0.05] # Quite a few ... too many for so little signal? TTRUST_TF_pvalues[scoreSum > 0] # I'm not convince that there is any signal in this dataset ... TTRUST_TF_pvalues %>% ggplot(aes(x = -log(betaUniformMixtureP), y = -log(fishersP))) + geom_point() + labs(title = "Plot to show how much more conservative beta-uniform P-Values are than Fisher's combined P-values", subtitle = "ATF3 knockout study (which has very little signal).\nBonferroni (conservative) multiple hypothesuis correction cut-off shown.") + geom_vline(xintercept = -log(0.05/800), linetype = "dashed", colour = "red") + geom_hline(yintercept = -log(0.05/800), linetype = "dashed", colour = "red") TTRUST_TF_pvalues <- koATF3_diffexDT[TTRUST_TF2targets_DT[!is.na(geneID), .(TF, geneID)], , on = "geneID"][,.(pValueSet = list(P.Value), geneSet = list(geneSymbol)), by = TF] # Perform meta-analysis using the TTRUST TF->target sets TTRUST_TF_pvalues <- koATF3_diffexDT[TTRUST_TF2targets_DT[!is.na(geneID), .(TF, geneID)], , on = "geneID"][,.(pValueSet = list(P.Value), geneSet = list(geneSymbol)), by = TF] TTRUST_TF_pvalues[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], ATF3knockout_betaUniformModel), by = TF] TTRUST_TF_pvalues[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = TF] TTRUST_TF_pvalues[p.adjust(betaUniformMixtureP) < 0.05] # No significant gene sets TTRUST_TF_pvalues[p.adjust(fishersP) < 0.05] # Quite a few ... too many for so little signal? TTRUST_TF_pvalues %>% ggplot(aes(x = -log(betaUniformMixtureP), y = -log(fishersP))) + geom_point() + labs(title = "Plot to show how much more conservative beta-uniform P-Values are than Fisher's combined P-values", subtitle = "ATF3 knockout study (which has very little signal).\nBonferroni (conservative) multiple hypothesuis correction cut-off shown.") + geom_vline(xintercept = -log(0.05/800), linetype = "dashed", colour = "red") + geom_hline(yintercept = -log(0.05/800), linetype = "dashed", colour = "red")
ACT3ko_geneIDexpress <- koATF3_HCT116_diffexDT[!is.na(geneID), .SD[koATF3_pValue == min(koATF3_pValue, na.rm = T)][1] , by = geneID] ATF3knockoutBUM <- fitBetaUniformMixtureDistribution(ACT3ko_geneIDexpress$koATF3_pValue) plot(ATF3knockoutBUM) # 55+% of calls made at FDR of 0.05 are false positives!! falsePositiveFraction(ATF3knockoutBUM, pValueThreshold = 0.05)/(falsePositiveFraction( ATF3knockoutBUM, pValueThreshold = 0.05) + truePositiveFraction(ATF3knockoutBUM, pValueThreshold = 0.05)) TF_pvals <- ACT3ko_geneIDexpress[TTRUST_TF2targets_DT %>% .[!is.na(geneID), .(TF, geneID)], , on = "geneID"] %>% .[,.(geneSets = list(koATF3_pValue)), by = TF] TF_pvals_list <- TF_pvals[, geneSets] names(TF_pvals_list) <- TF_pvals[, TF] betaUnifPs <- sapply(TF_pvals_list, betaUniformPvalueSumTest, betaUniformFit = ATF3knockoutBUM) fisherPs <- sapply(TF_pvals_list, fishersPvalueSumTest) summaryDT <- data.table(TF = names(TF_pvals_list), metaDEGthP = betaUnifPs, fiserP = fisherPs) summaryDT[, metaDEGthQ := p.adjust(metaDEGthP, "fdr")] summaryDT[, fisherQ := p.adjust(fiserP, "fdr")] summaryDT[order(metaDEGthQ)] summaryDT[metaDEGthQ <= 1E-3] summaryDT[ACT3ko_geneIDexpress, geneID := geneID, on = .(TF == geneSymbol)] ACT3ko_geneIDexpress TTRUST_TF2targets_DT[TF == "ATF3"] %>% merge(summaryDT, by = "geneID") %>% .[!is.na(geneID)] %>% mutate(pQ = -log10(metaDEGthQ)) %>% ggplot() + geom_histogram(aes(x = pQ))
It's good that it recovers signal in p53. But this is far from a comprehensive test ...
What about with BioPlanet
ACT3ko_geneIDexpress <- koATF3_HCT116_diffexDT[!is.na(geneID), .SD[koATF3_pValue == min(koATF3_pValue, na.rm = T)][1] , by = geneID] BioPlanet_pvals <- ACT3ko_geneIDexpress[bioplanetPathwaysDT %>% .[!is.na(geneID), .(BioPlanetName, geneID)], , on = "geneID"] %>% .[,.(geneSets = list(koATF3_pValue)), by = BioPlanetName] BP_pvals_list <- BioPlanet_pvals[, geneSets] names(BP_pvals_list) <- BioPlanet_pvals[, BioPlanetName] betaUnifPs <- sapply(BP_pvals_list, betaUniformPvalueSumTest, betaUniformFit = ATF3knockoutBUM) fisherPs <- sapply(BP_pvals_list, fishersPvalueSumTest) summaryBP_DT <- data.table(TF = names(BP_pvals_list), metaDEGthP = betaUnifPs, fiserP = fisherPs) summaryBP_DT[, metaDEGthQ := p.adjust(metaDEGthP, "fdr")] summaryBP_DT[, fisherQ := p.adjust(fiserP, "fdr")] summaryBP_DT[order(metaDEGthQ)] summaryBP_DT[order(metaDEGthQ)][metaDEGthQ <= 0.01] %>% head
Hmmm. Still tricky to see the precise signal.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.