GSE100118 is a cool experiment! CRISPR knockdown of the pluripotency transcription factor OCT4 during human embryogenesis. This is a nice complement to the siRNA and small-molecule peturbagen experiments. Plus it's a dataset generated using CRISPR and human embryos

library(data.table)

SRP113531details <-  fread("http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=SRP113531")

fwrite(SRP113531details, "/mnt/c/GSE73555/SRP113531details.tsv", sep = "\t", quote = FALSE)
time cut -f 1 ../SRP113531details.tsv | grep 'SRR' | parallel -j 5 'fasterq-dump {} -e 4 -t /tmp/scratch && pigz -p 4 {}_*.fastq'

This took quite a while! It's almost 100 RNAseq datasets (notice the zip command after fasterqdump - you will have no disk space left otherwise!)

Whilst this is a single-cell RNAseq data set, it's not from bulk tissue so there is no need to perform the cell-type determination study after the collection of data.

This makes things quite easy - we can treat it as though it were a 'normal' RNAseq experiment.

time cut -f 1 ../SRP113531details.tsv | grep 'SRR' | parallel -j 1 ~/Programs/kallisto/kallisto quant -t 10 -i ../Homo_sapiens.GRCh38.cDNA.idx -o ./{} ../FASTQ/{}_1.fastq.gz ../FASTQ/{}_2.fastq.gz

So. Fast. kallisto is a game changed for analysis pipelines. Took 90 minutes (roughly a minute per sample!).

parsedSOFT_GSE100118 <- getGEO(GEO = "GSE100118", GSEMatrix = FALSE, AnnotGPL = FALSE, getGPL = FALSE)


GSE100118_studyMetadata <- map_dfr(parsedSOFT_GSE100118@gsms,
        ~ {sampleAttr <- .x@header$characteristics_ch1

           experimentDetails <- .x@header$characteristics_ch1

           DT <- data.table(
              study = .x@header$geo_accession,
              sample_type = experimentDetails[grepl("sample type", experimentDetails)],
              condition =  experimentDetails[grepl("embryo", experimentDetails)],
              study_title = str_remove(.x@header$title," \\(RNA-Seq\\)") )

            return(DT)}  )

GSE100118_studyMetadata[, type := str_remove(sample_type,"sample type: ") %>% str_replace_all(" ","_") , by = condition]

GSE100118_studyMetadata[, subgroup := str_remove(condition,"embryo number: ") %>% str_replace_all(" ","_") , by = condition]
GSE100118_studyMetadata[, experiment := str_extract(subgroup, "(Cas9_Injected_control|CRISPR)"), by = subgroup]

setkey(GSE100118_studyMetadata, study_title)


GSE100118_studyMetadata[type != "Single_cell", experiment := "biopsy"]
GSE100118_studyMetadata[type != "Single_cell", subgroup := "biopsy"]
library(data.table)

#Suck out the ENST to ENSG (and gene Symbol) mapping from the GTF file, downloaded to /tmp/
ENST2ENSG <- fread('zcat /tmp/Homo_sapiens.GRCh38.cdna.all.fa.gz | grep "^>" | perl -pe "s/(>|chromosome:|gene:|GRCh38:|gene_biotype:|transcript_biotype:|gene_symbol:)//g" | perl -ne "s/\\s+/\\t/g; CORE::say $_" | cut -f1-7 ' , header = FALSE, fill = TRUE)

colnames(ENST2ENSG) <- c("ENST","havanaStatus","chromosome","ENSG","geneStatus","geneStatus","geneSymbol")

Importing transcript data

library(limma)
library(edgeR)
library(tximport)

projDir <- "/mnt/c/GSE100118/"

# Each kallisto alignment run produces a director (labelled by the SRR ID), with an hdf5 file
# We shall add a column with file locations to our experimental details table

SRP113531details[,path := str_c(projDir,"alignedReads/",Run,"/abundance.tsv")]


GSM2study <- GSE100118_studyMetadata[experiment != "biopsy", study]

# Notice that we pass in a transcript to gene mapping (tx2gene) and that we normalise TPM. See tximport documentation for more details
kallistoTranscriptEstimates <- tximport(files = SRP113531details[SampleName %in% GSM2study, path],
                                        type = "kallisto",
                                        tx2gene = ENST2ENSG[,.(ENST,ENSG)],
                                        countsFromAbundance = "lengthScaledTPM",
                                        dropInfReps = TRUE)
# Could use the hdf5 files for faster import, but installing the tools on centOS is a pig

# A function for computing the geometic mean of a series of observation
# See https://en.wikipedia.org/wiki/Geometric_mean
gm_mean = function(x, na.rm=TRUE){ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) }

geneAbundances <- data.table(ENSG = row.names(kallistoTranscriptEstimates$abundance),
                          gm_abundance = apply(kallistoTranscriptEstimates$abundance, 1, gm_mean) )

colnames(kallistoTranscriptEstimates$counts) <-  SRP113531details[SampleName %in% GSM2study, SampleName]

# Low quantified transcripts/genes can introduce strange artifacts. Filter these out with a very, very low cutoff. We only want to remove the lowest expressed genes
transcriptDGElist <- kallistoTranscriptEstimates$counts %>% 
  .[ geneAbundances[gm_abundance > 1.01, ENSG], ] %>%
  DGEList
transcriptDGElist %<>% calcNormFactors
GSE100118_designMat <- model.matrix(study ~ 0 + experiment + subgroup,
                                   GSE100118_studyMetadata[experiment != "biopsy"])


### Two additional matricies for testing designs and ensuring that we have the best model
GSE100118_designMat2 <- model.matrix(study ~ 0 + subgroup,
                                   GSE100118_studyMetadata[experiment != "biopsy"])

GSE100118_designMat3 <- model.matrix(study ~ 0 + experiment,
                                   GSE100118_studyMetadata[experiment != "biopsy"])
###

row.names(GSE100118_designMat) <- GSE100118_studyMetadata[experiment != "biopsy",study]

colnames(GSE100118_designMat) %<>% str_remove("experiment")

GSE100118_contrastMat <- makeContrasts(
              contrasts = "CRISPR-Cas9_Injected_control",
              levels = GSE100118_designMat)

voomTrendModelFromCounts <- voom(transcriptDGElist, GSE100118_designMat, plot = TRUE)

modePrefs <- selectModel(voomTrendModelFromCounts, list(GSE100118_designMat, GSE100118_designMat2, GSE100118_designMat3), "aic")
table(modePrefs$pref) # Looks like model one is best. It's certainly the easiest for us to process!



GSE100118_fit <- lmFit(voomTrendModelFromCounts, GSE100118_designMat)

GSE100118_model <- contrasts.fit(GSE100118_fit, contrasts = GSE100118_contrastMat) %>% eBayes

GSE100118_diffexDT <- data.table(topTable(GSE100118_model, number = Inf), keep.rownames = TRUE )
setnames(GSE100118_diffexDT, "rn", "ensembl")

GSE100118_diffexDT[ENST2ENSG, geneSymbol := i.geneSymbol, on = .(ensembl == ENSG)][order(P.Value)]

GSE100118_diffexDT[,ensembl := str_extract(ensembl, "ENSG0\\d+")]

ENSG00000169059_values <- transcriptDGElist$counts[row.names(transcriptDGElist$counts) %like% "ENSG00000169059",] %>% 
  data.frame(counts = .) %>% 
  data.table(keep.rownames = T)

GSE100118_studyMetadata[ENSG00000169059_values, , on =.(study == rn)]
# So tending towards being UP in the CRISPR samples

GSE100118_diffexDT[ensembl == "ENSG00000169059"]
uniprotIDmappings <- fread("zcat /mnt/c/Uniprot/HUMAN_9606_idmapping_selected.tab.gz", header = FALSE)

#We only really care about swissprot mappings
#We only really care about swissprot mappings
swissprot_accessions <- fread("zcat /mnt/c/Uniprot/uniprot_sprot.fasta.gz | grep '^>' | perl -ne 'm/sp\\|(\\w+)\\|/; CORE::say $1'", header = FALSE)

swissprotIDmappingssubset <-  uniprotIDmappings[V1 %in% swissprot_accessions$V1][V19 != "", c(1,2,3,19)]
colnames(swissprotIDmappingssubset) <- c("accession","name","geneID","ENSGgrouped")
swissprotIDmappingssubset %<>% .[,.(ENSG = unlist(tstrsplit(ENSGgrouped, "; "))), by = .(accession, name, geneID, ENSGgrouped)]

GSE100118_diffexDT %<>% merge(
  swissprotIDmappingssubset[,.(accession,name,geneID,ENSG)],
  by.x = "ensembl", by.y = "ENSG", all.x = TRUE)


cas9OCT4_hsZygote_diffexDT <- GSE100118_diffexDT[,.(ENSG = ensembl, geneSymbol, accession, name, geneID,
                                            koOCT4_logFC = logFC, koOCT4_pValue = P.Value)] # Only save important fields

setorder(cas9OCT4_hsZygote_diffexDT, koOCT4_pValue)

#Write the results out to a data file for inclusion in the package
save(cas9OCT4_hsZygote_diffexDT, file = "./data/cas9OCT4_hsZygote_diffexDT.RData", compress = "xz")

cas9OCT4_hsZygote_diffexDT[ p.adjust(koOCT4_pValue,"fdr") < 1E-3][order(-koOCT4_logFC)]
OCTcrisprByGeneID <- cas9OCT4_hsZygote_diffexDT[!is.na(geneID), .(geneID = as.integer(unlist(strsplit(geneID, "; ")))), by = .(ENSG, accession, koOCT4_pValue)]

cas9OCT4_BUMod <- fitBetaUniformMixtureDistribution(OCTcrisprByGeneID$koOCT4_pValue)

plot.BetaUniformModel(cas9OCT4_BUMod, outputFormula = FALSE, outputParameters=FALSE)

OCTcrisprByGeneID[, betaUnifScore_FDR0.05 := betaUniformScore(koOCT4_pValue, cas9OCT4_BUMod)]


OCTcrisprByGeneID[TTRUST_TF2targets_DT[TF == "POU5F1"], , on = "geneID"]

falsePositiveFraction(cas9OCT4_BUMod, pValueThreshold = 0.05)/(falsePositiveFraction( cas9OCT4_BUMod, pValueThreshold = 0.05) + truePositiveFraction(cas9OCT4_BUMod, pValueThreshold = 0.05))

noiseFractionUpperBound(cas9OCT4_BUMod)

TF_pvals <- OCTcrisprByGeneID[  unique(TTRUST_TF2targets_DT[!is.na(geneID), .(TF, geneID)]), , on = "geneID"] %>% 
                    .[!is.na(koOCT4_pValue),.(pValueSet = list(koOCT4_pValue), members = list(geneID), sumScore = sum(betaUnifScore_FDR0.05, na.rm=T)), by = TF]

for(i in 1:nrow(TF_pvals)){

 betaUniformPvalueSumTest( TF_pvals[1,  pValueSet[[1]]], cas9OCT4_BUMod)
}

TF_pvals[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], cas9OCT4_BUMod), by = TF]
TF_pvals[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = TF]

TF_pvals %>%
  ggplot(aes(x = -log10( as.numeric(fishersP)), y = -log10(as.numeric(betaUniformMixtureP)))) +
  geom_point()


TF_pvals[, betaUniformMixtureQ := p.adjust(betaUniformMixtureP, "fdr")]
TF_pvals[, fishersQ := p.adjust(fishersP, "fdr")]

TF_pvals[order(betaUniformMixtureQ)]

TF_pvals[TF == "POU5F1"]

TF_pvals[betaUniformMixtureQ < 0.01]

TF_pvals[order(betaUniformMixtureQ)]

TF_pvals[fishersQ < 0.01]

This is another focused dataset where TTRUST fails to pick out signal (or it's metaDEGth - also possible). I notice that NNANOG, a well known POU5F1 target is in there as diffexed ...

FANTOM5_hs_regcirc <- fread("zcat /mnt/c/regulatory_circutis/Network_compendium/Tissue-specific_regulatory_networks_FANTOM5-v1/32_high-level_networks/20_gastrointestinal_system.txt.gz") # Arbitrary cut off

FANTOM5_hs_regcirc[V3 > 0.2, qplot(V3)] + scale_x_continuous(breaks = seq(0,1,0.1))
cas9OCT4_geneSymExpress <- cas9OCT4_hsZygote_diffexDT[!is.na(geneSymbol), .SD[koOCT4_pValue == min(koOCT4_pValue, na.rm = T)][1] , by = geneSymbol]

cas9OCT4_geneSymExpress[,qplot(koOCT4_pValue, bins = 120)]

OCT4knockout_betaUniformModel <- fitBetaUniformMixtureDistribution(cas9OCT4_geneSymExpress$koOCT4_pValue, nStarts = 20)
# LLH fluctates a bit as we are on the bounds of parameter space!

OCT4knockout_betaUniformModel

noiseFractionUpperBound(OCT4knockout_betaUniformModel)

# If we were to use a P-value cutoff of 0.01, what would the FP rate be?
TP <- truePositiveFraction(OCT4knockout_betaUniformModel, pValueThreshold = 0.01)
FP <- falsePositiveFraction(OCT4knockout_betaUniformModel, pValueThreshold = 0.01)
round(100*FP/(TP+FP), digits = 1) # Over 30% would be false positives!
# Perform meta-analysis using the TTRUST TF->target sets

cas9OCT4_geneSymExpress[, betaUnifScore_FDR0.05 := betaUniformScore(koOCT4_pValue, OCT4knockout_betaUniformModel, FDR = 0.05)]



regCirc_pvalues <- cas9OCT4_geneSymExpress[ unique(FANTOM5_hs_regcirc[V2 > 0.3, .(TF = V1, geneSymbol = V2)]), , on = "geneSymbol"][!is.na(koOCT4_pValue),.(pValueSet = list(koOCT4_pValue), geneSet = list(geneSymbol), scoreSum = sum(betaUnifScore_FDR0.05)), by = TF]


regCirc_pvalues[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], OCT4knockout_betaUniformModel), by = TF]
regCirc_pvalues[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = TF]

regCirc_pvalues[p.adjust(betaUniformMixtureP) < 0.01] # No significant gene sets
regCirc_pvalues[p.adjust(fishersP) < 0.01] # Quite a few ... too many for so little signal?

TTRUST_TF_pvalues[scoreSum > 0]
library(fgsea)

hallmark_pathways <- gmtPathways("/mnt/c/broad_genesets/h.all.v7.0.entrez.gmt")
hallmark_pathways_DT <- map2_dfr(hallmark_pathways, names(hallmark_pathways), ~ data.table(name = .y, geneID = .x))


cannonical_pathways <- gmtPathways("/mnt/c/broad_genesets/c2.cp.v7.0.entrez.gmt")
cannonical_pathways_DT <- map2_dfr(cannonical_pathways, names(cannonical_pathways), ~ data.table(name = .y, geneID = .x))



chemical_genetic_peturbations <- gmtPathways("/mnt/c/broad_genesets/c2.cgp.v7.0.entrez.gmt")
CGP_geneSets_DT <- map2_dfr(chemical_genetic_peturbations, names(chemical_genetic_peturbations), ~ data.table(name = .y, geneID = .x))


cas9OCT4_geneIDexpress <- cas9OCT4_hsZygote_diffex_DT[!is.na(geneID), .SD[koOCT4_pValue == min(koOCT4_pValue, na.rm = T)][1] , by = geneID]

cas9OCT4_geneIDexpress[,qplot(koOCT4_pValue, bins = 120)]

OCT4knockout_betaUniformModel <- fitBetaUniformMixtureDistribution(cas9OCT4_geneIDexpress$koOCT4_pValue, nStarts = 20)

cas9OCT4_geneIDexpress[, betaUnifScore_FDR0.05 := betaUniformScore(koOCT4_pValue, OCT4knockout_betaUniformModel, FDR = 0.05)]



####


braodHallmarkSetsDT <- cas9OCT4_geneSymExpress[ unique(hallmark_pathways_DT[, .(pathway = name, geneID)]), , on = "geneID"][!is.na(koOCT4_pValue),.(pValueSet = list(koOCT4_pValue), geneSet = list(geneSymbol), scoreSum = sum(betaUnifScore_FDR0.05)), by = pathway]


braodHallmarkSetsDT[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], OCT4knockout_betaUniformModel), by = pathway]
braodHallmarkSetsDT[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = pathway]

braodHallmarkSetsDT[p.adjust(betaUniformMixtureP, "fdr") < 0.01] 
braodHallmarkSetsDT[p.adjust(fishersP, "fdr") < 0.05]

braodHallmarkSetsDT[scoreSum > 0]


####

braodGCPsetsDT <- cas9OCT4_geneSymExpress[ unique(CGP_geneSets_DT[, .(pathway = name, geneID)]), , on = "geneID"][!is.na(koOCT4_pValue),.(pValueSet = list(koOCT4_pValue), geneSet = list(geneSymbol), scoreSum = sum(betaUnifScore_FDR0.05)), by = pathway]

braodGCPsetsDT[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], OCT4knockout_betaUniformModel), by = pathway]
braodGCPsetsDT[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = pathway]

braodGCPsetsDT[p.adjust(betaUniformMixtureP) < 0.01] 
braodGCPsetsDT[p.adjust(fishersP) < 0.01]

braodGCPsetsDT[scoreSum > 0]


####

braodCanonocalSetsDT <- cas9OCT4_geneSymExpress[ unique(cannonical_pathways_DT[, .(pathway = name, geneID)]), , on = "geneID"][!is.na(koOCT4_pValue),.(pValueSet = list(koOCT4_pValue), geneSet = list(geneSymbol), scoreSum = sum(betaUnifScore_FDR0.05)), by = pathway]

braodCanonocalSetsDT[, betaUniformMixtureP := betaUniformPvalueSumTest(pValueSet[[1]], ATF3knockout_betaUniformModel), by = pathway]
braodCanonocalSetsDT[, fishersP := fishersPvalueSumTest(pValueSet[[1]]), by = pathway]

braodCanonocalSetsDT[p.adjust(betaUniformMixtureP) < 0.01] 
braodCanonocalSetsDT[p.adjust(fishersP) < 0.01]

braodGCPsetsDT[scoreSum > 0]


adamsardar/metaDEGth documentation built on Jan. 28, 2020, 12:17 a.m.