GSE100118 is a cool experiment! CRISPR knockdown of the pluripotency transcription factor OCT4 during human embryogenesis. This is a nice complement to the siRNA and small-molecule peturbagen experiments. Plus it's a dataset generated using CRISPR and human embryos
library(data.table) SRP113531details <- fread("http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=SRP113531") fwrite(SRP113531details, "~/RNAseq/GSE73555/SRP113531details.tsv", sep = "\t", quote = FALSE)
From within a subdirectory (rawReads):
time cut -f 1 ../SRP113531details.tsv | grep 'SRR' | parallel -j 5 'fasterq-dump {} -e 2 -t /dev/shm && pigz -p 2 {}_*.fastq'
This took quite a while! It's almost 100 RNAseq datasets (notice the zip command after fasterqdump - you will have no disk space left otherwise!). This took ~20 hours on my home internet connection.
Whilst this is a single-cell RNAseq data set, it's not from bulk tissue so there is no need to perform the cell-type determination study after the collection of data.
This makes things quite easy - we can treat it as though it were a 'normal' RNAseq experiment.
time cut -f 1 ../SRP113531details.tsv | grep 'SRR' | parallel -j 1 ~/Programs/kallisto/kallisto quant -t 10 -i ../Homo_sapiens.GRCh38.cDNA.idx -o ./{} ../FASTQ/{}_1.fastq.gz ../FASTQ/{}_2.fastq.gz
So. Fast. kallisto is a game changed for analysis pipelines. Took 90 minutes (roughly a minute per sample!).
library(purrr) library(GEOquery) library(stringr) parsedSOFT_GSE100118 <- getGEO(GEO = "GSE100118", GSEMatrix = FALSE, AnnotGPL = FALSE, getGPL = FALSE) GSE100118_studyMetadata <- map_dfr(parsedSOFT_GSE100118@gsms, ~ {sampleAttr <- .x@header$characteristics_ch1 experimentDetails <- .x@header$characteristics_ch1 DT <- data.table( study = .x@header$geo_accession, sample_type = experimentDetails[grepl("sample type", experimentDetails)], condition = experimentDetails[grepl("embryo", experimentDetails)], study_title = str_remove(.x@header$title," \\(RNA-Seq\\)") ) return(DT)} ) GSE100118_studyMetadata[, type := str_remove(sample_type,"sample type: ") %>% str_replace_all(" ","_") , by = condition] GSE100118_studyMetadata[, subgroup := str_remove(condition,"embryo number: ") %>% str_replace_all(" ","_") , by = condition] GSE100118_studyMetadata[, experiment := str_extract(subgroup, "(Cas9_Injected_control|CRISPR)"), by = subgroup] setkey(GSE100118_studyMetadata, study_title) GSE100118_studyMetadata[type != "Single_cell", experiment := "biopsy"] GSE100118_studyMetadata[type != "Single_cell", subgroup := "biopsy"]
library(data.table) #Suck out the ENST to ENSG (and gene Symbol) mapping from the GTF file, downloaded to /tmp/ ENST2ENSG <- fread('zcat /tmp/Homo_sapiens.GRCh38.cdna.all.fa.gz | grep "^>" | perl -pe "s/(>|chromosome:|gene:|GRCh38:|gene_biotype:|transcript_biotype:|gene_symbol:)//g" | perl -ne "s/\\s+/\\t/g; CORE::say $_" | cut -f1-7 ' , header = FALSE, fill = TRUE) colnames(ENST2ENSG) <- c("ENST","havanaStatus","chromosome","ENSG","geneStatus","geneStatus","geneSymbol")
library(limma) library(edgeR) library(tximport) projDir <- "/mnt/c/GSE100118/" # Each kallisto alignment run produces a director (labelled by the SRR ID), with an hdf5 file # We shall add a column with file locations to our experimental details table SRP113531details[,path := str_c(projDir,"alignedReads/",Run,"/abundance.tsv")] GSM2study <- GSE100118_studyMetadata[experiment != "biopsy", study] # Notice that we pass in a transcript to gene mapping (tx2gene) and that we normalise TPM. See tximport documentation for more details kallistoTranscriptEstimates <- tximport(files = SRP113531details[SampleName %in% GSM2study, path], type = "kallisto", tx2gene = ENST2ENSG[,.(ENST,ENSG)], countsFromAbundance = "lengthScaledTPM", dropInfReps = TRUE) # Could use the hdf5 files for faster import, but installing the tools on centOS is a pig # A function for computing the geometic mean of a series of observation # See https://en.wikipedia.org/wiki/Geometric_mean gm_mean = function(x, na.rm=TRUE){ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) } geneAbundances <- data.table(ENSG = row.names(kallistoTranscriptEstimates$abundance), gm_abundance = apply(kallistoTranscriptEstimates$abundance, 1, gm_mean) ) colnames(kallistoTranscriptEstimates$counts) <- SRP113531details[SampleName %in% GSM2study, SampleName] # Low quantified transcripts/genes can introduce strange artifacts. Filter these out with a very, very low cutoff. We only want to remove the lowest expressed genes transcriptDGElist <- kallistoTranscriptEstimates$counts %>% .[ geneAbundances[gm_abundance > 1.01, ENSG], ] %>% DGEList transcriptDGElist %<>% calcNormFactors
Using voom to process the scRNAseq, in accordance with literature recommendations here
GSE100118_designMat <- model.matrix(study ~ 0 + experiment + subgroup, GSE100118_studyMetadata[experiment != "biopsy"]) ### Two additional matricies for testing designs and ensuring that we have the best model GSE100118_designMat2 <- model.matrix(study ~ 0 + subgroup, GSE100118_studyMetadata[experiment != "biopsy"]) GSE100118_designMat3 <- model.matrix(study ~ 0 + experiment, GSE100118_studyMetadata[experiment != "biopsy"]) ### row.names(GSE100118_designMat) <- GSE100118_studyMetadata[experiment != "biopsy",study] colnames(GSE100118_designMat) %<>% str_remove("experiment") GSE100118_contrastMat <- makeContrasts( contrasts = "CRISPR-Cas9_Injected_control", levels = GSE100118_designMat) voomTrendModelFromCounts <- voom(transcriptDGElist, GSE100118_designMat, plot = TRUE) modePrefs <- selectModel(voomTrendModelFromCounts, list(GSE100118_designMat, GSE100118_designMat2, GSE100118_designMat3), "aic") table(modePrefs$pref) # Looks like model one is best. It's certainly the easiest for us to process! GSE100118_fit <- lmFit(voomTrendModelFromCounts, GSE100118_designMat) GSE100118_model <- contrasts.fit(GSE100118_fit, contrasts = GSE100118_contrastMat) %>% eBayes GSE100118_diffexDT <- data.table(topTable(GSE100118_model, number = Inf), keep.rownames = TRUE ) setnames(GSE100118_diffexDT, "rn", "ensembl") GSE100118_diffexDT[ENST2ENSG, geneSymbol := i.geneSymbol, on = .(ensembl == ENSG)][order(P.Value)] GSE100118_diffexDT[,ensembl := str_extract(ensembl, "ENSG0\\d+")] ENSG00000169059_values <- transcriptDGElist$counts[row.names(transcriptDGElist$counts) %like% "ENSG00000169059",] %>% data.frame(counts = .) %>% data.table(keep.rownames = T) GSE100118_studyMetadata[ENSG00000169059_values, , on =.(study == rn)] # So tending towards being UP in the CRISPR samples GSE100118_diffexDT[ensembl == "ENSG00000169059"]
uniprotIDmappings <- fread("zcat /mnt/c/Uniprot/HUMAN_9606_idmapping_selected.tab.gz", header = FALSE) #We only really care about swissprot mappings #We only really care about swissprot mappings swissprot_accessions <- fread("zcat /mnt/c/Uniprot/uniprot_sprot.fasta.gz | grep '^>' | perl -ne 'm/sp\\|(\\w+)\\|/; CORE::say $1'", header = FALSE) swissprotIDmappingssubset <- uniprotIDmappings[V1 %in% swissprot_accessions$V1][V19 != "", c(1,2,3,19)] colnames(swissprotIDmappingssubset) <- c("accession","name","geneID","ENSGgrouped") swissprotIDmappingssubset %<>% .[,.(ENSG = unlist(tstrsplit(ENSGgrouped, "; "))), by = .(accession, name, geneID, ENSGgrouped)] GSE100118_diffexDT %<>% merge( swissprotIDmappingssubset[,.(accession,name,geneID,ENSG)], by.x = "ensembl", by.y = "ENSG", all.x = TRUE) cas9OCT4_hsZygote_diffexDT <- GSE100118_diffexDT[,.(ENSG = ensembl, geneSymbol, accession, name, geneID, koOCT4_logFC = logFC, koOCT4_pValue = P.Value)] # Only save important fields setorder(cas9OCT4_hsZygote_diffexDT, koOCT4_pValue) #Write the results out to a data file for inclusion in the package save(cas9OCT4_hsZygote_diffexDT, file = "./data/cas9OCT4_hsZygote_diffexDT.RData", compress = "xz") cas9OCT4_hsZygote_diffexDT[ p.adjust(koOCT4_pValue,"fdr") < 1E-3][order(-koOCT4_logFC)]
I notice that NNANOG, a well known POU5F1 target is in there as diffexed ...
Download raw rpkm and compare (to be sure)
rpkmMatMetaData <- fread("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100118/suppl/GSE100118_scRNA_pou5f1crispr_rpkm_170603.csv.gz", nrows = 3) embryoIndex <- unlist(rpkmMatMetaData[2,]) koOCT4rpkm <- fread("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100118/suppl/GSE100118_scRNA_pou5f1crispr_rpkm_170603.csv.gz", header = FALSE, fill = T, skip = 3) setkey(GSE100118_studyMetadata, study_title) GSE100118_studyMetadata[embryoIndex[-1:-2]][is.na(type)] #Need to manually fix the mislabelled columns! embryoIndex[embryoIndex == "8.6."] <- "8.6" embryoIndex[embryoIndex == "C8.TE2"] <- "C8.TE.2" embryoIndex[embryoIndex == "C8.Teb"] <- "C8.TEb" embryoIndex[embryoIndex == "C12.8."] <- "C12.8" names(koOCT4rpkm) <- c(c("ENSG","gene"), GSE100118_studyMetadata[embryoIndex[-1:-2], study]) koOCT4rpkm[1:4,1:5]
This frame is detailed as RPKM from the study authors - but the reads are paired, so surely it's FPKM see here
koOCT4tpm <-
http://luisvalesilva.com/datasimple/rna-seq_units.html#:~:text=Since%20RPKM%20actually%20builds%20on,to%20sequencing%20depth%2Dnormalized%20counts.
https://liorpachter.wordpress.com/tag/tpm/
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.