GSE100118 is a cool experiment! CRISPR knockdown of the pluripotency transcription factor OCT4 during human embryogenesis. This is a nice complement to the siRNA and small-molecule peturbagen experiments. Plus it's a dataset generated using CRISPR and human embryos

library(data.table)

SRP113531details <-  fread("http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=SRP113531")

fwrite(SRP113531details, "~/RNAseq/GSE73555/SRP113531details.tsv", sep = "\t", quote = FALSE)

From within a subdirectory (rawReads):

time cut -f 1 ../SRP113531details.tsv | grep 'SRR' | parallel -j 5 'fasterq-dump {} -e 2 -t /dev/shm && pigz -p 2 {}_*.fastq'

This took quite a while! It's almost 100 RNAseq datasets (notice the zip command after fasterqdump - you will have no disk space left otherwise!). This took ~20 hours on my home internet connection.

Whilst this is a single-cell RNAseq data set, it's not from bulk tissue so there is no need to perform the cell-type determination study after the collection of data.

This makes things quite easy - we can treat it as though it were a 'normal' RNAseq experiment.

time cut -f 1 ../SRP113531details.tsv | grep 'SRR' | parallel -j 1 ~/Programs/kallisto/kallisto quant -t 10 -i ../Homo_sapiens.GRCh38.cDNA.idx -o ./{} ../FASTQ/{}_1.fastq.gz ../FASTQ/{}_2.fastq.gz

So. Fast. kallisto is a game changed for analysis pipelines. Took 90 minutes (roughly a minute per sample!).

library(purrr)
library(GEOquery)
library(stringr)

parsedSOFT_GSE100118 <- getGEO(GEO = "GSE100118", GSEMatrix = FALSE, AnnotGPL = FALSE, getGPL = FALSE)


GSE100118_studyMetadata <- map_dfr(parsedSOFT_GSE100118@gsms,
        ~ {sampleAttr <- .x@header$characteristics_ch1

           experimentDetails <- .x@header$characteristics_ch1

           DT <- data.table(
              study = .x@header$geo_accession,
              sample_type = experimentDetails[grepl("sample type", experimentDetails)],
              condition =  experimentDetails[grepl("embryo", experimentDetails)],
              study_title = str_remove(.x@header$title," \\(RNA-Seq\\)") )

            return(DT)}  )

GSE100118_studyMetadata[, type := str_remove(sample_type,"sample type: ") %>% str_replace_all(" ","_") , by = condition]

GSE100118_studyMetadata[, subgroup := str_remove(condition,"embryo number: ") %>% str_replace_all(" ","_") , by = condition]
GSE100118_studyMetadata[, experiment := str_extract(subgroup, "(Cas9_Injected_control|CRISPR)"), by = subgroup]

setkey(GSE100118_studyMetadata, study_title)


GSE100118_studyMetadata[type != "Single_cell", experiment := "biopsy"]
GSE100118_studyMetadata[type != "Single_cell", subgroup := "biopsy"]
library(data.table)

#Suck out the ENST to ENSG (and gene Symbol) mapping from the GTF file, downloaded to /tmp/
ENST2ENSG <- fread('zcat /tmp/Homo_sapiens.GRCh38.cdna.all.fa.gz | grep "^>" | perl -pe "s/(>|chromosome:|gene:|GRCh38:|gene_biotype:|transcript_biotype:|gene_symbol:)//g" | perl -ne "s/\\s+/\\t/g; CORE::say $_" | cut -f1-7 ' , header = FALSE, fill = TRUE)

colnames(ENST2ENSG) <- c("ENST","havanaStatus","chromosome","ENSG","geneStatus","geneStatus","geneSymbol")

Importing transcript data

library(limma)
library(edgeR)
library(tximport)

projDir <- "/mnt/c/GSE100118/"

# Each kallisto alignment run produces a director (labelled by the SRR ID), with an hdf5 file
# We shall add a column with file locations to our experimental details table

SRP113531details[,path := str_c(projDir,"alignedReads/",Run,"/abundance.tsv")]


GSM2study <- GSE100118_studyMetadata[experiment != "biopsy", study]

# Notice that we pass in a transcript to gene mapping (tx2gene) and that we normalise TPM. See tximport documentation for more details
kallistoTranscriptEstimates <- tximport(files = SRP113531details[SampleName %in% GSM2study, path],
                                        type = "kallisto",
                                        tx2gene = ENST2ENSG[,.(ENST,ENSG)],
                                        countsFromAbundance = "lengthScaledTPM",
                                        dropInfReps = TRUE)
# Could use the hdf5 files for faster import, but installing the tools on centOS is a pig

# A function for computing the geometic mean of a series of observation
# See https://en.wikipedia.org/wiki/Geometric_mean
gm_mean = function(x, na.rm=TRUE){ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) }

geneAbundances <- data.table(ENSG = row.names(kallistoTranscriptEstimates$abundance),
                          gm_abundance = apply(kallistoTranscriptEstimates$abundance, 1, gm_mean) )

colnames(kallistoTranscriptEstimates$counts) <-  SRP113531details[SampleName %in% GSM2study, SampleName]

# Low quantified transcripts/genes can introduce strange artifacts. Filter these out with a very, very low cutoff. We only want to remove the lowest expressed genes
transcriptDGElist <- kallistoTranscriptEstimates$counts %>% 
  .[ geneAbundances[gm_abundance > 1.01, ENSG], ] %>%
  DGEList
transcriptDGElist %<>% calcNormFactors

Using voom to process the scRNAseq, in accordance with literature recommendations here

GSE100118_designMat <- model.matrix(study ~ 0 + experiment + subgroup,
                                   GSE100118_studyMetadata[experiment != "biopsy"])


### Two additional matricies for testing designs and ensuring that we have the best model
GSE100118_designMat2 <- model.matrix(study ~ 0 + subgroup,
                                   GSE100118_studyMetadata[experiment != "biopsy"])

GSE100118_designMat3 <- model.matrix(study ~ 0 + experiment,
                                   GSE100118_studyMetadata[experiment != "biopsy"])
###

row.names(GSE100118_designMat) <- GSE100118_studyMetadata[experiment != "biopsy",study]

colnames(GSE100118_designMat) %<>% str_remove("experiment")

GSE100118_contrastMat <- makeContrasts(
              contrasts = "CRISPR-Cas9_Injected_control",
              levels = GSE100118_designMat)

voomTrendModelFromCounts <- voom(transcriptDGElist, GSE100118_designMat, plot = TRUE)

modePrefs <- selectModel(voomTrendModelFromCounts, list(GSE100118_designMat, GSE100118_designMat2, GSE100118_designMat3), "aic")
table(modePrefs$pref) # Looks like model one is best. It's certainly the easiest for us to process!



GSE100118_fit <- lmFit(voomTrendModelFromCounts, GSE100118_designMat)

GSE100118_model <- contrasts.fit(GSE100118_fit, contrasts = GSE100118_contrastMat) %>% eBayes

GSE100118_diffexDT <- data.table(topTable(GSE100118_model, number = Inf), keep.rownames = TRUE )
setnames(GSE100118_diffexDT, "rn", "ensembl")

GSE100118_diffexDT[ENST2ENSG, geneSymbol := i.geneSymbol, on = .(ensembl == ENSG)][order(P.Value)]

GSE100118_diffexDT[,ensembl := str_extract(ensembl, "ENSG0\\d+")]

ENSG00000169059_values <- transcriptDGElist$counts[row.names(transcriptDGElist$counts) %like% "ENSG00000169059",] %>% 
  data.frame(counts = .) %>% 
  data.table(keep.rownames = T)

GSE100118_studyMetadata[ENSG00000169059_values, , on =.(study == rn)]
# So tending towards being UP in the CRISPR samples

GSE100118_diffexDT[ensembl == "ENSG00000169059"]
uniprotIDmappings <- fread("zcat /mnt/c/Uniprot/HUMAN_9606_idmapping_selected.tab.gz", header = FALSE)

#We only really care about swissprot mappings
#We only really care about swissprot mappings
swissprot_accessions <- fread("zcat /mnt/c/Uniprot/uniprot_sprot.fasta.gz | grep '^>' | perl -ne 'm/sp\\|(\\w+)\\|/; CORE::say $1'", header = FALSE)

swissprotIDmappingssubset <-  uniprotIDmappings[V1 %in% swissprot_accessions$V1][V19 != "", c(1,2,3,19)]
colnames(swissprotIDmappingssubset) <- c("accession","name","geneID","ENSGgrouped")
swissprotIDmappingssubset %<>% .[,.(ENSG = unlist(tstrsplit(ENSGgrouped, "; "))), by = .(accession, name, geneID, ENSGgrouped)]

GSE100118_diffexDT %<>% merge(
  swissprotIDmappingssubset[,.(accession,name,geneID,ENSG)],
  by.x = "ensembl", by.y = "ENSG", all.x = TRUE)


cas9OCT4_hsZygote_diffexDT <- GSE100118_diffexDT[,.(ENSG = ensembl, geneSymbol, accession, name, geneID,
                                            koOCT4_logFC = logFC, koOCT4_pValue = P.Value)] # Only save important fields

setorder(cas9OCT4_hsZygote_diffexDT, koOCT4_pValue)

#Write the results out to a data file for inclusion in the package
save(cas9OCT4_hsZygote_diffexDT, file = "./data/cas9OCT4_hsZygote_diffexDT.RData", compress = "xz")

cas9OCT4_hsZygote_diffexDT[ p.adjust(koOCT4_pValue,"fdr") < 1E-3][order(-koOCT4_logFC)]

I notice that NNANOG, a well known POU5F1 target is in there as diffexed ...

Comparison againg provided values

Download raw rpkm and compare (to be sure)

rpkmMatMetaData <- fread("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100118/suppl/GSE100118_scRNA_pou5f1crispr_rpkm_170603.csv.gz", nrows = 3)

embryoIndex <- unlist(rpkmMatMetaData[2,])

koOCT4rpkm <- fread("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100118/suppl/GSE100118_scRNA_pou5f1crispr_rpkm_170603.csv.gz", header = FALSE, fill = T, skip = 3)


setkey(GSE100118_studyMetadata, study_title)

GSE100118_studyMetadata[embryoIndex[-1:-2]][is.na(type)]
#Need to manually fix the mislabelled columns!

embryoIndex[embryoIndex == "8.6."] <- "8.6"
embryoIndex[embryoIndex == "C8.TE2"] <- "C8.TE.2"
embryoIndex[embryoIndex == "C8.Teb"] <- "C8.TEb"
embryoIndex[embryoIndex == "C12.8."] <- "C12.8"

names(koOCT4rpkm) <- c(c("ENSG","gene"), GSE100118_studyMetadata[embryoIndex[-1:-2], study])

koOCT4rpkm[1:4,1:5]

This frame is detailed as RPKM from the study authors - but the reads are paired, so surely it's FPKM see here

koOCT4tpm <- 

TODO add gene length to frame, then use edgeR to convert to CPM

http://luisvalesilva.com/datasimple/rna-seq_units.html#:~:text=Since%20RPKM%20actually%20builds%20on,to%20sequencing%20depth%2Dnormalized%20counts.

https://liorpachter.wordpress.com/tag/tpm/



adamsardar/metaDEGth documentation built on Sept. 28, 2022, 10:12 p.m.