Evaluate whether GC content calculated on the RNA sequences using the Ensembl Perl API matches the one calculated using R and the genomic DNA sequence.

library(ensembldb)

edb <- EnsDb("/Users/jo/Projects/EnsDbs/98/EnsDb.Hsapiens.v98.sqlite")

chrs <- c(1:22, "X", "Y")
txs <- transcripts(edb, filter = ~ seq_name %in% chrs)

any(is.na(mcols(txs)$gc_content))

Get the sequences using R functionality.

library(GenomicFeatures)
dna <- getGenomeTwoBitFile(edb)

tx_exons <- exonsBy(edb, "tx", filter = ~ seq_name %in% chrs)
tx_seqs <- extractTranscriptSeqs(dna, tx_exons)
library(Biostrings)
gc_prop <- letterFrequency(tx_seqs, letters = "GC", as.prob = TRUE)[, 1]
names(gc_prop) <- names(tx_seqs)

Compare GC contents...

gc_prop_perl <- mcols(txs)$gc_content
names(gc_prop_perl) <- names(txs)

stopifnot(all(names(gc_prop) %in% names(gc_prop_perl)))
gc_prop_perl <- gc_prop_perl[names(gc_prop)]

head(gc_prop_perl)
head(gc_prop)

library(testthat)
expect_equal(gc_prop_perl, 100 * gc_prop)


jorainer/ensembldb documentation built on Nov. 21, 2023, 1:45 a.m.