context("test-88-gxf-loader.R")
test_that("Register GFF3 / GTF works OK", {
# cat("# Now connect to scidb\n")
e0 = tryCatch({rg_connect()}, error = function(e) {e})
if (!("error" %in% class(e0))) { # do not run this on EE installs, mainly targeted for Travis
init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
df_referenceset = data.frame(
name = c("GRCh37", "GRCh38"),
description = "...",
species = 'homo sapiens',
assembly_id = c("GrCh37", "GrCh38_r85"),
source_uri = "...",
source_accessions = "...",
is_derived = TRUE,
stringsAsFactors = FALSE)
cat("Registering reference set:", pretty_print(df_referenceset$name), "\n")
referenceset_id = register_referenceset(df = df_referenceset)
#### ReferencesSets ####
refsets = get_referenceset()
refset37 = refsets[grep("37", refsets$name), ]
refset38 = refsets[grep("38", refsets$name), ]
stopifnot(nrow(refset37) == 1)
stopifnot(nrow(refset38) == 1)
DIR = '/tmp'
##########################################################################
### GFF3 Format 1 -- GRCh38; ENSEMBL identifiers for transcripts and genes
### Download file ####
skipURL1 = TRUE
if (!skipURL1) { # skip test on Travis, as Travis server is unable to reach ftp.ensembl.org
url1 = 'ftp://ftp.ensembl.org/pub/release-97/gff3/homo_sapiens/Homo_sapiens.GRCh38.97.gff3.gz'
basename1 = basename(url1)
filepath1 = file.path(DIR, basename1)
if (exists('status')) rm(status)
status = download.file(url = url1, destfile = filepath1)
if (!exists('status')) stop("Issue downloading file: ", url1)
### Register to scidb at a FeatureSet ####
ftrset_idx = register_featureset(df = data.frame(
name = 'gxf_ftrset1',
referenceset_id = refset38$referenceset_id,
description = '...', source_uri = '...',
stringsAsFactors = FALSE))
ftr_idx = register_features_from_gxf_file(
featureset_name = 'gxf_ftrset1',
register_transcripts = TRUE,
gxf_filepath = filepath1,
gxf_format = 'GFF3'
)
}
##########################################################################
### GFF3 Format 2 -- GRCh37; refseq identifiers for transcripts, Hugo for genes
### Download file ####
skipURL2 = TRUE
if (!skipURL2) {
url2 = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz'
basename2 = basename(url2)
filepath2 = file.path(DIR, basename2)
if (exists('status')) rm(status)
status = download.file(url = url2, destfile = filepath2)
if (!exists('status')) stop("Issue downloading file: ", url2)
### Register to scidb at a FeatureSet ####
ftrset_idx = register_featureset(df = data.frame(
name = basename2,
referenceset_id = refset37$referenceset_id,
description = '...', source_uri = '...',
stringsAsFactors = FALSE))
ftr_idx = register_features_from_gxf_file(
featureset_name = basename2,
register_transcripts = TRUE,
gxf_filepath = filepath2,
gxf_format = 'GFF3'
)
}
##########################################################################
### GTF format 1 -- GRCh38
### Download file ####
skipURL3 = TRUE
if (!skipURL3) {
url3 = 'ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz'
basename3 = basename(url3)
filepath3 = file.path(DIR, basename3)
if (exists('status')) rm(status)
status = download.file(url = url3, destfile = filepath3)
if (!exists('status')) stop("Issue downloading file: ", url3)
### Register to scidb at a FeatureSet ####
ftrset_idx = register_featureset(df = data.frame(
name = basename3,
referenceset_id = refset38$referenceset_id,
description = '...', source_uri = '...',
stringsAsFactors = FALSE))
ftr_idx = register_features_from_gxf_file(
featureset_name = basename3,
register_transcripts = TRUE,
gxf_filepath = filepath3,
gxf_format = 'GTF'
)
}
########################################################################
#### Verification ####
fs = get_featuresets()
# Spot checks
if (!skipURL1) {
fs1 = fs[grep(basename1, fs$name), ]
# Gene EGFR
ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'gene', featureset_id = fs1$featureset_id)
testthat::expect_equal(nrow(ftr_df), 1)
# Transcripts linked to EGFR
ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'transcript', featureset_id = fs1$featureset_id)
testthat::expect_equal(nrow(ftr_df), 11)
# Transcripts linked to EGFR, KRAS
ftr_df = search_features(gene_symbol = c('EGFR', 'KRAS'), feature_type = 'transcript', featureset_id = fs1$featureset_id)
testthat::expect_equal(nrow(ftr_df), 15)
# Full downloads
ftr_df = search_features(featureset_id = fs1$featureset_id, mandatory_fields_only = T)
testthat::expect_true(any(grepl("ensembl", unique(ftr_df$source), ignore.case = T)))
table(ftr_df$source)
# ensembl ensembl_havana ensembl_havana_tagene havana havana_tagene insdc mirbase
# 19743 45302 22 196484 22059 37 3758
table(ftr_df$feature_type)
# gene transcript
# 60617 226788
testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'gene', ]), 60617)
testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'transcript', ]), 226788)
}
# Spot checks
if (!skipURL2) {
fs2 = fs[grep(basename2, fs$name), ]
ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'gene', featureset_id = fs2$featureset_id)
testthat::expect_equal(nrow(ftr_df), 1)
# Transcripts linked to EGFR
ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'transcript', featureset_id = fs2$featureset_id)
testthat::expect_equal(nrow(ftr_df), 9)
# Transcripts linked to EGFR, KRAS
ftr_df = search_features(gene_symbol = c('EGFR', 'KRAS'), feature_type = 'transcript', featureset_id = fs2$featureset_id)
testthat::expect_equal(nrow(ftr_df), 11)
# Full downloads
ftr_df = search_features(featureset_id = fs2$featureset_id, mandatory_fields_only = T)
testthat::expect_true(any(grepl("RefSeq", unique(ftr_df$source), ignore.case = T)))
table(ftr_df$source)
# BestRefSeq Curated Genomic RefSeq tRNAscan-SE
# 85023 15258 37 428
table(ftr_df$feature_type)
# gene transcript
# 42283 58463
testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'gene', ]), 42283)
testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'transcript', ]), 58463)
}
# Clean up
init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
}
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.