tests/testthat/test88-gxf-loader.R

context("test-88-gxf-loader.R")

test_that("Register GFF3 / GTF works OK", {
  # cat("# Now connect to scidb\n")
  e0 = tryCatch({rg_connect()}, error = function(e) {e})
  if (!("error" %in% class(e0))) { # do not run this on EE installs, mainly targeted for Travis
    init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
    df_referenceset = data.frame(
      name = c("GRCh37", "GRCh38"), 
      description = "...",
      species = 'homo sapiens',
      assembly_id = c("GrCh37", "GrCh38_r85"),
      source_uri = "...",
      source_accessions = "...",
      is_derived = TRUE, 
      stringsAsFactors = FALSE)
    
    cat("Registering reference set:", pretty_print(df_referenceset$name), "\n")
    referenceset_id = register_referenceset(df = df_referenceset)
    
    #### ReferencesSets ####
    refsets = get_referenceset()
    refset37 = refsets[grep("37", refsets$name), ]
    refset38 = refsets[grep("38", refsets$name), ]
    stopifnot(nrow(refset37) == 1)
    stopifnot(nrow(refset38) == 1)
    
    DIR = '/tmp'
    
    ##########################################################################
    ### GFF3 Format 1 -- GRCh38; ENSEMBL identifiers for transcripts and genes
    ### Download file ####
    skipURL1 = TRUE
    if (!skipURL1) { # skip test on Travis, as Travis server is unable to reach ftp.ensembl.org
      url1 = 'ftp://ftp.ensembl.org/pub/release-97/gff3/homo_sapiens/Homo_sapiens.GRCh38.97.gff3.gz'
      basename1 = basename(url1)
      filepath1 = file.path(DIR, basename1)
      if (exists('status')) rm(status)
      status = download.file(url = url1, destfile = filepath1)
      if (!exists('status')) stop("Issue downloading file: ", url1)
      
      ### Register to scidb at a FeatureSet ####
      ftrset_idx = register_featureset(df = data.frame(
        name = 'gxf_ftrset1', 
        referenceset_id = refset38$referenceset_id, 
        description = '...', source_uri = '...', 
        stringsAsFactors = FALSE))
      ftr_idx = register_features_from_gxf_file(
        featureset_name = 'gxf_ftrset1', 
        register_transcripts = TRUE, 
        gxf_filepath = filepath1, 
        gxf_format = 'GFF3'
      )
    }
    
    ##########################################################################
    ### GFF3 Format 2 -- GRCh37; refseq identifiers for transcripts, Hugo for genes
    ### Download file ####
    skipURL2 = TRUE
    if (!skipURL2) {
      url2 = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz'
      basename2 = basename(url2)
      filepath2 = file.path(DIR, basename2)
      if (exists('status')) rm(status)
      status = download.file(url = url2, destfile = filepath2)
      if (!exists('status')) stop("Issue downloading file: ", url2)
      
      ### Register to scidb at a FeatureSet ####
      ftrset_idx = register_featureset(df = data.frame(
        name = basename2, 
        referenceset_id = refset37$referenceset_id, 
        description = '...', source_uri = '...', 
        stringsAsFactors = FALSE))
      ftr_idx = register_features_from_gxf_file(
        featureset_name = basename2, 
        register_transcripts = TRUE, 
        gxf_filepath = filepath2, 
        gxf_format = 'GFF3'
      )
    }

    ##########################################################################
    ### GTF format 1 -- GRCh38
    ### Download file ####
    skipURL3 = TRUE
    if (!skipURL3) {
      url3 = 'ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz'
      basename3 = basename(url3)
      filepath3 = file.path(DIR, basename3)
      if (exists('status')) rm(status)
      status = download.file(url = url3, destfile = filepath3)
      if (!exists('status')) stop("Issue downloading file: ", url3)
      
      ### Register to scidb at a FeatureSet ####
      ftrset_idx = register_featureset(df = data.frame(
        name = basename3, 
        referenceset_id = refset38$referenceset_id, 
        description = '...', source_uri = '...', 
        stringsAsFactors = FALSE))
      ftr_idx = register_features_from_gxf_file(
        featureset_name = basename3, 
        register_transcripts = TRUE, 
        gxf_filepath = filepath3, 
        gxf_format = 'GTF'
      )
    }
    
    
    ########################################################################
    #### Verification ####
    fs = get_featuresets()
    
    # Spot checks
    if (!skipURL1) {
      fs1 = fs[grep(basename1, fs$name), ]
      # Gene EGFR
      ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'gene', featureset_id = fs1$featureset_id)
      testthat::expect_equal(nrow(ftr_df), 1)
      # Transcripts linked to EGFR
      ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'transcript', featureset_id = fs1$featureset_id)
      testthat::expect_equal(nrow(ftr_df), 11)
      # Transcripts linked to EGFR, KRAS
      ftr_df = search_features(gene_symbol = c('EGFR', 'KRAS'), feature_type = 'transcript', featureset_id = fs1$featureset_id)
      testthat::expect_equal(nrow(ftr_df), 15)
      
      # Full downloads
      ftr_df = search_features(featureset_id = fs1$featureset_id, mandatory_fields_only = T)
      testthat::expect_true(any(grepl("ensembl", unique(ftr_df$source), ignore.case = T)))
      table(ftr_df$source)
      #              ensembl        ensembl_havana ensembl_havana_tagene                havana         havana_tagene                 insdc               mirbase 
      #                19743                 45302                    22                196484                 22059                    37                  3758 
      table(ftr_df$feature_type)
      #       gene transcript 
      #      60617     226788
      testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'gene', ]), 60617)
      testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'transcript', ]), 226788)
    }
    
    # Spot checks
    if (!skipURL2) {
      fs2 = fs[grep(basename2, fs$name), ]
      ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'gene', featureset_id = fs2$featureset_id)
      testthat::expect_equal(nrow(ftr_df), 1)
      # Transcripts linked to EGFR
      ftr_df = search_features(gene_symbol = 'EGFR', feature_type = 'transcript', featureset_id = fs2$featureset_id)
      testthat::expect_equal(nrow(ftr_df), 9)
      # Transcripts linked to EGFR, KRAS
      ftr_df = search_features(gene_symbol = c('EGFR', 'KRAS'), feature_type = 'transcript', featureset_id = fs2$featureset_id)
      testthat::expect_equal(nrow(ftr_df), 11)
      
      # Full downloads
      ftr_df = search_features(featureset_id = fs2$featureset_id, mandatory_fields_only = T)
      testthat::expect_true(any(grepl("RefSeq", unique(ftr_df$source), ignore.case = T)))
      table(ftr_df$source)
      # BestRefSeq Curated Genomic          RefSeq     tRNAscan-SE 
      #      85023           15258              37             428 
      table(ftr_df$feature_type)
      #       gene transcript 
      #      42283      58463 
      testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'gene', ]), 42283)
      testthat::expect_equal(nrow(ftr_df[ftr_df$feature_type == 'transcript', ]), 58463)
    } 
    
    # Clean up
    init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
  }
})
Paradigm4/revealgenomics documentation built on April 7, 2020, 2:01 a.m.