tests/testthat/test99-api-loader.R

context("test-api-loader.R")
library(ggplot2)

test_that("Register entities via workbook works OK", {
  # cat("# Now connect to scidb\n")
  e0 = tryCatch({rg_connect()}, error = function(e) {e})
  if (!("error" %in% class(e0))) { # do not run this on EE installs, mainly targeted for Travis
    init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
    df_referenceset = data.frame(
      name = c("GRCh37", "GRCh38"), 
      description = "...",
      species = 'homo sapiens',
      assembly_id = c("GrCh37", "GrCh38_r85"),
      source_uri = "...",
      source_accessions = "...",
      is_derived = TRUE, 
      stringsAsFactors = FALSE)
    
    cat("Registering reference set:", pretty_print(df_referenceset$name), "\n")
    referenceset_id = register_referenceset(df = df_referenceset)
    
    wb = myExcelLoader(
      filename = system.file(
        "extdata", "scidb_metadata_sample.xlsx", package = "revealgenomics"))
    
    # Load metadata first
    register_entities_workbook(workbook = wb, 
                               register_upto_entity = 'MEASUREMENT')
    
    ##### Spot checks on Dataset #####
    # Spot checks that ontology fields of study have been captured properly
    datasets = get_datasets()
    expect_true(all(!is.na(datasets$study_type)))
    expect_true(all(!is.na(datasets$DAS)))
    expect_true(all(datasets$DAS %in% wb$Studies$DAS))
    expect_true(all(datasets$study_type %in% wb$Studies$study_type))
    
    # Check that get_projects() allows download of mandatory fields only
    p1 = get_projects()
    p2 = get_projects(mandatory_fields_only = TRUE)
    expect_equal(nrow(p1), nrow(p2))
    expect_equal(sort(p1$name), sort(p2$name))
    
    # Similar spot checks using `search_datasets()` function
    p = get_projects()
    stopifnot(nrow(p) == 1) # Expect one project to be loaded at this time
    project_id = p[1, ]$project_id
    datasets = search_datasets(project_id = project_id)
    expect_true(all(!is.na(datasets$study_type)))
    expect_true(all(!is.na(datasets$DAS)))
    expect_true(all(datasets$DAS %in% wb$Studies$DAS))
    expect_true(all(datasets$study_type %in% wb$Studies$study_type))
    
    ######## Update Study ##########
    wb$Studies$URL[2] = "https://clinicaltrials.gov/ct2/show/zzzz" # edit an existing field
    wb$Studies$new_field1 = c('entry1', 'entry2') # add a new column
    register_entities_workbook(workbook = wb, 
                               register_upto_entity = 'DEFINITION', 
                               entity_to_update = 'DATASET')
    datasets = get_datasets()
    expect_true(length(grep("zzzz", datasets$URL)) ==1)
    expect_true(all(datasets$new_field1 %in% c('entry1', 'entry2')))
    
    ######## Update Individual ##########
    ## Edit an entry
    rand_idx = 1
    test_disease_str = 'test disease 1'
    wb$Subjects[rand_idx, ]$primary_disease = test_disease_str
    
    ## Add a new column
    rand_string = stringi::stri_rand_strings(nrow(wb$Subjects), length = 6)
    wb$Subjects$new_field1 = rand_string
    # Introduce new definition for the new field
    new_definition = wb$Definitions[wb$Definitions$attribute_name == 'subject_id', ]
    new_definition$attribute_name = 'new_field1'
    new_definition$attribute_in_Samples = FALSE
    wb$Definitions = rbind(
      wb$Definitions, 
      new_definition
      )
    register_entities_workbook(workbook = wb, 
                               register_upto_entity = 'INDIVIDUAL', 
                               entity_to_update = 'INDIVIDUAL')
    
    # Spot checks for the updates
    ii = get_individuals()
    expect_true(all(ii$new_field1 %in% rand_string))
    expect_equal(
      length(which(ii$primary_disease == test_disease_str)),
      1)
    
    ######## Update MeasurementSet (trickier) ##########
    ## Edit an entry related to `pipeline_choices` sheet;
    # `pipeline_applications` column: "Defuse" ==> "deFuse"
    wb$pipeline_choices[wb$pipeline_choices$pipeline_scidb 
                        == '[external]-[Fusion] Defuse', 
                        ]$pipeline_applications = "deFuse"
    # Edit an entry related to `filter_choices` sheet;
    # `filter_description2` column 
    rand_string2 = stringi::stri_rand_strings(n = 1, length = 10)
    filter_name = 'gene - counts'
    wb$filter_choices[wb$filter_choices$filter_name 
                        == filter_name, 
                        ]$filter_description2 = rand_string2
    register_entities_workbook(workbook = wb, 
                               register_upto_entity = 'MEASUREMENTSET', 
                               entity_to_update = 'MEASUREMENTSET')
    
    # Spot checks
    ms = get_measurementsets()
    expect_equal(
      length(which(ms$pipeline_applications == "deFuse")),
      1)
    expect_true(
      all(ms[ms$filter_name == filter_name, ]$filter_description2 == rand_string2))
    
    #===========================================================================
    #### Geneset based data ####
    fsets = get_featuresets()
    stopifnot(nrow(fsets) == 3)
    fset = fsets[grep("geneset 1", fsets$name), ]
    stopifnot(nrow(fset) == 1)
    
    # Register genesets used for loading geneset based test data
    fidx = register_feature(
      df1 = data.frame(
        name = paste0("geneset_", 1:10), gene_symbol = 'NA', 
        featureset_id = fset$featureset_id, 
        chromosome = 'NA', start = 'NA', end = 'NA', 
        feature_type = 'geneset', source = 'XXX Biosciences'
      )
    )
    # Side-step and check that search features by type works
    fres = search_features(feature_type = 'geneset')
    testthat::expect_identical(sort(fidx$feature_id), sort(fres$feature_id))
    
    # Load data
    register_entities_workbook(workbook = wb, pipeline_name_filter = "continuous")
    
    # Check download
    id1 = find_dataset_id_by_grep("study_a")
    stopifnot(length(id1) == 1)
    ms1 = search_measurementsets(dataset_id = id1)
    ms1_cont = ms1[grep("continuous", ms1$name), ]
    
    expr1 = search_expression(measurementset = ms1_cont)
    testthat::expect_equal(as.character(class(expr1)), 'ExpressionSet')
    testthat::expect_equal(dim(exprs(expr1)), c(5, 5))
    
    expr1_df = search_expression(measurementset = ms1_cont, formExpressionSet = F)
    testthat::expect_equal(as.character(class(expr1_df)), 'data.frame')
    testthat::expect_equal(dim(expr1_df), c(25, 3))
    testthat::expect_true(all(c('value', 'biosample_id', 'feature_id') %in% colnames(expr1_df)))

    # Search by features
    testthat::expect_equal(
      dim(exprs(search_expression(measurementset = ms1_cont, 
                                  feature = fres[grep("geneset_4|geneset_5", fres$name), ]))), 
      c(2, 5)) # 2 features, 5 biosamples

    #===========================================================================
    #### Geneset (categorical) ####
    register_entities_workbook(workbook = wb, pipeline_name_filter = "categorical")
    
    # Now test download
    ms1_categ = ms1[grep("categorical", ms1$name), ]
    
    expr2_df = search_expression(measurementset = ms1_categ, formExpressionSet = F)
    testthat::expect_equal(as.character(class(expr2_df)), 'data.frame')
    testthat::expect_equal(dim(expr2_df), c(25, 3))
    testthat::expect_true(all(c('value', 'biosample_id', 'feature_id') %in% colnames(expr2_df)))
    testthat::expect_equal(class(expr2_df$value), 'character')

    # Download same data as an ExpressionSet
    expr2 = search_expression(measurementset = ms1_categ, formExpressionSet = TRUE)
    testthat::expect_equal(class(expr2)[1], "ExpressionSet")
    testthat::expect_equal(dim(exprs(expr2)), c(5, 5))
    testthat::expect_equal(expr2@featureData@data$name, paste0("geneset_", c(1:5)))
    testthat::expect_equal(class(as.vector(exprs(expr2))), "character")
    
    # Search by features
    testthat::expect_equal(
      dim(search_expression(measurementset = ms1_categ, 
                                  feature = fres[grep("geneset_4|geneset_5", fres$name), ], 
                                  formExpressionSet = FALSE)), 
      c(10, 6)) # data.frame containing 5 samples * 2 features

    ##### FeatureSet #####
    # Build up a featureset to be used for loading data
    target_featureset_id = fsets[grep("37", fsets$name), ]$featureset_id
    ftr_record = build_reference_gene_set(
      featureset_id = target_featureset_id, 
      gene_annotation_file_path = system.file("extdata", 
                                              "gene__hugo__hgnc_complete_set.txt.gz", package="revealgenomics"), # these are grch38 files but we use this to populate the featureset anyway
      gene_location_file_path = system.file("extdata", 
                                            "gene__grch38_release85_homosap_gene__newGene.tsv.gz", package="revealgenomics") # again a grch38 file
                                          ) 
    
    target_featureset_id = fsets[grep("38", fsets$name), ]$featureset_id
    ftr_record = build_reference_gene_set(featureset_id = target_featureset_id)
    
    ########### FUSION DATA ############
    # FMI fusion data has symbol `N/A`, handle it specially
    # FMI fusion data is registered at featureset GRCh37
    fsets = get_featuresets()
    fset = fsets[grep("37", fsets$name), ]
    
    fid = register_feature(df1 = data.frame(
      name = 'N/A', gene_symbol = 'NA', 
      featureset_id = fset$featureset_id, 
      chromosome = 'NA', start = 'NA', end = 'NA', 
      feature_type = 'gene', source = 'FMI_fusion_data', 
      stringsAsFactors = FALSE
    ))
    # Now load the data
    register_entities_workbook(workbook = wb, 
                               register_measurement_entity = 'FUSION')
    
    # Now do some checks on the data load
    ms = get_measurementsets()
    
    # TODO: walk all pipelines and measurementset IDs to verify data.
    ftrs = search_features(gene_symbol = c('TXNIP'))
    if (nrow(ftrs) > 2) {
      stop("If more than two features for TXNIP at this point; need to adjust test")
    }
    mn = ms[ms$pipeline_scidb == '[external]-[Fusion] Tophat Fusion',]
    mn = mn[grep("37", mn$featureset_name), ]
    v1 = search_fusion(measurementset = mn[1, ], feature = ftrs)
    cat('TXNIP feature search\n')
    expect_true(all.equal(dim(v1), c(3, 15)))
    
    ftrs = search_features(gene_symbol = c('IGH'))
    mn = ms[grep("Fusion.*Foundation.*Heme", ms$pipeline_scidb),]
    v1 = search_fusion(measurementset = mn, feature = ftrs)
    cat('IGH feature search\n')
    expect_true(all.equal(dim(v1), c(2, 26)))
    
    # FMI format 2
    ftrs = search_features(gene_symbol = c('TMPRSS2'))
    mn = ms[grep("Fusion.*Foundation.*Assay", ms$pipeline_scidb),]
    v1 = search_fusion(measurementset = mn, feature = ftrs)
    cat('IGH feature search\n')
    expect_true(all.equal(dim(v1), c(3, 27)))
    
    # deFuse fusion (Also this is at GRCh38 in the test Excel sheet)
    ftrs = search_features(gene_symbol = c('KANSL1'))
    mn = ms[ms$pipeline_scidb == '[external]-[Fusion] Defuse',]
    v1 = search_fusion(measurementset = mn, feature = ftrs)
    cat('KANSL1 feature search\n')
    expect_true(all.equal(dim(v1), c(1, 75)))
    
    ftrs = search_features(gene_symbol = c('ARL17A', 'KANSL1', 'AKNA'))
    mn = ms[ms$pipeline_scidb == '[external]-[Fusion] Defuse',]
    v1 = search_fusion(measurementset = mn, feature = ftrs)
    cat('KANSL1 feature search\n')
    expect_true(all.equal(dim(v1), c(2, 75)))
    
    ########### VARIANT DATA ############
    # Now load the variant data
    register_entities_workbook(workbook = wb, 
                               register_measurement_entity = 'VARIANT')
    
    # Now do some checks on the variant data load
    ms = get_measurementsets()
    ms_variant = ms[grepl("Variant", ms$pipeline_scidb) & 
                      grepl("FoundationOne Heme", ms$name), ]
    stopifnot(nrow(ms_variant) == 1)
    
    ftrs = search_features(gene_symbol = c('PARP2', 'RHOA', 'JAK2'))
    v1 = search_variant(measurementset = ms_variant, feature = ftrs)
    expect_true(all.equal(dim(v1), c(3, 22)))
    
    ftrs = search_features(gene_symbol = c('PARP2', 'RHOA', 'JAK2', 'TP53'))
    v2 = search_variant(measurementset = ms_variant, feature = ftrs)
    expect_true(all.equal(dim(v2), c(5, 22)))
    
    # FMI format 2
    ms_variant = ms[grepl("Variant", ms$pipeline_scidb) & 
                      grepl("FoundationOne Assay", ms$name), ]
    stopifnot(nrow(ms_variant) == 1)
    
    v3_all = search_variant(measurementset = ms_variant)
    expect_equal(dim(v3_all), c(28, 20))
    
    ftrs = search_features(gene_symbol = c("TP53BP1", "HSP90AA1", "DIS3"))
    v3_sub = search_variant(measurementset = ms_variant, feature = ftrs)
    expect_equal(dim(v3_sub), c(3, 23))
    
    ########### CNV DATA ############
    # Now load the CNV data
    register_entities_workbook(workbook = wb, 
                               register_measurement_entity = 'COPYNUMBER_VARIANT')
    ms = get_measurementsets()
    ms_cnv = ms[grep("COPYNUMBER", ms$entity), ]
    ms_cnv1 = ms_cnv[grep("Heme", ms_cnv$name), ]
    
    res_cnv1 = search_copy_number_variant(measurementset = ms_cnv1)
    expect_equal(dim(res_cnv1), c(5,11))
    
    ftrs = search_features(gene_symbol = "CDKN2A")
    res_cnv2 = search_copy_number_variant(measurementset = ms_cnv1, feature = ftrs)
    expect_equal(dim(res_cnv2), c(2,14))
    
    ftrs = search_features(gene_symbol = c("CDKN2A", "STAT6"))
    res_cnv3 = search_copy_number_variant(measurementset = ms_cnv1, feature = ftrs)
    expect_equal(dim(res_cnv3), c(3,14))
    
    # FMI format 2
    ms_cnv2 = ms_cnv[grep("Assay", ms_cnv$name), ]

    res_cnv1 = search_copy_number_variant(measurementset = ms_cnv2)
    expect_equal(dim(res_cnv1), c(12,12))
    
    ftrs = search_features(gene_symbol = "PTEN")
    res_cnv2 = search_copy_number_variant(measurementset = ms_cnv2, feature = ftrs)
    expect_equal(dim(res_cnv2), c(1,15))
    
    ftrs = search_features(gene_symbol = c("PTEN", "GRM3"))
    res_cnv3 = search_copy_number_variant(measurementset = ms_cnv2, feature = ftrs)
    expect_equal(dim(res_cnv3), c(2,15))
    
    ########### Check search by selected flex fields ############
    b0 = search_biosamples(dataset_id = 1, requested_attributes = 'sample_cell_type')
    b1 = search_biosamples(dataset_id = 2, requested_attributes = 'VISIT')
    b2 = search_biosamples(requested_attributes = 'sample_cell_type')
    b3 = search_biosamples(requested_attributes = c('sample_cell_type', 'VISIT'))
    expect_equal(dim(b0), c(12, 9))
    expect_equal(dim(b1), c(4, 8))
    expect_equal(dim(b2), c(16, 9))
    expect_equal(dim(b3), c(16, 10))
    ##### Check sample suffix (DNA, RNA by entity type) #### 
    
    ## "For data loaded via Excel template, check that biosamples associated with specific pipelines
    ## have the appropiate suffix: DNA or RNA"
    
    ## Note: Some entity types can allow both DNA and RNA -- tests need to be made more flexible to 
    ##       account for those cases
    
    
    # can generalize this to any study that has been loaded by Excel sheet
    def = get_definitions()
    if (nrow(def) > 0) {
      d_all_excel = get_datasets(dataset_id = unique(def$dataset_id))
      for (study_idx in 1:nrow(d_all_excel)) { 
        cat("=====================\n")
        d = d_all_excel[study_idx, ]
        print(d[, c('dataset_id', 'name')])
        
        ms_all = search_measurementsets(dataset_id = d$dataset_id)
        exp_all = search_experimentsets(dataset_id = d$dataset_id)
        ms_all = merge(ms_all[, c('measurementset_id', 'experimentset_id', 'entity', 'name')],
                       exp_all[, c('experimentset_id', 'experiment_type_API')],
                       by = 'experimentset_id')
        cat("Found", nrow(ms_all), "pipelines:\n\t", 
            pretty_print(ms_all$name), "\n")
        for (idx in 1:nrow(ms_all)) {
          ms = ms_all[idx, ]
          cat("sub-idx:", idx, "Pipeline:", ms$name, "\n")
          ms_stat = iquery(.ghEnv$db, 
                           paste0("aggregate(
                                 filter(", 
                                  revealgenomics:::full_arrayname(ms$entity),
                                  ", measurementset_id = ", ms$measurementset_id, 
                                  "), count(*), biosample_id)"),
                           return = TRUE)
          if (nrow(ms_stat) > 0) {
            bios_df = get_biosamples(biosample_id = ms_stat$biosample_id)
            cat("biosample names at pipeline", ms$measurementset_id, 
                "of study", d$dataset_id, ":\n\t", 
                pretty_print(bios_df$name), "\n")
            suffix = ifelse(
              ms$entity %in% c(.ghEnv$meta$arrVariant, 
                               .ghEnv$meta$arrFusion,
                               .ghEnv$meta$arrCopynumber_mat, 
                               .ghEnv$meta$arrCopynumber_seg, 
                               .ghEnv$meta$arrCopynumber_variant
              ),
              "DNA", # couuld be '__mRNA' or '__RNA'
              "RNA")
            message("Checking suffix")
            testthat::expect_equal(
              length(grep(suffix, bios_df$name, value=TRUE)),
              length(unique(ms_stat$biosample_id))) 
          }
        }
      }
    }
    
    #### Spectral Map creation ####
    ms = get_measurementsets(mandatory_fields_only = T)
    ms = ms[ms$entity == .ghEnv$meta$arrRnaquantification, ]
    for (idx in 1:nrow(ms)) {
      message("Working on idx: ", idx, " of ", nrow(ms))
      msi = ms[idx, ]
      
      es1 = search_expression(measurementset = msi)
      testthat::expect_equal(as.character(class(es1)), "ExpressionSet")
      
      # Now run spectral map creation
      studyName = get_datasets(dataset_id = msi$dataset_id)$name
      library(esetVis)
      library(mpm)
      library(metafolio)
      
      if (all(c('sample_cell_type', 'sample_type') %in% colnames(es1@phenoData@data))) {
        t1 = system.time({
          xx1 = revealgenomics:::createSpectralMap(
            eSetObj = es1, 
            studyName = studyName, 
            group1 = 'sample_cell_type', group2 = 'sample_type', 
            showRShinyProgress = FALSE, returnAnalysis = T)
        })[3]
        
        t2 = system.time({
          xx2 = revealgenomics:::createSpectralMap(
            eSetObj = es1, 
            studyName = studyName, 
            group1 = 'sample_cell_type', group2 = 'sample_type', 
            showRShinyProgress = FALSE, returnAnalysis = T, 
            pcaResult = xx1$pca)
        })[3]
        
        testthat::expect_true(t2 <= t1) # For small matrices, this is not always guaranteed

        ## Test equality of the objects returned in the first and second run
        # Checking equality of the following objects of 'xx1' and 'xx2'
        res1 = lapply(X = c('pca', 'spectral12', 'spectral13', 'spectral23'),
              FUN = function(k) testthat::expect_equal(xx1[[k]], xx2[[k]]))
        # In case of 'spectralHist' equality check fails when applied on the whole of 
        # 'spectralHist' as plot environment fails this test
        res2 = lapply(X = c('data', 'layers','scales','mapping','theme','coordinates','facet','labels'),
               FUN = function(k) testthat::expect_equal(xx1$spectralHist[[k]], 
                                                        xx2$spectralHist[[k]]))
        
        ## Compute truncated SVD
        t3 = system.time({
          xx3 = revealgenomics:::createSpectralMap(
            eSetObj = es1, 
            studyName = studyName, 
            group1 = 'sample_cell_type', group2 = 'sample_type', 
            showRShinyProgress = FALSE, returnAnalysis = T, 
            svd.method = 'truncated_svd')
        })[3]
        
        testthat::expect_true(t3 < t1) # For small matrices, this is not always guaranteed
        
        ## Test equality of the objects returned by calling complete SVD and truncated SVD.
        ## % explanation of variance will not be the same. Hence, pca, spectral[XY] & spectralHist
        ## cannot be equal. Also, due to the same in spectral[XY] plot and analysis wont be the same
        ## Thus, checking only topElements.
        res3 = lapply(X = c('spectral12', 'spectral13', 'spectral23'),
                      FUN = function(k) testthat::expect_equal(xx1[[k]]$topElements,
                                                               xx3[[k]]$topElements))

        ## Compute fast.svd
        t4 = system.time({
          xx4 = revealgenomics:::createSpectralMap(
            eSetObj = es1, 
            studyName = studyName, 
            group1 = 'sample_cell_type', group2 = 'sample_type', 
            showRShinyProgress = FALSE, returnAnalysis = T, 
            svd.method = 'fast_svd')
        })[3]
        
        testthat::expect_true(t4 < t1) # For small matrices, this is not always guaranteed
        
        ## Test equality of the objects returned by calling complete SVD and truncated SVD
        # top elements
        res4 = lapply(X = c('spectral12', 'spectral13', 'spectral23'),
                      FUN = function(k) testthat::expect_equal(xx1[[k]]$topElements,
                                                               xx4[[k]]$topElements))
        # % explanation of variance
        res5 = lapply(X = c('spectral12', 'spectral13', 'spectral23'),
                      FUN = function(k) testthat::expect_equal(
                        xx1[[k]]$analysis$axesContributionsPercentages[
                          1:length(xx4[[k]]$analysis$axesContributionsPercentages)],
                        xx4[[k]]$analysis$axesContributionsPercentages))

      } else {
        stop("Have not covered case where `sample_cell_type`", 
             " and `sample_type` are not present in sample metadata", 
             "\n Current case: measuremenset_name: ", msi$name, 
             "; dataset_name: ", get_datasets(dataset_id = msi$dataset_id)$name)
      }
      
    }
    
    # Clean up
    init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
  }
})
Paradigm4/revealgenomics documentation built on April 7, 2020, 2:01 a.m.