context("test-api-loader.R")
library(ggplot2)
test_that("Register entities via workbook works OK", {
# cat("# Now connect to scidb\n")
e0 = tryCatch({rg_connect()}, error = function(e) {e})
if (!("error" %in% class(e0))) { # do not run this on EE installs, mainly targeted for Travis
init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
df_referenceset = data.frame(
name = c("GRCh37", "GRCh38"),
description = "...",
species = 'homo sapiens',
assembly_id = c("GrCh37", "GrCh38_r85"),
source_uri = "...",
source_accessions = "...",
is_derived = TRUE,
stringsAsFactors = FALSE)
cat("Registering reference set:", pretty_print(df_referenceset$name), "\n")
referenceset_id = register_referenceset(df = df_referenceset)
wb = myExcelLoader(
filename = system.file(
"extdata", "scidb_metadata_sample.xlsx", package = "revealgenomics"))
# Load metadata first
register_entities_workbook(workbook = wb,
register_upto_entity = 'MEASUREMENT')
##### Spot checks on Dataset #####
# Spot checks that ontology fields of study have been captured properly
datasets = get_datasets()
expect_true(all(!is.na(datasets$study_type)))
expect_true(all(!is.na(datasets$DAS)))
expect_true(all(datasets$DAS %in% wb$Studies$DAS))
expect_true(all(datasets$study_type %in% wb$Studies$study_type))
# Check that get_projects() allows download of mandatory fields only
p1 = get_projects()
p2 = get_projects(mandatory_fields_only = TRUE)
expect_equal(nrow(p1), nrow(p2))
expect_equal(sort(p1$name), sort(p2$name))
# Similar spot checks using `search_datasets()` function
p = get_projects()
stopifnot(nrow(p) == 1) # Expect one project to be loaded at this time
project_id = p[1, ]$project_id
datasets = search_datasets(project_id = project_id)
expect_true(all(!is.na(datasets$study_type)))
expect_true(all(!is.na(datasets$DAS)))
expect_true(all(datasets$DAS %in% wb$Studies$DAS))
expect_true(all(datasets$study_type %in% wb$Studies$study_type))
######## Update Study ##########
wb$Studies$URL[2] = "https://clinicaltrials.gov/ct2/show/zzzz" # edit an existing field
wb$Studies$new_field1 = c('entry1', 'entry2') # add a new column
register_entities_workbook(workbook = wb,
register_upto_entity = 'DEFINITION',
entity_to_update = 'DATASET')
datasets = get_datasets()
expect_true(length(grep("zzzz", datasets$URL)) ==1)
expect_true(all(datasets$new_field1 %in% c('entry1', 'entry2')))
######## Update Individual ##########
## Edit an entry
rand_idx = 1
test_disease_str = 'test disease 1'
wb$Subjects[rand_idx, ]$primary_disease = test_disease_str
## Add a new column
rand_string = stringi::stri_rand_strings(nrow(wb$Subjects), length = 6)
wb$Subjects$new_field1 = rand_string
# Introduce new definition for the new field
new_definition = wb$Definitions[wb$Definitions$attribute_name == 'subject_id', ]
new_definition$attribute_name = 'new_field1'
new_definition$attribute_in_Samples = FALSE
wb$Definitions = rbind(
wb$Definitions,
new_definition
)
register_entities_workbook(workbook = wb,
register_upto_entity = 'INDIVIDUAL',
entity_to_update = 'INDIVIDUAL')
# Spot checks for the updates
ii = get_individuals()
expect_true(all(ii$new_field1 %in% rand_string))
expect_equal(
length(which(ii$primary_disease == test_disease_str)),
1)
######## Update MeasurementSet (trickier) ##########
## Edit an entry related to `pipeline_choices` sheet;
# `pipeline_applications` column: "Defuse" ==> "deFuse"
wb$pipeline_choices[wb$pipeline_choices$pipeline_scidb
== '[external]-[Fusion] Defuse',
]$pipeline_applications = "deFuse"
# Edit an entry related to `filter_choices` sheet;
# `filter_description2` column
rand_string2 = stringi::stri_rand_strings(n = 1, length = 10)
filter_name = 'gene - counts'
wb$filter_choices[wb$filter_choices$filter_name
== filter_name,
]$filter_description2 = rand_string2
register_entities_workbook(workbook = wb,
register_upto_entity = 'MEASUREMENTSET',
entity_to_update = 'MEASUREMENTSET')
# Spot checks
ms = get_measurementsets()
expect_equal(
length(which(ms$pipeline_applications == "deFuse")),
1)
expect_true(
all(ms[ms$filter_name == filter_name, ]$filter_description2 == rand_string2))
#===========================================================================
#### Geneset based data ####
fsets = get_featuresets()
stopifnot(nrow(fsets) == 3)
fset = fsets[grep("geneset 1", fsets$name), ]
stopifnot(nrow(fset) == 1)
# Register genesets used for loading geneset based test data
fidx = register_feature(
df1 = data.frame(
name = paste0("geneset_", 1:10), gene_symbol = 'NA',
featureset_id = fset$featureset_id,
chromosome = 'NA', start = 'NA', end = 'NA',
feature_type = 'geneset', source = 'XXX Biosciences'
)
)
# Side-step and check that search features by type works
fres = search_features(feature_type = 'geneset')
testthat::expect_identical(sort(fidx$feature_id), sort(fres$feature_id))
# Load data
register_entities_workbook(workbook = wb, pipeline_name_filter = "continuous")
# Check download
id1 = find_dataset_id_by_grep("study_a")
stopifnot(length(id1) == 1)
ms1 = search_measurementsets(dataset_id = id1)
ms1_cont = ms1[grep("continuous", ms1$name), ]
expr1 = search_expression(measurementset = ms1_cont)
testthat::expect_equal(as.character(class(expr1)), 'ExpressionSet')
testthat::expect_equal(dim(exprs(expr1)), c(5, 5))
expr1_df = search_expression(measurementset = ms1_cont, formExpressionSet = F)
testthat::expect_equal(as.character(class(expr1_df)), 'data.frame')
testthat::expect_equal(dim(expr1_df), c(25, 3))
testthat::expect_true(all(c('value', 'biosample_id', 'feature_id') %in% colnames(expr1_df)))
# Search by features
testthat::expect_equal(
dim(exprs(search_expression(measurementset = ms1_cont,
feature = fres[grep("geneset_4|geneset_5", fres$name), ]))),
c(2, 5)) # 2 features, 5 biosamples
#===========================================================================
#### Geneset (categorical) ####
register_entities_workbook(workbook = wb, pipeline_name_filter = "categorical")
# Now test download
ms1_categ = ms1[grep("categorical", ms1$name), ]
expr2_df = search_expression(measurementset = ms1_categ, formExpressionSet = F)
testthat::expect_equal(as.character(class(expr2_df)), 'data.frame')
testthat::expect_equal(dim(expr2_df), c(25, 3))
testthat::expect_true(all(c('value', 'biosample_id', 'feature_id') %in% colnames(expr2_df)))
testthat::expect_equal(class(expr2_df$value), 'character')
# Download same data as an ExpressionSet
expr2 = search_expression(measurementset = ms1_categ, formExpressionSet = TRUE)
testthat::expect_equal(class(expr2)[1], "ExpressionSet")
testthat::expect_equal(dim(exprs(expr2)), c(5, 5))
testthat::expect_equal(expr2@featureData@data$name, paste0("geneset_", c(1:5)))
testthat::expect_equal(class(as.vector(exprs(expr2))), "character")
# Search by features
testthat::expect_equal(
dim(search_expression(measurementset = ms1_categ,
feature = fres[grep("geneset_4|geneset_5", fres$name), ],
formExpressionSet = FALSE)),
c(10, 6)) # data.frame containing 5 samples * 2 features
##### FeatureSet #####
# Build up a featureset to be used for loading data
target_featureset_id = fsets[grep("37", fsets$name), ]$featureset_id
ftr_record = build_reference_gene_set(
featureset_id = target_featureset_id,
gene_annotation_file_path = system.file("extdata",
"gene__hugo__hgnc_complete_set.txt.gz", package="revealgenomics"), # these are grch38 files but we use this to populate the featureset anyway
gene_location_file_path = system.file("extdata",
"gene__grch38_release85_homosap_gene__newGene.tsv.gz", package="revealgenomics") # again a grch38 file
)
target_featureset_id = fsets[grep("38", fsets$name), ]$featureset_id
ftr_record = build_reference_gene_set(featureset_id = target_featureset_id)
########### FUSION DATA ############
# FMI fusion data has symbol `N/A`, handle it specially
# FMI fusion data is registered at featureset GRCh37
fsets = get_featuresets()
fset = fsets[grep("37", fsets$name), ]
fid = register_feature(df1 = data.frame(
name = 'N/A', gene_symbol = 'NA',
featureset_id = fset$featureset_id,
chromosome = 'NA', start = 'NA', end = 'NA',
feature_type = 'gene', source = 'FMI_fusion_data',
stringsAsFactors = FALSE
))
# Now load the data
register_entities_workbook(workbook = wb,
register_measurement_entity = 'FUSION')
# Now do some checks on the data load
ms = get_measurementsets()
# TODO: walk all pipelines and measurementset IDs to verify data.
ftrs = search_features(gene_symbol = c('TXNIP'))
if (nrow(ftrs) > 2) {
stop("If more than two features for TXNIP at this point; need to adjust test")
}
mn = ms[ms$pipeline_scidb == '[external]-[Fusion] Tophat Fusion',]
mn = mn[grep("37", mn$featureset_name), ]
v1 = search_fusion(measurementset = mn[1, ], feature = ftrs)
cat('TXNIP feature search\n')
expect_true(all.equal(dim(v1), c(3, 15)))
ftrs = search_features(gene_symbol = c('IGH'))
mn = ms[grep("Fusion.*Foundation.*Heme", ms$pipeline_scidb),]
v1 = search_fusion(measurementset = mn, feature = ftrs)
cat('IGH feature search\n')
expect_true(all.equal(dim(v1), c(2, 26)))
# FMI format 2
ftrs = search_features(gene_symbol = c('TMPRSS2'))
mn = ms[grep("Fusion.*Foundation.*Assay", ms$pipeline_scidb),]
v1 = search_fusion(measurementset = mn, feature = ftrs)
cat('IGH feature search\n')
expect_true(all.equal(dim(v1), c(3, 27)))
# deFuse fusion (Also this is at GRCh38 in the test Excel sheet)
ftrs = search_features(gene_symbol = c('KANSL1'))
mn = ms[ms$pipeline_scidb == '[external]-[Fusion] Defuse',]
v1 = search_fusion(measurementset = mn, feature = ftrs)
cat('KANSL1 feature search\n')
expect_true(all.equal(dim(v1), c(1, 75)))
ftrs = search_features(gene_symbol = c('ARL17A', 'KANSL1', 'AKNA'))
mn = ms[ms$pipeline_scidb == '[external]-[Fusion] Defuse',]
v1 = search_fusion(measurementset = mn, feature = ftrs)
cat('KANSL1 feature search\n')
expect_true(all.equal(dim(v1), c(2, 75)))
########### VARIANT DATA ############
# Now load the variant data
register_entities_workbook(workbook = wb,
register_measurement_entity = 'VARIANT')
# Now do some checks on the variant data load
ms = get_measurementsets()
ms_variant = ms[grepl("Variant", ms$pipeline_scidb) &
grepl("FoundationOne Heme", ms$name), ]
stopifnot(nrow(ms_variant) == 1)
ftrs = search_features(gene_symbol = c('PARP2', 'RHOA', 'JAK2'))
v1 = search_variant(measurementset = ms_variant, feature = ftrs)
expect_true(all.equal(dim(v1), c(3, 22)))
ftrs = search_features(gene_symbol = c('PARP2', 'RHOA', 'JAK2', 'TP53'))
v2 = search_variant(measurementset = ms_variant, feature = ftrs)
expect_true(all.equal(dim(v2), c(5, 22)))
# FMI format 2
ms_variant = ms[grepl("Variant", ms$pipeline_scidb) &
grepl("FoundationOne Assay", ms$name), ]
stopifnot(nrow(ms_variant) == 1)
v3_all = search_variant(measurementset = ms_variant)
expect_equal(dim(v3_all), c(28, 20))
ftrs = search_features(gene_symbol = c("TP53BP1", "HSP90AA1", "DIS3"))
v3_sub = search_variant(measurementset = ms_variant, feature = ftrs)
expect_equal(dim(v3_sub), c(3, 23))
########### CNV DATA ############
# Now load the CNV data
register_entities_workbook(workbook = wb,
register_measurement_entity = 'COPYNUMBER_VARIANT')
ms = get_measurementsets()
ms_cnv = ms[grep("COPYNUMBER", ms$entity), ]
ms_cnv1 = ms_cnv[grep("Heme", ms_cnv$name), ]
res_cnv1 = search_copy_number_variant(measurementset = ms_cnv1)
expect_equal(dim(res_cnv1), c(5,11))
ftrs = search_features(gene_symbol = "CDKN2A")
res_cnv2 = search_copy_number_variant(measurementset = ms_cnv1, feature = ftrs)
expect_equal(dim(res_cnv2), c(2,14))
ftrs = search_features(gene_symbol = c("CDKN2A", "STAT6"))
res_cnv3 = search_copy_number_variant(measurementset = ms_cnv1, feature = ftrs)
expect_equal(dim(res_cnv3), c(3,14))
# FMI format 2
ms_cnv2 = ms_cnv[grep("Assay", ms_cnv$name), ]
res_cnv1 = search_copy_number_variant(measurementset = ms_cnv2)
expect_equal(dim(res_cnv1), c(12,12))
ftrs = search_features(gene_symbol = "PTEN")
res_cnv2 = search_copy_number_variant(measurementset = ms_cnv2, feature = ftrs)
expect_equal(dim(res_cnv2), c(1,15))
ftrs = search_features(gene_symbol = c("PTEN", "GRM3"))
res_cnv3 = search_copy_number_variant(measurementset = ms_cnv2, feature = ftrs)
expect_equal(dim(res_cnv3), c(2,15))
########### Check search by selected flex fields ############
b0 = search_biosamples(dataset_id = 1, requested_attributes = 'sample_cell_type')
b1 = search_biosamples(dataset_id = 2, requested_attributes = 'VISIT')
b2 = search_biosamples(requested_attributes = 'sample_cell_type')
b3 = search_biosamples(requested_attributes = c('sample_cell_type', 'VISIT'))
expect_equal(dim(b0), c(12, 9))
expect_equal(dim(b1), c(4, 8))
expect_equal(dim(b2), c(16, 9))
expect_equal(dim(b3), c(16, 10))
##### Check sample suffix (DNA, RNA by entity type) ####
## "For data loaded via Excel template, check that biosamples associated with specific pipelines
## have the appropiate suffix: DNA or RNA"
## Note: Some entity types can allow both DNA and RNA -- tests need to be made more flexible to
## account for those cases
# can generalize this to any study that has been loaded by Excel sheet
def = get_definitions()
if (nrow(def) > 0) {
d_all_excel = get_datasets(dataset_id = unique(def$dataset_id))
for (study_idx in 1:nrow(d_all_excel)) {
cat("=====================\n")
d = d_all_excel[study_idx, ]
print(d[, c('dataset_id', 'name')])
ms_all = search_measurementsets(dataset_id = d$dataset_id)
exp_all = search_experimentsets(dataset_id = d$dataset_id)
ms_all = merge(ms_all[, c('measurementset_id', 'experimentset_id', 'entity', 'name')],
exp_all[, c('experimentset_id', 'experiment_type_API')],
by = 'experimentset_id')
cat("Found", nrow(ms_all), "pipelines:\n\t",
pretty_print(ms_all$name), "\n")
for (idx in 1:nrow(ms_all)) {
ms = ms_all[idx, ]
cat("sub-idx:", idx, "Pipeline:", ms$name, "\n")
ms_stat = iquery(.ghEnv$db,
paste0("aggregate(
filter(",
revealgenomics:::full_arrayname(ms$entity),
", measurementset_id = ", ms$measurementset_id,
"), count(*), biosample_id)"),
return = TRUE)
if (nrow(ms_stat) > 0) {
bios_df = get_biosamples(biosample_id = ms_stat$biosample_id)
cat("biosample names at pipeline", ms$measurementset_id,
"of study", d$dataset_id, ":\n\t",
pretty_print(bios_df$name), "\n")
suffix = ifelse(
ms$entity %in% c(.ghEnv$meta$arrVariant,
.ghEnv$meta$arrFusion,
.ghEnv$meta$arrCopynumber_mat,
.ghEnv$meta$arrCopynumber_seg,
.ghEnv$meta$arrCopynumber_variant
),
"DNA", # couuld be '__mRNA' or '__RNA'
"RNA")
message("Checking suffix")
testthat::expect_equal(
length(grep(suffix, bios_df$name, value=TRUE)),
length(unique(ms_stat$biosample_id)))
}
}
}
}
#### Spectral Map creation ####
ms = get_measurementsets(mandatory_fields_only = T)
ms = ms[ms$entity == .ghEnv$meta$arrRnaquantification, ]
for (idx in 1:nrow(ms)) {
message("Working on idx: ", idx, " of ", nrow(ms))
msi = ms[idx, ]
es1 = search_expression(measurementset = msi)
testthat::expect_equal(as.character(class(es1)), "ExpressionSet")
# Now run spectral map creation
studyName = get_datasets(dataset_id = msi$dataset_id)$name
library(esetVis)
library(mpm)
library(metafolio)
if (all(c('sample_cell_type', 'sample_type') %in% colnames(es1@phenoData@data))) {
t1 = system.time({
xx1 = revealgenomics:::createSpectralMap(
eSetObj = es1,
studyName = studyName,
group1 = 'sample_cell_type', group2 = 'sample_type',
showRShinyProgress = FALSE, returnAnalysis = T)
})[3]
t2 = system.time({
xx2 = revealgenomics:::createSpectralMap(
eSetObj = es1,
studyName = studyName,
group1 = 'sample_cell_type', group2 = 'sample_type',
showRShinyProgress = FALSE, returnAnalysis = T,
pcaResult = xx1$pca)
})[3]
testthat::expect_true(t2 <= t1) # For small matrices, this is not always guaranteed
## Test equality of the objects returned in the first and second run
# Checking equality of the following objects of 'xx1' and 'xx2'
res1 = lapply(X = c('pca', 'spectral12', 'spectral13', 'spectral23'),
FUN = function(k) testthat::expect_equal(xx1[[k]], xx2[[k]]))
# In case of 'spectralHist' equality check fails when applied on the whole of
# 'spectralHist' as plot environment fails this test
res2 = lapply(X = c('data', 'layers','scales','mapping','theme','coordinates','facet','labels'),
FUN = function(k) testthat::expect_equal(xx1$spectralHist[[k]],
xx2$spectralHist[[k]]))
## Compute truncated SVD
t3 = system.time({
xx3 = revealgenomics:::createSpectralMap(
eSetObj = es1,
studyName = studyName,
group1 = 'sample_cell_type', group2 = 'sample_type',
showRShinyProgress = FALSE, returnAnalysis = T,
svd.method = 'truncated_svd')
})[3]
testthat::expect_true(t3 < t1) # For small matrices, this is not always guaranteed
## Test equality of the objects returned by calling complete SVD and truncated SVD.
## % explanation of variance will not be the same. Hence, pca, spectral[XY] & spectralHist
## cannot be equal. Also, due to the same in spectral[XY] plot and analysis wont be the same
## Thus, checking only topElements.
res3 = lapply(X = c('spectral12', 'spectral13', 'spectral23'),
FUN = function(k) testthat::expect_equal(xx1[[k]]$topElements,
xx3[[k]]$topElements))
## Compute fast.svd
t4 = system.time({
xx4 = revealgenomics:::createSpectralMap(
eSetObj = es1,
studyName = studyName,
group1 = 'sample_cell_type', group2 = 'sample_type',
showRShinyProgress = FALSE, returnAnalysis = T,
svd.method = 'fast_svd')
})[3]
testthat::expect_true(t4 < t1) # For small matrices, this is not always guaranteed
## Test equality of the objects returned by calling complete SVD and truncated SVD
# top elements
res4 = lapply(X = c('spectral12', 'spectral13', 'spectral23'),
FUN = function(k) testthat::expect_equal(xx1[[k]]$topElements,
xx4[[k]]$topElements))
# % explanation of variance
res5 = lapply(X = c('spectral12', 'spectral13', 'spectral23'),
FUN = function(k) testthat::expect_equal(
xx1[[k]]$analysis$axesContributionsPercentages[
1:length(xx4[[k]]$analysis$axesContributionsPercentages)],
xx4[[k]]$analysis$axesContributionsPercentages))
} else {
stop("Have not covered case where `sample_cell_type`",
" and `sample_type` are not present in sample metadata",
"\n Current case: measuremenset_name: ", msi$name,
"; dataset_name: ", get_datasets(dataset_id = msi$dataset_id)$name)
}
}
# Clean up
init_db(arrays_to_init = get_entity_names(), force = TRUE, silent = TRUE)
}
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.