Nothing
# Updated mock API response to match the actual v2.0 structure
mock_api_response <- list(
# Model version
model_version = 2,
# Gene order (unchanged)
gene_order = c("ENSG00000000003", "ENSG00000000005", "ENSG00000000419", "ENSG00000000457",
"ENSG00000000460", "ENSG00000000938",
# Add more genes to reach 44,592 total
paste0("ENSG", sprintf("%011d", 1:44586))),
# NEW STRUCTURE: outputs is now a data.frame with list columns
outputs = data.frame(
# counts column - list of integer vectors (one per sample)
counts = I(list(
as.integer(c(904, 0, 539, 115, 239, 0, 1976, 817, 2422, 372, runif(44582, 0, 2000))),
as.integer(c(1350, 0, 343, 120, 151, 5, 937, 947, 1439, 344, runif(44582, 0, 2000))),
as.integer(c(1082, 0, 471, 144, 230, 4, 924, 861, 2093, 179, runif(44582, 0, 2000))),
as.integer(c(851, 5, 423, 147, 139, 3, 725, 1592, 5669, 407, runif(44582, 0, 2000))),
as.integer(c(339, 0, 356, 170, 91, 0, 1119, 748, 2459, 314, runif(44582, 0, 2000))),
as.integer(c(337, 0, 545, 174, 106, 28, 734, 770, 570, 651, runif(44582, 0, 2000))),
as.integer(c(822, 0, 440, 578, 60, 20, 1411, 915, 1004, 500, runif(44582, 0, 2000))),
as.integer(c(591, 0, 360, 212, 104, 53, 883, 1021, 826, 1439, runif(44582, 0, 2000))),
as.integer(c(999, 0, 844, 228, 61, 30, 786, 977, 446, 516, runif(44582, 0, 2000))),
as.integer(c(638, 1, 578, 194, 92, 64, 828, 416, 605, 613, runif(44582, 0, 2000)))
)),
# classifier_probs - data.frame with nested data.frames for each classifier
classifier_probs = I(data.frame(
# Sex probabilities
sex = I(data.frame(
female = c(0.0734, 0.1465, 0.1251, 0.0948, 0.2374, 0.1897, 0.2156, 0.1634, 0.0892, 0.1749),
male = c(0.927, 0.853, 0.875, 0.905, 0.763, 0.810, 0.784, 0.837, 0.911, 0.825)
)),
# Age years probabilities (96 age categories)
age_years = I(data.frame(
matrix(runif(10 * 96, 0, 0.05), nrow = 10, ncol = 96,
dimnames = list(NULL, as.character(0:95)))
)),
# Tissue ontology probabilities (442 categories)
tissue_ontology_id = I(data.frame(
matrix(runif(10 * 442, 0, 1e-3), nrow = 10, ncol = 442,
dimnames = list(NULL, c("CL:0000000", "CL:0000030", "CL:0000031",
paste0("UBERON:", sprintf("%07d", 1:439)))))
)),
# Sample type probabilities
sample_type = I(data.frame(
"cell line" = c(0.937, 0.966, 0.929, 0.909, 0.95, 0.943, 0.901, 0.934, 0.956, 0.912),
organoid = c(0.01129, 0.00439, 0.01091, 0.01375, 0.00753, 0.00891, 0.01234, 0.00876, 0.00543, 0.01098),
other = c(0.01282, 0.00688, 0.01579, 0.01839, 0.01271, 0.01456, 0.01789, 0.01345, 0.00987, 0.01567),
primary = c(0.00893, 0.00553, 0.00975, 0.01319, 0.00601, 0.00834, 0.01287, 0.00756, 0.00612, 0.00945),
"primary cells" = c(0.01198, 0.00467, 0.00902, 0.01929, 0.00837, 0.01023, 0.01567, 0.00934, 0.00723, 0.01134),
"primary tissue" = c(0.00706, 0.00682, 0.00779, 0.0176, 0.00782, 0.00934, 0.01456, 0.00812, 0.00678, 0.00945),
xenograft = c(0.01084, 0.00555, 0.01784, 0.00864, 0.00804, 0.01123, 0.01678, 0.00923, 0.00734, 0.01234),
check.names = FALSE
)),
# Disease ontology probabilities (589 categories)
disease_ontology_id = I(data.frame(
matrix(runif(10 * 589, 0, 1e-3), nrow = 10, ncol = 589,
dimnames = list(NULL, c("CL:0000623", "CL:0017002", "HGNC:11474",
paste0("MONDO:", sprintf("%07d", 1:586)))))
)),
# Cell type ontology probabilities (392 categories)
cell_type_ontology_id = I(data.frame(
matrix(runif(10 * 392, 0, 1e-3), nrow = 10, ncol = 392,
dimnames = list(NULL, paste0("CL:", sprintf("%07d", 1:392))))
)),
# Cell line ontology probabilities (763 categories) - CVCL_0023 dominates
cell_line_ontology_id = I(data.frame(
CVCL_0023 = c(0.862, 0.826, 0.875, 0.836, 0.854, 0.843, 0.821, 0.867, 0.891, 0.838),
matrix(runif(10 * 762, 0, 1e-3), nrow = 10, ncol = 762,
dimnames = list(NULL, paste0("CVCL_", sprintf("%04d", 1:762))))
))
)),
# latents - data.frame with list columns for each latent type
latents = I(data.frame(
biological = I(list(
runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3),
runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3)
)),
technical = I(list(
c(-0.0271, 0.518, -1.8222, -0.0959, 26.1482, runif(27, -5, 30)),
c(-1.66, 2.05, -3.95, 1.12, 17.84, runif(27, -5, 30)),
c(-0.446, 1.245, -4.278, 0.279, 20.811, runif(27, -5, 30)),
c(-1.73, -1.001, -2.311, 0.122, 13.508, runif(27, -5, 30)),
c(-0.631, 1.919, -2.409, 0.481, 10.967, runif(27, -5, 30)),
c(-1.282, -1.58, -4.221, 0.799, 12.121, runif(27, -5, 30)),
c(-0.823, 0.58, -3.437, 1.953, 17.658, runif(27, -5, 30)),
c(-0.0634, 1.2, -3.6599, 1.7813, 19.5482, runif(27, -5, 30)),
c(-1.018, -1.78, -3.108, -0.326, 15.316, runif(27, -5, 30)),
c(0.267, -0.548, -2.999, 1.454, 13.085, runif(27, -5, 30))
)),
perturbation = I(list(
runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2),
runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2)
))
)),
# metadata - data.frame with sample metadata
metadata = I(data.frame(
age_years = c("", "", "", "", "", "65", "65", "65", "65", "65"),
cell_line_ontology_id = c(rep("CVCL_0023", 5), rep("", 5)),
cell_type_ontology_id = rep("", 10),
developmental_stage = rep("", 10),
disease_ontology_id = c(rep("", 5), rep("MONDO:0011719", 5)),
ethnicity = rep("", 10),
genotype = rep("", 10),
library_layout = rep("", 10),
library_selection = rep("", 10),
modality = rep("bulk", 10),
perturbation_dose = rep("", 10),
perturbation_ontology_id = c(rep("ENSG00000156127", 5), rep("", 5)),
perturbation_time = c(rep("96 hours", 5), rep("", 5)),
perturbation_type = c(rep("crispr", 5), rep("", 5)),
platform = rep("", 10),
race = rep("", 10),
sample_type = c(rep("cell line", 5), rep("primary tissue", 5)),
sex = c(rep("", 5), rep("female", 5)),
study = rep("", 10),
tissue_ontology_id = c(rep("", 5), rep("UBERON:0000945", 5)),
stringsAsFactors = FALSE
)),
stringsAsFactors = FALSE
)
)
# Tests for log_cpm function
test_that("log_cpm transforms data correctly", {
# Create sample raw counts
raw_counts <- data.frame(
sample_id = c("A", "B", "C"),
gene1 = c(100, 200, 300),
gene2 = c(50, 100, 150),
gene3 = c(10, 20, 30)
)
# Transform to log CPM
result <- log_cpm(raw_counts)
# Manually calculate expected values for first row
row1_lib_size <- sum(raw_counts[1, -1 ]) # 100 + 50 + 10 = 160
expected_gene1_cpm <- (100 / 160) * 1e6 # 625000
expected_gene2_cpm <- (50 / 160) * 1e6 # 312500
expected_gene3_cpm <- (10 / 160) * 1e6 # 62500
# Log1p of expected values
expected_gene1_log <- log1p(expected_gene1_cpm)
expected_gene2_log <- log1p(expected_gene2_cpm)
expected_gene3_log <- log1p(expected_gene3_cpm)
# Check column names have _cpm suffix
expect_true(all(grepl("_cpm$", colnames(result[-1]))))
# Check values for first row (with tolerance for floating point differences)
expect_equal(result$gene1_cpm[1], expected_gene1_log, tolerance = 1e-5)
expect_equal(result$gene2_cpm[2], expected_gene2_log, tolerance = 1e-5)
expect_equal(result$gene3_cpm[3], expected_gene3_log, tolerance = 1e-5)
# Check dimensions
expect_equal(nrow(result), nrow(raw_counts))
expect_equal(ncol(result), ncol(raw_counts))
})
test_that("log_cpm handles edge cases correctly", {
# Test with matrix input
matrix_input <- matrix(c(100, 200, 50, 100, 10, 20), nrow = 2)
colnames(matrix_input) <- c("ENSG00001", "ENSG00002", "ENSG00003")
expect_error(log_cpm(matrix_input), NA) # Should not error
# Test with zero counts
zero_counts <- data.frame(
sample_id = c("A", "B", "C"),
gene1 = c(0, 200, 300),
gene2 = c(50, 0, 150),
gene3 = c(10, 20, 0)
)
result_zeros <- log_cpm(zero_counts)
expect_false(any(is.na(result_zeros)))
# Test with negative values (should be converted to 0)
neg_counts <- data.frame(
sample_id = c("A", "B", "C"),
gene1 = c(-10, 200, 300),
gene2 = c(50, -20, 150),
gene3 = c(10, 20, -30)
)
result_neg <- log_cpm(neg_counts)
expect_false(any(is.na(result_neg)))
})
test_that("log_cpm handles invalid inputs correctly", {
# Test with non-data frame/matrix
expect_error(log_cpm(list(a = 1:3, b = 4:6)), "Input must be a data frame or matrix")
# Test with empty data frame
expect_error(log_cpm(data.frame()), "Input must have at least one row and one column")
# Test with data frame with no columns
empty_df <- data.frame(x = integer(0))[, FALSE]
expect_error(log_cpm(empty_df), "Input must have at least one row and one column")
})
# Tests for extract_expression_data function
test_that("extract_expression_data processes API response correctly", {
# Test with as_counts = TRUE (default)
result_counts <- extract_expression_data(mock_api_response)
# Check structure
expect_type(result_counts, "list")
expect_named(result_counts, c("metadata", "expression"))
# Check metadata
expect_s3_class(result_counts$metadata, "data.frame")
expect_equal(nrow(result_counts$metadata), 10)
# Check expression data
expect_s3_class(result_counts$expression, "data.frame")
expect_equal(nrow(result_counts$expression), 10)
expect_equal(colnames(result_counts$expression)[1:4],
c("sample_id", "ENSG00000000003", "ENSG00000000005", "ENSG00000000419"))
# Test with as_counts = FALSE (log CPM transformation)
result_logcpm <- extract_expression_data(mock_api_response, as_counts = FALSE)
# Check expression data has been transformed (no longer integers)
expect_false(all(sapply(result_logcpm$expression, is.integer)))
})
test_that("extract_expression_data correctly assigns sample IDs", {
# Test sample ID generation
result <- extract_expression_data(mock_api_response)
# Check sample IDs match between metadata and expression
expect_equal(
nrow(result$metadata),
nrow(result$expression))
# sample ids should match
expect_equal(result$metadata$sample_id, result$expression$sample_id)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.