test-data-handlers.R
In rsynthbio: Synthesize Bio API Wrapper

# Updated mock API response to match the actual v2.0 structure
mock_api_response <- list(
  # Model version
  model_version = 2,

  # Gene order (unchanged)
  gene_order = c("ENSG00000000003", "ENSG00000000005", "ENSG00000000419", "ENSG00000000457",
                 "ENSG00000000460", "ENSG00000000938",
                 # Add more genes to reach 44,592 total
                 paste0("ENSG", sprintf("%011d", 1:44586))),

  # NEW STRUCTURE: outputs is now a data.frame with list columns
  outputs = data.frame(
    # counts column - list of integer vectors (one per sample)
    counts = I(list(
      as.integer(c(904, 0, 539, 115, 239, 0, 1976, 817, 2422, 372, runif(44582, 0, 2000))),
      as.integer(c(1350, 0, 343, 120, 151, 5, 937, 947, 1439, 344, runif(44582, 0, 2000))),
      as.integer(c(1082, 0, 471, 144, 230, 4, 924, 861, 2093, 179, runif(44582, 0, 2000))),
      as.integer(c(851, 5, 423, 147, 139, 3, 725, 1592, 5669, 407, runif(44582, 0, 2000))),
      as.integer(c(339, 0, 356, 170, 91, 0, 1119, 748, 2459, 314, runif(44582, 0, 2000))),
      as.integer(c(337, 0, 545, 174, 106, 28, 734, 770, 570, 651, runif(44582, 0, 2000))),
      as.integer(c(822, 0, 440, 578, 60, 20, 1411, 915, 1004, 500, runif(44582, 0, 2000))),
      as.integer(c(591, 0, 360, 212, 104, 53, 883, 1021, 826, 1439, runif(44582, 0, 2000))),
      as.integer(c(999, 0, 844, 228, 61, 30, 786, 977, 446, 516, runif(44582, 0, 2000))),
      as.integer(c(638, 1, 578, 194, 92, 64, 828, 416, 605, 613, runif(44582, 0, 2000)))
    )),

    # classifier_probs - data.frame with nested data.frames for each classifier
    classifier_probs = I(data.frame(
      # Sex probabilities
      sex = I(data.frame(
        female = c(0.0734, 0.1465, 0.1251, 0.0948, 0.2374, 0.1897, 0.2156, 0.1634, 0.0892, 0.1749),
        male = c(0.927, 0.853, 0.875, 0.905, 0.763, 0.810, 0.784, 0.837, 0.911, 0.825)
      )),

      # Age years probabilities (96 age categories)
      age_years = I(data.frame(
        matrix(runif(10 * 96, 0, 0.05), nrow = 10, ncol = 96,
               dimnames = list(NULL, as.character(0:95)))
      )),

      # Tissue ontology probabilities (442 categories)
      tissue_ontology_id = I(data.frame(
        matrix(runif(10 * 442, 0, 1e-3), nrow = 10, ncol = 442,
               dimnames = list(NULL, c("CL:0000000", "CL:0000030", "CL:0000031",
                                       paste0("UBERON:", sprintf("%07d", 1:439)))))
      )),

      # Sample type probabilities
      sample_type = I(data.frame(
        "cell line" = c(0.937, 0.966, 0.929, 0.909, 0.95, 0.943, 0.901, 0.934, 0.956, 0.912),
        organoid = c(0.01129, 0.00439, 0.01091, 0.01375, 0.00753, 0.00891, 0.01234, 0.00876, 0.00543, 0.01098),
        other = c(0.01282, 0.00688, 0.01579, 0.01839, 0.01271, 0.01456, 0.01789, 0.01345, 0.00987, 0.01567),
        primary = c(0.00893, 0.00553, 0.00975, 0.01319, 0.00601, 0.00834, 0.01287, 0.00756, 0.00612, 0.00945),
        "primary cells" = c(0.01198, 0.00467, 0.00902, 0.01929, 0.00837, 0.01023, 0.01567, 0.00934, 0.00723, 0.01134),
        "primary tissue" = c(0.00706, 0.00682, 0.00779, 0.0176, 0.00782, 0.00934, 0.01456, 0.00812, 0.00678, 0.00945),
        xenograft = c(0.01084, 0.00555, 0.01784, 0.00864, 0.00804, 0.01123, 0.01678, 0.00923, 0.00734, 0.01234),
        check.names = FALSE
      )),

      # Disease ontology probabilities (589 categories)
      disease_ontology_id = I(data.frame(
        matrix(runif(10 * 589, 0, 1e-3), nrow = 10, ncol = 589,
               dimnames = list(NULL, c("CL:0000623", "CL:0017002", "HGNC:11474",
                                       paste0("MONDO:", sprintf("%07d", 1:586)))))
      )),

      # Cell type ontology probabilities (392 categories)
      cell_type_ontology_id = I(data.frame(
        matrix(runif(10 * 392, 0, 1e-3), nrow = 10, ncol = 392,
               dimnames = list(NULL, paste0("CL:", sprintf("%07d", 1:392))))
      )),

      # Cell line ontology probabilities (763 categories) - CVCL_0023 dominates
      cell_line_ontology_id = I(data.frame(
        CVCL_0023 = c(0.862, 0.826, 0.875, 0.836, 0.854, 0.843, 0.821, 0.867, 0.891, 0.838),
        matrix(runif(10 * 762, 0, 1e-3), nrow = 10, ncol = 762,
               dimnames = list(NULL, paste0("CVCL_", sprintf("%04d", 1:762))))
      ))
    )),

    # latents - data.frame with list columns for each latent type
    latents = I(data.frame(
      biological = I(list(
        runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3),
        runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3), runif(1024, -3, 3)
      )),
      technical = I(list(
        c(-0.0271, 0.518, -1.8222, -0.0959, 26.1482, runif(27, -5, 30)),
        c(-1.66, 2.05, -3.95, 1.12, 17.84, runif(27, -5, 30)),
        c(-0.446, 1.245, -4.278, 0.279, 20.811, runif(27, -5, 30)),
        c(-1.73, -1.001, -2.311, 0.122, 13.508, runif(27, -5, 30)),
        c(-0.631, 1.919, -2.409, 0.481, 10.967, runif(27, -5, 30)),
        c(-1.282, -1.58, -4.221, 0.799, 12.121, runif(27, -5, 30)),
        c(-0.823, 0.58, -3.437, 1.953, 17.658, runif(27, -5, 30)),
        c(-0.0634, 1.2, -3.6599, 1.7813, 19.5482, runif(27, -5, 30)),
        c(-1.018, -1.78, -3.108, -0.326, 15.316, runif(27, -5, 30)),
        c(0.267, -0.548, -2.999, 1.454, 13.085, runif(27, -5, 30))
      )),
      perturbation = I(list(
        runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2),
        runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2), runif(512, -2, 2)
      ))
    )),

    # metadata - data.frame with sample metadata
    metadata = I(data.frame(
      age_years = c("", "", "", "", "", "65", "65", "65", "65", "65"),
      cell_line_ontology_id = c(rep("CVCL_0023", 5), rep("", 5)),
      cell_type_ontology_id = rep("", 10),
      developmental_stage = rep("", 10),
      disease_ontology_id = c(rep("", 5), rep("MONDO:0011719", 5)),
      ethnicity = rep("", 10),
      genotype = rep("", 10),
      library_layout = rep("", 10),
      library_selection = rep("", 10),
      modality = rep("bulk", 10),
      perturbation_dose = rep("", 10),
      perturbation_ontology_id = c(rep("ENSG00000156127", 5), rep("", 5)),
      perturbation_time = c(rep("96 hours", 5), rep("", 5)),
      perturbation_type = c(rep("crispr", 5), rep("", 5)),
      platform = rep("", 10),
      race = rep("", 10),
      sample_type = c(rep("cell line", 5), rep("primary tissue", 5)),
      sex = c(rep("", 5), rep("female", 5)),
      study = rep("", 10),
      tissue_ontology_id = c(rep("", 5), rep("UBERON:0000945", 5)),
      stringsAsFactors = FALSE
    )),

    stringsAsFactors = FALSE
  )
)


# Tests for log_cpm function
test_that("log_cpm transforms data correctly", {
  # Create sample raw counts
  raw_counts <- data.frame(
    sample_id = c("A", "B", "C"),
    gene1 = c(100, 200, 300),
    gene2 = c(50, 100, 150),
    gene3 = c(10, 20, 30)
  )

  # Transform to log CPM
  result <- log_cpm(raw_counts)

  # Manually calculate expected values for first row
  row1_lib_size <- sum(raw_counts[1, -1 ])  # 100 + 50 + 10 = 160
  expected_gene1_cpm <- (100 / 160) * 1e6  # 625000
  expected_gene2_cpm <- (50 / 160) * 1e6   # 312500
  expected_gene3_cpm <- (10 / 160) * 1e6   # 62500

  # Log1p of expected values
  expected_gene1_log <- log1p(expected_gene1_cpm)
  expected_gene2_log <- log1p(expected_gene2_cpm)
  expected_gene3_log <- log1p(expected_gene3_cpm)

  # Check column names have _cpm suffix
  expect_true(all(grepl("_cpm$", colnames(result[-1]))))

  # Check values for first row (with tolerance for floating point differences)
  expect_equal(result$gene1_cpm[1], expected_gene1_log, tolerance = 1e-5)
  expect_equal(result$gene2_cpm[2], expected_gene2_log, tolerance = 1e-5)
  expect_equal(result$gene3_cpm[3], expected_gene3_log, tolerance = 1e-5)

  # Check dimensions
  expect_equal(nrow(result), nrow(raw_counts))
  expect_equal(ncol(result), ncol(raw_counts))
})

test_that("log_cpm handles edge cases correctly", {
  # Test with matrix input
  matrix_input <- matrix(c(100, 200, 50, 100, 10, 20), nrow = 2)
  colnames(matrix_input) <- c("ENSG00001", "ENSG00002", "ENSG00003")
  expect_error(log_cpm(matrix_input), NA) # Should not error

  # Test with zero counts
  zero_counts <- data.frame(
    sample_id = c("A", "B", "C"),
    gene1 = c(0, 200, 300),
    gene2 = c(50, 0, 150),
    gene3 = c(10, 20, 0)
  )
  result_zeros <- log_cpm(zero_counts)
  expect_false(any(is.na(result_zeros)))

  # Test with negative values (should be converted to 0)
  neg_counts <- data.frame(
    sample_id = c("A", "B", "C"),
    gene1 = c(-10, 200, 300),
    gene2 = c(50, -20, 150),
    gene3 = c(10, 20, -30)
  )
  result_neg <- log_cpm(neg_counts)
  expect_false(any(is.na(result_neg)))
})

test_that("log_cpm handles invalid inputs correctly", {
  # Test with non-data frame/matrix
  expect_error(log_cpm(list(a = 1:3, b = 4:6)), "Input must be a data frame or matrix")

  # Test with empty data frame
  expect_error(log_cpm(data.frame()), "Input must have at least one row and one column")

  # Test with data frame with no columns
  empty_df <- data.frame(x = integer(0))[, FALSE]
  expect_error(log_cpm(empty_df), "Input must have at least one row and one column")
})

# Tests for extract_expression_data function
test_that("extract_expression_data processes API response correctly", {

  # Test with as_counts = TRUE (default)
  result_counts <- extract_expression_data(mock_api_response)

  # Check structure
  expect_type(result_counts, "list")
  expect_named(result_counts, c("metadata", "expression"))

  # Check metadata
  expect_s3_class(result_counts$metadata, "data.frame")
  expect_equal(nrow(result_counts$metadata), 10)

  # Check expression data
  expect_s3_class(result_counts$expression, "data.frame")
  expect_equal(nrow(result_counts$expression), 10)
  expect_equal(colnames(result_counts$expression)[1:4],
               c("sample_id", "ENSG00000000003", "ENSG00000000005", "ENSG00000000419"))

  # Test with as_counts = FALSE (log CPM transformation)
  result_logcpm <- extract_expression_data(mock_api_response, as_counts = FALSE)

  # Check expression data has been transformed (no longer integers)
  expect_false(all(sapply(result_logcpm$expression, is.integer)))
})


test_that("extract_expression_data correctly assigns sample IDs", {

  # Test sample ID generation
  result <- extract_expression_data(mock_api_response)

  # Check sample IDs match between metadata and expression
  expect_equal(
    nrow(result$metadata),
    nrow(result$expression))

  # sample ids should match
  expect_equal(result$metadata$sample_id, result$expression$sample_id)
})