tests/testthat/test-fidelity.R

test_that("GenerateData.R functions have identical output", {
  skip_if_not_installed("MilDistribution") %>%
    suppressMessages()
  # Note: as of updates to 0.3.2 of mildsvm, no longer expect fidelity of
  # `generate_mild_df()`.  Just check for similarity here

  set.seed(8)
  mildsvm_data <- mildsvm::generate_mild_df(nimp_pos = 1:5,
                                            nimp_neg = 1:5,
                                            mean = list(rep(1, 5), rep(2, 5), 0),
                                            sd_of_mean = rep(0, 3))
  set.seed(8)
  MilDistribution_data <-
    MilDistribution::GenerateMilData(positive_dist = "mvt",
                                     negative_dist = "mvnormal",
                                     remainder_dist = "mvnormal",
                                     positive_mean = rep(1, 5),
                                     negative_mean = rep(2, 5),
                                     positive_degree = 3)

  class(MilDistribution_data) <- c("mild_df", "data.frame")
  cols <- 4:13
  suppressWarnings({
    diff <- colMeans(mildsvm_data[, cols]) - colMeans(MilDistribution_data[, cols])
  })
  expect_lte(mean(diff), 0.05)
  expect_equal(dim(mildsvm_data), dim(MilDistribution_data))

})

test_that("kme.R functions have identical output", {
  skip_if_not_installed("MilDistribution")
  set.seed(8)
  mil_data <- mildsvm::generate_mild_df(ncov = 5, nbag = 7, nsample = 7)

  # remove one instance, and one observation to ensure unequal lengths
  ind1 <- which(mil_data$instance_name == unique(mil_data$instance_name)[1])
  ind2 <- which(mil_data$instance_name == unique(mil_data$instance_name)[2])[1]
  mil_data <- mil_data[-c(ind1, ind2), ]
  mil_data2 <- mil_data
  class(mil_data2) <- c("MilData", "data.frame")

  x = data.frame("instance_name" = c("inst_1", "inst_2", "inst_1"),
                 "X1" = c(-0.4, 0.5, 2))
  x2 = data.frame("instance_name" = c("inst_1", "inst_2", "inst_1"),
                  "X1" = c(-1, -2, 3))

  split_ind <- mil_data$bag_name %in% unique(mil_data$bag_name)[1:2]

  expect_equal(mildsvm::kme(mil_data), MilDistribution::kme(mil_data2))
  expect_equal(mildsvm::kme(x), MilDistribution::kme(x))
  expect_equal(mildsvm::kme(x, x2), MilDistribution::kme(x, x2))
  expect_equal(mildsvm::kme(mil_data[split_ind, ], mil_data[!split_ind, ]),
               MilDistribution::kme(mil_data2[split_ind, ], mil_data2[!split_ind, ]))
})

test_that("mismm.R functions have identical output", {
  skip_if_not_installed("gurobi")
  skip_if_not_installed("MilDistribution")
  set.seed(8)
  mil_data <- mildsvm::generate_mild_df(ncov = 5, nbag = 10, nsample = 7,
                                        positive_prob = 0.15,
                                        sd_of_mean = rep(0.1, 3))

  # remove one instance, and one observation to create unequal lengths
  ind1 <- which(mil_data$instance_name == unique(mil_data$instance_name)[1])
  ind2 <- which(mil_data$instance_name == unique(mil_data$instance_name)[2])[1]
  mil_data <- mil_data[-c(ind1, ind2), ]

  mil_data <- mil_data %>%
    dplyr::arrange(bag_label, bag_name, instance_name, X1)
  mil_data2 <- mil_data
  class(mil_data2) <- c("MilData", "data.frame")


  set.seed(8)
  mdl1 <- mildsvm::mismm(mil_data, cost = 1,
                           weights = c("0" = 5 / 19, "1" = 1), # set to n_neg_inst / n_pos_bag
                           control = list(scale = FALSE,
                                          sigma = 0.05))
  mdl2 <- MilDistribution::mil_distribution(mil_data2, cost = 1)

  # models are equal on key components, just differ some naming
  expect_equal(mdl1$ksvm_fit@alpha, mdl2$model$ksvm_res@alpha)
  expect_equal(mdl1$ksvm_fit@b, mdl2$model$ksvm_res@b)
  expect_equal(mdl1$n_step, mdl2$total_step)
  expect_equal(mdl1$repr_inst, mdl2$representative_inst)
  expect_equal(mdl1$x, mdl2$traindata, ignore_attr = TRUE)
  expect_equal(mdl1$useful_inst_idx, mdl2$useful_inst_idx, ignore_attr = TRUE)

  # predictions should match
  expect_equal(
    predict(mdl1, new_data = mil_data, type = "raw", layer = "instance")$.pred %>%
      setNames(NULL) %>%
      suppressMessages(),
    # factor(predict(mdl2, newdata = mil_data_)$final_pred$bag_label)
    predict(mdl2, newdata = mil_data2)$final_pred$instance_score %>%
      suppressMessages(),
    ignore_attr = TRUE
  )

  expect_equal(
    mil_data %>%
      dplyr::bind_cols(predict(mdl1, new_data = mil_data, type = "raw", layer = "bag")) %>%
      dplyr::distinct(bag_name, .pred),
    predict(mdl2, newdata = mil_data2)$final_pred %>%
      dplyr::distinct(bag_name, bag_score) %>%
      suppressMessages(),
    ignore_attr = TRUE
  )

  # it seems that there may have been a bug in Yifei's predict.mild code.  An
  # instance with a bag score of 0.14 gets labels as negative, when it should be
  # positive
  # expect_equal(
  #   mil_data_ %>%
  #     dplyr::bind_cols(predict(mdl1, new_data = mil_data_, type = "raw", layer = "bag")) %>%
  #     dplyr::bind_cols(predict(mdl1, new_data = mil_data_, type = "class", layer = "bag")) %>%
  #     dplyr::distinct(bag_name, .pred, .pred_class),
  #   predict(mdl2, newdata = mil_data_)$final_pred %>%
  #     dplyr::distinct(bag_name, bag_score, bag_label) %>%
  #     dplyr::mutate(bag_label = as.factor(bag_label)),
  #     ignore_attr = TRUE
  # )

  # there is another bug in the mil_distribution code where it will not work
  # when the data is passed out of order.

  # set.seed(8)
  # mildsvm_cv_output <- mildsvm::cv_mild(mil_data, n_fold = 3)
  # set.seed(8)
  # MilDistribution_cv_output <- MilDistribution::cv_mild(mil_data, n_fold = 3)
  # expect_equal(mildsvm_cv_output,
  #              MilDistribution_cv_output)

  # expect_equal(mildsvm::kme(x), MilDistribution::kme(x))
})

test_that("misvm.R functions have identical output.", {
  skip_if_not_installed("gurobi")
  skip_if_not_installed("MilDistribution")
  set.seed(8)
  mil_data <- mildsvm::generate_mild_df(nbag = 10, nsample = 7,
                                        positive_prob = 0.15,
                                        sd_of_mean = rep(0.1, 3))
  df1 <- mildsvm::build_instance_feature(mil_data, seq(0.05, 0.95, length.out = 10)) %>%
    dplyr::arrange(desc(bag_label), bag_name, instance_name)

  set.seed(8)
  mildsvm_output <- mildsvm::misvm(x = df1[, 4:123],
                                   y = df1$bag_label,
                                   bags = df1$bag_name,
                                   cost = 1,
                                   method = "heuristic",
                                   control = list(kernel = "radial",
                                                  sigma = 1 / length(4:123)))
  set.seed(8)
  MilDistribution_output <- MilDistribution::MI_SVM(df1, cost = 1)

  expect_equal(mildsvm_output$n_step, MilDistribution_output$total_step)

  # objects are quite different because of different ordering, but as long as predictions match that is okay
  mildsvm_inst_pred <- df1 %>%
    dplyr::select(bag_label, bag_name) %>%
    dplyr::bind_cols(predict(mildsvm_output, new_data = df1, layer = "instance")) %>%
    dplyr::bind_cols(predict(mildsvm_output, new_data = df1, type = "raw", layer = "instance"))

  mildsvm_bag_pred <- df1 %>%
    dplyr::select(bag_label, bag_name) %>%
    dplyr::bind_cols(predict(mildsvm_output, new_data = df1, type = "raw", layer = "bag")) %>%
    dplyr::bind_cols(predict(mildsvm_output, new_data = df1, layer = "bag")) %>%
    dplyr::distinct()

  MilDistribution_pred <- predict(MilDistribution_output, newdata = df1)

  expect_equal(
    mildsvm_inst_pred,
    MilDistribution_pred$instance_level_prediction,
    ignore_attr = TRUE
  )
  expect_equal(
    mildsvm_bag_pred %>% dplyr::arrange(.pred) %>% dplyr::pull(.pred),
    MilDistribution_pred$bag_level_prediction %>% dplyr::arrange(bag_score_pred) %>% dplyr::pull(bag_score_pred),
    ignore_attr = TRUE
  )


})

test_that("cv_misvm.R functions have identical output.", {
  skip_if_not_installed("gurobi")
  skip_if_not_installed("MilDistribution")
  set.seed(8)
  mil_data <- mildsvm::generate_mild_df(nbag = 10, nsample = 7,
                                        positive_prob = 0.15,
                                        sd_of_mean = rep(0.1, 3))
  df1 <- mildsvm::build_instance_feature(mil_data, seq(0.05, 0.95, length.out = 10)) %>%
    dplyr::arrange(desc(bag_label), bag_name, instance_name)


  set.seed(8)
  mildsvm_cv_output <- mildsvm::cv_misvm(x = df1[, 4:123],
                                         y = df1$bag_label,
                                         bags = df1$bag_name,
                                         cost_seq = 2^(-2:2),
                                         n_fold = 3,
                                         # weights = c("0" = 5 / 19, "1" = 1), # set to n_neg_inst / n_pos_bag
                                         method = "heuristic",
                                         control = list(kernel = "radial",
                                                        sigma = 1 / length(4:123)))
  set.seed(8)
  MilDistribution_cv_output <-
    MilDistribution::cv_MI_SVM(df1, n_fold = 3, cost_seq = 2^(-2:2)) %>%
    suppressMessages()

  mildsvm_cv_output$model$model$call <- NULL
  MilDistribution_cv_output$BestMdl$svm_mdl$call <- NULL
  expect_equal(mildsvm_cv_output$misvm_fit$n_step, MilDistribution_cv_output$BestMdl$total_step)

  mildsvm_inst_pred <- df1 %>%
    dplyr::select(bag_label, bag_name) %>%
    dplyr::bind_cols(predict(mildsvm_cv_output, new_data = df1, layer = "instance")) %>%
    dplyr::bind_cols(predict(mildsvm_cv_output, new_data = df1, type = "raw", layer = "instance"))

  mildsvm_bag_pred <- df1 %>%
    dplyr::select(bag_label, bag_name) %>%
    dplyr::bind_cols(predict(mildsvm_cv_output, new_data = df1, type = "raw", layer = "bag")) %>%
    dplyr::bind_cols(predict(mildsvm_cv_output, new_data = df1, layer = "bag")) %>%
    dplyr::distinct()

  MilDistribution_pred <- predict(MilDistribution_cv_output$BestMdl, newdata = df1) %>%
    suppressMessages()

  expect_equal(
    mildsvm_inst_pred,
    MilDistribution_pred$instance_level_prediction,
    ignore_attr = TRUE
  )
  expect_equal(
    mildsvm_bag_pred %>% dplyr::arrange(.pred) %>% dplyr::pull(.pred),
    MilDistribution_pred$bag_level_prediction %>% dplyr::arrange(bag_score_pred) %>% dplyr::pull(bag_score_pred),
    ignore_attr = TRUE
  )

})

test_that("smm.R functions have identical output", {
  skip_if_not_installed("gurobi")
  skip_if_not_installed("MilDistribution")
  set.seed(8)
  n_instances <- 10
  n_samples <- 20
  y <- rep(c(1, -1), each = n_samples * n_instances / 2)
  instances <- as.character(rep(1:n_instances, each = n_samples))
  x <- data.frame(x1 = rnorm(length(y), mean = 1*(y==1)),
                  x2 = rnorm(length(y), mean = 2*(y==1)),
                  x3 = rnorm(length(y), mean = 3*(y==1)))

  # NOTE: yifei's code has a bug in `SMM()` where instance_names are re-ordered
  # in y, but not in the x.  This comes from a dplyr::summarize ordering by the
  # instance_name, which might not align with the way the data was ordered.
  # Thus, we order by the instance_name here.

  # NOTE: also, we need to pass a factor as `y` for `ksvm`, which wasn't made
  # explicit in yifei's code
  df <- data.frame(instance_label = factor(y), instance_name = instances, x) %>%
    dplyr::arrange(instance_name)

  mdl1 <- mildsvm::smm(x = df[, c("x1", "x2", "x3")],
                       y = df$instance_label,
                       instances = df$instance_name,
                       cost = 1,
                       weights = FALSE,
                       control = list(sigma = 0.05, scale = FALSE))
  mdl2 <- MilDistribution::SMM(df %>% dplyr::arrange(instance_name))

  expect_equal(mdl1$ksvm_fit, mdl2$ksvm_res)
  common_components <- c("sigma", "cost")
  expect_equal(mdl1[common_components], mdl2[common_components])
  expect_equal(mdl1$x,
               mdl2$traindata %>% dplyr::select(-instance_label),
               ignore_attr = TRUE)

})

test_that("misvm.R functions have identical output on MilData object.", {
  skip_if_not_installed("gurobi")
  skip_if_not_installed("MilDistribution")
  set.seed(8)
  mil_data <- mildsvm::generate_mild_df(nbag = 10, nsample = 7,
                                        positive_prob = 0.15,
                                        sd_of_mean = rep(0.1, 3)) %>%
    dplyr::arrange(desc(bag_label), bag_name, instance_name)

  # make the quantile functions
  qtls <- seq(0.05, 0.95, length.out = 10)
  q_funs <- purrr::map(qtls,
                       function(qtl) { return(~quantile(.x, qtl)) })
  names(q_funs) <- paste0("q", qtls)


  set.seed(8)
  mdl1 <- mildsvm::misvm(mil_data, cost = 1,
                         .fns = c(list(mean = base::mean, sd = stats::sd), q_funs),
                         method = "heuristic",
                         control = list(kernel = "radial",
                                        sigma = 1 / 120))

  suppressWarnings({
    mdl2 <- MilDistribution::mil_with_feature(mil_data, cost = 1)
  })


  expect_equal(mdl1$svm_fit$coefs, mdl2$svm_mdl$coefs)
  expect_equal(mdl1$n_step, mdl2$total_step)

  # objects are quite different because of different ordering, but as long as predictions match that is okay
  mildsvm_bag_pred <- mil_data %>%
    dplyr::select(bag_label, bag_name) %>%
    dplyr::bind_cols(predict(mdl1, new_data = mil_data, type = "raw", layer = "bag")) %>%
    dplyr::bind_cols(predict(mdl1, new_data = mil_data, layer = "bag")) %>%
    dplyr::distinct()

  # Note prediction doesn't work in MilDistribution, but this is what it should do
  suppressWarnings({
    MilDistribution_pred <- predict(mdl2, newdata = MilDistribution::build_instance_feature(mil_data))
  })

  expect_equal(
    mildsvm_bag_pred %>% dplyr::arrange(.pred) %>% dplyr::pull(.pred),
    MilDistribution_pred$bag_level_prediction %>% dplyr::arrange(bag_score_pred) %>% dplyr::pull(bag_score_pred),
    ignore_attr = TRUE
  )

})

Try the mildsvm package in your browser

Any scripts or data that you put into this service are public.

mildsvm documentation built on July 14, 2022, 9:08 a.m.