tests/testthat/test_build_lm_1.R

context("test build_lm part 2")
test_that("binary prediction with character target column", {
  test_data <- structure(
    list(
      `CANCELLED X` = c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", "Y", "N"),
      `Carrier Name` = c("Delta Air Lines", "American Eagle", "American Airlines", "Southwest Airlines", "SkyWest Airlines", "Southwest Airlines", "Southwest Airlines", "Delta Air Lines", "Southwest Airlines", "Atlantic Southeast Airlines", "American Airlines", "Southwest Airlines", "US Airways", "US Airways", "Delta Air Lines", "Atlantic Southeast Airlines", NA, "Atlantic Southeast Airlines", "Delta Air Lines", "Delta Air Lines"),
      CARRIER = factor(c(NA, "MQ", "AA", "DL", "MQ", "AA", "DL", "DL", "MQ", "AA", "AA", "WN", "US", "US", "DL", "EV", "9E", "EV", "DL", "DL")), # test with factor with NA
      # testing filtering of Inf, -Inf, NA here.
      DISTANCE = c(Inf, -Inf, NA, 187, 273, 1062, 583, 240, 1123, 851, 852, 862, 361, 507, 1020, 1092, 342, 489, 1184, 545)), row.names = c(NA, -20L),
    class = c("tbl_df", "tbl", "data.frame"), .Names = c("CANCELLED X", "Carrier Name", "CARRIER", "DISTANCE"))

  # Make target variable logical. (We will support only logical as logistic regression target.)
  test_data <- test_data %>% dplyr::mutate(`CANCELLED X` = `CANCELLED X` == 'Y')

  # duplicate rows to make some predictable data
  # otherwise, the number of rows of the result of prediction becomes 0
  test_data <- dplyr::bind_rows(test_data, test_data)

  model_data <- build_lm.fast(test_data, `CANCELLED X`, `Carrier Name`, CARRIER, DISTANCE,
                              normalize_predictors = TRUE,
                              model_type = "glm", smote=FALSE, with_marginal_effects=TRUE, with_marginal_effects_confint=TRUE)

  ret <- test_data %>% select(-`CANCELLED X`) %>% add_prediction(model_df=model_data)
  ret <- model_data %>% prediction(data="newdata", data_frame=test_data)

  ret <- model_data %>% tidy_rowwise(model, type="vif")
  ret <- model_data %>% glance_rowwise(model, pretty.name=TRUE)
  expect_equal(colnames(ret), c("AUC","F1 Score","Accuracy Rate","Misclass. Rate","Precision",               
                                "Recall","P Value","Rows","Rows for TRUE","Rows for FALSE",
                                "Log Likelihood","AIC","BIC","Residual Deviance","Residual DF","Null Deviance",
                                "Null Model DF"))
  expect_equal(ret$`Rows`, 34)
  expect_equal(ret$`Rows for TRUE`, 4) # This ends up to be 4 after doubling
  expect_equal(ret$`Rows for FALSE`, 30) # This ends up to be 30 after doubling and removing NA rows.
  ret <- model_data %>% tidy_rowwise(model)
  ret <- model_data %>% augment_rowwise(model)

  expect_true(nrow(ret) > 0)
})
test_that("binary prediction with factor target column", {
  test_data <- tibble::tibble(
      `CANCELLED X` = factor(c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", "Y", "N"), levels=c("A","N","Y","B")),
      `Carrier Name` = c("Delta Air Lines", "American Eagle", "American Airlines", "Southwest Airlines", "SkyWest Airlines", "Southwest Airlines", "Southwest Airlines", "Delta Air Lines", "Southwest Airlines", "Atlantic Southeast Airlines", "American Airlines", "Southwest Airlines", "US Airways", "US Airways", "Delta Air Lines", "Atlantic Southeast Airlines", NA, "Atlantic Southeast Airlines", "Delta Air Lines", "Delta Air Lines"),
      CARRIER = factor(c(NA, "MQ", "AA", "DL", "MQ", "AA", "DL", "DL", "MQ", "AA", "AA", "WN", "US", "US", "DL", "EV", "9E", "EV", "DL", "DL")), # test with factor with NA
      # testing filtering of Inf, -Inf, NA here.
      DISTANCE = c(Inf, -Inf, NA, 187, 273, 1062, 583, 240, 1123, 851, 852, 862, 361, 507, 1020, 1092, 342, 489, 1184, 545))

  # Make target variable logical. (We will support only logical as logistic regression target.)
  test_data <- test_data %>% dplyr::mutate(`CANCELLED X` = `CANCELLED X` == 'Y')

  # duplicate rows to make some predictable data
  # otherwise, the number of rows of the result of prediction becomes 0
  test_data <- dplyr::bind_rows(test_data, test_data)

  model_data <- build_lm.fast(test_data, `CANCELLED X`, `Carrier Name`, CARRIER, DISTANCE, model_type = "glm", smote=FALSE, with_marginal_effects=TRUE, with_marginal_effects_confint=FALSE)
  ret <- model_data %>% prediction(data="newdata", data_frame=test_data)
  ret <- model_data %>% glance_rowwise(model, pretty.name=TRUE)
  expect_equal(ret$`Rows`, 34)
  expect_equal(ret$`Rows for TRUE`, 4) # This ends up to be 4 after doubling
  expect_equal(ret$`Rows for FALSE`, 30) # This ends up to be 30 after doubling and removing NA rows.
  ret <- model_data %>% tidy_rowwise(model)
  ret <- model_data %>% augment_rowwise(model)

  expect_true(nrow(ret) > 0)
})

test_that("binary prediction with variable_metric argument", {
  test_data <- structure(
    list(
      `CANCELLED X` = factor(c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", "Y", "N"), levels=c("A","N","Y","B")),
      `Carrier Name` = c("Delta Air Lines", "American Eagle", "American Airlines", "Southwest Airlines", "SkyWest Airlines", "Southwest Airlines", "Southwest Airlines", "Delta Air Lines", "Southwest Airlines", "Atlantic Southeast Airlines", "American Airlines", "Southwest Airlines", "US Airways", "US Airways", "Delta Air Lines", "Atlantic Southeast Airlines", NA, "Atlantic Southeast Airlines", "Delta Air Lines", "Delta Air Lines"),
      CARRIER = factor(c(NA, "MQ", "AA", "DL", "MQ", "AA", "DL", "DL", "MQ", "AA", "AA", "WN", "US", "US", "DL", "EV", "9E", "EV", "DL", "DL")), # test with factor with NA
      # testing filtering of Inf, -Inf, NA here.
      DISTANCE = c(Inf, -Inf, NA, 187, 273, 1062, 583, 240, 1123, 851, 852, 862, 361, 507, 1020, 1092, 342, 489, 1184, 545)), row.names = c(NA, -20L),
    class = c("tbl_df", "tbl", "data.frame"), .Names = c("CANCELLED X", "Carrier Name", "CARRIER", "DISTANCE"))

  # Make target variable logical. (We will support only logical as logistic regression target.)
  test_data <- test_data %>% dplyr::mutate(`CANCELLED X` = `CANCELLED X` == 'Y')

  # duplicate rows to make some predictable data
  # otherwise, the number of rows of the result of prediction becomes 0
  test_data <- dplyr::bind_rows(test_data, test_data)

  model_data <- build_lm.fast(test_data, `CANCELLED X`, `Carrier Name`, CARRIER, DISTANCE, model_type = "glm", smote=FALSE, variable_metric="odds_ratio")
  ret <- model_data %>% tidy_rowwise(model, variable_metric="odds_ratio")

  model_data <- build_lm.fast(test_data, `CANCELLED X`, `Carrier Name`, CARRIER, DISTANCE, model_type = "glm", smote=FALSE, variable_metric="coefficient")
  ret <- model_data %>% tidy_rowwise(model, variable_metric="coefficient")

  model_data <- build_lm.fast(test_data, `CANCELLED X`, `Carrier Name`, CARRIER, DISTANCE, model_type = "glm", smote=FALSE, variable_metric="ame")
  ret <- model_data %>% tidy_rowwise(model, variable_metric="ame")
  expect_true(c("ame") %in% colnames(ret))

  expect_true(nrow(ret) > 0)
})

test_data <- tibble::tibble(
      `CANCELLED X` = c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", "Y", "N"),
      `Carrier Name` = c("Delta Air Lines", "American Eagle", "American Airlines", "Southwest Airlines", "SkyWest Airlines", "Southwest Airlines", "Southwest Airlines", "Delta Air Lines", "Southwest Airlines", "Atlantic Southeast Airlines", "American Airlines", "Southwest Airlines", "US Airways", "US Airways", "Delta Air Lines", "Atlantic Southeast Airlines", NA, "Atlantic Southeast Airlines", "Delta Air Lines", "Delta Air Lines"),
      CARRIER = factor(c("AA", "MQ", "AA", "DL", "MQ", "AA", "DL", "DL", "MQ", "AA", "AA", "WN", "US", "US", "DL", "EV", "9E", "EV", "DL", "DL")), # test with factor with NA
      # testing filtering of Inf, -Inf, NA here.
      DISTANCE = c(10, 12, 12, 187, 273, 1062, 583, 240, 1123, 851, 852, 862, 361, 507, 1020, 1092, 342, 489, 1184, 545),
      ARR_TIME = c(10, 32, 321, 342, 123, 98, 10, 21, 80, 211, 121, 87, 821, 213, 213, 923, 121, 76, 34, 50),
      DERAY_TIME = c(12, 42, 321, 31, 3, 43, 342, 764, 123, 43, 50, 12, 876, 12, 34, 45, 84, 25, 87, 352))

# Make target variable logical. (We will support only logical as logistic regression target.)
test_data <- test_data %>% dplyr::mutate(`CANCELLED X` = `CANCELLED X` == 'Y')

test_data$klass <- c(rep("A", 10), rep("B", 10))
test_that("add_prediction with linear regression", {
  model_df <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     target_fun="log",
                                     predictor_funs=list(ARR_TIME="log", DELAY_TIME="none", "Carrier Name"="none"),
                                     model_type = "lm")
  ret <- test_data %>% select(-DISTANCE) %>% add_prediction(model_df=model_df)

  df2 <- test_data %>% select(-DISTANCE)
  ret <- df2 %>% add_prediction(model_df=model_df)
  expect_equal(colnames(df2), colnames(ret)[1:length(colnames(df2))]) # Check that the df2 column order is kept.

  expect_error({
    ret <- test_data %>% select(-DISTANCE, -ARR_TIME) %>% add_prediction(model_df=model_df)
  }, regexp=".*ARR_TIME.*Columns are required for the model, but do not exist.*")
})

test_that("Linear Regression with test rate", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "lm",
                                     test_rate = 0.1,
                                     test_split_type = "ordered") # testing ordered split too.
  res <- ret %>% tidy_rowwise(model)
  expect_true("Carrier Name: American Airlines" %in% res$term)
  res <- ret %>% tidy_rowwise(model, type="vif")
  expect_true("Carrier Name" %in% res$term)
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  suppressWarnings({
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    expect_equal(res$`Rows`, 17)
    variables <- (ret %>% tidy_rowwise(model, type="importance") %>% arrange(desc(importance)))$variable
    names(variables) <- NULL
    res <- ret %>% lm_partial_dependence()
    expect_equal(levels(res$x_name), variables) # Factor order of the PDP should be the same as the importance.
    expect_true(all(c("conf_high", "conf_low", "bin_sample_size") %in% colnames(res)))
   })
})


test_that("Linear Regression with outlier filtering", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "lm",
                                     test_rate = 0.3,
                                     normalize_predictors = TRUE, # testing target normalization too.
                                     target_outlier_filter_type="percentile",
                                     target_outlier_filter_threshold=0.9) # testing outlier filter too.

  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  #training_rownum <- nrow(test_data) - test_rownum
  training_rownum <- nrow(ret$source.data[[1]]) - test_rownum

  suppressWarnings({
    pred_new <- ret %>% prediction(data="newdata", data_frame=test_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value", 
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    expect_equal(res$`Rows`, 12)
   })
})

test_that("Group Linear Regression with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`DISTANCE`,
                        `ARR_TIME`,
                        model_type = "lm",
                        test_rate = 0.1)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- ret %>% prediction(data="newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})

test_that("GLM - Normal Destribution with test_rate", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "glm",
                                     family = "gaussian",
                                     test_rate = 0.1)
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=test_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% evaluate_lm_training_and_test(pretty.name=TRUE)
    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% tidy_rowwise(model, type="permutation_importance")
  })
})

test_that("Group GLM - Normal Destribution with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`DISTANCE`,
                        `ARR_TIME`,
                        model_type = "glm",
                        family = "gaussian",
                        test_rate = 0.1)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% tidy_rowwise(model, type="permutation_importance")
   })
})

test_that("GLM - Gamma Destribution with test_rate", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "glm",
                                     family = "Gamma",
                                     test_rate = 0.1)
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  suppressWarnings({
    res <- prediction(ret, data = "training_and_test", pretty.name=TRUE)
    pred_new <- prediction(ret, data = "newdata", data_frame=test_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})

test_that("Group GLM - Gamma Destribution with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`DISTANCE`,
                        `ARR_TIME`,
                        model_type = "glm",
                        family = "Gamma",
                        test_rate = 0.1)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})

test_that("GLM - Inverse Gaussian Destribution with test_rate", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "glm",
                                     family = "inverse.gaussian",
                                     test_rate = 0.1)
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=test_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})

test_that("Group GLM - Inverse Gaussian Destribution with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`DISTANCE`,
                        `ARR_TIME`,
                        model_type = "glm",
                        family = "inverse.gaussian",
                        test_rate = 0.1)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})

test_that("add_prediction with poisson regression", {
  model_df <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     predictor_funs=list(ARR_TIME="log", DELAY_TIME="none", "Carrier Name"="none"),
                                     model_type = "glm",
                                     family = "poisson",
                                     importance_measure="firm")
  ret <- test_data %>% select(-DISTANCE) %>% add_prediction(model_df=model_df)
})

test_that("GLM - poisson Destribution with test_rate", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "glm",
                                     family = "poisson",
                                     test_rate = 0.1,
                                     importance_measure="firm")
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=test_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% tidy_rowwise(model, type="permutation_importance")
   })
})

test_that("Group GLM - Poisson Destribution with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`DISTANCE`,
                        `ARR_TIME`,
                        model_type = "glm",
                        family = "poisson",
                        test_rate = 0.3)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% tidy_rowwise(model, type="permutation_importance")
    res <- ret %>% lm_partial_dependence()
   })
})

test_that("GLM - Negative Binomial Destribution with test_rate", {
  ret <- test_data %>% build_lm.fast(`DISTANCE`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     model_type = "glm",
                                     family = "negativebinomial",
                                     test_rate = 0.1)
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=test_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low","conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("Carrier Name", "DISTANCE", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% tidy_rowwise(model, type="permutation_importance")
    res <- ret %>% lm_partial_dependence()
   })
})

test_that("Group GLM - Negative Binomial Destribution with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`DISTANCE`,
                        `ARR_TIME`,
                        model_type = "glm",
                        family = "negativebinomial",
                        test_rate = 0.1)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low","conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "DISTANCE", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% tidy_rowwise(model, type="permutation_importance")
    res <- ret %>% lm_partial_dependence()
   })
})

test_that("add_prediction with logistic regression", {
  model_df <- test_data %>% build_lm.fast(`CANCELLED X`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     predictor_funs=list(ARR_TIME="log", DELAY_TIME="none", "Carrier Name"="none"),
                                     model_type = "glm",
                                     importance_measure="firm")
  ret <- test_data %>% select(-`CANCELLED X`) %>% add_prediction(model_df=model_df)
  expect_true(all(c("predicted_probability", "linear_predictor","predicted_label") %in% colnames(ret)))
})

test_that("Logistic Regression with test_rate", {
  ret <- test_data %>% build_lm.fast(`CANCELLED X`,
                                     `ARR_TIME`,
                                     `DERAY_TIME`,
                                     `Carrier Name`,
                                     family = "binomial",
                                     model_type = "glm",
                                     test_rate = 0.1)
  expect_equal(colnames(ret), c("model", ".test_index", "source.data"))
  test_rownum <- length(ret$.test_index[[1]])
  training_rownum <- nrow(test_data) - test_rownum

  variables <- (ret %>% tidy_rowwise(model, type="importance") %>% arrange(desc(importance)))$variable
  names(variables) <- NULL
  res <- ret %>% lm_partial_dependence()
  expect_equal(levels(res$x_name), variables) # Factor order of the PDP should be the same as the importance.

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=test_data)
    pred_training_and_test <- ret %>% prediction_binary(data = 'training_and_test', threshold = 0.5)
    pred_training_and_test_conf_mat <- ret %>% prediction_training_and_test(prediction_type = 'conf_mat', threshold = 0.5)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(training_rownum, nrow(pred_training))
    expect_equal(test_rownum, nrow(pred_test))

    expected_cols <- c("CANCELLED X", "Carrier Name", "ARR_TIME", "DERAY_TIME",
                       "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat",
                       "residual_standard_deviation", "cooks_distance", "predicted_response", "predicted_label")
    expect_true(all(expected_cols %in% colnames(pred_training)))
    expected_cols <- c("CANCELLED X", "Carrier Name", "ARR_TIME", "DERAY_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response", "predicted_label")
    expect_true(all(expected_cols %in% colnames(pred_test)))
    res <- ret %>% tidy_rowwise(model, pretty.name=TRUE)
    expected_cols <- c("Term", "Coefficient", "Std Error", "t Value", "P Value", "Conf High", "Conf Low", "Odds Ratio", "Base Level")
    expect_true(all(expected_cols %in% colnames(res)))
    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
    res <- ret %>% evaluate_binary_training_and_test(`CANCELLED X`, threshold = 0.5, pretty.name=TRUE)
    expect_equal(nrow(res), 2) # 2 for training and test.
    res <- ret %>% lm_partial_dependence()
   })
})

test_that("Group Logistic Regression with test_rate", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>%
           build_lm.fast(`CANCELLED X`,
                        `ARR_TIME`,
                        model_type = "glm",
                        family = "binomial",
                        link = "logit",
                        test_rate = 0.1)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    # Since broom 0.7.0, I sometimes see "residuals" missing here, but not consistently. Will keep watching.
    expected_cols <- c("klass", "CANCELLED X", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response", "predicted_label")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "CANCELLED X", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response", "predicted_label")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})

test_that("Group Logistic Regression with test_rate with weight", {
  group_data <- test_data %>% group_by(klass)
  ret <- group_data %>% mutate(Weight=sin(1:n())+1) %>%
           build_lm.fast(`CANCELLED X`,
                        `ARR_TIME`,
                        weight=`Weight`,
                        model_type = "glm",
                        family = "binomial",
                        link = "logit",
                        test_rate = 0.1)
  # Check the numbers so that we can detect any change in broom or stats in the future.
  expect_equal((ret %>% tidy_rowwise(model))$estimate, c(-24.840867308, 0.001245984, -1.104902459, -0.002945304), tolerance = 0.001)
  expect_equal(colnames(ret), c("klass", "model", ".test_index", "source.data"))
  group_nrows <- group_data %>% summarize(n=n()) %>% `[[`("n")
  test_nrows <- sapply(ret$.test_index, length, simplify=TRUE)
  training_nrows <- group_nrows - test_nrows

  suppressWarnings({
    pred_new <- prediction(ret, data = "newdata", data_frame=group_data)
    pred_training <- prediction(ret, data = "training")
    pred_test <- prediction(ret, data = "test")
    expect_equal(pred_training %>% summarize(n=n()) %>% `[[`("n"),
                 training_nrows)
    expect_equal(pred_test %>% summarize(n=n()) %>% `[[`("n"),
                 test_nrows)

    # Since broom 0.7.0, I sometimes see "residuals" missing here, but not consistently. Will keep watching.
    expected_cols <- c("klass", "CANCELLED X", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "residuals", "standardised_residuals", "hat", "residual_standard_deviation",
                       "cooks_distance", "predicted_response", "predicted_label")
    expect_true(all(expected_cols %in% colnames(pred_training)))

    expected_cols <- c("klass", "CANCELLED X", "ARR_TIME", "predicted_value",
                       "conf_low", "conf_high",
                       "standard_error",
                       "predicted_response", "predicted_label")
    expect_true(all(expected_cols %in% colnames(pred_test)))

    res <- ret %>% glance_rowwise(model, pretty.name=TRUE)
   })
})
exploratory-io/exploratory_func documentation built on April 23, 2024, 9:15 p.m.