tests/testthat/test-predict.R

testthat::context("Predictions are equivalent to MASS::predict method for polr")

iris$y <- sample(1:5, size = nrow(iris),
                 replace = TRUE)



ordered_logit <- oglm::oglmx(
  data = iris,
  formulaMEAN = "y ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width",
  link = "logit",
  threshparam = NULL,
  constantSD = FALSE,
  constantMEAN = FALSE,
  delta = 0)

ordered_logit_MASS <- MASS::polr("I(factor(y)) ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width",
                                 data = iris,
                                 Hess = TRUE,
                                 method = "logistic")

ordered_probit <- oglm::oglmx(
  data = iris,
  formulaMEAN = "y ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width",
  link = "probit",
  threshparam = NULL, constantSD = FALSE,
  constantMEAN = FALSE, delta = 0)

ordered_probit_MASS <- MASS::polr("I(factor(y)) ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width",
                                  data = iris,
                                  Hess = TRUE,
                                  method = "probit")


# ERROR WHEN NEWDATA IS NULL -----------------------

testthat::expect_error(predict(ordered_probit), regexp = "fit method not yet implemented")
testthat::expect_error(predict(ordered_logit), regexp = "fit method not yet implemented")


# TESTS FOR PROBIT ------------------------

oglm_class <- predict(ordered_probit, iris)
polr_class <- predict(ordered_probit_MASS, iris)

testthat::test_that("Class predictions are the same with oglmx and polr",{
  testthat::expect_equal(
    as.numeric(as.character(polr_class)),
    as.numeric(as.character(oglm_class))
  )
})


oglm_probs <- predict(ordered_probit, iris, type = "probs")
polr_probs <- predict(ordered_probit_MASS, iris, type = "probs")

head(oglm_probs)
head(polr_probs)


testthat::test_that("Prediction matrix has expected dimensions",{
  testthat::expect_equal(
    dim(oglm_probs),
    dim(polr_probs)
  )
})


testthat::test_that("Prediction matrix has values close from MASS::polr [tolerance: at 1e-5 level]",{
  testthat::expect_equal(
    oglm_probs,
    polr_probs,
    tolerance = 2e-5
  )
})


# TESTS FOR LOGIT -------------------------

oglm_class <- predict(ordered_logit, iris)
polr_class <- predict(ordered_logit_MASS, iris)


testthat::test_that("Class predictions are the same with oglmx and polr",{
  testthat::expect_equal(as.numeric(as.character(polr_class)),
                         as.numeric(as.character(oglm_class))
  )
})


testthat::test_that("Prediction matrix has expected dimensions",{
  testthat::expect_equal(
    dim(oglm_probs),
    dim(polr_probs)
  )
})


testthat::test_that("Prediction matrix has values close from MASS::polr [tolerance: at 1e-5 level]",{
  testthat::expect_equal(
    oglm_probs,
    polr_probs,
    tolerance = 2e-5
  )
})



# TEST LATENT MODEL PREDICTION -----------------


## OUTPUT FORMAT AS EXPECTED

xb_logit <- predict(ordered_logit, iris, type = "latent")
xb_probit <- predict(ordered_probit, iris, type = "latent")



testthat::test_that("Return two elements in list: xb, y_latent_pred and sigma",{
  testthat::expect_equal(typeof(xb_logit), "list")
  testthat::expect_equal(typeof(xb_probit), "list")
  testthat::expect_equal(names(xb_logit), c("xb","y_latent_pred","sigma_vector"))
  testthat::expect_equal(names(xb_probit), c("xb","y_latent_pred","sigma_vector"))
}
)

testthat::test_that("Return three elements in list: xb, y_latent_pred and sigma",{
  testthat::expect_true(is.numeric(xb_logit$xb))
  testthat::expect_true(is.numeric(xb_probit$xb))
  testthat::expect_true(is.numeric(xb_probit$y_latent_pred))
  testthat::expect_true(is.numeric(xb_probit$y_latent_pred))
  #  expect_true(is.numeric(xb_logit$sigma))
  #  expect_true(is.numeric(xb_probit$sigma))
})


## CONSISTENCY BETWEEN LATENT AND CLASS MODEL PREDICTIONS --------------

set.seed(242)

n <- 250

x1 <- sample(c(0, 1), n,
             replace = TRUE,
             prob = c(0.75, 0.25))

x2 <- vector("numeric", n)
x2[x1 == 0] <- sample(c(0, 1),
                      n - sum(x1 == 1),
                      replace = TRUE,
                      prob = c(2/3, 1/3))

z <- rnorm(n, 0.5)
# create latent outcome variable
latenty <- -0.5 + 1.5 * x1 - 0.5 * x2 + 0.5 * z + rnorm(n, sd = exp(0.5 *
                                                                      x1 - 0.5 * x2))
# observed y has four possible values: -1,0,1,2
# threshold values are: -0.5, 0.5, 1.5.
y <- vector("numeric", n)
y[latenty < 0.5] <- -1
y[latenty >= 0.5 & latenty < 1.5] <- 0
y[latenty >= 1.5 & latenty < 2.5] <- 1
y[latenty >= 2.5] <- 2
dataset <- data.frame(y, x1, x2)

bounds <- c(0.5, 1.5, 2.5)
lbounds <- log(bounds)




ordered_probit <- oglm::oglmx(
  data = dataset,
  formulaMEAN = "y ~ x1+x2",
  link = "probit",
  threshparam = lbounds,
  constantSD = TRUE)


xb_probit <- predict(ordered_probit, dataset, type = "xb")
latent_probit <- predict(ordered_probit, newdata = dataset, type = "latent")$y_latent_pred


dataset$pred <- latent_probit

dataset$pred_cut <- cut(dataset$pred, c(-Inf, lbounds, Inf),
                        labels = sort(unique(dataset$y))
)

confusion_matrix <- as.data.frame(table(dataset$pred_cut, dataset$y))


testthat::test_that("Accuracy (y_pred == y_obs) is not bad for well specified models",{
  testthat::expect_true(
    sum(confusion_matrix$Freq[confusion_matrix$Var1 == confusion_matrix$Var2])/sum(confusion_matrix$Freq) > 0.4
  )
}
)


# HETEROSCEDASTICITY -----------------------

latenty <- -0.5 + 1.5 * x1 - 0.5 * x2 + 0.5 * z + rnorm(n, sd = exp(0.5 *
                                                                      x1 - 0.5 * x2))
# observed y has four possible values: -1,0,1,2
# threshold values are: -0.5, 0.5, 1.5.
y <- vector("numeric", n)
y[latenty < 0.5] <- -1
y[latenty >= 0.5 & latenty < 1.5] <- 0
y[latenty >= 1.5 & latenty < 2.5] <- 1
y[latenty >= 2.5] <- 2
dataset <- data.frame(y, x1, x2)


ordered_probit2 <- oglm::oglmx(
  data = dataset,
  formulaMEAN = "y ~ x1+x2",
  formulaSD = "~ x1 + x2",
  link = "probit",
  threshparam = lbounds,
  constantSD = TRUE)


latent_probit <- predict(ordered_probit, newdata = dataset, type = "latent")$y_latent_pred

dataset$pred2 <- latent_probit

dataset$pred_cut2 <- cut(dataset$pred2, c(-Inf, lbounds, Inf),
                        labels = sort(unique(dataset$y))
)

confusion_matrix2 <- as.data.frame(table(dataset$pred_cut, dataset$y))


testthat::test_that("Accuracy (y_pred == y_obs) is not bad for well specified models",{
  testthat::expect_true(
    sum(confusion_matrix2$Freq[confusion_matrix2$Var1 == confusion_matrix2$Var2])/sum(confusion_matrix2$Freq) > 0.5
  )
}
)


testthat::test_that("Accuracy (y_pred == y_obs) is improved for well specified models",{
  testthat::expect_true(
    sum(confusion_matrix2$Freq[confusion_matrix2$Var1 == confusion_matrix2$Var2])/sum(confusion_matrix2$Freq) > sum(confusion_matrix$Freq[confusion_matrix$Var1 == confusion_matrix$Var2])/sum(confusion_matrix$Freq)
  )
}
)






## MONTE-CARLO EXPERIMENT

# df <- data.frame(x = rnorm(10e5))
# df$y_latent   <- df$x + rnorm(nrow(df))
# df$y_observed <- Hmisc::cut2(df$y_latent,
#                                cuts = quantile(df$y_latent, probs = c(0.25,.5,.75)))
#
#
# ordered_probit <- ordered_model_threshold(
#   df,
#   formula = "y_observed ~ 0 + x",
#   link = "probit",
#   thresholds = quantile(df$y_latent, probs = c(0.25,.5,.75))
# )
#
# pred <- predict(ordered_probit, df, type = "latent")
# y_predict <- pred$y_latent_pred
# xb <- pred$xb
linogaliana/oglm documentation built on March 5, 2021, 8:27 p.m.