library(MASS) # library(tikzDevice) library(Rmixmod) library(mvtnorm) library(scoringTools) set.seed(123)
mu0 <- array(0, c(8, 1)) mu1 <- array(1, c(8, 1)) Posdef <- function(n, ev = runif(n, 0, 3)) { Z <- matrix(ncol = n, rnorm(n^2)) decomp <- qr(Z) Q <- qr.Q(decomp) R <- qr.R(decomp) d <- diag(R) ph <- d / abs(d) O <- Q %*% diag(ph) Z <- t(O) %*% diag(ev) %*% O return(Z) } sigma0 <- Posdef(n = 8) sigma1 <- Posdef(n = 8)
m_test <- 100000 m <- 10000 nbvar <- c(4, 8, 15, 30) p0 <- 0.5 set.seed(21) y <- rbinom(m_test, 1, p0) data_test_pred <- array(0, c(4, m_test, 51)) data_test_gen <- array(0, c(4, m_test, 51)) data_test_bayes <- array(0, c(4, 3)) for (n in 2:2) { x <- array(0, c(m_test, nbvar[n])) x[y == 0, ] <- mvrnorm(n = sum(y == 0), mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]]) x[y == 1, ] <- mvrnorm(n = sum(y == 1), mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]]) data_test_pred[n, , 1:(nbvar[n] + 1)] <- as.matrix(cbind.data.frame(y = y, x = x)) data_test_gen[n, , ] <- data_test_pred[n, , ] data_test_gen[n, , 1] <- ifelse(data_test_pred[n, , 1] == 0, 2, 1) e0 <- log(p0) + dmvnorm(x[y == 0, ], mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]], log = TRUE) < log(1 - p0) + dmvnorm(x[y == 0, ], mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]], log = TRUE) e1 <- log(p0) + dmvnorm(x[y == 1, ], mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]], log = TRUE) > log(1 - p0) + dmvnorm(x[y == 1, ], mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]], log = TRUE) eb <- mean(c(e0, e1)) ebconf <- prop.test(sum(c(e0, e1)), m_test)$conf.int[1:2] data_test_bayes[n, ] <- c(eb, ebconf[1], ebconf[2]) rm(x) } rm(y) cut <- seq(0, .9, by = 0.02) tx_erreur <- array(NA, c(4, length(cut), 5, 20)) gini <- array(NA, c(4, length(cut), 5, 20)) rm(ebconf) rm(eb) rm(e0) rm(e1) gc()
Not run: takes roughly 10 to 30 minutes.
for (random in 1:20) { set.seed(random) y <- rbinom(m, 1, p0) data_learn_pred <- array(0, c(4, m, 51)) data_learn_gen <- array(0, c(4, m, 51)) for (n in 2:2) { x <- array(0, c(m, nbvar[n])) x[y == 0, ] <- mvrnorm(n = sum(y == 0), mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]]) x[y == 1, ] <- mvrnorm(n = sum(y == 1), mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]]) data_learn_pred[n, , 1:(nbvar[n] + 1)] <- as.matrix(cbind.data.frame(y = y, x = x)) data_learn_gen[n, , ] <- data_learn_pred[n, , ] data_learn_gen[n, , 1] <- ifelse(data_learn_gen[n, , 1] == 0, 2, 1) rm(x) } rm(y) gc() #### Eventual loop on number of featuers (frozen to 8) #### for (j in 2:2) { # On convertit l'ensemble d'apprentissage en dataframe learn_pred <- data.frame(data_learn_pred[j, , 1:(nbvar[j] + 1)]) learn_gen <- data.frame(data_learn_gen[j, , 1:(nbvar[j] + 1)]) # On construit la formule appliquee a glm (elle depend du nombre de variables dans le modele) PredictorVariables <- paste("X", 2:(nbvar[j] + 1), sep = "") Formula <- formula(paste("X1 ~ ", paste(PredictorVariables, collapse = " + "))) rm(PredictorVariables) gc() # On entraine un modele glm sur toutes les donnees pour simuler les refuses model_pred_tot <- glm(Formula, "binomial", learn_pred) #### Boucle sur le cut #### for (i in 1:length(cut)) { # On calcule les acceptes et rejetes en fonction du cut et du premier modele accepte <- (model_pred_tot$fitted.values > cut[i]) rejete <- (model_pred_tot$fitted.values <= cut[i]) # On controle qu'il reste des acceptes et des mauvais payeurs parmi les acceptes if (sum(accepte) > 0 & !(sum(learn_pred[accepte, 1]) == nrow(learn_pred[accepte, ]))) { # On construit l'ensemble d'apprentissage partiel learn_pred_part <- learn_pred[accepte, ] learn_gen_part <- learn_gen learn_gen_part[rejete, 1] <- NA # On entra?ne les modeles model_pred_part <- glm(Formula, family = binomial(link = "logit"), learn_pred_part) model_gen_part <- mixmodCluster( data = learn_gen_part[, 2:(nbvar[j] + 1)], knownLabels = learn_gen_part[, 1], nbCluster = 2 ) # On predit leur resultat sur l'ensemble de test et on enregistre le taux d'erreur model_pred_part.pred <- predict(model_pred_part, data.frame(data_test_pred[j, , ]), type = "response") >= 0.5 model_pred_part.erreur <- sum(abs(data_test_pred[j, , 1] - model_pred_part.pred)) / m_test model_pred_part.gini <- normalizedGini( data_test_pred[j, , 1], predict(model_pred_part, data.frame(data_test_pred[j, , ]), type = "response") ) model_gen_res <- mixmodPredict( data.frame(data_test_gen[j, , 2:(nbvar[j] + 1)]), model_gen_part@bestResult ) model_gen_part.pred <- model_gen_res@partition model_gen_part.erreur <- sum(abs(data_test_gen[j, , 1] - model_gen_part.pred)) / m_test model_gen_part.gini <- normalizedGini( data_test_pred[j, , 1], model_gen_res@proba[, 1] ) ## Augmentation ## if (sum(rejete) > 0) { model_pred_augmente <- augmentation( learn_pred_part[, -1], learn_pred[rejete, -1], learn_pred_part[, 1] ) model_pred_augmente.pred <- predict(model_pred_augmente@infered_model, data.frame(x = data.frame(data_test_pred[j, , ])), type = "response" ) >= 0.5 model_pred_augmente.erreur <- sum(abs(data_test_pred[j, , 1] - model_pred_augmente.pred)) / m_test model_pred_augmente.gini <- normalizedGini( data_test_pred[j, , 1], predict(model_pred_augmente@infered_model, data.frame(x = data.frame(data_test_pred[j, , ])), type = "response" ) ) ## Parcelling ## model_pred_parcelling <- parcelling( xf = learn_pred_part[, -1], xnf = learn_pred[rejete, -1], yf = learn_pred_part[, 1] ) model_pred_parcelling.pred <- predict(model_pred_parcelling@infered_model, data.frame(x = data.frame(data_test_pred[j, , ])), type = "response" ) >= 0.5 model_pred_parcelling.erreur <- sum(abs(data_test_pred[j, , 1] - model_pred_parcelling.pred)) / m_test model_pred_parcelling.gini <- normalizedGini( data_test_pred[j, , 1], predict(model_pred_parcelling@infered_model, data.frame(x = data.frame(data_test_pred[j, , ])), type = "response" ) ) ## Reclassification ## model_pred_reclassification <- reclassification( xf = learn_pred_part[, -1], xnf = learn_pred[rejete, -1], yf = learn_pred_part[, 1] ) model_pred_reclassification.pred <- predict( model_pred_reclassification@infered_model, data.frame(x = data.frame(data_test_pred[j, , ])), type = "response" ) >= 0.5 model_pred_reclassification.erreur <- sum(abs(data_test_pred[j, , 1] - model_pred_reclassification.pred)) / m_test model_pred_reclassification.gini <- normalizedGini( data_test_pred[j, , 1], predict(model_pred_reclassification@infered_model, data.frame(x = data.frame(data_test_pred[j, , ])), type = "response" ) ) } # On renvoie le taux d'erreur des modeles if (sum(rejete) > 0) { tx_erreur[j, i, , random] <- c( model_pred_part.erreur, model_pred_augmente.erreur, model_pred_reclassification.erreur, model_pred_parcelling.erreur, model_gen_part.erreur ) gini[j, i, , random] <- c( model_pred_part.gini, model_pred_augmente.gini, model_pred_reclassification.gini, model_pred_parcelling.gini, model_gen_part.gini ) } else { tx_erreur[j, i, , random] <- c( model_pred_part.erreur, model_pred_part.erreur, model_pred_part.erreur, model_pred_part.erreur, model_gen_part.erreur ) gini[j, i, , random] <- c( model_pred_part.gini, model_pred_part.gini, model_pred_part.gini, model_pred_part.gini, model_gen_part.gini ) } gc() } gc() } } }
Consequently, not run either.
# path_figure_tx_erreur <- # file.path( # dirname(rstudioapi::getActiveDocumentContext()$path), # "../TEX_CODE/reintegration_simu_miss_erreur.tex" # ) # # path_figure_gini <- # file.path( # dirname(rstudioapi::getActiveDocumentContext()$path), # "../TEX_CODE/reintegration_simu_miss_gini.tex" # ) ############ Figure error rate ############################################## tx_erreur_moy <- array(0, c(length(cut), 5)) gini_moy <- array(0, c(length(cut), 5)) for (methode in 1:5) { for (random in 1:20) { tx_erreur_moy[, methode] <- tx_erreur_moy[, methode] + tx_erreur[2, , methode, random] gini_moy[, methode] <- gini_moy[, methode] + gini[2, , methode, random] } tx_erreur_moy[, methode] <- tx_erreur_moy[, methode] / 20 gini_moy[, methode] <- gini_moy[, methode] / 20 } # tikz(path_figure_tx_erreur, # width = 7, # height = 4.4, # engine = "pdftex") plot( x = cut, y = tx_erreur_moy[, 1], xlim = c(0, .9), ylim = c(.1, .30), ylab = "Error rate on test set", xlab = "Cut-off value", pch = 15, xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = tx_erreur_moy[, 2], xlim = c(0, .9), ylim = c(.1, .30), ylab = "", xlab = "", pch = 17, col = "green", xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = tx_erreur_moy[, 3], xlim = c(0, .9), ylim = c(.1, .30), ylab = "", xlab = "", pch = 7, col = 2, xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = tx_erreur_moy[, 4], xlim = c(0, .9), ylim = c(.1, .30), ylab = "", xlab = "", pch = 8, col = 590, xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = tx_erreur_moy[, 5], xlim = c(0, .9), ylim = c(.1, .30), ylab = "", xlab = "", pch = 9, col = 376, xaxt = "n", yaxt = "n", type = "o" ) axis(1, at = pretty(cut), lab = pretty(cut), las = TRUE ) axis(2, at = pretty(seq(0.1, 0.3, 0.05), n = 10), lab = pretty(seq(0.1, 0.3, 0.05), n = 10), las = TRUE ) legend( 0.35, 0.3, pch = c(15, 17, 7, 8, 9), col = c(1, "green", 2, 590, 376), legend = c( "Logistic regression", "Augmentation", "Reclassification", "Parcelling", "Gaussian mixture" ) ) # dev.off() ############ Figure gini ############################################## # tikz(path_figure_gini, # width = 7, # height = 4.4, # engine = "pdftex") plot( x = cut, y = gini_moy[, 1], xlim = c(0, .9), ylim = c(.74, .93), ylab = "Gini on test set", xlab = "Cut-off value", pch = 15, xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = gini_moy[, 2], xlim = c(0, .9), ylim = c(.74, .93), ylab = "", xlab = "", pch = 17, col = "green", xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = gini_moy[, 3], xlim = c(0, .9), ylim = c(.74, .93), ylab = "", xlab = "", pch = 7, col = 2, xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = gini_moy[, 4], xlim = c(0, .9), ylim = c(.74, .93), ylab = "", xlab = "", pch = 8, col = 590, xaxt = "n", yaxt = "n", type = "o" ) par(new = TRUE) plot( x = cut, y = gini_moy[, 5], xlim = c(0, .9), ylim = c(.74, .93), ylab = "", xlab = "", pch = 9, col = 376, xaxt = "n", yaxt = "n", type = "o" ) axis(1, at = pretty(cut), lab = pretty(cut), las = TRUE ) axis(2, at = pretty(seq(0.75, 0.95, 0.1), n = 10), lab = pretty(seq(0.75, 0.95, 0.1), n = 10), las = TRUE ) legend( 0, 0.835, pch = c(15, 17, 7, 8, 9), col = c(1, "green", 2, 590, 376), legend = c( "Logistic regression", "Augmentation", "Reclassification", "Parcelling", "Gaussian mixture" ) ) # dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.