#' @title Prediction Function for Dissertation
#'
#'
#' @description This function will run the prediction for my dissertation. The goal is to establish prediction accuracy for each of the models. So, we'll need to generat a design matrix, generate some data, have a predicted value based on the design matrix, have an actual value, then also have an actual group vs a predicted group for each of the models...
#'
#' I want to matrix multiple all of the things, then use logistic to determine which one I should actually use...
#'
#' @param n The sample size to use. This should be 20\% of the actual data used in the simulation.
#' @param c The cutoff to use. This should match that used in the simulation.
#' @param sim_dat Data from the simulation function.
#'
#' @author Michael Floren
pred <- function(sim_dat){
### initial parameters ###
percent_for_test <- .2 #test sample size is 20% of the actual sample. May choose to use 25% instead, but not a big deal...
n <- sim_dat$n*percent_for_test
c <- sim_dat$c
### functions ###
mse <- function(y,yhat){ #mean squared error
sum((yhat-y)^2)/length(y)
}
#the next couple functions need a threshhold to compare to
thresh <- .5
pc <- function(g, ghat){ #percent correct, given decimals for ghat. Remember: 1 is small count, 0 is large count (this works either way, but g and ghat have to be consistent...)
sum((ghat>thresh)==g)/length(g)
}
auc <- function(g, ghat){
tryCatch(pROC::auc(pROC::roc(as.numeric(g),as.numeric(ghat))),error=function(x)NA)
}
# all overall numbers should be the same as for the simulation
k <- nrow(sim_dat$zip_results$count) #the number of rows for any of these should be the same...
p <- ncol(sim_dat$zip_results$count)
mean_of_IV <- 197.7828
sd_of_IV <- 40.21599
if(c==2){
true_sampi <- c(-1.4890070090395, 0.000896026713969103)
true_koppa <- c(2.26780053964254, -0.000373799698134193)
true_gamma <- c(1.79118900609313, -0.000338077141916478)
} else if (c==6){
true_sampi <- c(-0.991816836348344, 0.000632613090524932)
true_koppa <- c(2.51780279408921, -0.000375747468669122)
true_gamma <- c(2.1998559804243, -0.00025944850892789)
} else if (c==8){
true_sampi <- c(-0.989457553026938, 0.000623795679864685)
true_koppa <- c(2.51994084026393, -0.000385423153504593)
true_gamma <- c(2.20836925037315, -0.000300453184670207)
} else{
stop(paste0("Population parameters for c=",c," not set. Simulation stopped."))
}
# making generic lists to hold the information
verbose_results <- list()
results <- list()
#defining matrices for the results
for(i in 1:3){
results[[i]] <- as.data.frame(matrix(ncol=3, nrow=k))
colnames(results[[i]]) <- c("zip","mip","scip")
}
names(results) <- c("mse","percent_cor", "auc")
results$weighted_mse <- as.data.frame(matrix(ncol=3, nrow=k))
colnames(results$weighted_mse) <- c("zip", "scip", "mip")
for(i in 1:k){
# generating data for the run
desmat <- cbind(rep(1,n), rnorm(n, mean=mean_of_IV, sd=sd_of_IV))
actual <- scip_dat(n=n, desmat=desmat, sam=true_sampi, kop=true_koppa, gam=true_gamma, c=c)
verbose_results[[i]] <- as.data.frame(matrix(ncol=p+8, nrow=n)) #the verbose results for the iteration/run. Columns: p for desmat, 2 for out (group and count), 2 for predictions (count and group) for ZIP, SCIP, and MIP
colnames(verbose_results[[i]]) <- c(paste0("desmat", seq(0,ncol(desmat)-1)), "y", "g", "yhat_zip", "ghat_zip", "yhat_scip", "ghat_scip", "yhat_mip", "ghat_mip")
#setting info for the run
for(j in 1:ncol(desmat))
verbose_results[[i]][,paste0("desmat",j-1)] <- desmat[,j]
verbose_results[[i]]$y <- actual$y
verbose_results[[i]]$g <- actual$g
#### Predictions: ZIP ####
zip_group_pred <- exp(desmat%*%sim_dat$zip_results$logistic[i,])/(1+exp(desmat%*%sim_dat$zip_results$logistic[i,])) #exp(x\beta)/(1+exp(x\beta))
zip_count_pred <- exp(desmat%*%sim_dat$zip_results$count[i,])
#setting the weighted mse
results$weighted_mse$zip[i] <- mse(y=actual$y,
yhat=zip_group_pred*0 + (1-zip_group_pred)*zip_count_pred)
#for logistic results that point towards zero, set the predicted count to 0 (use same threshold as above)
zip_count_pred[zip_group_pred>thresh] <- 0
#set the verbose results information
verbose_results[[i]]$yhat_zip <- zip_count_pred
verbose_results[[i]]$ghat_zip <- zip_group_pred
#set the results information
results$mse$zip[i] <- mse(y=actual$y, yhat=zip_count_pred)
results$percent_cor$zip[i] <- pc(g=actual$g, ghat=zip_group_pred)
results$auc$zip[i] <- auc(g=actual$g, ghat=zip_group_pred)
#### Predictions: SCIP ####
# Just treating the logistic piece as the group prediction...
scip_group_pred <- exp(desmat%*%sim_dat$scip_results$logistic[i,])/(1+exp(desmat%*%sim_dat$scip_results$logistic[i,])) #the logistic piece
scip_sc_pred <- exp(desmat%*%sim_dat$scip_results$sc[i,])
scip_lc_pred <- exp(desmat%*%sim_dat$scip_results$lc[i,])
scip_count_pred <- ifelse(scip_group_pred>thresh, scip_sc_pred, scip_lc_pred)
#set the verbose results information
verbose_results[[i]]$yhat_scip <- scip_count_pred
verbose_results[[i]]$ghat_scip <- scip_group_pred
#set the results information
results$mse$scip[i] <- mse(y=actual$y, yhat=scip_count_pred)
results$percent_cor$scip[i] <- pc(g=actual$g, ghat=scip_group_pred)
results$auc$scip[i] <- auc(g=actual$g, ghat=scip_group_pred)
results$weighted_mse$scip[i] <- mse(y=actual$y,
yhat= scip_group_pred*scip_sc_pred + (1-scip_group_pred)*scip_lc_pred)
#### Predictions: MIP ####
names_of_logistic_pieces <- paste0("logistic", 0:c)
#doing the denominator first
denom_for_logistic_mat <- matrix(ncol=c+1, nrow=n)
for(j in 1:(c+1)){ #for all the gammas except for gamma zero, cause this is what the denom sum is. gamma_(c+1) is gamma_J (so including J). Gamma_J is zero.
denom_for_logistic_mat[,j] <- exp(desmat %*% sim_dat$mip_results[[names_of_logistic_pieces[j]]][i,])
}
denom_for_logistic <- 1 + apply(denom_for_logistic_mat, 1, sum)
#determining the logistic prediction pieces
mip_logistic <- as.data.frame(matrix(ncol=c+2, nrow=n))
colnames(mip_logistic) <- c(names_of_logistic_pieces, "logisticJ")
for(j in 1:(c+1)){
mip_logistic[,j] <- exp(desmat%*%sim_dat$mip_results[[names_of_logistic_pieces[j]]][i,])/denom_for_logistic
}
mip_logistic$logisticJ <- 1-apply(mip_logistic[,-ncol(mip_logistic)], 1, sum)
mip_group_pred <- 1-mip_logistic$logisticJ #the probability of being in the small count is 1-\pi_l
mip_count_pred <- exp(desmat%*%sim_dat$mip_results$beta[i,])
#setting the weighted MSE information
weighted_mip <- mip_logistic
for(j in 1:(c+1))
weighted_mip[,j] <- mip_logistic[,j]*(j-1) #the column times its count
weighted_mip$logisticJ <- mip_logistic$logisticJ * mip_count_pred #the count probability times the count prediction
results$weighted_mse$mip[i] <- mse(y=actual$y,
yhat=apply(weighted_mip, 1, sum))
#for each row, check which logistic piece is higher
group <- apply(mip_logistic, 1, function(x) which(x==max(x))[1])-1 #if multiple are tied, just take the first one... subtract 1 to match the column index with the meaning (first column is a count of zero)
for(j in 0:c){ #don't do J, as if the max group is J we want to leave it alone. Only change for
mip_count_pred[group==j] <- j
}
#set the verbose results information
verbose_results[[i]]$yhat_mip <- mip_count_pred
verbose_results[[i]]$ghat_mip <- mip_group_pred
#set the results information
results$mse$mip[i] <- mse(y=actual$y, yhat=mip_count_pred)
results$percent_cor$mip[i] <- pc(g=actual$g, ghat=mip_group_pred)
results$auc$mip[i] <- auc(g=actual$g, ghat=mip_group_pred)
}
list(verbose_results=verbose_results, results=results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.