# modeling
#' crossValidatedPredictor
#' @note DevStatus: one pass - utility ?/5
#' TODO(Albert): Discuss methodology ++ Use caret prob better
#'
#' @description Model selection tool with K-time cross validation and final validation on preserved data.
#'
#' @param data
#' @param dVarname = NULL
#' @param dependantVariable
#' @param independantVariables
#' @param type = "leaveOneOut"
#' @param family = "gaussian"
#' @param kTimes = NULL
#' @param loopOverModels = T
#' @param preservedData = F
#' @param plotInfo = T
#' @param TestWithRandomError = T
#'
#' @export
crossValidatedPredictor = function (data,
idVarname = NULL,
dependantVariable,
independantVariables ,
type = "leaveOneOut",
family = "gaussian",
kTimes = NULL,
loopOverModels = T,
preservedData = F,
plotInfo = T,
FTestWithRandomError = T) {
data = copy(data)
if (is.null(idVarname)) {
data[, crossValId := 1:NROW(data)]
idVarname = "crossValId"
}
## K-time cross validation based on the number of ids
idList = data[, get(idVarname)]
if (is.null(kTimes)) {
kTimes = NROW(idList)
}
writeLines(paste("Starting a", kTimes ,"Times Cross validation algorithm"))
# If presevedData option set
# Preserve test set for a final test with the best model
if (preservedData) {
preservedProportion = 0.3
writeLines(paste("Preserving", preservedProportion,"of data for ultimate test"))
preservedTestSet = sample(idList, round(preservedProportion*NROW(idList)))
idList = idList[idList%nin%preservedTestSet]
kTimes = NROW(idList)
}
# If model selection
# Build formula strings for each combination of dimension for the regression
formulaStrings = NULL
nIndependant = NROW(independantVariables)
if (loopOverModels) {
for (i in 1:nIndependant) {
colnamesCombination = combinations(n = nIndependant, r = i, v = independantVariables)
for (j in 1:NROW(colnamesCombination)) {
formulaString = paste0(dependantVariable, "~",paste0(colnamesCombination[j,], collapse = "+"))
formulaStrings = c(formulaStrings, formulaString)
}
}
} else {
# no model selection
formulaString = paste0(dependantVariable, "~",paste0(independantVariables, collapse = "+"))
formulaStrings = formulaString
}
# setup the output variables
residualLog = NULL
coeficientsLog = NULL
# Start the K-Time loop of crossvalidation
writeLines("Starting K-Times cross validation loop")
for (i in 1:kTimes) {
# test/training depends on type of crossvalidation
if (type=="leaveOneOut") {
# take out the ith subject
trainingIds = idList[(1:kTimes)%nin%i]
testId = idList[i]
}
# setup added row
residualLogToAdd = data.table(kId = i)
# test for all the possible formulas
for (j in 1:NROW(formulaStrings)) {
# get a formula object from the jth formula string
formulaObject = as.formula(formulaStrings[j])
# Linear regression using a gaussian GLM or a betaregression (logitLink with approximated variance)
coefList = rep(0,(nIndependant+1))
if (family == "betareg") {
resGLM = betareg(formula = formulaObject, data = data[id%in%trainingIds,])
coefList[1:NROW(resGLM$coefficients[[1]])] = resGLM$coefficients[[1]]
} else {
resGLM = glm(formula = formulaObject, data = data[id%in%trainingIds,], family = "gaussian")
coefList[1:NROW(resGLM$coefficients)] = resGLM$coefficients
}
# residuals here are the sum of squared residuals SSR
# training error
predictTraining = predict(resGLM, data = data[id%in%trainingIds,])
nonNaIndex = as.numeric(names(predict(resGLM, data = data[id%in%trainingIds,])))
ssrTraining = sum((data[id%in%trainingIds,][[dependantVariable]][nonNaIndex] - predictTraining)^2, na.rm=T)
# SSR of training with the null model
if (is.na(ssrTraining)) {
ssrTrainingNullModel = NA
} else {
ssrTrainingNullModel = sum((data[id%in%trainingIds,][[dependantVariable]] - mean(data[id%in%trainingIds,][[dependantVariable]], na.rm =T))^2, na.rm=T)
}
# predict the test set with the coef obtained with the training set
resPredict = predict(resGLM, data[id==testId,])
# compute the residuals
ssrToAdd = sum((data[id==testId, ][[dependantVariable]] - resPredict)^2)
# get residuals of the null model
if (is.na(ssrToAdd)) {
ssrNullModel = NA
} else {
ssrNullModel = sum((data[id==testId, ][[dependantVariable]] - mean(data[id%in%trainingIds,][[dependantVariable]], na.rm =T))^2)
}
# add the residuals of this specific model
residualLogToAdd[, (formulaStrings[j]) := list(abs(ssrToAdd))]
# log the coeficients for that model and that kValue
coefsToAdd = data.table(kId = i, formula = formulaStrings[j])
coefsToAdd[, c(paste0("Coefs",1:(nIndependant+1))) := as.list(coefList)]
# number of subjects
N = NROW(nonNaIndex)
# number of variables
P = NROW(all.vars(as.formula( formulaStrings[j])))-1
coefsToAdd[, c("trainingSSR", "trainingSSRNullModel", "N", "P","testSSRFullModel", "testSSRNullModel", "F", "pHO") := list(ssrTraining, ssrTrainingNullModel, N, P, ssrToAdd, ssrNullModel)]
coeficientsLog = rbind(coeficientsLog, coefsToAdd)
}
# Add the residuals to the final residual log
residualLog = rbind(residualLog, residualLogToAdd)
}
r2Training = 1 - (sum(coeficientsLog[, trainingSSR]/coeficientsLog[, N], na.rm= T) / sum(coeficientsLog[, trainingSSRNullModel]/coeficientsLog[, N], na.rm=T))
r2Test = 1 - (sum(coeficientsLog[, testSSRFullModel], na.rm= T) / sum(coeficientsLog[, testSSRNullModel], na.rm=T))
writeLines(paste("Full model on training set with mean R2 = ", r2Training, " and sqrt(MSE) = ", sqrt(mean(coeficientsLog[, trainingSSR]/coeficientsLog[, N], na.rm=T))))
writeLines(paste("Full model on test set with mean R2 = ", r2Test, " and sqrt(MSE) = ", sqrt(mean(coeficientsLog[, testSSRFullModel], na.rm=T))))
# F test is not adequate.. if models have the same number of predictors - do a F test between all full models and null models (same df)
# if (NROW(unique(coeficientsLog[, P]))==1) {
# numberOfNonNA = ntrue(!is.na(coeficientsLog[, testSSRFullModel]))
# P = unique(coeficientsLog[, testSSRFullModel])[1]
# probabilityTestFullModelIsSameAsNull = 1-pf(1-r2Test, P-1, numberOfNonNA-P)
# if (probabilityTestFullModelIsSameAsNull<0.05) {
# writeLines(paste("After cross validation, full model predict data significantly better on training (p = ", probabilityTestFullModelIsSameAsNull, ", F = ", 1-r2Test," [",P-1,",",numberOfNonNA-P,"])"))
# }
#
# }
stop()
if (loopOverModels) {
# Find the most predictive formula/beta
if (NCOL(residualLog)==2) {
nameCol = colnames(residualLog)[2]
meanResiduals = residualLog[, nameCol, with=F]/ntrue(!is.na(residualLog[[2]]))
meanResiduals = colSums(meanResiduals, na.rm=T)
} else {
meanResiduals = colSums(residualLog[[2:NCOL(residualLog)]], na.rm=T)/ntrue(!is.na(residualLog[[2]]))
}
bestResidualValue = meanResiduals[which(meanResiduals == min(meanResiduals, na.rm = T))]
bestResidualValueModel = names(meanResiduals)[which(meanResiduals == min(meanResiduals, na.rm = T))]
# get all variables from the formula string - the first one is the independant variable
writeLines(paste("The best model after K-Times crossvalidation is:", bestResidualValueModel))
writeLines(paste("With residuals equal to", bestResidualValue))
variablesInFormula = all.vars(as.formula(bestResidualValueModel))
independantVariables = variablesInFormula[2:NROW(variablesInFormula)]
if (plotInfo) {
meltedResidualLog = melt(residualLog, id.vars = "kId")
ggplot(meltedResidualLog[variable!=bestResidualValueModel,], aes(x=kId, y=value))+
geom_point(col=1) + geom_smooth(col=1)+
geom_point(data = meltedResidualLog[variable==bestResidualValueModel,], col=2)+
geom_smooth(data = meltedResidualLog[variable==bestResidualValueModel,], col=2) + theme_bw()
meltedCoeficients = melt(coeficientsLog[formula==bestResidualValueModel,], id.vars = c("kId","formula") )
ggplot(meltedCoeficients[variable!="Coefs1"], aes(x=kId, y=value, col=variable)) + geom_point()
}
# Test it on the excluded set varying the parameters using those found for each kIteration
# Get the residuals from a fit on the whole excluded set
writeLines("Starting to test on the preserved data")
dataPreserved = copy(data[id%in%preservedTestSet,])
fit = glm(formula = as.formula(bestResidualValueModel), data = dataPreserved, family = "gaussian")
real = data.matrix(dataPreserved[, dependantVariable, with=F])
predicted = predict(fit, dataPreserved)
residuals = abs(predicted-real)
coeficientsForBestModel = coeficientsLog[formula==bestResidualValueModel, ]
residualsVector = NULL
minResidual = 900000
bestFitResiduals = NULL
bestFitPredicted = NULL
bestFitCoeficients = NULL
for(k in 1:NROW(coeficientsForBestModel)) {
coeficientsForK = coeficientsForBestModel[k]
# predict dependant variable
fit$coefficients = as.matrix(coeficientsForK[, paste0("Coefs", 1:(NROW(independantVariables)+1)), with=F])
predicted = predict(fit, dataPreserved)
residuals = abs(predicted-real)
percentError = residuals/real
# manual predict
# Get the columns of the data corresponding to the variables in the formula
featureMatrix = cbind(matrix(data = 1,ncol = 1,nrow = NROW(dataPreserved)),data.matrix(dataPreserved[, independantVariables, with=F]))
predictedMatrix = featureMatrix %*% t(as.matrix(coeficientsForK[, paste0("Coefs", 1:(NROW(independantVariables)+1)), with=F]))
residualMatrix = abs(predictedMatrix - data.matrix(dataPreserved[, dependantVariable, with=F]))
residualsVector = c(residualsVector, mean(residuals, na.rm=T))
# add a FTest here comparing the new model with the best model so far
if (mean(residuals, na.rm=T)<minResidual) {
bestFitResiduals = residuals
bestFitPredicted = predicted
bestFitCoeficients = coeficientsForK
minResidual = mean(residuals, na.rm=T)
bestFitpredicitonAccuracy = mean(1 -residuals / real, na.rm= T)
}
}
if (plotInfo) {
print(ggplot(data.table(realValue = real, residuals = bestFitResiduals), aes(real, residuals)) + geom_point() + geom_smooth())
plot = ggplot(data.table(realValue = real, predicted = bestFitPredicted), aes(real, predicted))+geom_point()+stat_function(fun = function(x){return(x)}, geom="line")
print(plot)
}
fTestconclusionString = ""
if (FTestWithRandomError) {
writeLines("Comparing the selected model to a normally distributed random model")
# Check wether the model better predict the dependant variable than a mixed model with only a normally distriuted error term centered on the mean
# Build the model centered on the mean of the dependant variable and using the variance of the dependant variable
# "training"
meanDependantVariable = mean(data[, get(dependantVariable)], na.rm=T)
sdDependantVariable = sd(data[, get(dependantVariable)], na.rm=T)
writeLines(paste("Model built using a normal distriution of Mean ", meanDependantVariable, " and standard deviation of ", sdDependantVariable))
# Prediction == random sampling for that distribution
sizePreserved = NROW(dataPreserved)
predictedVector = rnorm(n = sizePreserved, mean = meanDependantVariable, sd = sdDependantVariable)
# Get residuals
residualsSimpleModel = abs(predictedVector - real)
writeLines(paste("Mean of residuals for normal model is ", mean(residualsSimpleModel, na.rm=T)))
# Compute F value follows F-distribution (L - L0, N - L)
model1residuals = as.matrix(bestFitResiduals)
model0residuals = as.matrix(residualsSimpleModel)
L = NROW(independantVariables)
L0 = 1
N = NROW(na.omit(model0residuals))
# check with Van der Ville which one is good
fValue = ((t(na.omit(model0residuals)) %*% na.omit(model0residuals)) - (t(na.omit(model1residuals)) %*% na.omit(model1residuals))) / (L-L0)
fValue = fValue / ((t(na.omit(model1residuals)) %*% na.omit(model1residuals))/(N-L))
writeLines(paste("F value (", (L-L0) ,",", (N-L) ,") between best model and normal model : ", fValue))
probabilityH0 = df(fValue, df1 = (L-L0), df2 = (N-L))
fTestconclusionString = paste("The probability that the best fit model gives the same amount of information as a random model is ", probabilityH0)
## Prediction accuracy = 1 - (abs(val-predicted)/val)))
restictedModelPredicitonAccuracy = mean(1 - abs(predictedVector - real) / real, na.rm= T)
## Training on preserved data
# get a formula object from the jth formula string
formulaObject = as.formula(bestResidualValueModel)
# regress using a gaussian GLM on the dimension of independant variables to predict dependant variable
if (family == "betareg") {
resGLM = betareg(formula = formulaObject, data = dataPreserved)
} else {
resGLM = glm(formula = formulaObject, data = dataPreserved, family = "gaussian")
}
# predict conners scores
resPredict = predict(resGLM, dataPreserved)
# compute the residuals
residualsFullPreserved = abs(dataPreserved[[dependantVariable]] - resPredict)
fullPreservedPredicitonAccuracy = mean(1 - residualsFullPreserved / dataPreserved[[dependantVariable]], na.rm= T)
# ## comparison FTest with training data
# fValue = ((t(na.omit(model0residuals)) %*% na.omit(model0residuals)) - (t(na.omit(model1residuals)) %*% na.omit(model1residuals))) / (L-L0)
# fValue = fValue / ((t(na.omit(model1residuals)) %*% na.omit(model1residuals))/(N-L))
# writeLines(paste("F value (", (L-L0) ,",", (N-L) ,") between best model and normal model : ", fValue))
#
# probabilityH0 = df(fValue, df1 = (L-L0), df2 = (N-L))
# fTestconclusionString = paste("The probability that the best fit model gives the same amount of information as a random model is ", probabilityH0)
}
stop()
writeLines("---------------------------------------")
kBest = which(residualsVector==min(residualsVector, na.rm=T))
writeLines(paste("Best model:", bestResidualValueModel))
writeLines(fTestconclusionString)
writeLines(paste0("Mean residual for best model is ", mean(residualsVector, na.rm=T)))
writeLines(paste0("Min Error [k == ", kBest ,"] is ", min(residualsVector, na.rm=T)))
writeLines(paste("Best coeficients:"))
variablesInFormula[1] = "(Intercept)"
setnames(bestFitCoeficients, paste0("Coefs", 1:(NROW(independantVariables)+1)), variablesInFormula)
print(bestFitCoeficients[, variablesInFormula, with=F])
#
# writeLines(paste("Restricted model predict the data with", restictedModelPredicitonAccuracy, "accuracy."))
# writeLines(paste("Full model trained on training set predicts the preserved data with", bestFitpredicitonAccuracy, "accuracy."))
# writeLines(paste("Full model trained on preserved data predicts the data with", fullPreservedPredicitonAccuracy, "accuracy."))
#
}
# writeLines(paste("If preserved dataset used for training and testing, mean residuals would have been : ", mean(residualsIfTrainedOnIt)))
}
#' sigmoid function
#'
#'@description credits: Kyriakos Chatzidimitriou
#'http://kyrcha.info/2012/07/08/tutorials-fitting-a-sigmoid-function-in-r/
#'
sigmoid = function(x, params=c(1,1,0), type = "exp") {
switch(type,
exp={
return(params[1] / (1 + exp(-params[2] * (x - params[3]))))
},
power={
return(param[1]^param[3]/param[1]^param[3]+param[2]^param[3])
},
stop("sigmoid: invalid type (should be exp or power)."))
}
#' Return a function interpolating point between (from_x, from_y) and (to_x, to_y)
#' following a logarithmic decay
#'
#' @param from_x numeric start point x
#' @param from_y numeric start point y
#' @param to_x numeric end point x
#' @param to_y numeric end point y
#' @param t numeric logarithmic slope
#'
#' @export
#'
generate_log_interpolation_function = function (from_x = 0, to_x = 0.05, from_y = 1, to_y = 0, t = 10) {
expa = exp(t*(to_y-from_y))
b = (-to_x*expa+from_x) / (1-expa)
g = (to_x - b) / exp(-t*to_y)
l = function(p) {
return(-log((p-b)/g)/t)
}
return(l)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.