# this function is supposed to automatically generate ne mean or median vriables from the census
# dataset to boost the precission of our survey prediction
# it automatically determines if a variable is metric or not
# and then use the mean or median on the larger (census) sample to account for lacking
# details in the survey sample. The newly generate vaiabkes of the mean are added to the survey data
# and the model equation is updated
#################################
### The function itself ####
#################################
mean.for.regression <- function(mResponse, censusdata, surveydata, model, location_survey){
if(mResponse == "."){ # instead of "." I want this to be a simple dot (.)
# like in the regression as a placeholder for all varaibles except y
#model.split <- unlist(strsplit(model, split="~")) # splits responses from the Y
#model.split <- gsub(pattern = " ", replacement="" , model.split[2])
# removes all the blanks
#model.split <- unlist(strsplit(model.split, split="\\+"))
model.split <- all.vars(model)[-1]
} else if(is.character(mResponse)){
model.split <- gsub(pattern = " ", replacement="" , mResponse)
# removes all the blanks
model.split <- unlist(strsplit(model.split, split="\\+"))
}
# this first if statement chreates the vector of individual variables as strings
# from which the means are supposed to be calculated
var.names <- rep(NA, length(model.split)) # new variable names
# var.means <- as.data.frame(matrix(NA, nrow = nrow(surveydata), ncol = length(model.split)))
# matrix of means and medians
# we need the locations in the surveydata as factors
col.n.survey <- which(colnames(surveydata)==eval(location_survey))
surveydata[,col.n.survey] <- as.factor(surveydata[,col.n.survey])
# as.factor(eval(parse(text=paste("surveydata$", location_survey, sep = ""))))
# for-loop that determines if a varaible is metric or not and then calculates
# the mean or median respectively and names the variables too
for (i in 1:length(model.split)) {
# first we select the variable we want from our input and connect it to the
# census dataset for the mean or median calculation
col.n.census <- which( colnames(censusdata)==eval(model.split[i]))
varcheck <- censusdata[,col.n.census]
#eval(parse(text=paste("censusdata$", model.split[i], sep = "")))
# if statement that determines if mean or median as computet and saved in a vector
if(is.numeric(varcheck) || is.integer(varcheck)){
# var.means[,i] <- rep(mean(varcheck), nrow(surveydata)) # mean
#tapply(df$speed, df$dive, mean)
var.names[i] <- paste(model.split[i], ".Cmean" , sep = "")
location.mean <- aggregate(as.formula(paste(model.split[i], "~", location_survey)), censusdata, mean)
# aggregrate is actually pretty slow
# there are faster options but they require additional packages
# se here: https://stackoverflow.com/questions/11562656/calculate-the-mean-by-group
names(location.mean)[2] <- paste(names(location.mean)[2], ".Cmean" , sep = "")
location.mean[,1] <- as.factor(location.mean[,1])
surveydata <- merge(surveydata, location.mean) #, by = eval(location_survey))
} else if(is.factor(varcheck)){
#var.means[,i] <- rep(median(varcheck), nrow(surveydata)) # median
var.names[i] <- paste(model.split[i], ".Cmedian" , sep = "")
location.median <- aggregate(as.formula(paste(model.split[i], "~", location_survey)), censusdata, median)
names(location.median)[2] <- paste(names(location.median)[2], ".Cmedian" , sep = "")
location.median[,1] <- as.factor(location.median[,1])
surveydata <- merge(surveydata, location.median)
}
}
#colnames(var.means) <- var.names # defines the new variable names
#######------------------------------------------------########
### this could be sped up by using data.table's rbindlist() ###
#surveydata <- cbind(surveydata, var.means)
#######------------------------------------------------########
# the last step is now to recreate tha part that gets pasted into the model
# possible function for this:
# https://stevencarlislewalker.wordpress.com/2012/08/06/merging-combining-adding-together-two-formula-objects-in-r/
model.in.characters <- as.character(model)
#model <- paste( model , paste(model.split, collapse = " + "), sep = " + ") # old version
modelx <- paste(model.in.characters[3] , paste(var.names, collapse = " + "), sep = " + ")
model.new <- paste(model.in.characters[2], modelx, sep = " ~ ")
# we should check if this hyperparametrisation works!!
return(list(model.new, surveydata))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.