R/mean.for.regression2.R

# #################################
# ###    The function itself   ####
# #################################
#
# #input:
#
# all( 1:3 %in%  1:8)
#
# unlist(strsplit("a,b", split=","))
#
#
#
# mean.for.regression <- function(mResponse, censusdata, surveydata, model, location_survey){
#   if(mResponse == ".") {
#     vars_for_mean_calculation <- all.vars(model)[-1]
#   } else if(is.character(mResponse) & length(mResponse == 1)){ # Fall: "a + b + c + d" oder "a, b, c, d"
#     # replace " " by "" --> remove blanks
#     vars_for_mean_calculation <- gsub(pattern = " ", replacement="" , mResponse)
#     vars_for_mean_calculation <- unlist(strsplit(model.split, split="\\+"))
#     vars_for_mean_calculation <- unlist(strsplit(model.split, split=","))
#   } else if(is.character(mResponse)){
#     vars_for_mean_calculation <- mResponse
#   } else {
#     stop("In order to include the means of variables included in the census in the model fit on the surveydata, you have to give a
#          a) string with the variables you want to include separated by \"+\" or \",\" or
#          b) a character vector with your variables
#          c) a \"\'.\'\" as string, indicating that you want to include the mean of all the variables in your model")
#   }
#   if(!all( mResponse %in%  names(censusdata))){
#     stop("your input for mResponse includes variables that are not present in the censusdata set.
#          Means for those variables cannot be calculated")
#   }
#   if(!all( mResponse %in%  names(survey))){
#     warning("your input for mResponse includes variables that are not present in the surveydata set")
#   }
#
#   means_from_census <- cbind(subset(censusdata, select = vars_for_mean_calculation), location_survey)
#   means_from_census <- try(by(data = means_from_census,
#                               INDICES = location_survey, FUN = mean))
#   if(class(means_from_census) == "try-error"){
#     stop("your input for mResponse corresponds to variables in the censusdata set that are not numeric.
#          Means for those variables cannot be calculated")
#   }
#
#   surveydata <- merge(surveydata, means_from_census, by = paste(varname_location_survey)))
#   var.names <- paste(vars_for_mean_calculation, "_meanCensus", sep = "")
#
#
#
#   # for-loop that determines if a varaible is metric or not and then calculates
#   # the mean or median respectively and names the variables too
#   for (i in 1:length(vars_for_mean_calculation)) {
#     numeric_or_factor[i] <- is.numeric(censusdata[i])
#     # first we select the variable we want from our input and connect it to the
#     # census dataset for the mean or median calculation
#     varcheck <-  censusdata[,which(model.split[i] == colnames(censusdata))]
#     #eval(parse(text=paste("censusdata$", model.split[i], sep = "")))
#     # if statement that determines if mean or median as computet and saved in a vector
#     if(is.numeric(varcheck) || is.integer(varcheck)){
#       var.names[i] <- paste(model.split[i], ".Cmean" , sep = "")
#       location.mean <- aggregate(as.formula(paste(model.split[i], "~", location_survey)), censusdata, mean)
#       # aggregrate is actually pretty slow
#       # there are faster options but they require additional packages
#       # se here: https://stackoverflow.com/questions/11562656/calculate-the-mean-by-group
#       names(location.mean)[2] <- paste(names(location.mean)[2], ".Cmean" , sep = "")
#       location.mean[,1] <- as.factor(location.mean[,1])
#
#       surveydata <- merge(surveydata, location.mean) #, by = eval(location_survey))
#     } else if(is.factor(varcheck)){
#       #var.means[,i] <- rep(median(varcheck), nrow(surveydata)) # median
#       var.names[i] <- paste(model.split[i], ".Cmedian" , sep = "")
#       location.median <- aggregate(as.formula(paste(model.split[i], "~", location_survey)), censusdata, median)
#
#       names(location.median)[2] <- paste(names(location.median)[2], ".Cmedian" , sep = "")
#       location.median[,1] <- as.factor(location.median[,1])
#       surveydata <- merge(surveydata, location.median)
#     }
#   }
#   #colnames(var.means) <- var.names # defines the new variable names
#   #######------------------------------------------------########
#   ### this could be sped up by using data.table's rbindlist() ###
#   #surveydata <- cbind(surveydata, var.means)
#   #######------------------------------------------------########
#
#   # the last step is now to recreate tha part that gets pasted into the model
#   # possible function for this:
#   # https://stevencarlislewalker.wordpress.com/2012/08/06/merging-combining-adding-together-two-formula-objects-in-r/
#   model.in.characters <- as.character(model)
#
#   #model <- paste( model , paste(model.split, collapse = " + "), sep = " + ") # old version
#   modelx <- paste(model.in.characters[3] , paste(var.names, collapse = " + "), sep = " + ")
#   model.new <- paste(model.in.characters[2], modelx, sep = " ~ ")
#   # we should check if this hyperparametrisation works!!
#   return(list(model.new, surveydata))
# }
#
nikosbosse/SAE documentation built on May 12, 2019, 4:37 a.m.