R/mean.for.regression.R

# this function is supposed to automatically generate ne mean or median vriables from the census
# dataset to boost the precission of our survey prediction
# it automatically determines if a variable is metric or not
# and then use the mean or median on the larger (census) sample to account for lacking
# details in the survey sample. The newly generate vaiabkes of the mean are added to the survey data
# and the model equation is updated


#################################
###    The function itself   ####
#################################

mean.for.regression <- function(mResponse, censusdata, surveydata, model, location_survey){
  if(mResponse == "."){ # instead of "." I want this to be a simple dot (.)
    # like in the regression as a placeholder for all varaibles except y
    #model.split <- unlist(strsplit(model, split="~")) # splits responses from the Y
    #model.split <- gsub(pattern = " ", replacement="" , model.split[2])
    # removes all the blanks
    #model.split <- unlist(strsplit(model.split, split="\\+"))
    model.split <- all.vars(model)[-1]
  } else if(is.character(mResponse)){
    model.split <- gsub(pattern = " ", replacement="" , mResponse)
    # removes all the blanks
    model.split <- unlist(strsplit(model.split, split="\\+"))
  }
  # this first if statement chreates the vector of individual variables as strings
  # from which the means are supposed to be calculated



  var.names <- rep(NA, length(model.split)) # new variable names
  # var.means <- as.data.frame(matrix(NA, nrow = nrow(surveydata), ncol = length(model.split)))
  # matrix of means and medians

  # we need the locations in the surveydata as factors
  col.n.survey <- which(colnames(surveydata)==eval(location_survey))
  surveydata[,col.n.survey] <- as.factor(surveydata[,col.n.survey])
  # as.factor(eval(parse(text=paste("surveydata$", location_survey, sep = ""))))

  # for-loop that determines if a varaible is metric or not and then calculates
  # the mean or median respectively and names the variables too
  for (i in 1:length(model.split)) {
    # first we select the variable we want from our input and connect it to the
    # census dataset for the mean or median calculation
    col.n.census <- which( colnames(censusdata)==eval(model.split[i]))
    varcheck <-  censusdata[,col.n.census]
    #eval(parse(text=paste("censusdata$", model.split[i], sep = "")))
    # if statement that determines if mean or median as computet and saved in a vector
    if(is.numeric(varcheck) || is.integer(varcheck)){
      # var.means[,i] <- rep(mean(varcheck), nrow(surveydata)) # mean
      #tapply(df$speed, df$dive, mean)
      var.names[i] <- paste(model.split[i], ".Cmean" , sep = "")
      location.mean <- aggregate(as.formula(paste(model.split[i], "~", location_survey)), censusdata, mean)
      # aggregrate is actually pretty slow
      # there are faster options but they require additional packages
      # se here: https://stackoverflow.com/questions/11562656/calculate-the-mean-by-group
      names(location.mean)[2] <- paste(names(location.mean)[2], ".Cmean" , sep = "")
      location.mean[,1] <- as.factor(location.mean[,1])

      surveydata <- merge(surveydata, location.mean) #, by = eval(location_survey))
    } else if(is.factor(varcheck)){
      #var.means[,i] <- rep(median(varcheck), nrow(surveydata)) # median
      var.names[i] <- paste(model.split[i], ".Cmedian" , sep = "")
      location.median <- aggregate(as.formula(paste(model.split[i], "~", location_survey)), censusdata, median)

      names(location.median)[2] <- paste(names(location.median)[2], ".Cmedian" , sep = "")
      location.median[,1] <- as.factor(location.median[,1])
      surveydata <- merge(surveydata, location.median)
    }
  }
  #colnames(var.means) <- var.names # defines the new variable names
  #######------------------------------------------------########
  ### this could be sped up by using data.table's rbindlist() ###
  #surveydata <- cbind(surveydata, var.means)
  #######------------------------------------------------########

  # the last step is now to recreate tha part that gets pasted into the model
  # possible function for this:
  # https://stevencarlislewalker.wordpress.com/2012/08/06/merging-combining-adding-together-two-formula-objects-in-r/
  model.in.characters <- as.character(model)

  #model <- paste( model , paste(model.split, collapse = " + "), sep = " + ") # old version
  modelx <- paste(model.in.characters[3] , paste(var.names, collapse = " + "), sep = " + ")
  model.new <- paste(model.in.characters[2], modelx, sep = " ~ ")
  # we should check if this hyperparametrisation works!!
  return(list(model.new, surveydata))
}
nikosbosse/SAE documentation built on May 12, 2019, 4:37 a.m.