R/error_bars.R

#'@title error_bars
#'@description analysis of error bars for a variable
#'@param df name of the dataframe that you want to analyse. This MUST be of the format data.frame(x,y). 'NA' and 'Inf' values are also not allowed.
#'@param s sequence of values (generated by seq(x,y,z)) - optional input: if not provided it will be calculated automatically.
#'@param draw optional input to draw disperseion on the error plot; can take values 'standard error', 'standard deviation' or 'both'. By default it takes 'both'.
#'@return an 'error_bars.out' object which contains (i) a dataframe of useful summary variables (ii) a dot plot showing the error bars, and (iii) a copy of the original dataframe with the bin_x variable appended as a third column.
#'@export

 
 
error_bars = function(df, s = NULL, draw='both'){
  library(ggplot2)
  library(dplyr)
  # where df= the dataframe containing the  values. It must be formated as (data.frame(x,y)) for the code to work.
  # s = a vector of cuts to split up the data by (this can also be generated automatically)
  
#  if (!'ggplot2' %in% installed.packages())
#    stop("ggplot2 must be installed and loaded to continue")
#  
#  if(!"package:ggplot2" %in% search())
#    print('ggplot2 is now loaded in the environment. Rerun the command to use the error_bars() function')
#    library(ggplot2)
#  
#  if (!'dplyr' %in% installed.packages())
#    stop("dplyr must be installed and loaded to continue")
# 
#  if(!"package:dplyr" %in% search())
#    print('dplyr is now loaded in the environment. Rerun the command to use the error_bars() function')
#    library(dplyr)
  
  
  # check the format of the variables:
  if (!missing(s)){
    if (!is.numeric(s))
      stop("'s' must be a numeric vector")
    if (length(s) < 3)
      stop("'s' must be at least 3 values long")
  }
  
  # check the dataframe:
  if (class(df) != "data.frame"){
    stop("df must be a dataframe object and must have only two columns (x,y). If your input looks like a dataframe, check it by using class() and as.data.frame() functions ")
  }
  
  # I believe that sometimes the dataframe can take on several data type objects at once (including dataframe)- so just make sure that it is actually a df:
    df = as.data.frame(df)
    
  if (ncol(df) != 2){
    stop("df must have only two columns: (x,y)")
  }
  
  # check whether there are any missing or Inf values:
  
  app = apply(df, 2, function(x) any(is.na(x) | is.infinite(x)))
  if (app[1] == TRUE){
    stop("the data frame contains either Infinite or NA values. These must be removed.")
  }
  if (app[2] == TRUE){
    stop("the data frame contains either Infinite or NA values. These must be removed.")
  }
  rm(app)
  
  # check draw:
  if (draw == 'standard error'){
    cont = 1
  } else if (draw == 'standard deviation'){
    cont = 1
  } else if (draw == 'both'){
    cont = 1
  } else {
    cont = 0
  }
  
  if (cont == 0){
    stop("'draw can only take the values 'standard error', 'standard deviation', or 'both'.")
  }
  
  # create 'x' object:
  x = df[,1]
  
  # create 's' vector if it is missing:
  if(missing(s)) {
    a = min(floor(x))
    b = max(ceiling(x))
    c = (b - a) / 10
    d = seq(a, b, c)
    s = d[2:length(d)]
  }
  
  # make a copy of the df to manipulate
  df.copy = df
  # rename the column names:
  colnames(df.copy) = c('x', 'y')
  
  
  # create the bin_x
  df.copy$bin_x = cut(x, breaks = c(-Inf, s, Inf), 
                      labels = c(s, Inf))
  df.copy$bin_x <- as.numeric(as.character(df.copy$bin_x))
  
  # create the summary variables in df.summary:
  
  df.summary <- df.copy %>% 
    group_by(bin_x) %>%
    summarize(xmean = mean(x),
              ymin = min(y),
              ymax = max(y),
              ymean = mean(y),
              ysd = sd(y),
              ymean_plus_sd = mean(y) + sd(y),
              ymean_minus_sd = mean(y) - sd(y),
              yse = sd(y)/sqrt(length(y)),
              ymean_plus_se = mean(y) + sd(y)/sqrt(length(y)),
              ymean_minus_se = mean(y) - sd(y)/sqrt(length(y))
    )
  
  df.summary$bin_x <- as.numeric(as.character(df.summary$bin_x))
  df.summary$num = dplyr::count(df.copy, bin_x)$n
  
  if (draw == 'both'){
  errorbars_plot = ggplot(df.summary, aes(bin_x, ymean)) +
    geom_point(size = 2) +
    geom_errorbar(aes(ymin = ymean_minus_sd, ymax = ymean_plus_sd, width=0.75, color= 'Standard Deviation')) +
    geom_errorbar(aes(ymin = ymean_minus_se, ymax = ymean_plus_se, width=0.75, color = 'Standard Error')) +
    labs(x = 'x bin', y = 'y values') +
    ggtitle('Plot of error bars') + 
    scale_colour_discrete(name="Measures") +
    theme_bw()
  }
  
  if (draw == 'standard error'){
    errorbars_plot = ggplot(df.summary, aes(bin_x, ymean)) +
      geom_point(size = 2) +
      #geom_errorbar(aes(ymin = ymean_minus_sd, ymax = ymean_plus_sd, width=0.75, color= 'Standard Deviation')) +
      geom_errorbar(aes(ymin = ymean_minus_se, ymax = ymean_plus_se, width=0.75, color = 'Standard Error')) +
      labs(x = 'x bin', y = 'y values') +
      ggtitle('Plot of error bars') + 
      scale_colour_discrete(name="Measures") +
      theme_bw()
  }
  
  if (draw == 'standard deviation'){
    errorbars_plot = ggplot(df.summary, aes(bin_x, ymean)) +
      geom_point(size = 2) +
      geom_errorbar(aes(ymin = ymean_minus_sd, ymax = ymean_plus_sd, width=0.75, color= 'Standard Deviation')) +
      #geom_errorbar(aes(ymin = ymean_minus_se, ymax = ymean_plus_se, width=0.75, color = 'Standard Error')) +
      labs(x = 'x bin', y = 'y values') +
      ggtitle('Plot of error bars') + 
      scale_colour_discrete(name="Measures") +
      theme_bw()
  }
  
  
  error_bars.out = structure(list(plot = errorbars_plot,
                                  df.summary = df.summary,
                                  df = df.copy),
                             class = 'error_bars.object')
  
  return(error_bars.out)
}
bvidgen/RPackage documentation built on May 13, 2019, 9:04 a.m.